1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0 2cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 3cc1dc7a3Sopenharmony_ci// Copyright 2019-2024 Arm Limited 4cc1dc7a3Sopenharmony_ci// 5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy 7cc1dc7a3Sopenharmony_ci// of the License at: 8cc1dc7a3Sopenharmony_ci// 9cc1dc7a3Sopenharmony_ci// http://www.apache.org/licenses/LICENSE-2.0 10cc1dc7a3Sopenharmony_ci// 11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software 12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations 15cc1dc7a3Sopenharmony_ci// under the License. 16cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 17cc1dc7a3Sopenharmony_ci 18cc1dc7a3Sopenharmony_ci/** 19cc1dc7a3Sopenharmony_ci * @brief 8x32-bit vectors, implemented using AVX2. 20cc1dc7a3Sopenharmony_ci * 21cc1dc7a3Sopenharmony_ci * This module implements 8-wide 32-bit float, int, and mask vectors for x86 22cc1dc7a3Sopenharmony_ci * AVX2. 23cc1dc7a3Sopenharmony_ci * 24cc1dc7a3Sopenharmony_ci * There is a baseline level of functionality provided by all vector widths and 25cc1dc7a3Sopenharmony_ci * implementations. This is implemented using identical function signatures, 26cc1dc7a3Sopenharmony_ci * modulo data type, so we can use them as substitutable implementations in VLA 27cc1dc7a3Sopenharmony_ci * code. 28cc1dc7a3Sopenharmony_ci */ 29cc1dc7a3Sopenharmony_ci 30cc1dc7a3Sopenharmony_ci#ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED 31cc1dc7a3Sopenharmony_ci#define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED 32cc1dc7a3Sopenharmony_ci 33cc1dc7a3Sopenharmony_ci#ifndef ASTCENC_SIMD_INLINE 34cc1dc7a3Sopenharmony_ci #error "Include astcenc_vecmathlib.h, do not include directly" 35cc1dc7a3Sopenharmony_ci#endif 36cc1dc7a3Sopenharmony_ci 37cc1dc7a3Sopenharmony_ci#include <cstdio> 38cc1dc7a3Sopenharmony_ci 39cc1dc7a3Sopenharmony_ci// Define convenience intrinsics that are missing on older compilers 40cc1dc7a3Sopenharmony_ci#define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1) 41cc1dc7a3Sopenharmony_ci 42cc1dc7a3Sopenharmony_ci// ============================================================================ 43cc1dc7a3Sopenharmony_ci// vfloat8 data type 44cc1dc7a3Sopenharmony_ci// ============================================================================ 45cc1dc7a3Sopenharmony_ci 46cc1dc7a3Sopenharmony_ci/** 47cc1dc7a3Sopenharmony_ci * @brief Data type for 8-wide floats. 48cc1dc7a3Sopenharmony_ci */ 49cc1dc7a3Sopenharmony_cistruct vfloat8 50cc1dc7a3Sopenharmony_ci{ 51cc1dc7a3Sopenharmony_ci /** 52cc1dc7a3Sopenharmony_ci * @brief Construct from zero-initialized value. 53cc1dc7a3Sopenharmony_ci */ 54cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE vfloat8() = default; 55cc1dc7a3Sopenharmony_ci 56cc1dc7a3Sopenharmony_ci /** 57cc1dc7a3Sopenharmony_ci * @brief Construct from 4 values loaded from an unaligned address. 58cc1dc7a3Sopenharmony_ci * 59cc1dc7a3Sopenharmony_ci * Consider using loada() which is better with vectors if data is aligned 60cc1dc7a3Sopenharmony_ci * to vector length. 61cc1dc7a3Sopenharmony_ci */ 62cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE explicit vfloat8(const float *p) 63cc1dc7a3Sopenharmony_ci { 64cc1dc7a3Sopenharmony_ci m = _mm256_loadu_ps(p); 65cc1dc7a3Sopenharmony_ci } 66cc1dc7a3Sopenharmony_ci 67cc1dc7a3Sopenharmony_ci /** 68cc1dc7a3Sopenharmony_ci * @brief Construct from 1 scalar value replicated across all lanes. 69cc1dc7a3Sopenharmony_ci * 70cc1dc7a3Sopenharmony_ci * Consider using zero() for constexpr zeros. 71cc1dc7a3Sopenharmony_ci */ 72cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE explicit vfloat8(float a) 73cc1dc7a3Sopenharmony_ci { 74cc1dc7a3Sopenharmony_ci m = _mm256_set1_ps(a); 75cc1dc7a3Sopenharmony_ci } 76cc1dc7a3Sopenharmony_ci 77cc1dc7a3Sopenharmony_ci /** 78cc1dc7a3Sopenharmony_ci * @brief Construct from 8 scalar values. 79cc1dc7a3Sopenharmony_ci * 80cc1dc7a3Sopenharmony_ci * The value of @c a is stored to lane 0 (LSB) in the SIMD register. 81cc1dc7a3Sopenharmony_ci */ 82cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE explicit vfloat8( 83cc1dc7a3Sopenharmony_ci float a, float b, float c, float d, 84cc1dc7a3Sopenharmony_ci float e, float f, float g, float h) 85cc1dc7a3Sopenharmony_ci { 86cc1dc7a3Sopenharmony_ci m = _mm256_set_ps(h, g, f, e, d, c, b, a); 87cc1dc7a3Sopenharmony_ci } 88cc1dc7a3Sopenharmony_ci 89cc1dc7a3Sopenharmony_ci /** 90cc1dc7a3Sopenharmony_ci * @brief Construct from an existing SIMD register. 91cc1dc7a3Sopenharmony_ci */ 92cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a) 93cc1dc7a3Sopenharmony_ci { 94cc1dc7a3Sopenharmony_ci m = a; 95cc1dc7a3Sopenharmony_ci } 96cc1dc7a3Sopenharmony_ci 97cc1dc7a3Sopenharmony_ci /** 98cc1dc7a3Sopenharmony_ci * @brief Get the scalar value of a single lane. 99cc1dc7a3Sopenharmony_ci */ 100cc1dc7a3Sopenharmony_ci template <int l> ASTCENC_SIMD_INLINE float lane() const 101cc1dc7a3Sopenharmony_ci { 102cc1dc7a3Sopenharmony_ci #if !defined(__clang__) && defined(_MSC_VER) 103cc1dc7a3Sopenharmony_ci return m.m256_f32[l]; 104cc1dc7a3Sopenharmony_ci #else 105cc1dc7a3Sopenharmony_ci union { __m256 m; float f[8]; } cvt; 106cc1dc7a3Sopenharmony_ci cvt.m = m; 107cc1dc7a3Sopenharmony_ci return cvt.f[l]; 108cc1dc7a3Sopenharmony_ci #endif 109cc1dc7a3Sopenharmony_ci } 110cc1dc7a3Sopenharmony_ci 111cc1dc7a3Sopenharmony_ci /** 112cc1dc7a3Sopenharmony_ci * @brief Factory that returns a vector of zeros. 113cc1dc7a3Sopenharmony_ci */ 114cc1dc7a3Sopenharmony_ci static ASTCENC_SIMD_INLINE vfloat8 zero() 115cc1dc7a3Sopenharmony_ci { 116cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_setzero_ps()); 117cc1dc7a3Sopenharmony_ci } 118cc1dc7a3Sopenharmony_ci 119cc1dc7a3Sopenharmony_ci /** 120cc1dc7a3Sopenharmony_ci * @brief Factory that returns a replicated scalar loaded from memory. 121cc1dc7a3Sopenharmony_ci */ 122cc1dc7a3Sopenharmony_ci static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p) 123cc1dc7a3Sopenharmony_ci { 124cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_broadcast_ss(p)); 125cc1dc7a3Sopenharmony_ci } 126cc1dc7a3Sopenharmony_ci 127cc1dc7a3Sopenharmony_ci /** 128cc1dc7a3Sopenharmony_ci * @brief Factory that returns a vector loaded from 32B aligned memory. 129cc1dc7a3Sopenharmony_ci */ 130cc1dc7a3Sopenharmony_ci static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p) 131cc1dc7a3Sopenharmony_ci { 132cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_load_ps(p)); 133cc1dc7a3Sopenharmony_ci } 134cc1dc7a3Sopenharmony_ci 135cc1dc7a3Sopenharmony_ci /** 136cc1dc7a3Sopenharmony_ci * @brief Factory that returns a vector containing the lane IDs. 137cc1dc7a3Sopenharmony_ci */ 138cc1dc7a3Sopenharmony_ci static ASTCENC_SIMD_INLINE vfloat8 lane_id() 139cc1dc7a3Sopenharmony_ci { 140cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0)); 141cc1dc7a3Sopenharmony_ci } 142cc1dc7a3Sopenharmony_ci 143cc1dc7a3Sopenharmony_ci /** 144cc1dc7a3Sopenharmony_ci * @brief The vector ... 145cc1dc7a3Sopenharmony_ci */ 146cc1dc7a3Sopenharmony_ci __m256 m; 147cc1dc7a3Sopenharmony_ci}; 148cc1dc7a3Sopenharmony_ci 149cc1dc7a3Sopenharmony_ci// ============================================================================ 150cc1dc7a3Sopenharmony_ci// vint8 data type 151cc1dc7a3Sopenharmony_ci// ============================================================================ 152cc1dc7a3Sopenharmony_ci 153cc1dc7a3Sopenharmony_ci/** 154cc1dc7a3Sopenharmony_ci * @brief Data type for 8-wide ints. 155cc1dc7a3Sopenharmony_ci */ 156cc1dc7a3Sopenharmony_cistruct vint8 157cc1dc7a3Sopenharmony_ci{ 158cc1dc7a3Sopenharmony_ci /** 159cc1dc7a3Sopenharmony_ci * @brief Construct from zero-initialized value. 160cc1dc7a3Sopenharmony_ci */ 161cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE vint8() = default; 162cc1dc7a3Sopenharmony_ci 163cc1dc7a3Sopenharmony_ci /** 164cc1dc7a3Sopenharmony_ci * @brief Construct from 8 values loaded from an unaligned address. 165cc1dc7a3Sopenharmony_ci * 166cc1dc7a3Sopenharmony_ci * Consider using loada() which is better with vectors if data is aligned 167cc1dc7a3Sopenharmony_ci * to vector length. 168cc1dc7a3Sopenharmony_ci */ 169cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE explicit vint8(const int *p) 170cc1dc7a3Sopenharmony_ci { 171cc1dc7a3Sopenharmony_ci m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p)); 172cc1dc7a3Sopenharmony_ci } 173cc1dc7a3Sopenharmony_ci 174cc1dc7a3Sopenharmony_ci /** 175cc1dc7a3Sopenharmony_ci * @brief Construct from 8 uint8_t loaded from an unaligned address. 176cc1dc7a3Sopenharmony_ci */ 177cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p) 178cc1dc7a3Sopenharmony_ci { 179cc1dc7a3Sopenharmony_ci // _mm_loadu_si64 would be nicer syntax, but missing on older GCC 180cc1dc7a3Sopenharmony_ci m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p))); 181cc1dc7a3Sopenharmony_ci } 182cc1dc7a3Sopenharmony_ci 183cc1dc7a3Sopenharmony_ci /** 184cc1dc7a3Sopenharmony_ci * @brief Construct from 1 scalar value replicated across all lanes. 185cc1dc7a3Sopenharmony_ci * 186cc1dc7a3Sopenharmony_ci * Consider using vfloat4::zero() for constexpr zeros. 187cc1dc7a3Sopenharmony_ci */ 188cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE explicit vint8(int a) 189cc1dc7a3Sopenharmony_ci { 190cc1dc7a3Sopenharmony_ci m = _mm256_set1_epi32(a); 191cc1dc7a3Sopenharmony_ci } 192cc1dc7a3Sopenharmony_ci 193cc1dc7a3Sopenharmony_ci /** 194cc1dc7a3Sopenharmony_ci * @brief Construct from 8 scalar values. 195cc1dc7a3Sopenharmony_ci * 196cc1dc7a3Sopenharmony_ci * The value of @c a is stored to lane 0 (LSB) in the SIMD register. 197cc1dc7a3Sopenharmony_ci */ 198cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE explicit vint8( 199cc1dc7a3Sopenharmony_ci int a, int b, int c, int d, 200cc1dc7a3Sopenharmony_ci int e, int f, int g, int h) 201cc1dc7a3Sopenharmony_ci { 202cc1dc7a3Sopenharmony_ci m = _mm256_set_epi32(h, g, f, e, d, c, b, a); 203cc1dc7a3Sopenharmony_ci } 204cc1dc7a3Sopenharmony_ci 205cc1dc7a3Sopenharmony_ci /** 206cc1dc7a3Sopenharmony_ci * @brief Construct from an existing SIMD register. 207cc1dc7a3Sopenharmony_ci */ 208cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE explicit vint8(__m256i a) 209cc1dc7a3Sopenharmony_ci { 210cc1dc7a3Sopenharmony_ci m = a; 211cc1dc7a3Sopenharmony_ci } 212cc1dc7a3Sopenharmony_ci 213cc1dc7a3Sopenharmony_ci /** 214cc1dc7a3Sopenharmony_ci * @brief Get the scalar from a single lane. 215cc1dc7a3Sopenharmony_ci */ 216cc1dc7a3Sopenharmony_ci template <int l> ASTCENC_SIMD_INLINE int lane() const 217cc1dc7a3Sopenharmony_ci { 218cc1dc7a3Sopenharmony_ci #if !defined(__clang__) && defined(_MSC_VER) 219cc1dc7a3Sopenharmony_ci return m.m256i_i32[l]; 220cc1dc7a3Sopenharmony_ci #else 221cc1dc7a3Sopenharmony_ci union { __m256i m; int f[8]; } cvt; 222cc1dc7a3Sopenharmony_ci cvt.m = m; 223cc1dc7a3Sopenharmony_ci return cvt.f[l]; 224cc1dc7a3Sopenharmony_ci #endif 225cc1dc7a3Sopenharmony_ci } 226cc1dc7a3Sopenharmony_ci 227cc1dc7a3Sopenharmony_ci /** 228cc1dc7a3Sopenharmony_ci * @brief Factory that returns a vector of zeros. 229cc1dc7a3Sopenharmony_ci */ 230cc1dc7a3Sopenharmony_ci static ASTCENC_SIMD_INLINE vint8 zero() 231cc1dc7a3Sopenharmony_ci { 232cc1dc7a3Sopenharmony_ci return vint8(_mm256_setzero_si256()); 233cc1dc7a3Sopenharmony_ci } 234cc1dc7a3Sopenharmony_ci 235cc1dc7a3Sopenharmony_ci /** 236cc1dc7a3Sopenharmony_ci * @brief Factory that returns a replicated scalar loaded from memory. 237cc1dc7a3Sopenharmony_ci */ 238cc1dc7a3Sopenharmony_ci static ASTCENC_SIMD_INLINE vint8 load1(const int* p) 239cc1dc7a3Sopenharmony_ci { 240cc1dc7a3Sopenharmony_ci __m128i a = _mm_set1_epi32(*p); 241cc1dc7a3Sopenharmony_ci return vint8(_mm256_broadcastd_epi32(a)); 242cc1dc7a3Sopenharmony_ci } 243cc1dc7a3Sopenharmony_ci 244cc1dc7a3Sopenharmony_ci /** 245cc1dc7a3Sopenharmony_ci * @brief Factory that returns a vector loaded from unaligned memory. 246cc1dc7a3Sopenharmony_ci */ 247cc1dc7a3Sopenharmony_ci static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p) 248cc1dc7a3Sopenharmony_ci { 249cc1dc7a3Sopenharmony_ci return vint8(_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(p))); 250cc1dc7a3Sopenharmony_ci } 251cc1dc7a3Sopenharmony_ci 252cc1dc7a3Sopenharmony_ci /** 253cc1dc7a3Sopenharmony_ci * @brief Factory that returns a vector loaded from 32B aligned memory. 254cc1dc7a3Sopenharmony_ci */ 255cc1dc7a3Sopenharmony_ci static ASTCENC_SIMD_INLINE vint8 loada(const int* p) 256cc1dc7a3Sopenharmony_ci { 257cc1dc7a3Sopenharmony_ci return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p))); 258cc1dc7a3Sopenharmony_ci } 259cc1dc7a3Sopenharmony_ci 260cc1dc7a3Sopenharmony_ci /** 261cc1dc7a3Sopenharmony_ci * @brief Factory that returns a vector containing the lane IDs. 262cc1dc7a3Sopenharmony_ci */ 263cc1dc7a3Sopenharmony_ci static ASTCENC_SIMD_INLINE vint8 lane_id() 264cc1dc7a3Sopenharmony_ci { 265cc1dc7a3Sopenharmony_ci return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); 266cc1dc7a3Sopenharmony_ci } 267cc1dc7a3Sopenharmony_ci 268cc1dc7a3Sopenharmony_ci /** 269cc1dc7a3Sopenharmony_ci * @brief The vector ... 270cc1dc7a3Sopenharmony_ci */ 271cc1dc7a3Sopenharmony_ci __m256i m; 272cc1dc7a3Sopenharmony_ci}; 273cc1dc7a3Sopenharmony_ci 274cc1dc7a3Sopenharmony_ci// ============================================================================ 275cc1dc7a3Sopenharmony_ci// vmask8 data type 276cc1dc7a3Sopenharmony_ci// ============================================================================ 277cc1dc7a3Sopenharmony_ci 278cc1dc7a3Sopenharmony_ci/** 279cc1dc7a3Sopenharmony_ci * @brief Data type for 8-wide control plane masks. 280cc1dc7a3Sopenharmony_ci */ 281cc1dc7a3Sopenharmony_cistruct vmask8 282cc1dc7a3Sopenharmony_ci{ 283cc1dc7a3Sopenharmony_ci /** 284cc1dc7a3Sopenharmony_ci * @brief Construct from an existing SIMD register. 285cc1dc7a3Sopenharmony_ci */ 286cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE explicit vmask8(__m256 a) 287cc1dc7a3Sopenharmony_ci { 288cc1dc7a3Sopenharmony_ci m = a; 289cc1dc7a3Sopenharmony_ci } 290cc1dc7a3Sopenharmony_ci 291cc1dc7a3Sopenharmony_ci /** 292cc1dc7a3Sopenharmony_ci * @brief Construct from an existing SIMD register. 293cc1dc7a3Sopenharmony_ci */ 294cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE explicit vmask8(__m256i a) 295cc1dc7a3Sopenharmony_ci { 296cc1dc7a3Sopenharmony_ci m = _mm256_castsi256_ps(a); 297cc1dc7a3Sopenharmony_ci } 298cc1dc7a3Sopenharmony_ci 299cc1dc7a3Sopenharmony_ci /** 300cc1dc7a3Sopenharmony_ci * @brief Construct from 1 scalar value. 301cc1dc7a3Sopenharmony_ci */ 302cc1dc7a3Sopenharmony_ci ASTCENC_SIMD_INLINE explicit vmask8(bool a) 303cc1dc7a3Sopenharmony_ci { 304cc1dc7a3Sopenharmony_ci vint8 mask(a == false ? 0 : -1); 305cc1dc7a3Sopenharmony_ci m = _mm256_castsi256_ps(mask.m); 306cc1dc7a3Sopenharmony_ci } 307cc1dc7a3Sopenharmony_ci 308cc1dc7a3Sopenharmony_ci /** 309cc1dc7a3Sopenharmony_ci * @brief The vector ... 310cc1dc7a3Sopenharmony_ci */ 311cc1dc7a3Sopenharmony_ci __m256 m; 312cc1dc7a3Sopenharmony_ci}; 313cc1dc7a3Sopenharmony_ci 314cc1dc7a3Sopenharmony_ci// ============================================================================ 315cc1dc7a3Sopenharmony_ci// vmask8 operators and functions 316cc1dc7a3Sopenharmony_ci// ============================================================================ 317cc1dc7a3Sopenharmony_ci 318cc1dc7a3Sopenharmony_ci/** 319cc1dc7a3Sopenharmony_ci * @brief Overload: mask union (or). 320cc1dc7a3Sopenharmony_ci */ 321cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b) 322cc1dc7a3Sopenharmony_ci{ 323cc1dc7a3Sopenharmony_ci return vmask8(_mm256_or_ps(a.m, b.m)); 324cc1dc7a3Sopenharmony_ci} 325cc1dc7a3Sopenharmony_ci 326cc1dc7a3Sopenharmony_ci/** 327cc1dc7a3Sopenharmony_ci * @brief Overload: mask intersect (and). 328cc1dc7a3Sopenharmony_ci */ 329cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b) 330cc1dc7a3Sopenharmony_ci{ 331cc1dc7a3Sopenharmony_ci return vmask8(_mm256_and_ps(a.m, b.m)); 332cc1dc7a3Sopenharmony_ci} 333cc1dc7a3Sopenharmony_ci 334cc1dc7a3Sopenharmony_ci/** 335cc1dc7a3Sopenharmony_ci * @brief Overload: mask difference (xor). 336cc1dc7a3Sopenharmony_ci */ 337cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b) 338cc1dc7a3Sopenharmony_ci{ 339cc1dc7a3Sopenharmony_ci return vmask8(_mm256_xor_ps(a.m, b.m)); 340cc1dc7a3Sopenharmony_ci} 341cc1dc7a3Sopenharmony_ci 342cc1dc7a3Sopenharmony_ci/** 343cc1dc7a3Sopenharmony_ci * @brief Overload: mask invert (not). 344cc1dc7a3Sopenharmony_ci */ 345cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a) 346cc1dc7a3Sopenharmony_ci{ 347cc1dc7a3Sopenharmony_ci return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1))); 348cc1dc7a3Sopenharmony_ci} 349cc1dc7a3Sopenharmony_ci 350cc1dc7a3Sopenharmony_ci/** 351cc1dc7a3Sopenharmony_ci * @brief Return a 8-bit mask code indicating mask status. 352cc1dc7a3Sopenharmony_ci * 353cc1dc7a3Sopenharmony_ci * bit0 = lane 0 354cc1dc7a3Sopenharmony_ci */ 355cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE unsigned int mask(vmask8 a) 356cc1dc7a3Sopenharmony_ci{ 357cc1dc7a3Sopenharmony_ci return static_cast<unsigned int>(_mm256_movemask_ps(a.m)); 358cc1dc7a3Sopenharmony_ci} 359cc1dc7a3Sopenharmony_ci 360cc1dc7a3Sopenharmony_ci/** 361cc1dc7a3Sopenharmony_ci * @brief True if any lanes are enabled, false otherwise. 362cc1dc7a3Sopenharmony_ci */ 363cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE bool any(vmask8 a) 364cc1dc7a3Sopenharmony_ci{ 365cc1dc7a3Sopenharmony_ci return mask(a) != 0; 366cc1dc7a3Sopenharmony_ci} 367cc1dc7a3Sopenharmony_ci 368cc1dc7a3Sopenharmony_ci/** 369cc1dc7a3Sopenharmony_ci * @brief True if all lanes are enabled, false otherwise. 370cc1dc7a3Sopenharmony_ci */ 371cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE bool all(vmask8 a) 372cc1dc7a3Sopenharmony_ci{ 373cc1dc7a3Sopenharmony_ci return mask(a) == 0xFF; 374cc1dc7a3Sopenharmony_ci} 375cc1dc7a3Sopenharmony_ci 376cc1dc7a3Sopenharmony_ci// ============================================================================ 377cc1dc7a3Sopenharmony_ci// vint8 operators and functions 378cc1dc7a3Sopenharmony_ci// ============================================================================ 379cc1dc7a3Sopenharmony_ci/** 380cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector addition. 381cc1dc7a3Sopenharmony_ci */ 382cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b) 383cc1dc7a3Sopenharmony_ci{ 384cc1dc7a3Sopenharmony_ci return vint8(_mm256_add_epi32(a.m, b.m)); 385cc1dc7a3Sopenharmony_ci} 386cc1dc7a3Sopenharmony_ci 387cc1dc7a3Sopenharmony_ci/** 388cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector incremental addition. 389cc1dc7a3Sopenharmony_ci */ 390cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b) 391cc1dc7a3Sopenharmony_ci{ 392cc1dc7a3Sopenharmony_ci a = a + b; 393cc1dc7a3Sopenharmony_ci return a; 394cc1dc7a3Sopenharmony_ci} 395cc1dc7a3Sopenharmony_ci 396cc1dc7a3Sopenharmony_ci/** 397cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector subtraction. 398cc1dc7a3Sopenharmony_ci */ 399cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b) 400cc1dc7a3Sopenharmony_ci{ 401cc1dc7a3Sopenharmony_ci return vint8(_mm256_sub_epi32(a.m, b.m)); 402cc1dc7a3Sopenharmony_ci} 403cc1dc7a3Sopenharmony_ci 404cc1dc7a3Sopenharmony_ci/** 405cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector multiplication. 406cc1dc7a3Sopenharmony_ci */ 407cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b) 408cc1dc7a3Sopenharmony_ci{ 409cc1dc7a3Sopenharmony_ci return vint8(_mm256_mullo_epi32(a.m, b.m)); 410cc1dc7a3Sopenharmony_ci} 411cc1dc7a3Sopenharmony_ci 412cc1dc7a3Sopenharmony_ci/** 413cc1dc7a3Sopenharmony_ci * @brief Overload: vector bit invert. 414cc1dc7a3Sopenharmony_ci */ 415cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 operator~(vint8 a) 416cc1dc7a3Sopenharmony_ci{ 417cc1dc7a3Sopenharmony_ci return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1))); 418cc1dc7a3Sopenharmony_ci} 419cc1dc7a3Sopenharmony_ci 420cc1dc7a3Sopenharmony_ci/** 421cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector bitwise or. 422cc1dc7a3Sopenharmony_ci */ 423cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b) 424cc1dc7a3Sopenharmony_ci{ 425cc1dc7a3Sopenharmony_ci return vint8(_mm256_or_si256(a.m, b.m)); 426cc1dc7a3Sopenharmony_ci} 427cc1dc7a3Sopenharmony_ci 428cc1dc7a3Sopenharmony_ci/** 429cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector bitwise and. 430cc1dc7a3Sopenharmony_ci */ 431cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b) 432cc1dc7a3Sopenharmony_ci{ 433cc1dc7a3Sopenharmony_ci return vint8(_mm256_and_si256(a.m, b.m)); 434cc1dc7a3Sopenharmony_ci} 435cc1dc7a3Sopenharmony_ci 436cc1dc7a3Sopenharmony_ci/** 437cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector bitwise xor. 438cc1dc7a3Sopenharmony_ci */ 439cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b) 440cc1dc7a3Sopenharmony_ci{ 441cc1dc7a3Sopenharmony_ci return vint8(_mm256_xor_si256(a.m, b.m)); 442cc1dc7a3Sopenharmony_ci} 443cc1dc7a3Sopenharmony_ci 444cc1dc7a3Sopenharmony_ci/** 445cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector equality. 446cc1dc7a3Sopenharmony_ci */ 447cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b) 448cc1dc7a3Sopenharmony_ci{ 449cc1dc7a3Sopenharmony_ci return vmask8(_mm256_cmpeq_epi32(a.m, b.m)); 450cc1dc7a3Sopenharmony_ci} 451cc1dc7a3Sopenharmony_ci 452cc1dc7a3Sopenharmony_ci/** 453cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector inequality. 454cc1dc7a3Sopenharmony_ci */ 455cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b) 456cc1dc7a3Sopenharmony_ci{ 457cc1dc7a3Sopenharmony_ci return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m)); 458cc1dc7a3Sopenharmony_ci} 459cc1dc7a3Sopenharmony_ci 460cc1dc7a3Sopenharmony_ci/** 461cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector less than. 462cc1dc7a3Sopenharmony_ci */ 463cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b) 464cc1dc7a3Sopenharmony_ci{ 465cc1dc7a3Sopenharmony_ci return vmask8(_mm256_cmpgt_epi32(b.m, a.m)); 466cc1dc7a3Sopenharmony_ci} 467cc1dc7a3Sopenharmony_ci 468cc1dc7a3Sopenharmony_ci/** 469cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector greater than. 470cc1dc7a3Sopenharmony_ci */ 471cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b) 472cc1dc7a3Sopenharmony_ci{ 473cc1dc7a3Sopenharmony_ci return vmask8(_mm256_cmpgt_epi32(a.m, b.m)); 474cc1dc7a3Sopenharmony_ci} 475cc1dc7a3Sopenharmony_ci 476cc1dc7a3Sopenharmony_ci/** 477cc1dc7a3Sopenharmony_ci * @brief Logical shift left. 478cc1dc7a3Sopenharmony_ci */ 479cc1dc7a3Sopenharmony_citemplate <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a) 480cc1dc7a3Sopenharmony_ci{ 481cc1dc7a3Sopenharmony_ci return vint8(_mm256_slli_epi32(a.m, s)); 482cc1dc7a3Sopenharmony_ci} 483cc1dc7a3Sopenharmony_ci 484cc1dc7a3Sopenharmony_ci/** 485cc1dc7a3Sopenharmony_ci * @brief Arithmetic shift right. 486cc1dc7a3Sopenharmony_ci */ 487cc1dc7a3Sopenharmony_citemplate <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a) 488cc1dc7a3Sopenharmony_ci{ 489cc1dc7a3Sopenharmony_ci return vint8(_mm256_srai_epi32(a.m, s)); 490cc1dc7a3Sopenharmony_ci} 491cc1dc7a3Sopenharmony_ci 492cc1dc7a3Sopenharmony_ci/** 493cc1dc7a3Sopenharmony_ci * @brief Logical shift right. 494cc1dc7a3Sopenharmony_ci */ 495cc1dc7a3Sopenharmony_citemplate <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a) 496cc1dc7a3Sopenharmony_ci{ 497cc1dc7a3Sopenharmony_ci return vint8(_mm256_srli_epi32(a.m, s)); 498cc1dc7a3Sopenharmony_ci} 499cc1dc7a3Sopenharmony_ci 500cc1dc7a3Sopenharmony_ci/** 501cc1dc7a3Sopenharmony_ci * @brief Return the min vector of two vectors. 502cc1dc7a3Sopenharmony_ci */ 503cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b) 504cc1dc7a3Sopenharmony_ci{ 505cc1dc7a3Sopenharmony_ci return vint8(_mm256_min_epi32(a.m, b.m)); 506cc1dc7a3Sopenharmony_ci} 507cc1dc7a3Sopenharmony_ci 508cc1dc7a3Sopenharmony_ci/** 509cc1dc7a3Sopenharmony_ci * @brief Return the max vector of two vectors. 510cc1dc7a3Sopenharmony_ci */ 511cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b) 512cc1dc7a3Sopenharmony_ci{ 513cc1dc7a3Sopenharmony_ci return vint8(_mm256_max_epi32(a.m, b.m)); 514cc1dc7a3Sopenharmony_ci} 515cc1dc7a3Sopenharmony_ci 516cc1dc7a3Sopenharmony_ci/** 517cc1dc7a3Sopenharmony_ci * @brief Return the horizontal minimum of a vector. 518cc1dc7a3Sopenharmony_ci */ 519cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 hmin(vint8 a) 520cc1dc7a3Sopenharmony_ci{ 521cc1dc7a3Sopenharmony_ci __m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1)); 522cc1dc7a3Sopenharmony_ci m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2))); 523cc1dc7a3Sopenharmony_ci m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1))); 524cc1dc7a3Sopenharmony_ci m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0)); 525cc1dc7a3Sopenharmony_ci 526cc1dc7a3Sopenharmony_ci __m256i r = astcenc_mm256_set_m128i(m, m); 527cc1dc7a3Sopenharmony_ci vint8 vmin(r); 528cc1dc7a3Sopenharmony_ci return vmin; 529cc1dc7a3Sopenharmony_ci} 530cc1dc7a3Sopenharmony_ci 531cc1dc7a3Sopenharmony_ci/** 532cc1dc7a3Sopenharmony_ci * @brief Return the horizontal maximum of a vector. 533cc1dc7a3Sopenharmony_ci */ 534cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 hmax(vint8 a) 535cc1dc7a3Sopenharmony_ci{ 536cc1dc7a3Sopenharmony_ci __m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1)); 537cc1dc7a3Sopenharmony_ci m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2))); 538cc1dc7a3Sopenharmony_ci m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1))); 539cc1dc7a3Sopenharmony_ci m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0)); 540cc1dc7a3Sopenharmony_ci 541cc1dc7a3Sopenharmony_ci __m256i r = astcenc_mm256_set_m128i(m, m); 542cc1dc7a3Sopenharmony_ci vint8 vmax(r); 543cc1dc7a3Sopenharmony_ci return vmax; 544cc1dc7a3Sopenharmony_ci} 545cc1dc7a3Sopenharmony_ci 546cc1dc7a3Sopenharmony_ci/** 547cc1dc7a3Sopenharmony_ci * @brief Store a vector to a 16B aligned memory address. 548cc1dc7a3Sopenharmony_ci */ 549cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void storea(vint8 a, int* p) 550cc1dc7a3Sopenharmony_ci{ 551cc1dc7a3Sopenharmony_ci _mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m); 552cc1dc7a3Sopenharmony_ci} 553cc1dc7a3Sopenharmony_ci 554cc1dc7a3Sopenharmony_ci/** 555cc1dc7a3Sopenharmony_ci * @brief Store a vector to an unaligned memory address. 556cc1dc7a3Sopenharmony_ci */ 557cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void store(vint8 a, int* p) 558cc1dc7a3Sopenharmony_ci{ 559cc1dc7a3Sopenharmony_ci _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m); 560cc1dc7a3Sopenharmony_ci} 561cc1dc7a3Sopenharmony_ci 562cc1dc7a3Sopenharmony_ci/** 563cc1dc7a3Sopenharmony_ci * @brief Store lowest N (vector width) bytes into an unaligned address. 564cc1dc7a3Sopenharmony_ci */ 565cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p) 566cc1dc7a3Sopenharmony_ci{ 567cc1dc7a3Sopenharmony_ci // This is the most logical implementation, but the convenience intrinsic 568cc1dc7a3Sopenharmony_ci // is missing on older compilers (supported in g++ 9 and clang++ 9). 569cc1dc7a3Sopenharmony_ci // _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0)) 570cc1dc7a3Sopenharmony_ci _mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0)); 571cc1dc7a3Sopenharmony_ci} 572cc1dc7a3Sopenharmony_ci 573cc1dc7a3Sopenharmony_ci/** 574cc1dc7a3Sopenharmony_ci * @brief Gather N (vector width) indices from the array. 575cc1dc7a3Sopenharmony_ci */ 576cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices) 577cc1dc7a3Sopenharmony_ci{ 578cc1dc7a3Sopenharmony_ci return vint8(_mm256_i32gather_epi32(base, indices.m, 4)); 579cc1dc7a3Sopenharmony_ci} 580cc1dc7a3Sopenharmony_ci 581cc1dc7a3Sopenharmony_ci/** 582cc1dc7a3Sopenharmony_ci * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. 583cc1dc7a3Sopenharmony_ci */ 584cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v) 585cc1dc7a3Sopenharmony_ci{ 586cc1dc7a3Sopenharmony_ci __m256i shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 587cc1dc7a3Sopenharmony_ci 0, 0, 0, 0, 28, 24, 20, 16, 588cc1dc7a3Sopenharmony_ci 0, 0, 0, 0, 0, 0, 0, 0, 589cc1dc7a3Sopenharmony_ci 0, 0, 0, 0, 12, 8, 4, 0); 590cc1dc7a3Sopenharmony_ci __m256i a = _mm256_shuffle_epi8(v.m, shuf); 591cc1dc7a3Sopenharmony_ci __m128i a0 = _mm256_extracti128_si256(a, 0); 592cc1dc7a3Sopenharmony_ci __m128i a1 = _mm256_extracti128_si256(a, 1); 593cc1dc7a3Sopenharmony_ci __m128i b = _mm_unpacklo_epi32(a0, a1); 594cc1dc7a3Sopenharmony_ci 595cc1dc7a3Sopenharmony_ci __m256i r = astcenc_mm256_set_m128i(b, b); 596cc1dc7a3Sopenharmony_ci return vint8(r); 597cc1dc7a3Sopenharmony_ci} 598cc1dc7a3Sopenharmony_ci 599cc1dc7a3Sopenharmony_ci/** 600cc1dc7a3Sopenharmony_ci * @brief Return lanes from @c b if @c cond is set, else @c a. 601cc1dc7a3Sopenharmony_ci */ 602cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond) 603cc1dc7a3Sopenharmony_ci{ 604cc1dc7a3Sopenharmony_ci __m256i condi = _mm256_castps_si256(cond.m); 605cc1dc7a3Sopenharmony_ci return vint8(_mm256_blendv_epi8(a.m, b.m, condi)); 606cc1dc7a3Sopenharmony_ci} 607cc1dc7a3Sopenharmony_ci 608cc1dc7a3Sopenharmony_ci// ============================================================================ 609cc1dc7a3Sopenharmony_ci// vfloat4 operators and functions 610cc1dc7a3Sopenharmony_ci// ============================================================================ 611cc1dc7a3Sopenharmony_ci 612cc1dc7a3Sopenharmony_ci/** 613cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector addition. 614cc1dc7a3Sopenharmony_ci */ 615cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b) 616cc1dc7a3Sopenharmony_ci{ 617cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_add_ps(a.m, b.m)); 618cc1dc7a3Sopenharmony_ci} 619cc1dc7a3Sopenharmony_ci 620cc1dc7a3Sopenharmony_ci/** 621cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector incremental addition. 622cc1dc7a3Sopenharmony_ci */ 623cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b) 624cc1dc7a3Sopenharmony_ci{ 625cc1dc7a3Sopenharmony_ci a = a + b; 626cc1dc7a3Sopenharmony_ci return a; 627cc1dc7a3Sopenharmony_ci} 628cc1dc7a3Sopenharmony_ci 629cc1dc7a3Sopenharmony_ci/** 630cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector subtraction. 631cc1dc7a3Sopenharmony_ci */ 632cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b) 633cc1dc7a3Sopenharmony_ci{ 634cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_sub_ps(a.m, b.m)); 635cc1dc7a3Sopenharmony_ci} 636cc1dc7a3Sopenharmony_ci 637cc1dc7a3Sopenharmony_ci/** 638cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector multiplication. 639cc1dc7a3Sopenharmony_ci */ 640cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b) 641cc1dc7a3Sopenharmony_ci{ 642cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_mul_ps(a.m, b.m)); 643cc1dc7a3Sopenharmony_ci} 644cc1dc7a3Sopenharmony_ci 645cc1dc7a3Sopenharmony_ci/** 646cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar multiplication. 647cc1dc7a3Sopenharmony_ci */ 648cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b) 649cc1dc7a3Sopenharmony_ci{ 650cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b))); 651cc1dc7a3Sopenharmony_ci} 652cc1dc7a3Sopenharmony_ci 653cc1dc7a3Sopenharmony_ci/** 654cc1dc7a3Sopenharmony_ci * @brief Overload: scalar by vector multiplication. 655cc1dc7a3Sopenharmony_ci */ 656cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b) 657cc1dc7a3Sopenharmony_ci{ 658cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m)); 659cc1dc7a3Sopenharmony_ci} 660cc1dc7a3Sopenharmony_ci 661cc1dc7a3Sopenharmony_ci/** 662cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector division. 663cc1dc7a3Sopenharmony_ci */ 664cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b) 665cc1dc7a3Sopenharmony_ci{ 666cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_div_ps(a.m, b.m)); 667cc1dc7a3Sopenharmony_ci} 668cc1dc7a3Sopenharmony_ci 669cc1dc7a3Sopenharmony_ci/** 670cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar division. 671cc1dc7a3Sopenharmony_ci */ 672cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b) 673cc1dc7a3Sopenharmony_ci{ 674cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b))); 675cc1dc7a3Sopenharmony_ci} 676cc1dc7a3Sopenharmony_ci 677cc1dc7a3Sopenharmony_ci 678cc1dc7a3Sopenharmony_ci/** 679cc1dc7a3Sopenharmony_ci * @brief Overload: scalar by vector division. 680cc1dc7a3Sopenharmony_ci */ 681cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b) 682cc1dc7a3Sopenharmony_ci{ 683cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m)); 684cc1dc7a3Sopenharmony_ci} 685cc1dc7a3Sopenharmony_ci 686cc1dc7a3Sopenharmony_ci 687cc1dc7a3Sopenharmony_ci/** 688cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector equality. 689cc1dc7a3Sopenharmony_ci */ 690cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b) 691cc1dc7a3Sopenharmony_ci{ 692cc1dc7a3Sopenharmony_ci return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ)); 693cc1dc7a3Sopenharmony_ci} 694cc1dc7a3Sopenharmony_ci 695cc1dc7a3Sopenharmony_ci/** 696cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector inequality. 697cc1dc7a3Sopenharmony_ci */ 698cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b) 699cc1dc7a3Sopenharmony_ci{ 700cc1dc7a3Sopenharmony_ci return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ)); 701cc1dc7a3Sopenharmony_ci} 702cc1dc7a3Sopenharmony_ci 703cc1dc7a3Sopenharmony_ci/** 704cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector less than. 705cc1dc7a3Sopenharmony_ci */ 706cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b) 707cc1dc7a3Sopenharmony_ci{ 708cc1dc7a3Sopenharmony_ci return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ)); 709cc1dc7a3Sopenharmony_ci} 710cc1dc7a3Sopenharmony_ci 711cc1dc7a3Sopenharmony_ci/** 712cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector greater than. 713cc1dc7a3Sopenharmony_ci */ 714cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b) 715cc1dc7a3Sopenharmony_ci{ 716cc1dc7a3Sopenharmony_ci return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ)); 717cc1dc7a3Sopenharmony_ci} 718cc1dc7a3Sopenharmony_ci 719cc1dc7a3Sopenharmony_ci/** 720cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector less than or equal. 721cc1dc7a3Sopenharmony_ci */ 722cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b) 723cc1dc7a3Sopenharmony_ci{ 724cc1dc7a3Sopenharmony_ci return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ)); 725cc1dc7a3Sopenharmony_ci} 726cc1dc7a3Sopenharmony_ci 727cc1dc7a3Sopenharmony_ci/** 728cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector greater than or equal. 729cc1dc7a3Sopenharmony_ci */ 730cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b) 731cc1dc7a3Sopenharmony_ci{ 732cc1dc7a3Sopenharmony_ci return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ)); 733cc1dc7a3Sopenharmony_ci} 734cc1dc7a3Sopenharmony_ci 735cc1dc7a3Sopenharmony_ci/** 736cc1dc7a3Sopenharmony_ci * @brief Return the min vector of two vectors. 737cc1dc7a3Sopenharmony_ci * 738cc1dc7a3Sopenharmony_ci * If either lane value is NaN, @c b will be returned for that lane. 739cc1dc7a3Sopenharmony_ci */ 740cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b) 741cc1dc7a3Sopenharmony_ci{ 742cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_min_ps(a.m, b.m)); 743cc1dc7a3Sopenharmony_ci} 744cc1dc7a3Sopenharmony_ci 745cc1dc7a3Sopenharmony_ci/** 746cc1dc7a3Sopenharmony_ci * @brief Return the min vector of a vector and a scalar. 747cc1dc7a3Sopenharmony_ci * 748cc1dc7a3Sopenharmony_ci * If either lane value is NaN, @c b will be returned for that lane. 749cc1dc7a3Sopenharmony_ci */ 750cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b) 751cc1dc7a3Sopenharmony_ci{ 752cc1dc7a3Sopenharmony_ci return min(a, vfloat8(b)); 753cc1dc7a3Sopenharmony_ci} 754cc1dc7a3Sopenharmony_ci 755cc1dc7a3Sopenharmony_ci/** 756cc1dc7a3Sopenharmony_ci * @brief Return the max vector of two vectors. 757cc1dc7a3Sopenharmony_ci * 758cc1dc7a3Sopenharmony_ci * If either lane value is NaN, @c b will be returned for that lane. 759cc1dc7a3Sopenharmony_ci */ 760cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b) 761cc1dc7a3Sopenharmony_ci{ 762cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_max_ps(a.m, b.m)); 763cc1dc7a3Sopenharmony_ci} 764cc1dc7a3Sopenharmony_ci 765cc1dc7a3Sopenharmony_ci/** 766cc1dc7a3Sopenharmony_ci * @brief Return the max vector of a vector and a scalar. 767cc1dc7a3Sopenharmony_ci * 768cc1dc7a3Sopenharmony_ci * If either lane value is NaN, @c b will be returned for that lane. 769cc1dc7a3Sopenharmony_ci */ 770cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b) 771cc1dc7a3Sopenharmony_ci{ 772cc1dc7a3Sopenharmony_ci return max(a, vfloat8(b)); 773cc1dc7a3Sopenharmony_ci} 774cc1dc7a3Sopenharmony_ci 775cc1dc7a3Sopenharmony_ci/** 776cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between min and max. 777cc1dc7a3Sopenharmony_ci * 778cc1dc7a3Sopenharmony_ci * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN 779cc1dc7a3Sopenharmony_ci * then @c min will be returned for that lane. 780cc1dc7a3Sopenharmony_ci */ 781cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a) 782cc1dc7a3Sopenharmony_ci{ 783cc1dc7a3Sopenharmony_ci // Do not reorder - second operand will return if either is NaN 784cc1dc7a3Sopenharmony_ci a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min)); 785cc1dc7a3Sopenharmony_ci a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max)); 786cc1dc7a3Sopenharmony_ci return a; 787cc1dc7a3Sopenharmony_ci} 788cc1dc7a3Sopenharmony_ci 789cc1dc7a3Sopenharmony_ci/** 790cc1dc7a3Sopenharmony_ci * @brief Return a clamped value between 0.0f and max. 791cc1dc7a3Sopenharmony_ci * 792cc1dc7a3Sopenharmony_ci * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will 793cc1dc7a3Sopenharmony_ci * be returned for that lane. 794cc1dc7a3Sopenharmony_ci */ 795cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a) 796cc1dc7a3Sopenharmony_ci{ 797cc1dc7a3Sopenharmony_ci a.m = _mm256_max_ps(a.m, _mm256_setzero_ps()); 798cc1dc7a3Sopenharmony_ci a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max)); 799cc1dc7a3Sopenharmony_ci return a; 800cc1dc7a3Sopenharmony_ci} 801cc1dc7a3Sopenharmony_ci 802cc1dc7a3Sopenharmony_ci/** 803cc1dc7a3Sopenharmony_ci * @brief Return a clamped value between 0.0f and 1.0f. 804cc1dc7a3Sopenharmony_ci * 805cc1dc7a3Sopenharmony_ci * If @c a is NaN then zero will be returned for that lane. 806cc1dc7a3Sopenharmony_ci */ 807cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a) 808cc1dc7a3Sopenharmony_ci{ 809cc1dc7a3Sopenharmony_ci a.m = _mm256_max_ps(a.m, _mm256_setzero_ps()); 810cc1dc7a3Sopenharmony_ci a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f)); 811cc1dc7a3Sopenharmony_ci return a; 812cc1dc7a3Sopenharmony_ci} 813cc1dc7a3Sopenharmony_ci 814cc1dc7a3Sopenharmony_ci/** 815cc1dc7a3Sopenharmony_ci * @brief Return the absolute value of the float vector. 816cc1dc7a3Sopenharmony_ci */ 817cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a) 818cc1dc7a3Sopenharmony_ci{ 819cc1dc7a3Sopenharmony_ci __m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); 820cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_and_ps(a.m, msk)); 821cc1dc7a3Sopenharmony_ci} 822cc1dc7a3Sopenharmony_ci 823cc1dc7a3Sopenharmony_ci/** 824cc1dc7a3Sopenharmony_ci * @brief Return a float rounded to the nearest integer value. 825cc1dc7a3Sopenharmony_ci */ 826cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a) 827cc1dc7a3Sopenharmony_ci{ 828cc1dc7a3Sopenharmony_ci constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; 829cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_round_ps(a.m, flags)); 830cc1dc7a3Sopenharmony_ci} 831cc1dc7a3Sopenharmony_ci 832cc1dc7a3Sopenharmony_ci/** 833cc1dc7a3Sopenharmony_ci * @brief Return the horizontal minimum of a vector. 834cc1dc7a3Sopenharmony_ci */ 835cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a) 836cc1dc7a3Sopenharmony_ci{ 837cc1dc7a3Sopenharmony_ci __m128 vlow = _mm256_castps256_ps128(a.m); 838cc1dc7a3Sopenharmony_ci __m128 vhigh = _mm256_extractf128_ps(a.m, 1); 839cc1dc7a3Sopenharmony_ci vlow = _mm_min_ps(vlow, vhigh); 840cc1dc7a3Sopenharmony_ci 841cc1dc7a3Sopenharmony_ci // First do an horizontal reduction. 842cc1dc7a3Sopenharmony_ci __m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1)); 843cc1dc7a3Sopenharmony_ci __m128 mins = _mm_min_ps(vlow, shuf); 844cc1dc7a3Sopenharmony_ci shuf = _mm_movehl_ps(shuf, mins); 845cc1dc7a3Sopenharmony_ci mins = _mm_min_ss(mins, shuf); 846cc1dc7a3Sopenharmony_ci 847cc1dc7a3Sopenharmony_ci // This is the most logical implementation, but the convenience intrinsic 848cc1dc7a3Sopenharmony_ci // is missing on older compilers (supported in g++ 9 and clang++ 9). 849cc1dc7a3Sopenharmony_ci //__m256i r = _mm256_set_m128(m, m) 850cc1dc7a3Sopenharmony_ci __m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1); 851cc1dc7a3Sopenharmony_ci 852cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_permute_ps(r, 0)); 853cc1dc7a3Sopenharmony_ci} 854cc1dc7a3Sopenharmony_ci 855cc1dc7a3Sopenharmony_ci/** 856cc1dc7a3Sopenharmony_ci * @brief Return the horizontal minimum of a vector. 857cc1dc7a3Sopenharmony_ci */ 858cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hmin_s(vfloat8 a) 859cc1dc7a3Sopenharmony_ci{ 860cc1dc7a3Sopenharmony_ci return hmin(a).lane<0>(); 861cc1dc7a3Sopenharmony_ci} 862cc1dc7a3Sopenharmony_ci 863cc1dc7a3Sopenharmony_ci/** 864cc1dc7a3Sopenharmony_ci * @brief Return the horizontal maximum of a vector. 865cc1dc7a3Sopenharmony_ci */ 866cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a) 867cc1dc7a3Sopenharmony_ci{ 868cc1dc7a3Sopenharmony_ci __m128 vlow = _mm256_castps256_ps128(a.m); 869cc1dc7a3Sopenharmony_ci __m128 vhigh = _mm256_extractf128_ps(a.m, 1); 870cc1dc7a3Sopenharmony_ci vhigh = _mm_max_ps(vlow, vhigh); 871cc1dc7a3Sopenharmony_ci 872cc1dc7a3Sopenharmony_ci // First do an horizontal reduction. 873cc1dc7a3Sopenharmony_ci __m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1)); 874cc1dc7a3Sopenharmony_ci __m128 maxs = _mm_max_ps(vhigh, shuf); 875cc1dc7a3Sopenharmony_ci shuf = _mm_movehl_ps(shuf,maxs); 876cc1dc7a3Sopenharmony_ci maxs = _mm_max_ss(maxs, shuf); 877cc1dc7a3Sopenharmony_ci 878cc1dc7a3Sopenharmony_ci // This is the most logical implementation, but the convenience intrinsic 879cc1dc7a3Sopenharmony_ci // is missing on older compilers (supported in g++ 9 and clang++ 9). 880cc1dc7a3Sopenharmony_ci //__m256i r = _mm256_set_m128(m, m) 881cc1dc7a3Sopenharmony_ci __m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1); 882cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_permute_ps(r, 0)); 883cc1dc7a3Sopenharmony_ci} 884cc1dc7a3Sopenharmony_ci 885cc1dc7a3Sopenharmony_ci/** 886cc1dc7a3Sopenharmony_ci * @brief Return the horizontal maximum of a vector. 887cc1dc7a3Sopenharmony_ci */ 888cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hmax_s(vfloat8 a) 889cc1dc7a3Sopenharmony_ci{ 890cc1dc7a3Sopenharmony_ci return hmax(a).lane<0>(); 891cc1dc7a3Sopenharmony_ci} 892cc1dc7a3Sopenharmony_ci 893cc1dc7a3Sopenharmony_ci/** 894cc1dc7a3Sopenharmony_ci * @brief Return the horizontal sum of a vector. 895cc1dc7a3Sopenharmony_ci */ 896cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hadd_s(vfloat8 a) 897cc1dc7a3Sopenharmony_ci{ 898cc1dc7a3Sopenharmony_ci // Two sequential 4-wide adds gives invariance with 4-wide code 899cc1dc7a3Sopenharmony_ci vfloat4 lo(_mm256_extractf128_ps(a.m, 0)); 900cc1dc7a3Sopenharmony_ci vfloat4 hi(_mm256_extractf128_ps(a.m, 1)); 901cc1dc7a3Sopenharmony_ci return hadd_s(lo) + hadd_s(hi); 902cc1dc7a3Sopenharmony_ci} 903cc1dc7a3Sopenharmony_ci 904cc1dc7a3Sopenharmony_ci/** 905cc1dc7a3Sopenharmony_ci * @brief Return lanes from @c b if @c cond is set, else @c a. 906cc1dc7a3Sopenharmony_ci */ 907cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond) 908cc1dc7a3Sopenharmony_ci{ 909cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m)); 910cc1dc7a3Sopenharmony_ci} 911cc1dc7a3Sopenharmony_ci 912cc1dc7a3Sopenharmony_ci/** 913cc1dc7a3Sopenharmony_ci * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. 914cc1dc7a3Sopenharmony_ci */ 915cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond) 916cc1dc7a3Sopenharmony_ci{ 917cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m)); 918cc1dc7a3Sopenharmony_ci} 919cc1dc7a3Sopenharmony_ci 920cc1dc7a3Sopenharmony_ci/** 921cc1dc7a3Sopenharmony_ci * @brief Accumulate lane-wise sums for a vector, folded 4-wide. 922cc1dc7a3Sopenharmony_ci * 923cc1dc7a3Sopenharmony_ci * This is invariant with 4-wide implementations. 924cc1dc7a3Sopenharmony_ci */ 925cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a) 926cc1dc7a3Sopenharmony_ci{ 927cc1dc7a3Sopenharmony_ci vfloat4 lo(_mm256_extractf128_ps(a.m, 0)); 928cc1dc7a3Sopenharmony_ci haccumulate(accum, lo); 929cc1dc7a3Sopenharmony_ci 930cc1dc7a3Sopenharmony_ci vfloat4 hi(_mm256_extractf128_ps(a.m, 1)); 931cc1dc7a3Sopenharmony_ci haccumulate(accum, hi); 932cc1dc7a3Sopenharmony_ci} 933cc1dc7a3Sopenharmony_ci 934cc1dc7a3Sopenharmony_ci/** 935cc1dc7a3Sopenharmony_ci * @brief Accumulate lane-wise sums for a vector. 936cc1dc7a3Sopenharmony_ci * 937cc1dc7a3Sopenharmony_ci * This is NOT invariant with 4-wide implementations. 938cc1dc7a3Sopenharmony_ci */ 939cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a) 940cc1dc7a3Sopenharmony_ci{ 941cc1dc7a3Sopenharmony_ci accum += a; 942cc1dc7a3Sopenharmony_ci} 943cc1dc7a3Sopenharmony_ci 944cc1dc7a3Sopenharmony_ci/** 945cc1dc7a3Sopenharmony_ci * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide. 946cc1dc7a3Sopenharmony_ci * 947cc1dc7a3Sopenharmony_ci * This is invariant with 4-wide implementations. 948cc1dc7a3Sopenharmony_ci */ 949cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m) 950cc1dc7a3Sopenharmony_ci{ 951cc1dc7a3Sopenharmony_ci a = select(vfloat8::zero(), a, m); 952cc1dc7a3Sopenharmony_ci haccumulate(accum, a); 953cc1dc7a3Sopenharmony_ci} 954cc1dc7a3Sopenharmony_ci 955cc1dc7a3Sopenharmony_ci/** 956cc1dc7a3Sopenharmony_ci * @brief Accumulate masked lane-wise sums for a vector. 957cc1dc7a3Sopenharmony_ci * 958cc1dc7a3Sopenharmony_ci * This is NOT invariant with 4-wide implementations. 959cc1dc7a3Sopenharmony_ci */ 960cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m) 961cc1dc7a3Sopenharmony_ci{ 962cc1dc7a3Sopenharmony_ci a = select(vfloat8::zero(), a, m); 963cc1dc7a3Sopenharmony_ci haccumulate(accum, a); 964cc1dc7a3Sopenharmony_ci} 965cc1dc7a3Sopenharmony_ci 966cc1dc7a3Sopenharmony_ci/** 967cc1dc7a3Sopenharmony_ci * @brief Return the sqrt of the lanes in the vector. 968cc1dc7a3Sopenharmony_ci */ 969cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a) 970cc1dc7a3Sopenharmony_ci{ 971cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_sqrt_ps(a.m)); 972cc1dc7a3Sopenharmony_ci} 973cc1dc7a3Sopenharmony_ci 974cc1dc7a3Sopenharmony_ci/** 975cc1dc7a3Sopenharmony_ci * @brief Load a vector of gathered results from an array; 976cc1dc7a3Sopenharmony_ci */ 977cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices) 978cc1dc7a3Sopenharmony_ci{ 979cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_i32gather_ps(base, indices.m, 4)); 980cc1dc7a3Sopenharmony_ci} 981cc1dc7a3Sopenharmony_ci 982cc1dc7a3Sopenharmony_ci/** 983cc1dc7a3Sopenharmony_ci * @brief Store a vector to an unaligned memory address. 984cc1dc7a3Sopenharmony_ci */ 985cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void store(vfloat8 a, float* p) 986cc1dc7a3Sopenharmony_ci{ 987cc1dc7a3Sopenharmony_ci _mm256_storeu_ps(p, a.m); 988cc1dc7a3Sopenharmony_ci} 989cc1dc7a3Sopenharmony_ci 990cc1dc7a3Sopenharmony_ci/** 991cc1dc7a3Sopenharmony_ci * @brief Store a vector to a 32B aligned memory address. 992cc1dc7a3Sopenharmony_ci */ 993cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p) 994cc1dc7a3Sopenharmony_ci{ 995cc1dc7a3Sopenharmony_ci _mm256_store_ps(p, a.m); 996cc1dc7a3Sopenharmony_ci} 997cc1dc7a3Sopenharmony_ci 998cc1dc7a3Sopenharmony_ci/** 999cc1dc7a3Sopenharmony_ci * @brief Return a integer value for a float vector, using truncation. 1000cc1dc7a3Sopenharmony_ci */ 1001cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a) 1002cc1dc7a3Sopenharmony_ci{ 1003cc1dc7a3Sopenharmony_ci return vint8(_mm256_cvttps_epi32(a.m)); 1004cc1dc7a3Sopenharmony_ci} 1005cc1dc7a3Sopenharmony_ci 1006cc1dc7a3Sopenharmony_ci/** 1007cc1dc7a3Sopenharmony_ci * @brief Return a integer value for a float vector, using round-to-nearest. 1008cc1dc7a3Sopenharmony_ci */ 1009cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a) 1010cc1dc7a3Sopenharmony_ci{ 1011cc1dc7a3Sopenharmony_ci a = a + vfloat8(0.5f); 1012cc1dc7a3Sopenharmony_ci return vint8(_mm256_cvttps_epi32(a.m)); 1013cc1dc7a3Sopenharmony_ci} 1014cc1dc7a3Sopenharmony_ci 1015cc1dc7a3Sopenharmony_ci 1016cc1dc7a3Sopenharmony_ci/** 1017cc1dc7a3Sopenharmony_ci * @brief Return a float value for an integer vector. 1018cc1dc7a3Sopenharmony_ci */ 1019cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a) 1020cc1dc7a3Sopenharmony_ci{ 1021cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_cvtepi32_ps(a.m)); 1022cc1dc7a3Sopenharmony_ci} 1023cc1dc7a3Sopenharmony_ci 1024cc1dc7a3Sopenharmony_ci/** 1025cc1dc7a3Sopenharmony_ci * @brief Return a float value as an integer bit pattern (i.e. no conversion). 1026cc1dc7a3Sopenharmony_ci * 1027cc1dc7a3Sopenharmony_ci * It is a common trick to convert floats into integer bit patterns, perform 1028cc1dc7a3Sopenharmony_ci * some bit hackery based on knowledge they are IEEE 754 layout, and then 1029cc1dc7a3Sopenharmony_ci * convert them back again. This is the first half of that flip. 1030cc1dc7a3Sopenharmony_ci */ 1031cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a) 1032cc1dc7a3Sopenharmony_ci{ 1033cc1dc7a3Sopenharmony_ci return vint8(_mm256_castps_si256(a.m)); 1034cc1dc7a3Sopenharmony_ci} 1035cc1dc7a3Sopenharmony_ci 1036cc1dc7a3Sopenharmony_ci/** 1037cc1dc7a3Sopenharmony_ci * @brief Return a integer value as a float bit pattern (i.e. no conversion). 1038cc1dc7a3Sopenharmony_ci * 1039cc1dc7a3Sopenharmony_ci * It is a common trick to convert floats into integer bit patterns, perform 1040cc1dc7a3Sopenharmony_ci * some bit hackery based on knowledge they are IEEE 754 layout, and then 1041cc1dc7a3Sopenharmony_ci * convert them back again. This is the second half of that flip. 1042cc1dc7a3Sopenharmony_ci */ 1043cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a) 1044cc1dc7a3Sopenharmony_ci{ 1045cc1dc7a3Sopenharmony_ci return vfloat8(_mm256_castsi256_ps(a.m)); 1046cc1dc7a3Sopenharmony_ci} 1047cc1dc7a3Sopenharmony_ci 1048cc1dc7a3Sopenharmony_ci/** 1049cc1dc7a3Sopenharmony_ci * @brief Prepare a vtable lookup table for use with the native SIMD size. 1050cc1dc7a3Sopenharmony_ci */ 1051cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p) 1052cc1dc7a3Sopenharmony_ci{ 1053cc1dc7a3Sopenharmony_ci // AVX2 duplicates the table within each 128-bit lane 1054cc1dc7a3Sopenharmony_ci __m128i t0n = t0.m; 1055cc1dc7a3Sopenharmony_ci t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); 1056cc1dc7a3Sopenharmony_ci} 1057cc1dc7a3Sopenharmony_ci 1058cc1dc7a3Sopenharmony_ci/** 1059cc1dc7a3Sopenharmony_ci * @brief Prepare a vtable lookup table for use with the native SIMD size. 1060cc1dc7a3Sopenharmony_ci */ 1061cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p) 1062cc1dc7a3Sopenharmony_ci{ 1063cc1dc7a3Sopenharmony_ci // AVX2 duplicates the table within each 128-bit lane 1064cc1dc7a3Sopenharmony_ci __m128i t0n = t0.m; 1065cc1dc7a3Sopenharmony_ci t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); 1066cc1dc7a3Sopenharmony_ci 1067cc1dc7a3Sopenharmony_ci __m128i t1n = _mm_xor_si128(t0.m, t1.m); 1068cc1dc7a3Sopenharmony_ci t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n)); 1069cc1dc7a3Sopenharmony_ci} 1070cc1dc7a3Sopenharmony_ci 1071cc1dc7a3Sopenharmony_ci/** 1072cc1dc7a3Sopenharmony_ci * @brief Prepare a vtable lookup table for use with the native SIMD size. 1073cc1dc7a3Sopenharmony_ci */ 1074cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void vtable_prepare( 1075cc1dc7a3Sopenharmony_ci vint4 t0, vint4 t1, vint4 t2, vint4 t3, 1076cc1dc7a3Sopenharmony_ci vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p) 1077cc1dc7a3Sopenharmony_ci{ 1078cc1dc7a3Sopenharmony_ci // AVX2 duplicates the table within each 128-bit lane 1079cc1dc7a3Sopenharmony_ci __m128i t0n = t0.m; 1080cc1dc7a3Sopenharmony_ci t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); 1081cc1dc7a3Sopenharmony_ci 1082cc1dc7a3Sopenharmony_ci __m128i t1n = _mm_xor_si128(t0.m, t1.m); 1083cc1dc7a3Sopenharmony_ci t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n)); 1084cc1dc7a3Sopenharmony_ci 1085cc1dc7a3Sopenharmony_ci __m128i t2n = _mm_xor_si128(t1.m, t2.m); 1086cc1dc7a3Sopenharmony_ci t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n)); 1087cc1dc7a3Sopenharmony_ci 1088cc1dc7a3Sopenharmony_ci __m128i t3n = _mm_xor_si128(t2.m, t3.m); 1089cc1dc7a3Sopenharmony_ci t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n)); 1090cc1dc7a3Sopenharmony_ci} 1091cc1dc7a3Sopenharmony_ci 1092cc1dc7a3Sopenharmony_ci/** 1093cc1dc7a3Sopenharmony_ci * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. 1094cc1dc7a3Sopenharmony_ci */ 1095cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx) 1096cc1dc7a3Sopenharmony_ci{ 1097cc1dc7a3Sopenharmony_ci // Set index byte MSB to 1 for unused bytes so shuffle returns zero 1098cc1dc7a3Sopenharmony_ci __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00))); 1099cc1dc7a3Sopenharmony_ci 1100cc1dc7a3Sopenharmony_ci __m256i result = _mm256_shuffle_epi8(t0.m, idxx); 1101cc1dc7a3Sopenharmony_ci return vint8(result); 1102cc1dc7a3Sopenharmony_ci} 1103cc1dc7a3Sopenharmony_ci 1104cc1dc7a3Sopenharmony_ci/** 1105cc1dc7a3Sopenharmony_ci * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. 1106cc1dc7a3Sopenharmony_ci */ 1107cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx) 1108cc1dc7a3Sopenharmony_ci{ 1109cc1dc7a3Sopenharmony_ci // Set index byte MSB to 1 for unused bytes so shuffle returns zero 1110cc1dc7a3Sopenharmony_ci __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00))); 1111cc1dc7a3Sopenharmony_ci 1112cc1dc7a3Sopenharmony_ci __m256i result = _mm256_shuffle_epi8(t0.m, idxx); 1113cc1dc7a3Sopenharmony_ci idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); 1114cc1dc7a3Sopenharmony_ci 1115cc1dc7a3Sopenharmony_ci __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx); 1116cc1dc7a3Sopenharmony_ci result = _mm256_xor_si256(result, result2); 1117cc1dc7a3Sopenharmony_ci return vint8(result); 1118cc1dc7a3Sopenharmony_ci} 1119cc1dc7a3Sopenharmony_ci 1120cc1dc7a3Sopenharmony_ci/** 1121cc1dc7a3Sopenharmony_ci * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. 1122cc1dc7a3Sopenharmony_ci */ 1123cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx) 1124cc1dc7a3Sopenharmony_ci{ 1125cc1dc7a3Sopenharmony_ci // Set index byte MSB to 1 for unused bytes so shuffle returns zero 1126cc1dc7a3Sopenharmony_ci __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00))); 1127cc1dc7a3Sopenharmony_ci 1128cc1dc7a3Sopenharmony_ci __m256i result = _mm256_shuffle_epi8(t0.m, idxx); 1129cc1dc7a3Sopenharmony_ci idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); 1130cc1dc7a3Sopenharmony_ci 1131cc1dc7a3Sopenharmony_ci __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx); 1132cc1dc7a3Sopenharmony_ci result = _mm256_xor_si256(result, result2); 1133cc1dc7a3Sopenharmony_ci idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); 1134cc1dc7a3Sopenharmony_ci 1135cc1dc7a3Sopenharmony_ci result2 = _mm256_shuffle_epi8(t2.m, idxx); 1136cc1dc7a3Sopenharmony_ci result = _mm256_xor_si256(result, result2); 1137cc1dc7a3Sopenharmony_ci idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); 1138cc1dc7a3Sopenharmony_ci 1139cc1dc7a3Sopenharmony_ci result2 = _mm256_shuffle_epi8(t3.m, idxx); 1140cc1dc7a3Sopenharmony_ci result = _mm256_xor_si256(result, result2); 1141cc1dc7a3Sopenharmony_ci 1142cc1dc7a3Sopenharmony_ci return vint8(result); 1143cc1dc7a3Sopenharmony_ci} 1144cc1dc7a3Sopenharmony_ci 1145cc1dc7a3Sopenharmony_ci/** 1146cc1dc7a3Sopenharmony_ci * @brief Return a vector of interleaved RGBA data. 1147cc1dc7a3Sopenharmony_ci * 1148cc1dc7a3Sopenharmony_ci * Input vectors have the value stored in the bottom 8 bits of each lane, 1149cc1dc7a3Sopenharmony_ci * with high bits set to zero. 1150cc1dc7a3Sopenharmony_ci * 1151cc1dc7a3Sopenharmony_ci * Output vector stores a single RGBA texel packed in each lane. 1152cc1dc7a3Sopenharmony_ci */ 1153cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a) 1154cc1dc7a3Sopenharmony_ci{ 1155cc1dc7a3Sopenharmony_ci return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a); 1156cc1dc7a3Sopenharmony_ci} 1157cc1dc7a3Sopenharmony_ci 1158cc1dc7a3Sopenharmony_ci/** 1159cc1dc7a3Sopenharmony_ci * @brief Store a vector, skipping masked lanes. 1160cc1dc7a3Sopenharmony_ci * 1161cc1dc7a3Sopenharmony_ci * All masked lanes must be at the end of vector, after all non-masked lanes. 1162cc1dc7a3Sopenharmony_ci */ 1163cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask) 1164cc1dc7a3Sopenharmony_ci{ 1165cc1dc7a3Sopenharmony_ci _mm256_maskstore_epi32(reinterpret_cast<int*>(base), _mm256_castps_si256(mask.m), data.m); 1166cc1dc7a3Sopenharmony_ci} 1167cc1dc7a3Sopenharmony_ci 1168cc1dc7a3Sopenharmony_ci/** 1169cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of ints. 1170cc1dc7a3Sopenharmony_ci */ 1171cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void print(vint8 a) 1172cc1dc7a3Sopenharmony_ci{ 1173cc1dc7a3Sopenharmony_ci alignas(32) int v[8]; 1174cc1dc7a3Sopenharmony_ci storea(a, v); 1175cc1dc7a3Sopenharmony_ci printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n", 1176cc1dc7a3Sopenharmony_ci v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); 1177cc1dc7a3Sopenharmony_ci} 1178cc1dc7a3Sopenharmony_ci 1179cc1dc7a3Sopenharmony_ci/** 1180cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of ints. 1181cc1dc7a3Sopenharmony_ci */ 1182cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void printx(vint8 a) 1183cc1dc7a3Sopenharmony_ci{ 1184cc1dc7a3Sopenharmony_ci alignas(32) int v[8]; 1185cc1dc7a3Sopenharmony_ci storea(a, v); 1186cc1dc7a3Sopenharmony_ci printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n", 1187cc1dc7a3Sopenharmony_ci v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); 1188cc1dc7a3Sopenharmony_ci} 1189cc1dc7a3Sopenharmony_ci 1190cc1dc7a3Sopenharmony_ci/** 1191cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of floats. 1192cc1dc7a3Sopenharmony_ci */ 1193cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void print(vfloat8 a) 1194cc1dc7a3Sopenharmony_ci{ 1195cc1dc7a3Sopenharmony_ci alignas(32) float v[8]; 1196cc1dc7a3Sopenharmony_ci storea(a, v); 1197cc1dc7a3Sopenharmony_ci printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n", 1198cc1dc7a3Sopenharmony_ci static_cast<double>(v[0]), static_cast<double>(v[1]), 1199cc1dc7a3Sopenharmony_ci static_cast<double>(v[2]), static_cast<double>(v[3]), 1200cc1dc7a3Sopenharmony_ci static_cast<double>(v[4]), static_cast<double>(v[5]), 1201cc1dc7a3Sopenharmony_ci static_cast<double>(v[6]), static_cast<double>(v[7])); 1202cc1dc7a3Sopenharmony_ci} 1203cc1dc7a3Sopenharmony_ci 1204cc1dc7a3Sopenharmony_ci/** 1205cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of masks. 1206cc1dc7a3Sopenharmony_ci */ 1207cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void print(vmask8 a) 1208cc1dc7a3Sopenharmony_ci{ 1209cc1dc7a3Sopenharmony_ci print(select(vint8(0), vint8(1), a)); 1210cc1dc7a3Sopenharmony_ci} 1211cc1dc7a3Sopenharmony_ci 1212cc1dc7a3Sopenharmony_ci#endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED 1213