1// SPDX-License-Identifier: Apache-2.0 2// ---------------------------------------------------------------------------- 3// Copyright 2019-2024 Arm Limited 4// 5// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6// use this file except in compliance with the License. You may obtain a copy 7// of the License at: 8// 9// http://www.apache.org/licenses/LICENSE-2.0 10// 11// Unless required by applicable law or agreed to in writing, software 12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14// License for the specific language governing permissions and limitations 15// under the License. 16// ---------------------------------------------------------------------------- 17 18/** 19 * @brief 8x32-bit vectors, implemented using AVX2. 20 * 21 * This module implements 8-wide 32-bit float, int, and mask vectors for x86 22 * AVX2. 23 * 24 * There is a baseline level of functionality provided by all vector widths and 25 * implementations. This is implemented using identical function signatures, 26 * modulo data type, so we can use them as substitutable implementations in VLA 27 * code. 28 */ 29 30#ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED 31#define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED 32 33#ifndef ASTCENC_SIMD_INLINE 34 #error "Include astcenc_vecmathlib.h, do not include directly" 35#endif 36 37#include <cstdio> 38 39// Define convenience intrinsics that are missing on older compilers 40#define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1) 41 42// ============================================================================ 43// vfloat8 data type 44// ============================================================================ 45 46/** 47 * @brief Data type for 8-wide floats. 48 */ 49struct vfloat8 50{ 51 /** 52 * @brief Construct from zero-initialized value. 53 */ 54 ASTCENC_SIMD_INLINE vfloat8() = default; 55 56 /** 57 * @brief Construct from 4 values loaded from an unaligned address. 58 * 59 * Consider using loada() which is better with vectors if data is aligned 60 * to vector length. 61 */ 62 ASTCENC_SIMD_INLINE explicit vfloat8(const float *p) 63 { 64 m = _mm256_loadu_ps(p); 65 } 66 67 /** 68 * @brief Construct from 1 scalar value replicated across all lanes. 69 * 70 * Consider using zero() for constexpr zeros. 71 */ 72 ASTCENC_SIMD_INLINE explicit vfloat8(float a) 73 { 74 m = _mm256_set1_ps(a); 75 } 76 77 /** 78 * @brief Construct from 8 scalar values. 79 * 80 * The value of @c a is stored to lane 0 (LSB) in the SIMD register. 81 */ 82 ASTCENC_SIMD_INLINE explicit vfloat8( 83 float a, float b, float c, float d, 84 float e, float f, float g, float h) 85 { 86 m = _mm256_set_ps(h, g, f, e, d, c, b, a); 87 } 88 89 /** 90 * @brief Construct from an existing SIMD register. 91 */ 92 ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a) 93 { 94 m = a; 95 } 96 97 /** 98 * @brief Get the scalar value of a single lane. 99 */ 100 template <int l> ASTCENC_SIMD_INLINE float lane() const 101 { 102 #if !defined(__clang__) && defined(_MSC_VER) 103 return m.m256_f32[l]; 104 #else 105 union { __m256 m; float f[8]; } cvt; 106 cvt.m = m; 107 return cvt.f[l]; 108 #endif 109 } 110 111 /** 112 * @brief Factory that returns a vector of zeros. 113 */ 114 static ASTCENC_SIMD_INLINE vfloat8 zero() 115 { 116 return vfloat8(_mm256_setzero_ps()); 117 } 118 119 /** 120 * @brief Factory that returns a replicated scalar loaded from memory. 121 */ 122 static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p) 123 { 124 return vfloat8(_mm256_broadcast_ss(p)); 125 } 126 127 /** 128 * @brief Factory that returns a vector loaded from 32B aligned memory. 129 */ 130 static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p) 131 { 132 return vfloat8(_mm256_load_ps(p)); 133 } 134 135 /** 136 * @brief Factory that returns a vector containing the lane IDs. 137 */ 138 static ASTCENC_SIMD_INLINE vfloat8 lane_id() 139 { 140 return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0)); 141 } 142 143 /** 144 * @brief The vector ... 145 */ 146 __m256 m; 147}; 148 149// ============================================================================ 150// vint8 data type 151// ============================================================================ 152 153/** 154 * @brief Data type for 8-wide ints. 155 */ 156struct vint8 157{ 158 /** 159 * @brief Construct from zero-initialized value. 160 */ 161 ASTCENC_SIMD_INLINE vint8() = default; 162 163 /** 164 * @brief Construct from 8 values loaded from an unaligned address. 165 * 166 * Consider using loada() which is better with vectors if data is aligned 167 * to vector length. 168 */ 169 ASTCENC_SIMD_INLINE explicit vint8(const int *p) 170 { 171 m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p)); 172 } 173 174 /** 175 * @brief Construct from 8 uint8_t loaded from an unaligned address. 176 */ 177 ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p) 178 { 179 // _mm_loadu_si64 would be nicer syntax, but missing on older GCC 180 m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p))); 181 } 182 183 /** 184 * @brief Construct from 1 scalar value replicated across all lanes. 185 * 186 * Consider using vfloat4::zero() for constexpr zeros. 187 */ 188 ASTCENC_SIMD_INLINE explicit vint8(int a) 189 { 190 m = _mm256_set1_epi32(a); 191 } 192 193 /** 194 * @brief Construct from 8 scalar values. 195 * 196 * The value of @c a is stored to lane 0 (LSB) in the SIMD register. 197 */ 198 ASTCENC_SIMD_INLINE explicit vint8( 199 int a, int b, int c, int d, 200 int e, int f, int g, int h) 201 { 202 m = _mm256_set_epi32(h, g, f, e, d, c, b, a); 203 } 204 205 /** 206 * @brief Construct from an existing SIMD register. 207 */ 208 ASTCENC_SIMD_INLINE explicit vint8(__m256i a) 209 { 210 m = a; 211 } 212 213 /** 214 * @brief Get the scalar from a single lane. 215 */ 216 template <int l> ASTCENC_SIMD_INLINE int lane() const 217 { 218 #if !defined(__clang__) && defined(_MSC_VER) 219 return m.m256i_i32[l]; 220 #else 221 union { __m256i m; int f[8]; } cvt; 222 cvt.m = m; 223 return cvt.f[l]; 224 #endif 225 } 226 227 /** 228 * @brief Factory that returns a vector of zeros. 229 */ 230 static ASTCENC_SIMD_INLINE vint8 zero() 231 { 232 return vint8(_mm256_setzero_si256()); 233 } 234 235 /** 236 * @brief Factory that returns a replicated scalar loaded from memory. 237 */ 238 static ASTCENC_SIMD_INLINE vint8 load1(const int* p) 239 { 240 __m128i a = _mm_set1_epi32(*p); 241 return vint8(_mm256_broadcastd_epi32(a)); 242 } 243 244 /** 245 * @brief Factory that returns a vector loaded from unaligned memory. 246 */ 247 static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p) 248 { 249 return vint8(_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(p))); 250 } 251 252 /** 253 * @brief Factory that returns a vector loaded from 32B aligned memory. 254 */ 255 static ASTCENC_SIMD_INLINE vint8 loada(const int* p) 256 { 257 return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p))); 258 } 259 260 /** 261 * @brief Factory that returns a vector containing the lane IDs. 262 */ 263 static ASTCENC_SIMD_INLINE vint8 lane_id() 264 { 265 return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); 266 } 267 268 /** 269 * @brief The vector ... 270 */ 271 __m256i m; 272}; 273 274// ============================================================================ 275// vmask8 data type 276// ============================================================================ 277 278/** 279 * @brief Data type for 8-wide control plane masks. 280 */ 281struct vmask8 282{ 283 /** 284 * @brief Construct from an existing SIMD register. 285 */ 286 ASTCENC_SIMD_INLINE explicit vmask8(__m256 a) 287 { 288 m = a; 289 } 290 291 /** 292 * @brief Construct from an existing SIMD register. 293 */ 294 ASTCENC_SIMD_INLINE explicit vmask8(__m256i a) 295 { 296 m = _mm256_castsi256_ps(a); 297 } 298 299 /** 300 * @brief Construct from 1 scalar value. 301 */ 302 ASTCENC_SIMD_INLINE explicit vmask8(bool a) 303 { 304 vint8 mask(a == false ? 0 : -1); 305 m = _mm256_castsi256_ps(mask.m); 306 } 307 308 /** 309 * @brief The vector ... 310 */ 311 __m256 m; 312}; 313 314// ============================================================================ 315// vmask8 operators and functions 316// ============================================================================ 317 318/** 319 * @brief Overload: mask union (or). 320 */ 321ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b) 322{ 323 return vmask8(_mm256_or_ps(a.m, b.m)); 324} 325 326/** 327 * @brief Overload: mask intersect (and). 328 */ 329ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b) 330{ 331 return vmask8(_mm256_and_ps(a.m, b.m)); 332} 333 334/** 335 * @brief Overload: mask difference (xor). 336 */ 337ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b) 338{ 339 return vmask8(_mm256_xor_ps(a.m, b.m)); 340} 341 342/** 343 * @brief Overload: mask invert (not). 344 */ 345ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a) 346{ 347 return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1))); 348} 349 350/** 351 * @brief Return a 8-bit mask code indicating mask status. 352 * 353 * bit0 = lane 0 354 */ 355ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a) 356{ 357 return static_cast<unsigned int>(_mm256_movemask_ps(a.m)); 358} 359 360/** 361 * @brief True if any lanes are enabled, false otherwise. 362 */ 363ASTCENC_SIMD_INLINE bool any(vmask8 a) 364{ 365 return mask(a) != 0; 366} 367 368/** 369 * @brief True if all lanes are enabled, false otherwise. 370 */ 371ASTCENC_SIMD_INLINE bool all(vmask8 a) 372{ 373 return mask(a) == 0xFF; 374} 375 376// ============================================================================ 377// vint8 operators and functions 378// ============================================================================ 379/** 380 * @brief Overload: vector by vector addition. 381 */ 382ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b) 383{ 384 return vint8(_mm256_add_epi32(a.m, b.m)); 385} 386 387/** 388 * @brief Overload: vector by vector incremental addition. 389 */ 390ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b) 391{ 392 a = a + b; 393 return a; 394} 395 396/** 397 * @brief Overload: vector by vector subtraction. 398 */ 399ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b) 400{ 401 return vint8(_mm256_sub_epi32(a.m, b.m)); 402} 403 404/** 405 * @brief Overload: vector by vector multiplication. 406 */ 407ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b) 408{ 409 return vint8(_mm256_mullo_epi32(a.m, b.m)); 410} 411 412/** 413 * @brief Overload: vector bit invert. 414 */ 415ASTCENC_SIMD_INLINE vint8 operator~(vint8 a) 416{ 417 return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1))); 418} 419 420/** 421 * @brief Overload: vector by vector bitwise or. 422 */ 423ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b) 424{ 425 return vint8(_mm256_or_si256(a.m, b.m)); 426} 427 428/** 429 * @brief Overload: vector by vector bitwise and. 430 */ 431ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b) 432{ 433 return vint8(_mm256_and_si256(a.m, b.m)); 434} 435 436/** 437 * @brief Overload: vector by vector bitwise xor. 438 */ 439ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b) 440{ 441 return vint8(_mm256_xor_si256(a.m, b.m)); 442} 443 444/** 445 * @brief Overload: vector by vector equality. 446 */ 447ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b) 448{ 449 return vmask8(_mm256_cmpeq_epi32(a.m, b.m)); 450} 451 452/** 453 * @brief Overload: vector by vector inequality. 454 */ 455ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b) 456{ 457 return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m)); 458} 459 460/** 461 * @brief Overload: vector by vector less than. 462 */ 463ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b) 464{ 465 return vmask8(_mm256_cmpgt_epi32(b.m, a.m)); 466} 467 468/** 469 * @brief Overload: vector by vector greater than. 470 */ 471ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b) 472{ 473 return vmask8(_mm256_cmpgt_epi32(a.m, b.m)); 474} 475 476/** 477 * @brief Logical shift left. 478 */ 479template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a) 480{ 481 return vint8(_mm256_slli_epi32(a.m, s)); 482} 483 484/** 485 * @brief Arithmetic shift right. 486 */ 487template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a) 488{ 489 return vint8(_mm256_srai_epi32(a.m, s)); 490} 491 492/** 493 * @brief Logical shift right. 494 */ 495template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a) 496{ 497 return vint8(_mm256_srli_epi32(a.m, s)); 498} 499 500/** 501 * @brief Return the min vector of two vectors. 502 */ 503ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b) 504{ 505 return vint8(_mm256_min_epi32(a.m, b.m)); 506} 507 508/** 509 * @brief Return the max vector of two vectors. 510 */ 511ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b) 512{ 513 return vint8(_mm256_max_epi32(a.m, b.m)); 514} 515 516/** 517 * @brief Return the horizontal minimum of a vector. 518 */ 519ASTCENC_SIMD_INLINE vint8 hmin(vint8 a) 520{ 521 __m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1)); 522 m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2))); 523 m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1))); 524 m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0)); 525 526 __m256i r = astcenc_mm256_set_m128i(m, m); 527 vint8 vmin(r); 528 return vmin; 529} 530 531/** 532 * @brief Return the horizontal maximum of a vector. 533 */ 534ASTCENC_SIMD_INLINE vint8 hmax(vint8 a) 535{ 536 __m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1)); 537 m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2))); 538 m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1))); 539 m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0)); 540 541 __m256i r = astcenc_mm256_set_m128i(m, m); 542 vint8 vmax(r); 543 return vmax; 544} 545 546/** 547 * @brief Store a vector to a 16B aligned memory address. 548 */ 549ASTCENC_SIMD_INLINE void storea(vint8 a, int* p) 550{ 551 _mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m); 552} 553 554/** 555 * @brief Store a vector to an unaligned memory address. 556 */ 557ASTCENC_SIMD_INLINE void store(vint8 a, int* p) 558{ 559 _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m); 560} 561 562/** 563 * @brief Store lowest N (vector width) bytes into an unaligned address. 564 */ 565ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p) 566{ 567 // This is the most logical implementation, but the convenience intrinsic 568 // is missing on older compilers (supported in g++ 9 and clang++ 9). 569 // _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0)) 570 _mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0)); 571} 572 573/** 574 * @brief Gather N (vector width) indices from the array. 575 */ 576ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices) 577{ 578 return vint8(_mm256_i32gather_epi32(base, indices.m, 4)); 579} 580 581/** 582 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. 583 */ 584ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v) 585{ 586 __m256i shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 587 0, 0, 0, 0, 28, 24, 20, 16, 588 0, 0, 0, 0, 0, 0, 0, 0, 589 0, 0, 0, 0, 12, 8, 4, 0); 590 __m256i a = _mm256_shuffle_epi8(v.m, shuf); 591 __m128i a0 = _mm256_extracti128_si256(a, 0); 592 __m128i a1 = _mm256_extracti128_si256(a, 1); 593 __m128i b = _mm_unpacklo_epi32(a0, a1); 594 595 __m256i r = astcenc_mm256_set_m128i(b, b); 596 return vint8(r); 597} 598 599/** 600 * @brief Return lanes from @c b if @c cond is set, else @c a. 601 */ 602ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond) 603{ 604 __m256i condi = _mm256_castps_si256(cond.m); 605 return vint8(_mm256_blendv_epi8(a.m, b.m, condi)); 606} 607 608// ============================================================================ 609// vfloat4 operators and functions 610// ============================================================================ 611 612/** 613 * @brief Overload: vector by vector addition. 614 */ 615ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b) 616{ 617 return vfloat8(_mm256_add_ps(a.m, b.m)); 618} 619 620/** 621 * @brief Overload: vector by vector incremental addition. 622 */ 623ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b) 624{ 625 a = a + b; 626 return a; 627} 628 629/** 630 * @brief Overload: vector by vector subtraction. 631 */ 632ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b) 633{ 634 return vfloat8(_mm256_sub_ps(a.m, b.m)); 635} 636 637/** 638 * @brief Overload: vector by vector multiplication. 639 */ 640ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b) 641{ 642 return vfloat8(_mm256_mul_ps(a.m, b.m)); 643} 644 645/** 646 * @brief Overload: vector by scalar multiplication. 647 */ 648ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b) 649{ 650 return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b))); 651} 652 653/** 654 * @brief Overload: scalar by vector multiplication. 655 */ 656ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b) 657{ 658 return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m)); 659} 660 661/** 662 * @brief Overload: vector by vector division. 663 */ 664ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b) 665{ 666 return vfloat8(_mm256_div_ps(a.m, b.m)); 667} 668 669/** 670 * @brief Overload: vector by scalar division. 671 */ 672ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b) 673{ 674 return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b))); 675} 676 677 678/** 679 * @brief Overload: scalar by vector division. 680 */ 681ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b) 682{ 683 return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m)); 684} 685 686 687/** 688 * @brief Overload: vector by vector equality. 689 */ 690ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b) 691{ 692 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ)); 693} 694 695/** 696 * @brief Overload: vector by vector inequality. 697 */ 698ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b) 699{ 700 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ)); 701} 702 703/** 704 * @brief Overload: vector by vector less than. 705 */ 706ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b) 707{ 708 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ)); 709} 710 711/** 712 * @brief Overload: vector by vector greater than. 713 */ 714ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b) 715{ 716 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ)); 717} 718 719/** 720 * @brief Overload: vector by vector less than or equal. 721 */ 722ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b) 723{ 724 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ)); 725} 726 727/** 728 * @brief Overload: vector by vector greater than or equal. 729 */ 730ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b) 731{ 732 return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ)); 733} 734 735/** 736 * @brief Return the min vector of two vectors. 737 * 738 * If either lane value is NaN, @c b will be returned for that lane. 739 */ 740ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b) 741{ 742 return vfloat8(_mm256_min_ps(a.m, b.m)); 743} 744 745/** 746 * @brief Return the min vector of a vector and a scalar. 747 * 748 * If either lane value is NaN, @c b will be returned for that lane. 749 */ 750ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b) 751{ 752 return min(a, vfloat8(b)); 753} 754 755/** 756 * @brief Return the max vector of two vectors. 757 * 758 * If either lane value is NaN, @c b will be returned for that lane. 759 */ 760ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b) 761{ 762 return vfloat8(_mm256_max_ps(a.m, b.m)); 763} 764 765/** 766 * @brief Return the max vector of a vector and a scalar. 767 * 768 * If either lane value is NaN, @c b will be returned for that lane. 769 */ 770ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b) 771{ 772 return max(a, vfloat8(b)); 773} 774 775/** 776 * @brief Return the clamped value between min and max. 777 * 778 * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN 779 * then @c min will be returned for that lane. 780 */ 781ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a) 782{ 783 // Do not reorder - second operand will return if either is NaN 784 a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min)); 785 a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max)); 786 return a; 787} 788 789/** 790 * @brief Return a clamped value between 0.0f and max. 791 * 792 * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will 793 * be returned for that lane. 794 */ 795ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a) 796{ 797 a.m = _mm256_max_ps(a.m, _mm256_setzero_ps()); 798 a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max)); 799 return a; 800} 801 802/** 803 * @brief Return a clamped value between 0.0f and 1.0f. 804 * 805 * If @c a is NaN then zero will be returned for that lane. 806 */ 807ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a) 808{ 809 a.m = _mm256_max_ps(a.m, _mm256_setzero_ps()); 810 a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f)); 811 return a; 812} 813 814/** 815 * @brief Return the absolute value of the float vector. 816 */ 817ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a) 818{ 819 __m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); 820 return vfloat8(_mm256_and_ps(a.m, msk)); 821} 822 823/** 824 * @brief Return a float rounded to the nearest integer value. 825 */ 826ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a) 827{ 828 constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; 829 return vfloat8(_mm256_round_ps(a.m, flags)); 830} 831 832/** 833 * @brief Return the horizontal minimum of a vector. 834 */ 835ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a) 836{ 837 __m128 vlow = _mm256_castps256_ps128(a.m); 838 __m128 vhigh = _mm256_extractf128_ps(a.m, 1); 839 vlow = _mm_min_ps(vlow, vhigh); 840 841 // First do an horizontal reduction. 842 __m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1)); 843 __m128 mins = _mm_min_ps(vlow, shuf); 844 shuf = _mm_movehl_ps(shuf, mins); 845 mins = _mm_min_ss(mins, shuf); 846 847 // This is the most logical implementation, but the convenience intrinsic 848 // is missing on older compilers (supported in g++ 9 and clang++ 9). 849 //__m256i r = _mm256_set_m128(m, m) 850 __m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1); 851 852 return vfloat8(_mm256_permute_ps(r, 0)); 853} 854 855/** 856 * @brief Return the horizontal minimum of a vector. 857 */ 858ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a) 859{ 860 return hmin(a).lane<0>(); 861} 862 863/** 864 * @brief Return the horizontal maximum of a vector. 865 */ 866ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a) 867{ 868 __m128 vlow = _mm256_castps256_ps128(a.m); 869 __m128 vhigh = _mm256_extractf128_ps(a.m, 1); 870 vhigh = _mm_max_ps(vlow, vhigh); 871 872 // First do an horizontal reduction. 873 __m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1)); 874 __m128 maxs = _mm_max_ps(vhigh, shuf); 875 shuf = _mm_movehl_ps(shuf,maxs); 876 maxs = _mm_max_ss(maxs, shuf); 877 878 // This is the most logical implementation, but the convenience intrinsic 879 // is missing on older compilers (supported in g++ 9 and clang++ 9). 880 //__m256i r = _mm256_set_m128(m, m) 881 __m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1); 882 return vfloat8(_mm256_permute_ps(r, 0)); 883} 884 885/** 886 * @brief Return the horizontal maximum of a vector. 887 */ 888ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a) 889{ 890 return hmax(a).lane<0>(); 891} 892 893/** 894 * @brief Return the horizontal sum of a vector. 895 */ 896ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a) 897{ 898 // Two sequential 4-wide adds gives invariance with 4-wide code 899 vfloat4 lo(_mm256_extractf128_ps(a.m, 0)); 900 vfloat4 hi(_mm256_extractf128_ps(a.m, 1)); 901 return hadd_s(lo) + hadd_s(hi); 902} 903 904/** 905 * @brief Return lanes from @c b if @c cond is set, else @c a. 906 */ 907ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond) 908{ 909 return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m)); 910} 911 912/** 913 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. 914 */ 915ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond) 916{ 917 return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m)); 918} 919 920/** 921 * @brief Accumulate lane-wise sums for a vector, folded 4-wide. 922 * 923 * This is invariant with 4-wide implementations. 924 */ 925ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a) 926{ 927 vfloat4 lo(_mm256_extractf128_ps(a.m, 0)); 928 haccumulate(accum, lo); 929 930 vfloat4 hi(_mm256_extractf128_ps(a.m, 1)); 931 haccumulate(accum, hi); 932} 933 934/** 935 * @brief Accumulate lane-wise sums for a vector. 936 * 937 * This is NOT invariant with 4-wide implementations. 938 */ 939ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a) 940{ 941 accum += a; 942} 943 944/** 945 * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide. 946 * 947 * This is invariant with 4-wide implementations. 948 */ 949ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m) 950{ 951 a = select(vfloat8::zero(), a, m); 952 haccumulate(accum, a); 953} 954 955/** 956 * @brief Accumulate masked lane-wise sums for a vector. 957 * 958 * This is NOT invariant with 4-wide implementations. 959 */ 960ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m) 961{ 962 a = select(vfloat8::zero(), a, m); 963 haccumulate(accum, a); 964} 965 966/** 967 * @brief Return the sqrt of the lanes in the vector. 968 */ 969ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a) 970{ 971 return vfloat8(_mm256_sqrt_ps(a.m)); 972} 973 974/** 975 * @brief Load a vector of gathered results from an array; 976 */ 977ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices) 978{ 979 return vfloat8(_mm256_i32gather_ps(base, indices.m, 4)); 980} 981 982/** 983 * @brief Store a vector to an unaligned memory address. 984 */ 985ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p) 986{ 987 _mm256_storeu_ps(p, a.m); 988} 989 990/** 991 * @brief Store a vector to a 32B aligned memory address. 992 */ 993ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p) 994{ 995 _mm256_store_ps(p, a.m); 996} 997 998/** 999 * @brief Return a integer value for a float vector, using truncation. 1000 */ 1001ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a) 1002{ 1003 return vint8(_mm256_cvttps_epi32(a.m)); 1004} 1005 1006/** 1007 * @brief Return a integer value for a float vector, using round-to-nearest. 1008 */ 1009ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a) 1010{ 1011 a = a + vfloat8(0.5f); 1012 return vint8(_mm256_cvttps_epi32(a.m)); 1013} 1014 1015 1016/** 1017 * @brief Return a float value for an integer vector. 1018 */ 1019ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a) 1020{ 1021 return vfloat8(_mm256_cvtepi32_ps(a.m)); 1022} 1023 1024/** 1025 * @brief Return a float value as an integer bit pattern (i.e. no conversion). 1026 * 1027 * It is a common trick to convert floats into integer bit patterns, perform 1028 * some bit hackery based on knowledge they are IEEE 754 layout, and then 1029 * convert them back again. This is the first half of that flip. 1030 */ 1031ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a) 1032{ 1033 return vint8(_mm256_castps_si256(a.m)); 1034} 1035 1036/** 1037 * @brief Return a integer value as a float bit pattern (i.e. no conversion). 1038 * 1039 * It is a common trick to convert floats into integer bit patterns, perform 1040 * some bit hackery based on knowledge they are IEEE 754 layout, and then 1041 * convert them back again. This is the second half of that flip. 1042 */ 1043ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a) 1044{ 1045 return vfloat8(_mm256_castsi256_ps(a.m)); 1046} 1047 1048/** 1049 * @brief Prepare a vtable lookup table for use with the native SIMD size. 1050 */ 1051ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p) 1052{ 1053 // AVX2 duplicates the table within each 128-bit lane 1054 __m128i t0n = t0.m; 1055 t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); 1056} 1057 1058/** 1059 * @brief Prepare a vtable lookup table for use with the native SIMD size. 1060 */ 1061ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p) 1062{ 1063 // AVX2 duplicates the table within each 128-bit lane 1064 __m128i t0n = t0.m; 1065 t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); 1066 1067 __m128i t1n = _mm_xor_si128(t0.m, t1.m); 1068 t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n)); 1069} 1070 1071/** 1072 * @brief Prepare a vtable lookup table for use with the native SIMD size. 1073 */ 1074ASTCENC_SIMD_INLINE void vtable_prepare( 1075 vint4 t0, vint4 t1, vint4 t2, vint4 t3, 1076 vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p) 1077{ 1078 // AVX2 duplicates the table within each 128-bit lane 1079 __m128i t0n = t0.m; 1080 t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); 1081 1082 __m128i t1n = _mm_xor_si128(t0.m, t1.m); 1083 t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n)); 1084 1085 __m128i t2n = _mm_xor_si128(t1.m, t2.m); 1086 t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n)); 1087 1088 __m128i t3n = _mm_xor_si128(t2.m, t3.m); 1089 t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n)); 1090} 1091 1092/** 1093 * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. 1094 */ 1095ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx) 1096{ 1097 // Set index byte MSB to 1 for unused bytes so shuffle returns zero 1098 __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00))); 1099 1100 __m256i result = _mm256_shuffle_epi8(t0.m, idxx); 1101 return vint8(result); 1102} 1103 1104/** 1105 * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. 1106 */ 1107ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx) 1108{ 1109 // Set index byte MSB to 1 for unused bytes so shuffle returns zero 1110 __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00))); 1111 1112 __m256i result = _mm256_shuffle_epi8(t0.m, idxx); 1113 idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); 1114 1115 __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx); 1116 result = _mm256_xor_si256(result, result2); 1117 return vint8(result); 1118} 1119 1120/** 1121 * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. 1122 */ 1123ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx) 1124{ 1125 // Set index byte MSB to 1 for unused bytes so shuffle returns zero 1126 __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00))); 1127 1128 __m256i result = _mm256_shuffle_epi8(t0.m, idxx); 1129 idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); 1130 1131 __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx); 1132 result = _mm256_xor_si256(result, result2); 1133 idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); 1134 1135 result2 = _mm256_shuffle_epi8(t2.m, idxx); 1136 result = _mm256_xor_si256(result, result2); 1137 idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); 1138 1139 result2 = _mm256_shuffle_epi8(t3.m, idxx); 1140 result = _mm256_xor_si256(result, result2); 1141 1142 return vint8(result); 1143} 1144 1145/** 1146 * @brief Return a vector of interleaved RGBA data. 1147 * 1148 * Input vectors have the value stored in the bottom 8 bits of each lane, 1149 * with high bits set to zero. 1150 * 1151 * Output vector stores a single RGBA texel packed in each lane. 1152 */ 1153ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a) 1154{ 1155 return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a); 1156} 1157 1158/** 1159 * @brief Store a vector, skipping masked lanes. 1160 * 1161 * All masked lanes must be at the end of vector, after all non-masked lanes. 1162 */ 1163ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask) 1164{ 1165 _mm256_maskstore_epi32(reinterpret_cast<int*>(base), _mm256_castps_si256(mask.m), data.m); 1166} 1167 1168/** 1169 * @brief Debug function to print a vector of ints. 1170 */ 1171ASTCENC_SIMD_INLINE void print(vint8 a) 1172{ 1173 alignas(32) int v[8]; 1174 storea(a, v); 1175 printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n", 1176 v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); 1177} 1178 1179/** 1180 * @brief Debug function to print a vector of ints. 1181 */ 1182ASTCENC_SIMD_INLINE void printx(vint8 a) 1183{ 1184 alignas(32) int v[8]; 1185 storea(a, v); 1186 printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n", 1187 v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); 1188} 1189 1190/** 1191 * @brief Debug function to print a vector of floats. 1192 */ 1193ASTCENC_SIMD_INLINE void print(vfloat8 a) 1194{ 1195 alignas(32) float v[8]; 1196 storea(a, v); 1197 printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n", 1198 static_cast<double>(v[0]), static_cast<double>(v[1]), 1199 static_cast<double>(v[2]), static_cast<double>(v[3]), 1200 static_cast<double>(v[4]), static_cast<double>(v[5]), 1201 static_cast<double>(v[6]), static_cast<double>(v[7])); 1202} 1203 1204/** 1205 * @brief Debug function to print a vector of masks. 1206 */ 1207ASTCENC_SIMD_INLINE void print(vmask8 a) 1208{ 1209 print(select(vint8(0), vint8(1), a)); 1210} 1211 1212#endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED 1213