1// SPDX-License-Identifier: Apache-2.0 2// ---------------------------------------------------------------------------- 3// Copyright 2019-2023 Arm Limited 4// 5// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6// use this file except in compliance with the License. You may obtain a copy 7// of the License at: 8// 9// http://www.apache.org/licenses/LICENSE-2.0 10// 11// Unless required by applicable law or agreed to in writing, software 12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14// License for the specific language governing permissions and limitations 15// under the License. 16// ---------------------------------------------------------------------------- 17 18/** 19 * @brief 4x32-bit vectors, implemented using plain C++. 20 * 21 * This module implements 4-wide 32-bit float, int, and mask vectors. This 22 * module provides a scalar fallback for VLA code, primarily useful for 23 * debugging VLA algorithms without the complexity of handling SIMD. Only the 24 * baseline level of functionality needed to support VLA is provided. 25 * 26 * Note that the vector conditional operators implemented by this module are 27 * designed to behave like SIMD conditional operators that generate lane masks. 28 * Rather than returning 0/1 booleans like normal C++ code they will return 29 * 0/-1 to give a full lane-width bitmask. 30 * 31 * Note that the documentation for this module still talks about "vectors" to 32 * help developers think about the implied VLA behavior when writing optimized 33 * paths. 34 */ 35 36#ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED 37#define ASTC_VECMATHLIB_NONE_4_H_INCLUDED 38 39#ifndef ASTCENC_SIMD_INLINE 40 #error "Include astcenc_vecmathlib.h, do not include directly" 41#endif 42 43#include <algorithm> 44#include <cstdio> 45#include <cstring> 46#include <cfenv> 47 48// ============================================================================ 49// vfloat4 data type 50// ============================================================================ 51 52/** 53 * @brief Data type for 4-wide floats. 54 */ 55struct vfloat4 56{ 57 /** 58 * @brief Construct from zero-initialized value. 59 */ 60 ASTCENC_SIMD_INLINE vfloat4() = default; 61 62 /** 63 * @brief Construct from 4 values loaded from an unaligned address. 64 * 65 * Consider using loada() which is better with wider VLA vectors if data is 66 * aligned to vector length. 67 */ 68 ASTCENC_SIMD_INLINE explicit vfloat4(const float* p) 69 { 70 m[0] = p[0]; 71 m[1] = p[1]; 72 m[2] = p[2]; 73 m[3] = p[3]; 74 } 75 76 /** 77 * @brief Construct from 4 scalar values replicated across all lanes. 78 * 79 * Consider using zero() for constexpr zeros. 80 */ 81 ASTCENC_SIMD_INLINE explicit vfloat4(float a) 82 { 83 m[0] = a; 84 m[1] = a; 85 m[2] = a; 86 m[3] = a; 87 } 88 89 /** 90 * @brief Construct from 4 scalar values. 91 * 92 * The value of @c a is stored to lane 0 (LSB) in the SIMD register. 93 */ 94 ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d) 95 { 96 m[0] = a; 97 m[1] = b; 98 m[2] = c; 99 m[3] = d; 100 } 101 102 /** 103 * @brief Get the scalar value of a single lane. 104 */ 105 template <int l> ASTCENC_SIMD_INLINE float lane() const 106 { 107 return m[l]; 108 } 109 110 /** 111 * @brief Set the scalar value of a single lane. 112 */ 113 template <int l> ASTCENC_SIMD_INLINE void set_lane(float a) 114 { 115 m[l] = a; 116 } 117 118 /** 119 * @brief Factory that returns a vector of zeros. 120 */ 121 static ASTCENC_SIMD_INLINE vfloat4 zero() 122 { 123 return vfloat4(0.0f); 124 } 125 126 /** 127 * @brief Factory that returns a replicated scalar loaded from memory. 128 */ 129 static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p) 130 { 131 return vfloat4(*p); 132 } 133 134 /** 135 * @brief Factory that returns a vector loaded from aligned memory. 136 */ 137 static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p) 138 { 139 return vfloat4(p); 140 } 141 142 /** 143 * @brief Factory that returns a vector containing the lane IDs. 144 */ 145 static ASTCENC_SIMD_INLINE vfloat4 lane_id() 146 { 147 return vfloat4(0.0f, 1.0f, 2.0f, 3.0f); 148 } 149 150 /** 151 * @brief Return a swizzled float 2. 152 */ 153 template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const 154 { 155 return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f); 156 } 157 158 /** 159 * @brief Return a swizzled float 3. 160 */ 161 template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const 162 { 163 return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f); 164 } 165 166 /** 167 * @brief Return a swizzled float 4. 168 */ 169 template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const 170 { 171 return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>()); 172 } 173 174 /** 175 * @brief The vector ... 176 */ 177 float m[4]; 178}; 179 180// ============================================================================ 181// vint4 data type 182// ============================================================================ 183 184/** 185 * @brief Data type for 4-wide ints. 186 */ 187struct vint4 188{ 189 /** 190 * @brief Construct from zero-initialized value. 191 */ 192 ASTCENC_SIMD_INLINE vint4() = default; 193 194 /** 195 * @brief Construct from 4 values loaded from an unaligned address. 196 * 197 * Consider using vint4::loada() which is better with wider VLA vectors 198 * if data is aligned. 199 */ 200 ASTCENC_SIMD_INLINE explicit vint4(const int* p) 201 { 202 m[0] = p[0]; 203 m[1] = p[1]; 204 m[2] = p[2]; 205 m[3] = p[3]; 206 } 207 208 /** 209 * @brief Construct from 4 uint8_t loaded from an unaligned address. 210 */ 211 ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p) 212 { 213 m[0] = p[0]; 214 m[1] = p[1]; 215 m[2] = p[2]; 216 m[3] = p[3]; 217 } 218 219 /** 220 * @brief Construct from 4 scalar values. 221 * 222 * The value of @c a is stored to lane 0 (LSB) in the SIMD register. 223 */ 224 ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d) 225 { 226 m[0] = a; 227 m[1] = b; 228 m[2] = c; 229 m[3] = d; 230 } 231 232 233 /** 234 * @brief Construct from 4 scalar values replicated across all lanes. 235 * 236 * Consider using vint4::zero() for constexpr zeros. 237 */ 238 ASTCENC_SIMD_INLINE explicit vint4(int a) 239 { 240 m[0] = a; 241 m[1] = a; 242 m[2] = a; 243 m[3] = a; 244 } 245 246 /** 247 * @brief Get the scalar value of a single lane. 248 */ 249 template <int l> ASTCENC_SIMD_INLINE int lane() const 250 { 251 return m[l]; 252 } 253 254 /** 255 * @brief Set the scalar value of a single lane. 256 */ 257 template <int l> ASTCENC_SIMD_INLINE void set_lane(int a) 258 { 259 m[l] = a; 260 } 261 262 /** 263 * @brief Factory that returns a vector of zeros. 264 */ 265 static ASTCENC_SIMD_INLINE vint4 zero() 266 { 267 return vint4(0); 268 } 269 270 /** 271 * @brief Factory that returns a replicated scalar loaded from memory. 272 */ 273 static ASTCENC_SIMD_INLINE vint4 load1(const int* p) 274 { 275 return vint4(*p); 276 } 277 278 /** 279 * @brief Factory that returns a vector loaded from unaligned memory. 280 */ 281 static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p) 282 { 283 vint4 data; 284 std::memcpy(&data.m, p, 4 * sizeof(int)); 285 return data; 286 } 287 288 /** 289 * @brief Factory that returns a vector loaded from 16B aligned memory. 290 */ 291 static ASTCENC_SIMD_INLINE vint4 loada(const int* p) 292 { 293 return vint4(p); 294 } 295 296 /** 297 * @brief Factory that returns a vector containing the lane IDs. 298 */ 299 static ASTCENC_SIMD_INLINE vint4 lane_id() 300 { 301 return vint4(0, 1, 2, 3); 302 } 303 304 /** 305 * @brief The vector ... 306 */ 307 int m[4]; 308}; 309 310// ============================================================================ 311// vmask4 data type 312// ============================================================================ 313 314/** 315 * @brief Data type for 4-wide control plane masks. 316 */ 317struct vmask4 318{ 319 /** 320 * @brief Construct from an existing mask value. 321 */ 322 ASTCENC_SIMD_INLINE explicit vmask4(int* p) 323 { 324 m[0] = p[0]; 325 m[1] = p[1]; 326 m[2] = p[2]; 327 m[3] = p[3]; 328 } 329 330 /** 331 * @brief Construct from 1 scalar value. 332 */ 333 ASTCENC_SIMD_INLINE explicit vmask4(bool a) 334 { 335 m[0] = a == false ? 0 : -1; 336 m[1] = a == false ? 0 : -1; 337 m[2] = a == false ? 0 : -1; 338 m[3] = a == false ? 0 : -1; 339 } 340 341 /** 342 * @brief Construct from 4 scalar values. 343 * 344 * The value of @c a is stored to lane 0 (LSB) in the SIMD register. 345 */ 346 ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d) 347 { 348 m[0] = a == false ? 0 : -1; 349 m[1] = b == false ? 0 : -1; 350 m[2] = c == false ? 0 : -1; 351 m[3] = d == false ? 0 : -1; 352 } 353 354 /** 355 * @brief Get the scalar value of a single lane. 356 */ 357 template <int l> ASTCENC_SIMD_INLINE float lane() const 358 { 359 return m[l] != 0; 360 } 361 362 /** 363 * @brief The vector ... 364 */ 365 int m[4]; 366}; 367 368// ============================================================================ 369// vmask4 operators and functions 370// ============================================================================ 371 372/** 373 * @brief Overload: mask union (or). 374 */ 375ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b) 376{ 377 return vmask4(a.m[0] | b.m[0], 378 a.m[1] | b.m[1], 379 a.m[2] | b.m[2], 380 a.m[3] | b.m[3]); 381} 382 383/** 384 * @brief Overload: mask intersect (and). 385 */ 386ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b) 387{ 388 return vmask4(a.m[0] & b.m[0], 389 a.m[1] & b.m[1], 390 a.m[2] & b.m[2], 391 a.m[3] & b.m[3]); 392} 393 394/** 395 * @brief Overload: mask difference (xor). 396 */ 397ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b) 398{ 399 return vmask4(a.m[0] ^ b.m[0], 400 a.m[1] ^ b.m[1], 401 a.m[2] ^ b.m[2], 402 a.m[3] ^ b.m[3]); 403} 404 405/** 406 * @brief Overload: mask invert (not). 407 */ 408ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a) 409{ 410 return vmask4(~a.m[0], 411 ~a.m[1], 412 ~a.m[2], 413 ~a.m[3]); 414} 415 416/** 417 * @brief Return a 1-bit mask code indicating mask status. 418 * 419 * bit0 = lane 0 420 */ 421ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a) 422{ 423 return ((a.m[0] >> 31) & 0x1) | 424 ((a.m[1] >> 30) & 0x2) | 425 ((a.m[2] >> 29) & 0x4) | 426 ((a.m[3] >> 28) & 0x8); 427} 428 429// ============================================================================ 430// vint4 operators and functions 431// ============================================================================ 432 433/** 434 * @brief Overload: vector by vector addition. 435 */ 436ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b) 437{ 438 return vint4(a.m[0] + b.m[0], 439 a.m[1] + b.m[1], 440 a.m[2] + b.m[2], 441 a.m[3] + b.m[3]); 442} 443 444/** 445 * @brief Overload: vector by vector subtraction. 446 */ 447ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b) 448{ 449 return vint4(a.m[0] - b.m[0], 450 a.m[1] - b.m[1], 451 a.m[2] - b.m[2], 452 a.m[3] - b.m[3]); 453} 454 455/** 456 * @brief Overload: vector by vector multiplication. 457 */ 458ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b) 459{ 460 return vint4(a.m[0] * b.m[0], 461 a.m[1] * b.m[1], 462 a.m[2] * b.m[2], 463 a.m[3] * b.m[3]); 464} 465 466/** 467 * @brief Overload: vector bit invert. 468 */ 469ASTCENC_SIMD_INLINE vint4 operator~(vint4 a) 470{ 471 return vint4(~a.m[0], 472 ~a.m[1], 473 ~a.m[2], 474 ~a.m[3]); 475} 476 477/** 478 * @brief Overload: vector by vector bitwise or. 479 */ 480ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b) 481{ 482 return vint4(a.m[0] | b.m[0], 483 a.m[1] | b.m[1], 484 a.m[2] | b.m[2], 485 a.m[3] | b.m[3]); 486} 487 488/** 489 * @brief Overload: vector by vector bitwise and. 490 */ 491ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b) 492{ 493 return vint4(a.m[0] & b.m[0], 494 a.m[1] & b.m[1], 495 a.m[2] & b.m[2], 496 a.m[3] & b.m[3]); 497} 498 499/** 500 * @brief Overload: vector by vector bitwise xor. 501 */ 502ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b) 503{ 504 return vint4(a.m[0] ^ b.m[0], 505 a.m[1] ^ b.m[1], 506 a.m[2] ^ b.m[2], 507 a.m[3] ^ b.m[3]); 508} 509 510/** 511 * @brief Overload: vector by vector equality. 512 */ 513ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b) 514{ 515 return vmask4(a.m[0] == b.m[0], 516 a.m[1] == b.m[1], 517 a.m[2] == b.m[2], 518 a.m[3] == b.m[3]); 519} 520 521/** 522 * @brief Overload: vector by vector inequality. 523 */ 524ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b) 525{ 526 return vmask4(a.m[0] != b.m[0], 527 a.m[1] != b.m[1], 528 a.m[2] != b.m[2], 529 a.m[3] != b.m[3]); 530} 531 532/** 533 * @brief Overload: vector by vector less than. 534 */ 535ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b) 536{ 537 return vmask4(a.m[0] < b.m[0], 538 a.m[1] < b.m[1], 539 a.m[2] < b.m[2], 540 a.m[3] < b.m[3]); 541} 542 543/** 544 * @brief Overload: vector by vector greater than. 545 */ 546ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b) 547{ 548 return vmask4(a.m[0] > b.m[0], 549 a.m[1] > b.m[1], 550 a.m[2] > b.m[2], 551 a.m[3] > b.m[3]); 552} 553 554/** 555 * @brief Logical shift left. 556 */ 557template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a) 558{ 559 return vint4(a.m[0] << s, 560 a.m[1] << s, 561 a.m[2] << s, 562 a.m[3] << s); 563} 564 565/** 566 * @brief Logical shift right. 567 */ 568template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a) 569{ 570 unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s; 571 unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s; 572 unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s; 573 unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s; 574 575 return vint4(static_cast<int>(as0), 576 static_cast<int>(as1), 577 static_cast<int>(as2), 578 static_cast<int>(as3)); 579} 580 581/** 582 * @brief Arithmetic shift right. 583 */ 584template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a) 585{ 586 return vint4(a.m[0] >> s, 587 a.m[1] >> s, 588 a.m[2] >> s, 589 a.m[3] >> s); 590} 591 592/** 593 * @brief Return the min vector of two vectors. 594 */ 595ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b) 596{ 597 return vint4(a.m[0] < b.m[0] ? a.m[0] : b.m[0], 598 a.m[1] < b.m[1] ? a.m[1] : b.m[1], 599 a.m[2] < b.m[2] ? a.m[2] : b.m[2], 600 a.m[3] < b.m[3] ? a.m[3] : b.m[3]); 601} 602 603/** 604 * @brief Return the min vector of two vectors. 605 */ 606ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b) 607{ 608 return vint4(a.m[0] > b.m[0] ? a.m[0] : b.m[0], 609 a.m[1] > b.m[1] ? a.m[1] : b.m[1], 610 a.m[2] > b.m[2] ? a.m[2] : b.m[2], 611 a.m[3] > b.m[3] ? a.m[3] : b.m[3]); 612} 613 614/** 615 * @brief Return the horizontal minimum of a single vector. 616 */ 617ASTCENC_SIMD_INLINE vint4 hmin(vint4 a) 618{ 619 int b = std::min(a.m[0], a.m[1]); 620 int c = std::min(a.m[2], a.m[3]); 621 return vint4(std::min(b, c)); 622} 623 624/** 625 * @brief Return the horizontal maximum of a single vector. 626 */ 627ASTCENC_SIMD_INLINE vint4 hmax(vint4 a) 628{ 629 int b = std::max(a.m[0], a.m[1]); 630 int c = std::max(a.m[2], a.m[3]); 631 return vint4(std::max(b, c)); 632} 633 634/** 635 * @brief Return the horizontal sum of vector lanes as a scalar. 636 */ 637ASTCENC_SIMD_INLINE int hadd_s(vint4 a) 638{ 639 return a.m[0] + a.m[1] + a.m[2] + a.m[3]; 640} 641 642/** 643 * @brief Store a vector to an aligned memory address. 644 */ 645ASTCENC_SIMD_INLINE void storea(vint4 a, int* p) 646{ 647 p[0] = a.m[0]; 648 p[1] = a.m[1]; 649 p[2] = a.m[2]; 650 p[3] = a.m[3]; 651} 652 653/** 654 * @brief Store a vector to an unaligned memory address. 655 */ 656ASTCENC_SIMD_INLINE void store(vint4 a, int* p) 657{ 658 p[0] = a.m[0]; 659 p[1] = a.m[1]; 660 p[2] = a.m[2]; 661 p[3] = a.m[3]; 662} 663 664/** 665 * @brief Store a vector to an unaligned memory address. 666 */ 667ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p) 668{ 669 std::memcpy(p, a.m, sizeof(int) * 4); 670} 671 672/** 673 * @brief Store lowest N (vector width) bytes into an unaligned address. 674 */ 675ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) 676{ 677 std::memcpy(p, a.m, sizeof(uint8_t) * 4); 678} 679 680/** 681 * @brief Gather N (vector width) indices from the array. 682 */ 683ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices) 684{ 685 return vint4(base[indices.m[0]], 686 base[indices.m[1]], 687 base[indices.m[2]], 688 base[indices.m[3]]); 689} 690 691/** 692 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. 693 */ 694ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) 695{ 696 int b0 = a.m[0] & 0xFF; 697 int b1 = a.m[1] & 0xFF; 698 int b2 = a.m[2] & 0xFF; 699 int b3 = a.m[3] & 0xFF; 700 701 int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24); 702 return vint4(b, 0, 0, 0); 703} 704 705/** 706 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. 707 */ 708ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond) 709{ 710 return vint4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0], 711 (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1], 712 (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2], 713 (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]); 714} 715 716// ============================================================================ 717// vfloat4 operators and functions 718// ============================================================================ 719 720/** 721 * @brief Overload: vector by vector addition. 722 */ 723ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b) 724{ 725 return vfloat4(a.m[0] + b.m[0], 726 a.m[1] + b.m[1], 727 a.m[2] + b.m[2], 728 a.m[3] + b.m[3]); 729} 730 731/** 732 * @brief Overload: vector by vector subtraction. 733 */ 734ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b) 735{ 736 return vfloat4(a.m[0] - b.m[0], 737 a.m[1] - b.m[1], 738 a.m[2] - b.m[2], 739 a.m[3] - b.m[3]); 740} 741 742/** 743 * @brief Overload: vector by vector multiplication. 744 */ 745ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b) 746{ 747 return vfloat4(a.m[0] * b.m[0], 748 a.m[1] * b.m[1], 749 a.m[2] * b.m[2], 750 a.m[3] * b.m[3]); 751} 752 753/** 754 * @brief Overload: vector by vector division. 755 */ 756ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b) 757{ 758 return vfloat4(a.m[0] / b.m[0], 759 a.m[1] / b.m[1], 760 a.m[2] / b.m[2], 761 a.m[3] / b.m[3]); 762} 763 764/** 765 * @brief Overload: vector by vector equality. 766 */ 767ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b) 768{ 769 return vmask4(a.m[0] == b.m[0], 770 a.m[1] == b.m[1], 771 a.m[2] == b.m[2], 772 a.m[3] == b.m[3]); 773} 774 775/** 776 * @brief Overload: vector by vector inequality. 777 */ 778ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b) 779{ 780 return vmask4(a.m[0] != b.m[0], 781 a.m[1] != b.m[1], 782 a.m[2] != b.m[2], 783 a.m[3] != b.m[3]); 784} 785 786/** 787 * @brief Overload: vector by vector less than. 788 */ 789ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b) 790{ 791 return vmask4(a.m[0] < b.m[0], 792 a.m[1] < b.m[1], 793 a.m[2] < b.m[2], 794 a.m[3] < b.m[3]); 795} 796 797/** 798 * @brief Overload: vector by vector greater than. 799 */ 800ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b) 801{ 802 return vmask4(a.m[0] > b.m[0], 803 a.m[1] > b.m[1], 804 a.m[2] > b.m[2], 805 a.m[3] > b.m[3]); 806} 807 808/** 809 * @brief Overload: vector by vector less than or equal. 810 */ 811ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b) 812{ 813 return vmask4(a.m[0] <= b.m[0], 814 a.m[1] <= b.m[1], 815 a.m[2] <= b.m[2], 816 a.m[3] <= b.m[3]); 817} 818 819/** 820 * @brief Overload: vector by vector greater than or equal. 821 */ 822ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b) 823{ 824 return vmask4(a.m[0] >= b.m[0], 825 a.m[1] >= b.m[1], 826 a.m[2] >= b.m[2], 827 a.m[3] >= b.m[3]); 828} 829 830/** 831 * @brief Return the min vector of two vectors. 832 * 833 * If either lane value is NaN, @c b will be returned for that lane. 834 */ 835ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b) 836{ 837 return vfloat4(a.m[0] < b.m[0] ? a.m[0] : b.m[0], 838 a.m[1] < b.m[1] ? a.m[1] : b.m[1], 839 a.m[2] < b.m[2] ? a.m[2] : b.m[2], 840 a.m[3] < b.m[3] ? a.m[3] : b.m[3]); 841} 842 843/** 844 * @brief Return the max vector of two vectors. 845 * 846 * If either lane value is NaN, @c b will be returned for that lane. 847 */ 848ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b) 849{ 850 return vfloat4(a.m[0] > b.m[0] ? a.m[0] : b.m[0], 851 a.m[1] > b.m[1] ? a.m[1] : b.m[1], 852 a.m[2] > b.m[2] ? a.m[2] : b.m[2], 853 a.m[3] > b.m[3] ? a.m[3] : b.m[3]); 854} 855 856/** 857 * @brief Return the absolute value of the float vector. 858 */ 859ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a) 860{ 861 return vfloat4(std::abs(a.m[0]), 862 std::abs(a.m[1]), 863 std::abs(a.m[2]), 864 std::abs(a.m[3])); 865} 866 867/** 868 * @brief Return a float rounded to the nearest integer value. 869 */ 870ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a) 871{ 872 assert(std::fegetround() == FE_TONEAREST); 873 return vfloat4(std::nearbyint(a.m[0]), 874 std::nearbyint(a.m[1]), 875 std::nearbyint(a.m[2]), 876 std::nearbyint(a.m[3])); 877} 878 879/** 880 * @brief Return the horizontal minimum of a vector. 881 */ 882ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a) 883{ 884 float tmp1 = std::min(a.m[0], a.m[1]); 885 float tmp2 = std::min(a.m[2], a.m[3]); 886 return vfloat4(std::min(tmp1, tmp2)); 887} 888 889/** 890 * @brief Return the horizontal maximum of a vector. 891 */ 892ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a) 893{ 894 float tmp1 = std::max(a.m[0], a.m[1]); 895 float tmp2 = std::max(a.m[2], a.m[3]); 896 return vfloat4(std::max(tmp1, tmp2)); 897} 898 899/** 900 * @brief Return the horizontal sum of a vector. 901 */ 902ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a) 903{ 904 // Use halving add, gives invariance with SIMD versions 905 return (a.m[0] + a.m[2]) + (a.m[1] + a.m[3]); 906} 907 908/** 909 * @brief Return the sqrt of the lanes in the vector. 910 */ 911ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a) 912{ 913 return vfloat4(std::sqrt(a.m[0]), 914 std::sqrt(a.m[1]), 915 std::sqrt(a.m[2]), 916 std::sqrt(a.m[3])); 917} 918 919/** 920 * @brief Return lanes from @c b if @c cond is set, else @c a. 921 */ 922ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) 923{ 924 return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0], 925 (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1], 926 (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2], 927 (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]); 928} 929 930/** 931 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. 932 */ 933ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond) 934{ 935 return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0], 936 (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1], 937 (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2], 938 (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]); 939} 940 941/** 942 * @brief Load a vector of gathered results from an array; 943 */ 944ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) 945{ 946 return vfloat4(base[indices.m[0]], 947 base[indices.m[1]], 948 base[indices.m[2]], 949 base[indices.m[3]]); 950} 951 952/** 953 * @brief Store a vector to an unaligned memory address. 954 */ 955ASTCENC_SIMD_INLINE void store(vfloat4 a, float* ptr) 956{ 957 ptr[0] = a.m[0]; 958 ptr[1] = a.m[1]; 959 ptr[2] = a.m[2]; 960 ptr[3] = a.m[3]; 961} 962 963/** 964 * @brief Store a vector to an aligned memory address. 965 */ 966ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* ptr) 967{ 968 ptr[0] = a.m[0]; 969 ptr[1] = a.m[1]; 970 ptr[2] = a.m[2]; 971 ptr[3] = a.m[3]; 972} 973 974/** 975 * @brief Return a integer value for a float vector, using truncation. 976 */ 977ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a) 978{ 979 return vint4(static_cast<int>(a.m[0]), 980 static_cast<int>(a.m[1]), 981 static_cast<int>(a.m[2]), 982 static_cast<int>(a.m[3])); 983} 984 985/**f 986 * @brief Return a integer value for a float vector, using round-to-nearest. 987 */ 988ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a) 989{ 990 a = a + vfloat4(0.5f); 991 return vint4(static_cast<int>(a.m[0]), 992 static_cast<int>(a.m[1]), 993 static_cast<int>(a.m[2]), 994 static_cast<int>(a.m[3])); 995} 996 997/** 998 * @brief Return a float value for a integer vector. 999 */ 1000ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a) 1001{ 1002 return vfloat4(static_cast<float>(a.m[0]), 1003 static_cast<float>(a.m[1]), 1004 static_cast<float>(a.m[2]), 1005 static_cast<float>(a.m[3])); 1006} 1007 1008/** 1009 * @brief Return a float16 value for a float vector, using round-to-nearest. 1010 */ 1011ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a) 1012{ 1013 return vint4( 1014 float_to_sf16(a.lane<0>()), 1015 float_to_sf16(a.lane<1>()), 1016 float_to_sf16(a.lane<2>()), 1017 float_to_sf16(a.lane<3>())); 1018} 1019 1020/** 1021 * @brief Return a float16 value for a float scalar, using round-to-nearest. 1022 */ 1023static inline uint16_t float_to_float16(float a) 1024{ 1025 return float_to_sf16(a); 1026} 1027 1028/** 1029 * @brief Return a float value for a float16 vector. 1030 */ 1031ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a) 1032{ 1033 return vfloat4( 1034 sf16_to_float(static_cast<uint16_t>(a.lane<0>())), 1035 sf16_to_float(static_cast<uint16_t>(a.lane<1>())), 1036 sf16_to_float(static_cast<uint16_t>(a.lane<2>())), 1037 sf16_to_float(static_cast<uint16_t>(a.lane<3>()))); 1038} 1039 1040/** 1041 * @brief Return a float value for a float16 scalar. 1042 */ 1043ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a) 1044{ 1045 return sf16_to_float(a); 1046} 1047 1048/** 1049 * @brief Return a float value as an integer bit pattern (i.e. no conversion). 1050 * 1051 * It is a common trick to convert floats into integer bit patterns, perform 1052 * some bit hackery based on knowledge they are IEEE 754 layout, and then 1053 * convert them back again. This is the first half of that flip. 1054 */ 1055ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a) 1056{ 1057 vint4 r; 1058 std::memcpy(r.m, a.m, 4 * 4); 1059 return r; 1060} 1061 1062/** 1063 * @brief Return a integer value as a float bit pattern (i.e. no conversion). 1064 * 1065 * It is a common trick to convert floats into integer bit patterns, perform 1066 * some bit hackery based on knowledge they are IEEE 754 layout, and then 1067 * convert them back again. This is the second half of that flip. 1068 */ 1069ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a) 1070{ 1071 vfloat4 r; 1072 std::memcpy(r.m, a.m, 4 * 4); 1073 return r; 1074} 1075 1076/** 1077 * @brief Prepare a vtable lookup table for use with the native SIMD size. 1078 */ 1079ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p) 1080{ 1081 t0p = t0; 1082} 1083 1084/** 1085 * @brief Prepare a vtable lookup table for use with the native SIMD size. 1086 */ 1087ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p) 1088{ 1089 t0p = t0; 1090 t1p = t1; 1091} 1092 1093/** 1094 * @brief Prepare a vtable lookup table for use with the native SIMD size. 1095 */ 1096ASTCENC_SIMD_INLINE void vtable_prepare( 1097 vint4 t0, vint4 t1, vint4 t2, vint4 t3, 1098 vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p) 1099{ 1100 t0p = t0; 1101 t1p = t1; 1102 t2p = t2; 1103 t3p = t3; 1104} 1105 1106/** 1107 * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. 1108 */ 1109ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) 1110{ 1111 uint8_t table[16]; 1112 1113 std::memcpy(table + 0, t0.m, 4 * sizeof(int)); 1114 1115 return vint4(table[idx.lane<0>()], 1116 table[idx.lane<1>()], 1117 table[idx.lane<2>()], 1118 table[idx.lane<3>()]); 1119} 1120 1121 1122/** 1123 * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. 1124 */ 1125ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) 1126{ 1127 uint8_t table[32]; 1128 1129 std::memcpy(table + 0, t0.m, 4 * sizeof(int)); 1130 std::memcpy(table + 16, t1.m, 4 * sizeof(int)); 1131 1132 return vint4(table[idx.lane<0>()], 1133 table[idx.lane<1>()], 1134 table[idx.lane<2>()], 1135 table[idx.lane<3>()]); 1136} 1137 1138/** 1139 * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. 1140 */ 1141ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) 1142{ 1143 uint8_t table[64]; 1144 1145 std::memcpy(table + 0, t0.m, 4 * sizeof(int)); 1146 std::memcpy(table + 16, t1.m, 4 * sizeof(int)); 1147 std::memcpy(table + 32, t2.m, 4 * sizeof(int)); 1148 std::memcpy(table + 48, t3.m, 4 * sizeof(int)); 1149 1150 return vint4(table[idx.lane<0>()], 1151 table[idx.lane<1>()], 1152 table[idx.lane<2>()], 1153 table[idx.lane<3>()]); 1154} 1155 1156/** 1157 * @brief Return a vector of interleaved RGBA data. 1158 * 1159 * Input vectors have the value stored in the bottom 8 bits of each lane, 1160 * with high bits set to zero. 1161 * 1162 * Output vector stores a single RGBA texel packed in each lane. 1163 */ 1164ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a) 1165{ 1166 return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a); 1167} 1168 1169/** 1170 * @brief Store a single vector lane to an unaligned address. 1171 */ 1172ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data) 1173{ 1174 std::memcpy(base, &data, sizeof(int)); 1175} 1176 1177/** 1178 * @brief Store a vector, skipping masked lanes. 1179 * 1180 * All masked lanes must be at the end of vector, after all non-masked lanes. 1181 * Input is a byte array of at least 4 bytes per unmasked entry. 1182 */ 1183ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask) 1184{ 1185 if (mask.m[3]) 1186 { 1187 store(data, base); 1188 } 1189 else if (mask.m[2]) 1190 { 1191 store_lane(base + 0, data.lane<0>()); 1192 store_lane(base + 4, data.lane<1>()); 1193 store_lane(base + 8, data.lane<2>()); 1194 } 1195 else if (mask.m[1]) 1196 { 1197 store_lane(base + 0, data.lane<0>()); 1198 store_lane(base + 4, data.lane<1>()); 1199 } 1200 else if (mask.m[0]) 1201 { 1202 store_lane(base + 0, data.lane<0>()); 1203 } 1204} 1205 1206#endif // #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED 1207