1// SPDX-License-Identifier: Apache-2.0 2// ---------------------------------------------------------------------------- 3// Copyright 2019-2023 Arm Limited 4// 5// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6// use this file except in compliance with the License. You may obtain a copy 7// of the License at: 8// 9// http://www.apache.org/licenses/LICENSE-2.0 10// 11// Unless required by applicable law or agreed to in writing, software 12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14// License for the specific language governing permissions and limitations 15// under the License. 16// ---------------------------------------------------------------------------- 17 18/** 19 * @brief 4x32-bit vectors, implemented using Armv8-A NEON. 20 * 21 * This module implements 4-wide 32-bit float, int, and mask vectors for 22 * Armv8-A NEON. 23 * 24 * There is a baseline level of functionality provided by all vector widths and 25 * implementations. This is implemented using identical function signatures, 26 * modulo data type, so we can use them as substitutable implementations in VLA 27 * code. 28 * 29 * The 4-wide vectors are also used as a fixed-width type, and significantly 30 * extend the functionality above that available to VLA code. 31 */ 32 33#ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED 34#define ASTC_VECMATHLIB_NEON_4_H_INCLUDED 35 36#ifndef ASTCENC_SIMD_INLINE 37 #error "Include astcenc_vecmathlib.h, do not include directly" 38#endif 39 40#include <cstdio> 41#include <cstring> 42 43// ============================================================================ 44// vfloat4 data type 45// ============================================================================ 46 47/** 48 * @brief Data type for 4-wide floats. 49 */ 50struct vfloat4 51{ 52 /** 53 * @brief Construct from zero-initialized value. 54 */ 55 ASTCENC_SIMD_INLINE vfloat4() = default; 56 57 /** 58 * @brief Construct from 4 values loaded from an unaligned address. 59 * 60 * Consider using loada() which is better with vectors if data is aligned 61 * to vector length. 62 */ 63 ASTCENC_SIMD_INLINE explicit vfloat4(const float *p) 64 { 65 m = vld1q_f32(p); 66 } 67 68 /** 69 * @brief Construct from 1 scalar value replicated across all lanes. 70 * 71 * Consider using zero() for constexpr zeros. 72 */ 73 ASTCENC_SIMD_INLINE explicit vfloat4(float a) 74 { 75 m = vdupq_n_f32(a); 76 } 77 78 /** 79 * @brief Construct from 4 scalar values. 80 * 81 * The value of @c a is stored to lane 0 (LSB) in the SIMD register. 82 */ 83 ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d) 84 { 85 float v[4] { a, b, c, d }; 86 m = vld1q_f32(v); 87 } 88 89 /** 90 * @brief Construct from an existing SIMD register. 91 */ 92 ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a) 93 { 94 m = a; 95 } 96 97 /** 98 * @brief Get the scalar value of a single lane. 99 */ 100 template <int l> ASTCENC_SIMD_INLINE float lane() const 101 { 102 return vgetq_lane_f32(m, l); 103 } 104 105 /** 106 * @brief Set the scalar value of a single lane. 107 */ 108 template <int l> ASTCENC_SIMD_INLINE void set_lane(float a) 109 { 110 m = vsetq_lane_f32(a, m, l); 111 } 112 113 /** 114 * @brief Factory that returns a vector of zeros. 115 */ 116 static ASTCENC_SIMD_INLINE vfloat4 zero() 117 { 118 return vfloat4(vdupq_n_f32(0.0f)); 119 } 120 121 /** 122 * @brief Factory that returns a replicated scalar loaded from memory. 123 */ 124 static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p) 125 { 126 return vfloat4(vld1q_dup_f32(p)); 127 } 128 129 /** 130 * @brief Factory that returns a vector loaded from 16B aligned memory. 131 */ 132 static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p) 133 { 134 return vfloat4(vld1q_f32(p)); 135 } 136 137 /** 138 * @brief Factory that returns a vector containing the lane IDs. 139 */ 140 static ASTCENC_SIMD_INLINE vfloat4 lane_id() 141 { 142 alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f }; 143 return vfloat4(vld1q_f32(data)); 144 } 145 146 /** 147 * @brief Return a swizzled float 2. 148 */ 149 template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const 150 { 151 return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f); 152 } 153 154 /** 155 * @brief Return a swizzled float 3. 156 */ 157 template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const 158 { 159 return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f); 160 } 161 162 /** 163 * @brief Return a swizzled float 4. 164 */ 165 template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const 166 { 167 return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>()); 168 } 169 170 /** 171 * @brief The vector ... 172 */ 173 float32x4_t m; 174}; 175 176// ============================================================================ 177// vint4 data type 178// ============================================================================ 179 180/** 181 * @brief Data type for 4-wide ints. 182 */ 183struct vint4 184{ 185 /** 186 * @brief Construct from zero-initialized value. 187 */ 188 ASTCENC_SIMD_INLINE vint4() = default; 189 190 /** 191 * @brief Construct from 4 values loaded from an unaligned address. 192 * 193 * Consider using loada() which is better with vectors if data is aligned 194 * to vector length. 195 */ 196 ASTCENC_SIMD_INLINE explicit vint4(const int *p) 197 { 198 m = vld1q_s32(p); 199 } 200 201 /** 202 * @brief Construct from 4 uint8_t loaded from an unaligned address. 203 */ 204 ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p) 205 { 206 // Cast is safe - NEON loads are allowed to be unaligned 207 uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p)); 208 uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8))); 209 m = vreinterpretq_s32_u32(vmovl_u16(t16)); 210 } 211 212 /** 213 * @brief Construct from 1 scalar value replicated across all lanes. 214 * 215 * Consider using vfloat4::zero() for constexpr zeros. 216 */ 217 ASTCENC_SIMD_INLINE explicit vint4(int a) 218 { 219 m = vdupq_n_s32(a); 220 } 221 222 /** 223 * @brief Construct from 4 scalar values. 224 * 225 * The value of @c a is stored to lane 0 (LSB) in the SIMD register. 226 */ 227 ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d) 228 { 229 int v[4] { a, b, c, d }; 230 m = vld1q_s32(v); 231 } 232 233 /** 234 * @brief Construct from an existing SIMD register. 235 */ 236 ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a) 237 { 238 m = a; 239 } 240 241 /** 242 * @brief Get the scalar from a single lane. 243 */ 244 template <int l> ASTCENC_SIMD_INLINE int lane() const 245 { 246 return vgetq_lane_s32(m, l); 247 } 248 249 /** 250 * @brief Set the scalar value of a single lane. 251 */ 252 template <int l> ASTCENC_SIMD_INLINE void set_lane(int a) 253 { 254 m = vsetq_lane_s32(a, m, l); 255 } 256 257 /** 258 * @brief Factory that returns a vector of zeros. 259 */ 260 static ASTCENC_SIMD_INLINE vint4 zero() 261 { 262 return vint4(0); 263 } 264 265 /** 266 * @brief Factory that returns a replicated scalar loaded from memory. 267 */ 268 static ASTCENC_SIMD_INLINE vint4 load1(const int* p) 269 { 270 return vint4(*p); 271 } 272 273 /** 274 * @brief Factory that returns a vector loaded from unaligned memory. 275 */ 276 static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p) 277 { 278 vint4 data; 279 std::memcpy(&data.m, p, 4 * sizeof(int)); 280 return data; 281 } 282 283 /** 284 * @brief Factory that returns a vector loaded from 16B aligned memory. 285 */ 286 static ASTCENC_SIMD_INLINE vint4 loada(const int* p) 287 { 288 return vint4(p); 289 } 290 291 /** 292 * @brief Factory that returns a vector containing the lane IDs. 293 */ 294 static ASTCENC_SIMD_INLINE vint4 lane_id() 295 { 296 alignas(16) static const int data[4] { 0, 1, 2, 3 }; 297 return vint4(vld1q_s32(data)); 298 } 299 300 /** 301 * @brief The vector ... 302 */ 303 int32x4_t m; 304}; 305 306// ============================================================================ 307// vmask4 data type 308// ============================================================================ 309 310/** 311 * @brief Data type for 4-wide control plane masks. 312 */ 313struct vmask4 314{ 315 /** 316 * @brief Construct from an existing SIMD register. 317 */ 318 ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a) 319 { 320 m = a; 321 } 322 323#if !defined(_MSC_VER) 324 /** 325 * @brief Construct from an existing SIMD register. 326 */ 327 ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a) 328 { 329 m = vreinterpretq_u32_s32(a); 330 } 331#endif 332 333 /** 334 * @brief Construct from 1 scalar value. 335 */ 336 ASTCENC_SIMD_INLINE explicit vmask4(bool a) 337 { 338 m = vreinterpretq_u32_s32(vdupq_n_s32(a == true ? -1 : 0)); 339 } 340 341 /** 342 * @brief Construct from 4 scalar values. 343 * 344 * The value of @c a is stored to lane 0 (LSB) in the SIMD register. 345 */ 346 ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d) 347 { 348 int v[4] { 349 a == true ? -1 : 0, 350 b == true ? -1 : 0, 351 c == true ? -1 : 0, 352 d == true ? -1 : 0 353 }; 354 355 int32x4_t ms = vld1q_s32(v); 356 m = vreinterpretq_u32_s32(ms); 357 } 358 359 /** 360 * @brief Get the scalar from a single lane. 361 */ 362 template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const 363 { 364 return vgetq_lane_u32(m, l) != 0; 365 } 366 367 /** 368 * @brief The vector ... 369 */ 370 uint32x4_t m; 371}; 372 373// ============================================================================ 374// vmask4 operators and functions 375// ============================================================================ 376 377/** 378 * @brief Overload: mask union (or). 379 */ 380ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b) 381{ 382 return vmask4(vorrq_u32(a.m, b.m)); 383} 384 385/** 386 * @brief Overload: mask intersect (and). 387 */ 388ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b) 389{ 390 return vmask4(vandq_u32(a.m, b.m)); 391} 392 393/** 394 * @brief Overload: mask difference (xor). 395 */ 396ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b) 397{ 398 return vmask4(veorq_u32(a.m, b.m)); 399} 400 401/** 402 * @brief Overload: mask invert (not). 403 */ 404ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a) 405{ 406 return vmask4(vmvnq_u32(a.m)); 407} 408 409/** 410 * @brief Return a 4-bit mask code indicating mask status. 411 * 412 * bit0 = lane 0 413 */ 414ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a) 415{ 416 static const int shifta[4] { 0, 1, 2, 3 }; 417 static const int32x4_t shift = vld1q_s32(shifta); 418 419 uint32x4_t tmp = vshrq_n_u32(a.m, 31); 420 return vaddvq_u32(vshlq_u32(tmp, shift)); 421} 422 423// ============================================================================ 424// vint4 operators and functions 425// ============================================================================ 426 427/** 428 * @brief Overload: vector by vector addition. 429 */ 430ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b) 431{ 432 return vint4(vaddq_s32(a.m, b.m)); 433} 434 435/** 436 * @brief Overload: vector by vector subtraction. 437 */ 438ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b) 439{ 440 return vint4(vsubq_s32(a.m, b.m)); 441} 442 443/** 444 * @brief Overload: vector by vector multiplication. 445 */ 446ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b) 447{ 448 return vint4(vmulq_s32(a.m, b.m)); 449} 450 451/** 452 * @brief Overload: vector bit invert. 453 */ 454ASTCENC_SIMD_INLINE vint4 operator~(vint4 a) 455{ 456 return vint4(vmvnq_s32(a.m)); 457} 458 459/** 460 * @brief Overload: vector by vector bitwise or. 461 */ 462ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b) 463{ 464 return vint4(vorrq_s32(a.m, b.m)); 465} 466 467/** 468 * @brief Overload: vector by vector bitwise and. 469 */ 470ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b) 471{ 472 return vint4(vandq_s32(a.m, b.m)); 473} 474 475/** 476 * @brief Overload: vector by vector bitwise xor. 477 */ 478ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b) 479{ 480 return vint4(veorq_s32(a.m, b.m)); 481} 482 483/** 484 * @brief Overload: vector by vector equality. 485 */ 486ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b) 487{ 488 return vmask4(vceqq_s32(a.m, b.m)); 489} 490 491/** 492 * @brief Overload: vector by vector inequality. 493 */ 494ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b) 495{ 496 return ~vmask4(vceqq_s32(a.m, b.m)); 497} 498 499/** 500 * @brief Overload: vector by vector less than. 501 */ 502ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b) 503{ 504 return vmask4(vcltq_s32(a.m, b.m)); 505} 506 507/** 508 * @brief Overload: vector by vector greater than. 509 */ 510ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b) 511{ 512 return vmask4(vcgtq_s32(a.m, b.m)); 513} 514 515/** 516 * @brief Logical shift left. 517 */ 518template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a) 519{ 520 return vint4(vshlq_s32(a.m, vdupq_n_s32(s))); 521} 522 523/** 524 * @brief Logical shift right. 525 */ 526template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a) 527{ 528 uint32x4_t ua = vreinterpretq_u32_s32(a.m); 529 ua = vshlq_u32(ua, vdupq_n_s32(-s)); 530 return vint4(vreinterpretq_s32_u32(ua)); 531} 532 533/** 534 * @brief Arithmetic shift right. 535 */ 536template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a) 537{ 538 return vint4(vshlq_s32(a.m, vdupq_n_s32(-s))); 539} 540 541/** 542 * @brief Return the min vector of two vectors. 543 */ 544ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b) 545{ 546 return vint4(vminq_s32(a.m, b.m)); 547} 548 549/** 550 * @brief Return the max vector of two vectors. 551 */ 552ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b) 553{ 554 return vint4(vmaxq_s32(a.m, b.m)); 555} 556 557/** 558 * @brief Return the horizontal minimum of a vector. 559 */ 560ASTCENC_SIMD_INLINE vint4 hmin(vint4 a) 561{ 562 return vint4(vminvq_s32(a.m)); 563} 564 565/** 566 * @brief Return the horizontal maximum of a vector. 567 */ 568ASTCENC_SIMD_INLINE vint4 hmax(vint4 a) 569{ 570 return vint4(vmaxvq_s32(a.m)); 571} 572 573/** 574 * @brief Return the horizontal sum of a vector. 575 */ 576ASTCENC_SIMD_INLINE int hadd_s(vint4 a) 577{ 578 int32x2_t t = vadd_s32(vget_high_s32(a.m), vget_low_s32(a.m)); 579 return vget_lane_s32(vpadd_s32(t, t), 0); 580} 581 582/** 583 * @brief Return the horizontal sum of a vector. 584 */ 585ASTCENC_SIMD_INLINE uint32_t hadd_s(vmask4 a) 586{ 587 // Use add with SIMD versions 588 return vaddvq_u32(a.m); 589} 590 591#define ASTCENC_USE_NATIVE_ADDV 592/** 593 * @brief Return the horizontal sum of a vector. 594 */ 595ASTCENC_SIMD_INLINE float hadd_rgba_s(vfloat4 a) 596{ 597 // Use add with SIMD versions 598 return vaddvq_f32(a.m); 599} 600 601/** 602 * @brief Store a vector to a 16B aligned memory address. 603 */ 604ASTCENC_SIMD_INLINE void storea(vint4 a, int* p) 605{ 606 vst1q_s32(p, a.m); 607} 608 609/** 610 * @brief Store a vector to an unaligned memory address. 611 */ 612ASTCENC_SIMD_INLINE void store(vint4 a, int* p) 613{ 614 vst1q_s32(p, a.m); 615} 616 617/** 618 * @brief Store a vector to an unaligned memory address. 619 */ 620ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p) 621{ 622 std::memcpy(p, &a.m, sizeof(int) * 4); 623} 624 625/** 626 * @brief Store lowest N (vector width) bytes into an unaligned address. 627 */ 628ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) 629{ 630 vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0); 631} 632 633/** 634 * @brief Gather N (vector width) indices from the array. 635 */ 636ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices) 637{ 638 alignas(16) int idx[4]; 639 storea(indices, idx); 640 alignas(16) int vals[4]; 641 vals[0] = base[idx[0]]; 642 vals[1] = base[idx[1]]; 643 vals[2] = base[idx[2]]; 644 vals[3] = base[idx[3]]; 645 return vint4(vals); 646} 647 648/** 649 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. 650 */ 651ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) 652{ 653 uint8x16_t idx = {0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 654 int8x16_t av = vreinterpretq_s8_s32(a.m); 655 return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx))); 656} 657 658/** 659 * @brief Return lanes from @c b if @c cond is set, else @c a. 660 */ 661ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond) 662{ 663 return vint4(vbslq_s32(cond.m, b.m, a.m)); 664} 665 666// ============================================================================ 667// vfloat4 operators and functions 668// ============================================================================ 669 670/** 671 * @brief Overload: vector by vector addition. 672 */ 673ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b) 674{ 675 return vfloat4(vaddq_f32(a.m, b.m)); 676} 677 678/** 679 * @brief Overload: vector by vector subtraction. 680 */ 681ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b) 682{ 683 return vfloat4(vsubq_f32(a.m, b.m)); 684} 685 686/** 687 * @brief Overload: vector by vector multiplication. 688 */ 689ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b) 690{ 691 return vfloat4(vmulq_f32(a.m, b.m)); 692} 693 694/** 695 * @brief Overload: vector by vector division. 696 */ 697ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b) 698{ 699 return vfloat4(vdivq_f32(a.m, b.m)); 700} 701 702/** 703 * @brief Overload: vector by vector equality. 704 */ 705ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b) 706{ 707 return vmask4(vceqq_f32(a.m, b.m)); 708} 709 710/** 711 * @brief Overload: vector by vector inequality. 712 */ 713ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b) 714{ 715 return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m))); 716} 717 718/** 719 * @brief Overload: vector by vector less than. 720 */ 721ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b) 722{ 723 return vmask4(vcltq_f32(a.m, b.m)); 724} 725 726/** 727 * @brief Overload: vector by vector greater than. 728 */ 729ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b) 730{ 731 return vmask4(vcgtq_f32(a.m, b.m)); 732} 733 734/** 735 * @brief Overload: vector by vector less than or equal. 736 */ 737ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b) 738{ 739 return vmask4(vcleq_f32(a.m, b.m)); 740} 741 742/** 743 * @brief Overload: vector by vector greater than or equal. 744 */ 745ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b) 746{ 747 return vmask4(vcgeq_f32(a.m, b.m)); 748} 749 750/** 751 * @brief Return the min vector of two vectors. 752 * 753 * If either lane value is NaN, @c b will be returned for that lane. 754 */ 755ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b) 756{ 757 // Do not reorder - second operand will return if either is NaN 758 return vfloat4(vminnmq_f32(a.m, b.m)); 759} 760 761/** 762 * @brief Return the max vector of two vectors. 763 * 764 * If either lane value is NaN, @c b will be returned for that lane. 765 */ 766ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b) 767{ 768 // Do not reorder - second operand will return if either is NaN 769 return vfloat4(vmaxnmq_f32(a.m, b.m)); 770} 771 772/** 773 * @brief Return the absolute value of the float vector. 774 */ 775ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a) 776{ 777 float32x4_t zero = vdupq_n_f32(0.0f); 778 float32x4_t inv = vsubq_f32(zero, a.m); 779 return vfloat4(vmaxq_f32(a.m, inv)); 780} 781 782/** 783 * @brief Return a float rounded to the nearest integer value. 784 */ 785ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a) 786{ 787 return vfloat4(vrndnq_f32(a.m)); 788} 789 790/** 791 * @brief Return the horizontal minimum of a vector. 792 */ 793ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a) 794{ 795 return vfloat4(vminvq_f32(a.m)); 796} 797 798/** 799 * @brief Return the horizontal maximum of a vector. 800 */ 801ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a) 802{ 803 return vfloat4(vmaxvq_f32(a.m)); 804} 805 806/** 807 * @brief Return the horizontal sum of a vector. 808 */ 809ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a) 810{ 811 // Perform halving add to ensure invariance; we cannot use vaddqv as this 812 // does (0 + 1 + 2 + 3) which is not invariant with x86 (0 + 2) + (1 + 3). 813 float32x2_t t = vadd_f32(vget_high_f32(a.m), vget_low_f32(a.m)); 814 return vget_lane_f32(vpadd_f32(t, t), 0); 815} 816 817/** 818 * @brief Return the sqrt of the lanes in the vector. 819 */ 820ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a) 821{ 822 return vfloat4(vsqrtq_f32(a.m)); 823} 824 825/** 826 * @brief Return lanes from @c b if @c cond is set, else @c a. 827 */ 828ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) 829{ 830 return vfloat4(vbslq_f32(cond.m, b.m, a.m)); 831} 832 833/** 834 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. 835 */ 836ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond) 837{ 838 static const uint32x4_t msb = vdupq_n_u32(0x80000000u); 839 uint32x4_t mask = vcgeq_u32(cond.m, msb); 840 return vfloat4(vbslq_f32(mask, b.m, a.m)); 841} 842 843/** 844 * @brief Load a vector of gathered results from an array; 845 */ 846ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) 847{ 848 alignas(16) int idx[4]; 849 storea(indices, idx); 850 alignas(16) float vals[4]; 851 vals[0] = base[idx[0]]; 852 vals[1] = base[idx[1]]; 853 vals[2] = base[idx[2]]; 854 vals[3] = base[idx[3]]; 855 return vfloat4(vals); 856} 857 858/** 859 * @brief Store a vector to an unaligned memory address. 860 */ 861ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p) 862{ 863 vst1q_f32(p, a.m); 864} 865 866/** 867 * @brief Store a vector to a 16B aligned memory address. 868 */ 869ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p) 870{ 871 vst1q_f32(p, a.m); 872} 873 874/** 875 * @brief Return a integer value for a float vector, using truncation. 876 */ 877ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a) 878{ 879 return vint4(vcvtq_s32_f32(a.m)); 880} 881 882/** 883 * @brief Return a integer value for a float vector, using round-to-nearest. 884 */ 885ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a) 886{ 887 a = a + vfloat4(0.5f); 888 return vint4(vcvtq_s32_f32(a.m)); 889} 890 891/** 892 * @brief Return a float value for an integer vector. 893 */ 894ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a) 895{ 896 return vfloat4(vcvtq_f32_s32(a.m)); 897} 898 899/** 900 * @brief Return a float16 value for a float vector, using round-to-nearest. 901 */ 902ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a) 903{ 904 // Generate float16 value 905 float16x4_t f16 = vcvt_f16_f32(a.m); 906 907 // Convert each 16-bit float pattern to a 32-bit pattern 908 uint16x4_t u16 = vreinterpret_u16_f16(f16); 909 uint32x4_t u32 = vmovl_u16(u16); 910 return vint4(vreinterpretq_s32_u32(u32)); 911} 912 913/** 914 * @brief Return a float16 value for a float scalar, using round-to-nearest. 915 */ 916static inline uint16_t float_to_float16(float a) 917{ 918 vfloat4 av(a); 919 return static_cast<uint16_t>(float_to_float16(av).lane<0>()); 920} 921 922/** 923 * @brief Return a float value for a float16 vector. 924 */ 925ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a) 926{ 927 // Convert each 32-bit float pattern to a 16-bit pattern 928 uint32x4_t u32 = vreinterpretq_u32_s32(a.m); 929 uint16x4_t u16 = vmovn_u32(u32); 930 float16x4_t f16 = vreinterpret_f16_u16(u16); 931 932 // Generate float16 value 933 return vfloat4(vcvt_f32_f16(f16)); 934} 935 936/** 937 * @brief Return a float value for a float16 scalar. 938 */ 939ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a) 940{ 941 vint4 av(a); 942 return float16_to_float(av).lane<0>(); 943} 944 945/** 946 * @brief Return a float value as an integer bit pattern (i.e. no conversion). 947 * 948 * It is a common trick to convert floats into integer bit patterns, perform 949 * some bit hackery based on knowledge they are IEEE 754 layout, and then 950 * convert them back again. This is the first half of that flip. 951 */ 952ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a) 953{ 954 return vint4(vreinterpretq_s32_f32(a.m)); 955} 956 957/** 958 * @brief Return a integer value as a float bit pattern (i.e. no conversion). 959 * 960 * It is a common trick to convert floats into integer bit patterns, perform 961 * some bit hackery based on knowledge they are IEEE 754 layout, and then 962 * convert them back again. This is the second half of that flip. 963 */ 964ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) 965{ 966 return vfloat4(vreinterpretq_f32_s32(v.m)); 967} 968 969/** 970 * @brief Prepare a vtable lookup table for use with the native SIMD size. 971 */ 972ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p) 973{ 974 t0p = t0; 975} 976 977 978/** 979 * @brief Prepare a vtable lookup table for use with the native SIMD size. 980 */ 981ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p) 982{ 983 t0p = t0; 984 t1p = t1; 985} 986 987/** 988 * @brief Prepare a vtable lookup table for use with the native SIMD size. 989 */ 990ASTCENC_SIMD_INLINE void vtable_prepare( 991 vint4 t0, vint4 t1, vint4 t2, vint4 t3, 992 vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p) 993{ 994 t0p = t0; 995 t1p = t1; 996 t2p = t2; 997 t3p = t3; 998} 999 1000/** 1001 * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. 1002 */ 1003ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) 1004{ 1005 int8x16_t table { 1006 vreinterpretq_s8_s32(t0.m) 1007 }; 1008 1009 // Set index byte above max index for unused bytes so table lookup returns zero 1010 int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); 1011 uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); 1012 1013 return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes))); 1014} 1015 1016/** 1017 * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. 1018 */ 1019ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) 1020{ 1021 int8x16x2_t table { 1022 vreinterpretq_s8_s32(t0.m), 1023 vreinterpretq_s8_s32(t1.m) 1024 }; 1025 1026 // Set index byte above max index for unused bytes so table lookup returns zero 1027 int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); 1028 uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); 1029 1030 return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes))); 1031} 1032 1033/** 1034 * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. 1035 */ 1036ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) 1037{ 1038 int8x16x4_t table { 1039 vreinterpretq_s8_s32(t0.m), 1040 vreinterpretq_s8_s32(t1.m), 1041 vreinterpretq_s8_s32(t2.m), 1042 vreinterpretq_s8_s32(t3.m) 1043 }; 1044 1045 // Set index byte above max index for unused bytes so table lookup returns zero 1046 int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); 1047 uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); 1048 1049 return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes))); 1050} 1051 1052/** 1053 * @brief Return a vector of interleaved RGBA data. 1054 * 1055 * Input vectors have the value stored in the bottom 8 bits of each lane, 1056 * with high bits set to zero. 1057 * 1058 * Output vector stores a single RGBA texel packed in each lane. 1059 */ 1060ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a) 1061{ 1062 return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a); 1063} 1064 1065/** 1066 * @brief Store a single vector lane to an unaligned address. 1067 */ 1068ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data) 1069{ 1070 std::memcpy(base, &data, sizeof(int)); 1071} 1072 1073/** 1074 * @brief Store a vector, skipping masked lanes. 1075 * 1076 * All masked lanes must be at the end of vector, after all non-masked lanes. 1077 */ 1078ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask) 1079{ 1080 if (mask.lane<3>()) 1081 { 1082 store(data, base); 1083 } 1084 else if (mask.lane<2>() != 0.0f) 1085 { 1086 store_lane(base + 0, data.lane<0>()); 1087 store_lane(base + 4, data.lane<1>()); 1088 store_lane(base + 8, data.lane<2>()); 1089 } 1090 else if (mask.lane<1>() != 0.0f) 1091 { 1092 store_lane(base + 0, data.lane<0>()); 1093 store_lane(base + 4, data.lane<1>()); 1094 } 1095 else if (mask.lane<0>() != 0.0f) 1096 { 1097 store_lane(base + 0, data.lane<0>()); 1098 } 1099} 1100 1101#define ASTCENC_USE_NATIVE_POPCOUNT 1 1102 1103/** 1104 * @brief Population bit count. 1105 * 1106 * @param v The value to population count. 1107 * 1108 * @return The number of 1 bits. 1109 */ 1110ASTCENC_SIMD_INLINE int popcount(uint64_t v) 1111{ 1112 return static_cast<int>(vaddlv_u8(vcnt_u8(vcreate_u8(v)))); 1113} 1114 1115/** 1116 * @brief Population bit count. 1117 * 1118 * @param v The value to population count. 1119 * 1120 * @return The number of 1 bits. 1121 */ 1122ASTCENC_SIMD_INLINE int popcount(uint64x2_t v) 1123{ 1124 return static_cast<int>(vaddvq_u8(vcntq_u8(vreinterpretq_u8_u64(v)))); 1125} 1126 1127/** 1128 * @brief Population bit count. 1129 * 1130 * @param v The value to population count. 1131 * 1132 * @return The number of 1 bits. 1133 */ 1134ASTCENC_SIMD_INLINE int popcount(vmask4 v) 1135{ 1136 return static_cast<int>(vaddvq_u8(vcntq_u8(vreinterpretq_u8_u32(v.m)))); 1137} 1138 1139#endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED 1140