1// SPDX-License-Identifier: Apache-2.0 2// ---------------------------------------------------------------------------- 3// Copyright 2019-2022 Arm Limited 4// Copyright 2008 Jose Fonseca 5// 6// Licensed under the Apache License, Version 2.0 (the "License"); you may not 7// use this file except in compliance with the License. You may obtain a copy 8// of the License at: 9// 10// http://www.apache.org/licenses/LICENSE-2.0 11// 12// Unless required by applicable law or agreed to in writing, software 13// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 14// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 15// License for the specific language governing permissions and limitations 16// under the License. 17// ---------------------------------------------------------------------------- 18 19/* 20 * This module implements vector support for floats, ints, and vector lane 21 * control masks. It provides access to both explicit vector width types, and 22 * flexible N-wide types where N can be determined at compile time. 23 * 24 * The design of this module encourages use of vector length agnostic code, via 25 * the vint, vfloat, and vmask types. These will take on the widest SIMD vector 26 * with that is available at compile time. The current vector width is 27 * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant. 28 * 29 * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types. 30 * These are provided primarily for prototyping and algorithm debug of VLA 31 * implementations. 32 * 33 * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4 34 * types. These are provided for use by VLA code, but are also expected to be 35 * used as a fixed-width type and will supported a reference C++ fallback for 36 * use on platforms without SIMD intrinsics. 37 * 38 * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8 39 * types. These are provide for use by VLA code, and are not expected to be 40 * used as a fixed-width type in normal code. No reference C implementation is 41 * provided on platforms without underlying SIMD intrinsics. 42 * 43 * With the current implementation ISA support is provided for: 44 * 45 * * 1-wide for scalar reference. 46 * * 4-wide for Armv8-A NEON. 47 * * 4-wide for x86-64 SSE2. 48 * * 4-wide for x86-64 SSE4.1. 49 * * 8-wide for x86-64 AVX2. 50 */ 51 52#ifndef ASTC_VECMATHLIB_H_INCLUDED 53#define ASTC_VECMATHLIB_H_INCLUDED 54 55#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 56 #include <immintrin.h> 57#elif ASTCENC_NEON != 0 58 #include <arm_neon.h> 59#endif 60 61#if !defined(__clang__) && defined(_MSC_VER) 62 #define ASTCENC_SIMD_INLINE __forceinline 63 #define ASTCENC_NO_INLINE 64#elif defined(__GNUC__) && !defined(__clang__) 65 #define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline 66 #define ASTCENC_NO_INLINE __attribute__ ((noinline)) 67#else 68 #define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline 69 #define ASTCENC_NO_INLINE __attribute__ ((noinline)) 70#endif 71 72#if ASTCENC_AVX >= 2 73 /* If we have AVX2 expose 8-wide VLA. */ 74 #include "astcenc_vecmathlib_sse_4.h" 75 #include "astcenc_vecmathlib_common_4.h" 76 #include "astcenc_vecmathlib_avx2_8.h" 77 78 #define ASTCENC_SIMD_WIDTH 8 79 80 using vfloat = vfloat8; 81 82 #if defined(ASTCENC_NO_INVARIANCE) 83 using vfloatacc = vfloat8; 84 #else 85 using vfloatacc = vfloat4; 86 #endif 87 88 using vint = vint8; 89 using vmask = vmask8; 90 91 constexpr auto loada = vfloat8::loada; 92 constexpr auto load1 = vfloat8::load1; 93 94#elif ASTCENC_SSE >= 20 95 /* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */ 96 #include "astcenc_vecmathlib_sse_4.h" 97 #include "astcenc_vecmathlib_common_4.h" 98 99 #define ASTCENC_SIMD_WIDTH 4 100 101 using vfloat = vfloat4; 102 using vfloatacc = vfloat4; 103 using vint = vint4; 104 using vmask = vmask4; 105 106 constexpr auto loada = vfloat4::loada; 107 constexpr auto load1 = vfloat4::load1; 108 109#elif ASTCENC_NEON > 0 110 /* If we have NEON expose 4-wide VLA. */ 111 #include "astcenc_vecmathlib_neon_4.h" 112 #include "astcenc_vecmathlib_common_4.h" 113 114 #define ASTCENC_SIMD_WIDTH 4 115 116 using vfloat = vfloat4; 117 using vfloatacc = vfloat4; 118 using vint = vint4; 119 using vmask = vmask4; 120 121 constexpr auto loada = vfloat4::loada; 122 constexpr auto load1 = vfloat4::load1; 123 124#else 125 // If we have nothing expose 4-wide VLA, and 4-wide fixed width. 126 127 // Note: We no longer expose the 1-wide scalar fallback because it is not 128 // invariant with the 4-wide path due to algorithms that use horizontal 129 // operations that accumulate a local vector sum before accumulating into 130 // a running sum. 131 // 132 // For 4 items adding into an accumulator using 1-wide vectors the sum is: 133 // 134 // result = ((((sum + l0) + l1) + l2) + l3) 135 // 136 // ... whereas the accumulator for a 4-wide vector sum is: 137 // 138 // result = sum + ((l0 + l2) + (l1 + l3)) 139 // 140 // In "normal maths" this is the same, but the floating point reassociation 141 // differences mean that these will not produce the same result. 142 143 #include "astcenc_vecmathlib_none_4.h" 144 #include "astcenc_vecmathlib_common_4.h" 145 146 #define ASTCENC_SIMD_WIDTH 4 147 148 using vfloat = vfloat4; 149 using vfloatacc = vfloat4; 150 using vint = vint4; 151 using vmask = vmask4; 152 153 constexpr auto loada = vfloat4::loada; 154 constexpr auto load1 = vfloat4::load1; 155#endif 156 157/** 158 * @brief Round a count down to the largest multiple of 8. 159 * 160 * @param count The unrounded value. 161 * 162 * @return The rounded value. 163 */ 164ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count) 165{ 166 return count & static_cast<unsigned int>(~(8 - 1)); 167} 168 169/** 170 * @brief Round a count down to the largest multiple of 4. 171 * 172 * @param count The unrounded value. 173 * 174 * @return The rounded value. 175 */ 176ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count) 177{ 178 return count & static_cast<unsigned int>(~(4 - 1)); 179} 180 181/** 182 * @brief Round a count down to the largest multiple of the SIMD width. 183 * 184 * Assumption that the vector width is a power of two ... 185 * 186 * @param count The unrounded value. 187 * 188 * @return The rounded value. 189 */ 190ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count) 191{ 192 return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1)); 193} 194 195/** 196 * @brief Round a count up to the largest multiple of the SIMD width. 197 * 198 * Assumption that the vector width is a power of two ... 199 * 200 * @param count The unrounded value. 201 * 202 * @return The rounded value. 203 */ 204ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count) 205{ 206 unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH; 207 return multiples * ASTCENC_SIMD_WIDTH; 208} 209 210/** 211 * @brief Return @c a with lanes negated if the @c b lane is negative. 212 */ 213ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b) 214{ 215 vint ia = float_as_int(a); 216 vint ib = float_as_int(b); 217 vint sign_mask(static_cast<int>(0x80000000)); 218 vint r = ia ^ (ib & sign_mask); 219 return int_as_float(r); 220} 221 222/** 223 * @brief Return fast, but approximate, vector atan(x). 224 * 225 * Max error of this implementation is 0.004883. 226 */ 227ASTCENC_SIMD_INLINE vfloat atan(vfloat x) 228{ 229 vmask c = abs(x) > vfloat(1.0f); 230 vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x); 231 vfloat y = select(x, vfloat(1.0f) / x, c); 232 y = y / (y * y * vfloat(0.28f) + vfloat(1.0f)); 233 return select(y, z - y, c); 234} 235 236/** 237 * @brief Return fast, but approximate, vector atan2(x, y). 238 */ 239ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x) 240{ 241 vfloat z = atan(abs(y / x)); 242 vmask xmask = vmask(float_as_int(x).m); 243 return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y); 244} 245 246/* 247 * @brief Factory that returns a unit length 4 component vfloat4. 248 */ 249static ASTCENC_SIMD_INLINE vfloat4 unit4() 250{ 251 return vfloat4(0.5f); 252} 253 254/** 255 * @brief Factory that returns a unit length 3 component vfloat4. 256 */ 257static ASTCENC_SIMD_INLINE vfloat4 unit3() 258{ 259 float val = 0.577350258827209473f; 260 return vfloat4(val, val, val, 0.0f); 261} 262 263/** 264 * @brief Factory that returns a unit length 2 component vfloat4. 265 */ 266static ASTCENC_SIMD_INLINE vfloat4 unit2() 267{ 268 float val = 0.707106769084930420f; 269 return vfloat4(val, val, 0.0f, 0.0f); 270} 271 272/** 273 * @brief Factory that returns a 3 component vfloat4. 274 */ 275static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c) 276{ 277 return vfloat4(a, b, c, 0.0f); 278} 279 280/** 281 * @brief Factory that returns a 2 component vfloat4. 282 */ 283static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b) 284{ 285 return vfloat4(a, b, 0.0f, 0.0f); 286} 287 288/** 289 * @brief Normalize a non-zero length vector to unit length. 290 */ 291static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a) 292{ 293 vfloat4 length = dot(a, a); 294 return a / sqrt(length); 295} 296 297/** 298 * @brief Normalize a vector, returning @c safe if len is zero. 299 */ 300static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe) 301{ 302 vfloat4 length = dot(a, a); 303 if (length.lane<0>() != 0.0f) 304 { 305 return a / sqrt(length); 306 } 307 308 return safe; 309} 310 311 312 313#define POLY0(x, c0) ( c0) 314#define POLY1(x, c0, c1) ((POLY0(x, c1) * x) + c0) 315#define POLY2(x, c0, c1, c2) ((POLY1(x, c1, c2) * x) + c0) 316#define POLY3(x, c0, c1, c2, c3) ((POLY2(x, c1, c2, c3) * x) + c0) 317#define POLY4(x, c0, c1, c2, c3, c4) ((POLY3(x, c1, c2, c3, c4) * x) + c0) 318#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0) 319 320/** 321 * @brief Compute an approximate exp2(x) for each lane in the vector. 322 * 323 * Based on 5th degree minimax polynomials, ported from this blog 324 * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html 325 */ 326static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x) 327{ 328 x = clamp(-126.99999f, 129.0f, x); 329 330 vint4 ipart = float_to_int(x - 0.5f); 331 vfloat4 fpart = x - int_to_float(ipart); 332 333 // Integer contrib, using 1 << ipart 334 vfloat4 iexp = int_as_float(lsl<23>(ipart + 127)); 335 336 // Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5) 337 vfloat4 fexp = POLY5(fpart, 338 9.9999994e-1f, 339 6.9315308e-1f, 340 2.4015361e-1f, 341 5.5826318e-2f, 342 8.9893397e-3f, 343 1.8775767e-3f); 344 345 return iexp * fexp; 346} 347 348/** 349 * @brief Compute an approximate log2(x) for each lane in the vector. 350 * 351 * Based on 5th degree minimax polynomials, ported from this blog 352 * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html 353 */ 354static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x) 355{ 356 vint4 exp(0x7F800000); 357 vint4 mant(0x007FFFFF); 358 vint4 one(0x3F800000); 359 360 vint4 i = float_as_int(x); 361 362 vfloat4 e = int_to_float(lsr<23>(i & exp) - 127); 363 364 vfloat4 m = int_as_float((i & mant) | one); 365 366 // Polynomial fit of log2(x)/(x - 1), for x in range [1, 2) 367 vfloat4 p = POLY4(m, 368 2.8882704548164776201f, 369 -2.52074962577807006663f, 370 1.48116647521213171641f, 371 -0.465725644288844778798f, 372 0.0596515482674574969533f); 373 374 // Increases the polynomial degree, but ensures that log2(1) == 0 375 p = p * (m - 1.0f); 376 377 return p + e; 378} 379 380/** 381 * @brief Compute an approximate pow(x, y) for each lane in the vector. 382 * 383 * Power function based on the exp2(log2(x) * y) transform. 384 */ 385static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y) 386{ 387 vmask4 zero_mask = y == vfloat4(0.0f); 388 vfloat4 estimate = exp2(log2(x) * y); 389 390 // Guarantee that y == 0 returns exactly 1.0f 391 return select(estimate, vfloat4(1.0f), zero_mask); 392} 393 394/** 395 * @brief Count the leading zeros for each lane in @c a. 396 * 397 * Valid for all data values of @c a; will return a per-lane value [0, 32]. 398 */ 399static ASTCENC_SIMD_INLINE vint4 clz(vint4 a) 400{ 401 // This function is a horrible abuse of floating point exponents to convert 402 // the original integer value into a 2^N encoding we can recover easily. 403 404 // Convert to float without risk of rounding up by keeping only top 8 bits. 405 // This trick is is guaranteed to keep top 8 bits and clear the 9th. 406 a = (~lsr<8>(a)) & a; 407 a = float_as_int(int_to_float(a)); 408 409 // Extract and unbias exponent 410 a = vint4(127 + 31) - lsr<23>(a); 411 412 // Clamp result to a valid 32-bit range 413 return clamp(0, 32, a); 414} 415 416/** 417 * @brief Return lanewise 2^a for each lane in @c a. 418 * 419 * Use of signed int means that this is only valid for values in range [0, 31]. 420 */ 421static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a) 422{ 423 // 2^30 is the largest signed number than can be represented 424 assert(all(a < vint4(31))); 425 426 // This function is a horrible abuse of floating point to use the exponent 427 // and float conversion to generate a 2^N multiple. 428 429 // Bias the exponent 430 vint4 exp = a + 127; 431 exp = lsl<23>(exp); 432 433 // Reinterpret the bits as a float, and then convert to an int 434 vfloat4 f = int_as_float(exp); 435 return float_to_int(f); 436} 437 438/** 439 * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1]. 440 */ 441static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p) 442{ 443 vint4 fp16_one = vint4(0x3C00); 444 vint4 fp16_small = lsl<8>(p); 445 446 vmask4 is_one = p == vint4(0xFFFF); 447 vmask4 is_small = p < vint4(4); 448 449 // Manually inline clz() on Visual Studio to avoid release build codegen bug 450 // see https://github.com/ARM-software/astc-encoder/issues/259 451#if !defined(__clang__) && defined(_MSC_VER) 452 vint4 a = (~lsr<8>(p)) & p; 453 a = float_as_int(int_to_float(a)); 454 a = vint4(127 + 31) - lsr<23>(a); 455 vint4 lz = clamp(0, 32, a) - 16; 456#else 457 vint4 lz = clz(p) - 16; 458#endif 459 460 p = p * two_to_the_n(lz + 1); 461 p = p & vint4(0xFFFF); 462 463 p = lsr<6>(p); 464 465 p = p | lsl<10>(vint4(14) - lz); 466 467 vint4 r = select(p, fp16_one, is_one); 468 r = select(r, fp16_small, is_small); 469 return r; 470} 471 472/** 473 * @brief Convert 16-bit LNS to float16. 474 */ 475static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p) 476{ 477 vint4 mc = p & 0x7FF; 478 vint4 ec = lsr<11>(p); 479 480 vint4 mc_512 = mc * 3; 481 vmask4 mask_512 = mc < vint4(512); 482 483 vint4 mc_1536 = mc * 4 - 512; 484 vmask4 mask_1536 = mc < vint4(1536); 485 486 vint4 mc_else = mc * 5 - 2048; 487 488 vint4 mt = mc_else; 489 mt = select(mt, mc_1536, mask_1536); 490 mt = select(mt, mc_512, mask_512); 491 492 vint4 res = lsl<10>(ec) | lsr<3>(mt); 493 return min(res, vint4(0x7BFF)); 494} 495 496/** 497 * @brief Extract mantissa and exponent of a float value. 498 * 499 * @param a The input value. 500 * @param[out] exp The output exponent. 501 * 502 * @return The mantissa. 503 */ 504static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp) 505{ 506 // Interpret the bits as an integer 507 vint4 ai = float_as_int(a); 508 509 // Extract and unbias the exponent 510 exp = (lsr<23>(ai) & 0xFF) - 126; 511 512 // Extract and unbias the mantissa 513 vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000; 514 return int_as_float(manti); 515} 516 517/** 518 * @brief Convert float to 16-bit LNS. 519 */ 520static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a) 521{ 522 vint4 exp; 523 vfloat4 mant = frexp(a, exp); 524 525 // Do these early before we start messing about ... 526 vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f)); 527 vmask4 mask_infinity = a >= vfloat4(65536.0f); 528 529 // If input is smaller than 2^-14, multiply by 2^25 and don't bias. 530 vmask4 exp_lt_m13 = exp < vint4(-13); 531 532 vfloat4 a1a = a * 33554432.0f; 533 vint4 expa = vint4::zero(); 534 535 vfloat4 a1b = (mant - 0.5f) * 4096; 536 vint4 expb = exp + 14; 537 538 a = select(a1b, a1a, exp_lt_m13); 539 exp = select(expb, expa, exp_lt_m13); 540 541 vmask4 a_lt_384 = a < vfloat4(384.0f); 542 vmask4 a_lt_1408 = a <= vfloat4(1408.0f); 543 544 vfloat4 a2a = a * (4.0f / 3.0f); 545 vfloat4 a2b = a + 128.0f; 546 vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f); 547 548 a = a2c; 549 a = select(a, a2b, a_lt_1408); 550 a = select(a, a2a, a_lt_384); 551 552 a = a + (int_to_float(exp) * 2048.0f) + 1.0f; 553 554 a = select(a, vfloat4(65535.0f), mask_infinity); 555 a = select(a, vfloat4::zero(), mask_underflow_nan); 556 557 return a; 558} 559 560namespace astc 561{ 562 563static ASTCENC_SIMD_INLINE float pow(float x, float y) 564{ 565 return pow(vfloat4(x), vfloat4(y)).lane<0>(); 566} 567 568} 569 570#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED 571