1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0 2cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 3cc1dc7a3Sopenharmony_ci// Copyright 2019-2022 Arm Limited 4cc1dc7a3Sopenharmony_ci// Copyright 2008 Jose Fonseca 5cc1dc7a3Sopenharmony_ci// 6cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not 7cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy 8cc1dc7a3Sopenharmony_ci// of the License at: 9cc1dc7a3Sopenharmony_ci// 10cc1dc7a3Sopenharmony_ci// http://www.apache.org/licenses/LICENSE-2.0 11cc1dc7a3Sopenharmony_ci// 12cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software 13cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 14cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 15cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations 16cc1dc7a3Sopenharmony_ci// under the License. 17cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 18cc1dc7a3Sopenharmony_ci 19cc1dc7a3Sopenharmony_ci/* 20cc1dc7a3Sopenharmony_ci * This module implements vector support for floats, ints, and vector lane 21cc1dc7a3Sopenharmony_ci * control masks. It provides access to both explicit vector width types, and 22cc1dc7a3Sopenharmony_ci * flexible N-wide types where N can be determined at compile time. 23cc1dc7a3Sopenharmony_ci * 24cc1dc7a3Sopenharmony_ci * The design of this module encourages use of vector length agnostic code, via 25cc1dc7a3Sopenharmony_ci * the vint, vfloat, and vmask types. These will take on the widest SIMD vector 26cc1dc7a3Sopenharmony_ci * with that is available at compile time. The current vector width is 27cc1dc7a3Sopenharmony_ci * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant. 28cc1dc7a3Sopenharmony_ci * 29cc1dc7a3Sopenharmony_ci * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types. 30cc1dc7a3Sopenharmony_ci * These are provided primarily for prototyping and algorithm debug of VLA 31cc1dc7a3Sopenharmony_ci * implementations. 32cc1dc7a3Sopenharmony_ci * 33cc1dc7a3Sopenharmony_ci * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4 34cc1dc7a3Sopenharmony_ci * types. These are provided for use by VLA code, but are also expected to be 35cc1dc7a3Sopenharmony_ci * used as a fixed-width type and will supported a reference C++ fallback for 36cc1dc7a3Sopenharmony_ci * use on platforms without SIMD intrinsics. 37cc1dc7a3Sopenharmony_ci * 38cc1dc7a3Sopenharmony_ci * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8 39cc1dc7a3Sopenharmony_ci * types. These are provide for use by VLA code, and are not expected to be 40cc1dc7a3Sopenharmony_ci * used as a fixed-width type in normal code. No reference C implementation is 41cc1dc7a3Sopenharmony_ci * provided on platforms without underlying SIMD intrinsics. 42cc1dc7a3Sopenharmony_ci * 43cc1dc7a3Sopenharmony_ci * With the current implementation ISA support is provided for: 44cc1dc7a3Sopenharmony_ci * 45cc1dc7a3Sopenharmony_ci * * 1-wide for scalar reference. 46cc1dc7a3Sopenharmony_ci * * 4-wide for Armv8-A NEON. 47cc1dc7a3Sopenharmony_ci * * 4-wide for x86-64 SSE2. 48cc1dc7a3Sopenharmony_ci * * 4-wide for x86-64 SSE4.1. 49cc1dc7a3Sopenharmony_ci * * 8-wide for x86-64 AVX2. 50cc1dc7a3Sopenharmony_ci */ 51cc1dc7a3Sopenharmony_ci 52cc1dc7a3Sopenharmony_ci#ifndef ASTC_VECMATHLIB_H_INCLUDED 53cc1dc7a3Sopenharmony_ci#define ASTC_VECMATHLIB_H_INCLUDED 54cc1dc7a3Sopenharmony_ci 55cc1dc7a3Sopenharmony_ci#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 56cc1dc7a3Sopenharmony_ci #include <immintrin.h> 57cc1dc7a3Sopenharmony_ci#elif ASTCENC_NEON != 0 58cc1dc7a3Sopenharmony_ci #include <arm_neon.h> 59cc1dc7a3Sopenharmony_ci#endif 60cc1dc7a3Sopenharmony_ci 61cc1dc7a3Sopenharmony_ci#if !defined(__clang__) && defined(_MSC_VER) 62cc1dc7a3Sopenharmony_ci #define ASTCENC_SIMD_INLINE __forceinline 63cc1dc7a3Sopenharmony_ci #define ASTCENC_NO_INLINE 64cc1dc7a3Sopenharmony_ci#elif defined(__GNUC__) && !defined(__clang__) 65cc1dc7a3Sopenharmony_ci #define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline 66cc1dc7a3Sopenharmony_ci #define ASTCENC_NO_INLINE __attribute__ ((noinline)) 67cc1dc7a3Sopenharmony_ci#else 68cc1dc7a3Sopenharmony_ci #define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline 69cc1dc7a3Sopenharmony_ci #define ASTCENC_NO_INLINE __attribute__ ((noinline)) 70cc1dc7a3Sopenharmony_ci#endif 71cc1dc7a3Sopenharmony_ci 72cc1dc7a3Sopenharmony_ci#if ASTCENC_AVX >= 2 73cc1dc7a3Sopenharmony_ci /* If we have AVX2 expose 8-wide VLA. */ 74cc1dc7a3Sopenharmony_ci #include "astcenc_vecmathlib_sse_4.h" 75cc1dc7a3Sopenharmony_ci #include "astcenc_vecmathlib_common_4.h" 76cc1dc7a3Sopenharmony_ci #include "astcenc_vecmathlib_avx2_8.h" 77cc1dc7a3Sopenharmony_ci 78cc1dc7a3Sopenharmony_ci #define ASTCENC_SIMD_WIDTH 8 79cc1dc7a3Sopenharmony_ci 80cc1dc7a3Sopenharmony_ci using vfloat = vfloat8; 81cc1dc7a3Sopenharmony_ci 82cc1dc7a3Sopenharmony_ci #if defined(ASTCENC_NO_INVARIANCE) 83cc1dc7a3Sopenharmony_ci using vfloatacc = vfloat8; 84cc1dc7a3Sopenharmony_ci #else 85cc1dc7a3Sopenharmony_ci using vfloatacc = vfloat4; 86cc1dc7a3Sopenharmony_ci #endif 87cc1dc7a3Sopenharmony_ci 88cc1dc7a3Sopenharmony_ci using vint = vint8; 89cc1dc7a3Sopenharmony_ci using vmask = vmask8; 90cc1dc7a3Sopenharmony_ci 91cc1dc7a3Sopenharmony_ci constexpr auto loada = vfloat8::loada; 92cc1dc7a3Sopenharmony_ci constexpr auto load1 = vfloat8::load1; 93cc1dc7a3Sopenharmony_ci 94cc1dc7a3Sopenharmony_ci#elif ASTCENC_SSE >= 20 95cc1dc7a3Sopenharmony_ci /* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */ 96cc1dc7a3Sopenharmony_ci #include "astcenc_vecmathlib_sse_4.h" 97cc1dc7a3Sopenharmony_ci #include "astcenc_vecmathlib_common_4.h" 98cc1dc7a3Sopenharmony_ci 99cc1dc7a3Sopenharmony_ci #define ASTCENC_SIMD_WIDTH 4 100cc1dc7a3Sopenharmony_ci 101cc1dc7a3Sopenharmony_ci using vfloat = vfloat4; 102cc1dc7a3Sopenharmony_ci using vfloatacc = vfloat4; 103cc1dc7a3Sopenharmony_ci using vint = vint4; 104cc1dc7a3Sopenharmony_ci using vmask = vmask4; 105cc1dc7a3Sopenharmony_ci 106cc1dc7a3Sopenharmony_ci constexpr auto loada = vfloat4::loada; 107cc1dc7a3Sopenharmony_ci constexpr auto load1 = vfloat4::load1; 108cc1dc7a3Sopenharmony_ci 109cc1dc7a3Sopenharmony_ci#elif ASTCENC_NEON > 0 110cc1dc7a3Sopenharmony_ci /* If we have NEON expose 4-wide VLA. */ 111cc1dc7a3Sopenharmony_ci #include "astcenc_vecmathlib_neon_4.h" 112cc1dc7a3Sopenharmony_ci #include "astcenc_vecmathlib_common_4.h" 113cc1dc7a3Sopenharmony_ci 114cc1dc7a3Sopenharmony_ci #define ASTCENC_SIMD_WIDTH 4 115cc1dc7a3Sopenharmony_ci 116cc1dc7a3Sopenharmony_ci using vfloat = vfloat4; 117cc1dc7a3Sopenharmony_ci using vfloatacc = vfloat4; 118cc1dc7a3Sopenharmony_ci using vint = vint4; 119cc1dc7a3Sopenharmony_ci using vmask = vmask4; 120cc1dc7a3Sopenharmony_ci 121cc1dc7a3Sopenharmony_ci constexpr auto loada = vfloat4::loada; 122cc1dc7a3Sopenharmony_ci constexpr auto load1 = vfloat4::load1; 123cc1dc7a3Sopenharmony_ci 124cc1dc7a3Sopenharmony_ci#else 125cc1dc7a3Sopenharmony_ci // If we have nothing expose 4-wide VLA, and 4-wide fixed width. 126cc1dc7a3Sopenharmony_ci 127cc1dc7a3Sopenharmony_ci // Note: We no longer expose the 1-wide scalar fallback because it is not 128cc1dc7a3Sopenharmony_ci // invariant with the 4-wide path due to algorithms that use horizontal 129cc1dc7a3Sopenharmony_ci // operations that accumulate a local vector sum before accumulating into 130cc1dc7a3Sopenharmony_ci // a running sum. 131cc1dc7a3Sopenharmony_ci // 132cc1dc7a3Sopenharmony_ci // For 4 items adding into an accumulator using 1-wide vectors the sum is: 133cc1dc7a3Sopenharmony_ci // 134cc1dc7a3Sopenharmony_ci // result = ((((sum + l0) + l1) + l2) + l3) 135cc1dc7a3Sopenharmony_ci // 136cc1dc7a3Sopenharmony_ci // ... whereas the accumulator for a 4-wide vector sum is: 137cc1dc7a3Sopenharmony_ci // 138cc1dc7a3Sopenharmony_ci // result = sum + ((l0 + l2) + (l1 + l3)) 139cc1dc7a3Sopenharmony_ci // 140cc1dc7a3Sopenharmony_ci // In "normal maths" this is the same, but the floating point reassociation 141cc1dc7a3Sopenharmony_ci // differences mean that these will not produce the same result. 142cc1dc7a3Sopenharmony_ci 143cc1dc7a3Sopenharmony_ci #include "astcenc_vecmathlib_none_4.h" 144cc1dc7a3Sopenharmony_ci #include "astcenc_vecmathlib_common_4.h" 145cc1dc7a3Sopenharmony_ci 146cc1dc7a3Sopenharmony_ci #define ASTCENC_SIMD_WIDTH 4 147cc1dc7a3Sopenharmony_ci 148cc1dc7a3Sopenharmony_ci using vfloat = vfloat4; 149cc1dc7a3Sopenharmony_ci using vfloatacc = vfloat4; 150cc1dc7a3Sopenharmony_ci using vint = vint4; 151cc1dc7a3Sopenharmony_ci using vmask = vmask4; 152cc1dc7a3Sopenharmony_ci 153cc1dc7a3Sopenharmony_ci constexpr auto loada = vfloat4::loada; 154cc1dc7a3Sopenharmony_ci constexpr auto load1 = vfloat4::load1; 155cc1dc7a3Sopenharmony_ci#endif 156cc1dc7a3Sopenharmony_ci 157cc1dc7a3Sopenharmony_ci/** 158cc1dc7a3Sopenharmony_ci * @brief Round a count down to the largest multiple of 8. 159cc1dc7a3Sopenharmony_ci * 160cc1dc7a3Sopenharmony_ci * @param count The unrounded value. 161cc1dc7a3Sopenharmony_ci * 162cc1dc7a3Sopenharmony_ci * @return The rounded value. 163cc1dc7a3Sopenharmony_ci */ 164cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count) 165cc1dc7a3Sopenharmony_ci{ 166cc1dc7a3Sopenharmony_ci return count & static_cast<unsigned int>(~(8 - 1)); 167cc1dc7a3Sopenharmony_ci} 168cc1dc7a3Sopenharmony_ci 169cc1dc7a3Sopenharmony_ci/** 170cc1dc7a3Sopenharmony_ci * @brief Round a count down to the largest multiple of 4. 171cc1dc7a3Sopenharmony_ci * 172cc1dc7a3Sopenharmony_ci * @param count The unrounded value. 173cc1dc7a3Sopenharmony_ci * 174cc1dc7a3Sopenharmony_ci * @return The rounded value. 175cc1dc7a3Sopenharmony_ci */ 176cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count) 177cc1dc7a3Sopenharmony_ci{ 178cc1dc7a3Sopenharmony_ci return count & static_cast<unsigned int>(~(4 - 1)); 179cc1dc7a3Sopenharmony_ci} 180cc1dc7a3Sopenharmony_ci 181cc1dc7a3Sopenharmony_ci/** 182cc1dc7a3Sopenharmony_ci * @brief Round a count down to the largest multiple of the SIMD width. 183cc1dc7a3Sopenharmony_ci * 184cc1dc7a3Sopenharmony_ci * Assumption that the vector width is a power of two ... 185cc1dc7a3Sopenharmony_ci * 186cc1dc7a3Sopenharmony_ci * @param count The unrounded value. 187cc1dc7a3Sopenharmony_ci * 188cc1dc7a3Sopenharmony_ci * @return The rounded value. 189cc1dc7a3Sopenharmony_ci */ 190cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count) 191cc1dc7a3Sopenharmony_ci{ 192cc1dc7a3Sopenharmony_ci return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1)); 193cc1dc7a3Sopenharmony_ci} 194cc1dc7a3Sopenharmony_ci 195cc1dc7a3Sopenharmony_ci/** 196cc1dc7a3Sopenharmony_ci * @brief Round a count up to the largest multiple of the SIMD width. 197cc1dc7a3Sopenharmony_ci * 198cc1dc7a3Sopenharmony_ci * Assumption that the vector width is a power of two ... 199cc1dc7a3Sopenharmony_ci * 200cc1dc7a3Sopenharmony_ci * @param count The unrounded value. 201cc1dc7a3Sopenharmony_ci * 202cc1dc7a3Sopenharmony_ci * @return The rounded value. 203cc1dc7a3Sopenharmony_ci */ 204cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count) 205cc1dc7a3Sopenharmony_ci{ 206cc1dc7a3Sopenharmony_ci unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH; 207cc1dc7a3Sopenharmony_ci return multiples * ASTCENC_SIMD_WIDTH; 208cc1dc7a3Sopenharmony_ci} 209cc1dc7a3Sopenharmony_ci 210cc1dc7a3Sopenharmony_ci/** 211cc1dc7a3Sopenharmony_ci * @brief Return @c a with lanes negated if the @c b lane is negative. 212cc1dc7a3Sopenharmony_ci */ 213cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b) 214cc1dc7a3Sopenharmony_ci{ 215cc1dc7a3Sopenharmony_ci vint ia = float_as_int(a); 216cc1dc7a3Sopenharmony_ci vint ib = float_as_int(b); 217cc1dc7a3Sopenharmony_ci vint sign_mask(static_cast<int>(0x80000000)); 218cc1dc7a3Sopenharmony_ci vint r = ia ^ (ib & sign_mask); 219cc1dc7a3Sopenharmony_ci return int_as_float(r); 220cc1dc7a3Sopenharmony_ci} 221cc1dc7a3Sopenharmony_ci 222cc1dc7a3Sopenharmony_ci/** 223cc1dc7a3Sopenharmony_ci * @brief Return fast, but approximate, vector atan(x). 224cc1dc7a3Sopenharmony_ci * 225cc1dc7a3Sopenharmony_ci * Max error of this implementation is 0.004883. 226cc1dc7a3Sopenharmony_ci */ 227cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat atan(vfloat x) 228cc1dc7a3Sopenharmony_ci{ 229cc1dc7a3Sopenharmony_ci vmask c = abs(x) > vfloat(1.0f); 230cc1dc7a3Sopenharmony_ci vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x); 231cc1dc7a3Sopenharmony_ci vfloat y = select(x, vfloat(1.0f) / x, c); 232cc1dc7a3Sopenharmony_ci y = y / (y * y * vfloat(0.28f) + vfloat(1.0f)); 233cc1dc7a3Sopenharmony_ci return select(y, z - y, c); 234cc1dc7a3Sopenharmony_ci} 235cc1dc7a3Sopenharmony_ci 236cc1dc7a3Sopenharmony_ci/** 237cc1dc7a3Sopenharmony_ci * @brief Return fast, but approximate, vector atan2(x, y). 238cc1dc7a3Sopenharmony_ci */ 239cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x) 240cc1dc7a3Sopenharmony_ci{ 241cc1dc7a3Sopenharmony_ci vfloat z = atan(abs(y / x)); 242cc1dc7a3Sopenharmony_ci vmask xmask = vmask(float_as_int(x).m); 243cc1dc7a3Sopenharmony_ci return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y); 244cc1dc7a3Sopenharmony_ci} 245cc1dc7a3Sopenharmony_ci 246cc1dc7a3Sopenharmony_ci/* 247cc1dc7a3Sopenharmony_ci * @brief Factory that returns a unit length 4 component vfloat4. 248cc1dc7a3Sopenharmony_ci */ 249cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 unit4() 250cc1dc7a3Sopenharmony_ci{ 251cc1dc7a3Sopenharmony_ci return vfloat4(0.5f); 252cc1dc7a3Sopenharmony_ci} 253cc1dc7a3Sopenharmony_ci 254cc1dc7a3Sopenharmony_ci/** 255cc1dc7a3Sopenharmony_ci * @brief Factory that returns a unit length 3 component vfloat4. 256cc1dc7a3Sopenharmony_ci */ 257cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 unit3() 258cc1dc7a3Sopenharmony_ci{ 259cc1dc7a3Sopenharmony_ci float val = 0.577350258827209473f; 260cc1dc7a3Sopenharmony_ci return vfloat4(val, val, val, 0.0f); 261cc1dc7a3Sopenharmony_ci} 262cc1dc7a3Sopenharmony_ci 263cc1dc7a3Sopenharmony_ci/** 264cc1dc7a3Sopenharmony_ci * @brief Factory that returns a unit length 2 component vfloat4. 265cc1dc7a3Sopenharmony_ci */ 266cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 unit2() 267cc1dc7a3Sopenharmony_ci{ 268cc1dc7a3Sopenharmony_ci float val = 0.707106769084930420f; 269cc1dc7a3Sopenharmony_ci return vfloat4(val, val, 0.0f, 0.0f); 270cc1dc7a3Sopenharmony_ci} 271cc1dc7a3Sopenharmony_ci 272cc1dc7a3Sopenharmony_ci/** 273cc1dc7a3Sopenharmony_ci * @brief Factory that returns a 3 component vfloat4. 274cc1dc7a3Sopenharmony_ci */ 275cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c) 276cc1dc7a3Sopenharmony_ci{ 277cc1dc7a3Sopenharmony_ci return vfloat4(a, b, c, 0.0f); 278cc1dc7a3Sopenharmony_ci} 279cc1dc7a3Sopenharmony_ci 280cc1dc7a3Sopenharmony_ci/** 281cc1dc7a3Sopenharmony_ci * @brief Factory that returns a 2 component vfloat4. 282cc1dc7a3Sopenharmony_ci */ 283cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b) 284cc1dc7a3Sopenharmony_ci{ 285cc1dc7a3Sopenharmony_ci return vfloat4(a, b, 0.0f, 0.0f); 286cc1dc7a3Sopenharmony_ci} 287cc1dc7a3Sopenharmony_ci 288cc1dc7a3Sopenharmony_ci/** 289cc1dc7a3Sopenharmony_ci * @brief Normalize a non-zero length vector to unit length. 290cc1dc7a3Sopenharmony_ci */ 291cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a) 292cc1dc7a3Sopenharmony_ci{ 293cc1dc7a3Sopenharmony_ci vfloat4 length = dot(a, a); 294cc1dc7a3Sopenharmony_ci return a / sqrt(length); 295cc1dc7a3Sopenharmony_ci} 296cc1dc7a3Sopenharmony_ci 297cc1dc7a3Sopenharmony_ci/** 298cc1dc7a3Sopenharmony_ci * @brief Normalize a vector, returning @c safe if len is zero. 299cc1dc7a3Sopenharmony_ci */ 300cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe) 301cc1dc7a3Sopenharmony_ci{ 302cc1dc7a3Sopenharmony_ci vfloat4 length = dot(a, a); 303cc1dc7a3Sopenharmony_ci if (length.lane<0>() != 0.0f) 304cc1dc7a3Sopenharmony_ci { 305cc1dc7a3Sopenharmony_ci return a / sqrt(length); 306cc1dc7a3Sopenharmony_ci } 307cc1dc7a3Sopenharmony_ci 308cc1dc7a3Sopenharmony_ci return safe; 309cc1dc7a3Sopenharmony_ci} 310cc1dc7a3Sopenharmony_ci 311cc1dc7a3Sopenharmony_ci 312cc1dc7a3Sopenharmony_ci 313cc1dc7a3Sopenharmony_ci#define POLY0(x, c0) ( c0) 314cc1dc7a3Sopenharmony_ci#define POLY1(x, c0, c1) ((POLY0(x, c1) * x) + c0) 315cc1dc7a3Sopenharmony_ci#define POLY2(x, c0, c1, c2) ((POLY1(x, c1, c2) * x) + c0) 316cc1dc7a3Sopenharmony_ci#define POLY3(x, c0, c1, c2, c3) ((POLY2(x, c1, c2, c3) * x) + c0) 317cc1dc7a3Sopenharmony_ci#define POLY4(x, c0, c1, c2, c3, c4) ((POLY3(x, c1, c2, c3, c4) * x) + c0) 318cc1dc7a3Sopenharmony_ci#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0) 319cc1dc7a3Sopenharmony_ci 320cc1dc7a3Sopenharmony_ci/** 321cc1dc7a3Sopenharmony_ci * @brief Compute an approximate exp2(x) for each lane in the vector. 322cc1dc7a3Sopenharmony_ci * 323cc1dc7a3Sopenharmony_ci * Based on 5th degree minimax polynomials, ported from this blog 324cc1dc7a3Sopenharmony_ci * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html 325cc1dc7a3Sopenharmony_ci */ 326cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x) 327cc1dc7a3Sopenharmony_ci{ 328cc1dc7a3Sopenharmony_ci x = clamp(-126.99999f, 129.0f, x); 329cc1dc7a3Sopenharmony_ci 330cc1dc7a3Sopenharmony_ci vint4 ipart = float_to_int(x - 0.5f); 331cc1dc7a3Sopenharmony_ci vfloat4 fpart = x - int_to_float(ipart); 332cc1dc7a3Sopenharmony_ci 333cc1dc7a3Sopenharmony_ci // Integer contrib, using 1 << ipart 334cc1dc7a3Sopenharmony_ci vfloat4 iexp = int_as_float(lsl<23>(ipart + 127)); 335cc1dc7a3Sopenharmony_ci 336cc1dc7a3Sopenharmony_ci // Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5) 337cc1dc7a3Sopenharmony_ci vfloat4 fexp = POLY5(fpart, 338cc1dc7a3Sopenharmony_ci 9.9999994e-1f, 339cc1dc7a3Sopenharmony_ci 6.9315308e-1f, 340cc1dc7a3Sopenharmony_ci 2.4015361e-1f, 341cc1dc7a3Sopenharmony_ci 5.5826318e-2f, 342cc1dc7a3Sopenharmony_ci 8.9893397e-3f, 343cc1dc7a3Sopenharmony_ci 1.8775767e-3f); 344cc1dc7a3Sopenharmony_ci 345cc1dc7a3Sopenharmony_ci return iexp * fexp; 346cc1dc7a3Sopenharmony_ci} 347cc1dc7a3Sopenharmony_ci 348cc1dc7a3Sopenharmony_ci/** 349cc1dc7a3Sopenharmony_ci * @brief Compute an approximate log2(x) for each lane in the vector. 350cc1dc7a3Sopenharmony_ci * 351cc1dc7a3Sopenharmony_ci * Based on 5th degree minimax polynomials, ported from this blog 352cc1dc7a3Sopenharmony_ci * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html 353cc1dc7a3Sopenharmony_ci */ 354cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x) 355cc1dc7a3Sopenharmony_ci{ 356cc1dc7a3Sopenharmony_ci vint4 exp(0x7F800000); 357cc1dc7a3Sopenharmony_ci vint4 mant(0x007FFFFF); 358cc1dc7a3Sopenharmony_ci vint4 one(0x3F800000); 359cc1dc7a3Sopenharmony_ci 360cc1dc7a3Sopenharmony_ci vint4 i = float_as_int(x); 361cc1dc7a3Sopenharmony_ci 362cc1dc7a3Sopenharmony_ci vfloat4 e = int_to_float(lsr<23>(i & exp) - 127); 363cc1dc7a3Sopenharmony_ci 364cc1dc7a3Sopenharmony_ci vfloat4 m = int_as_float((i & mant) | one); 365cc1dc7a3Sopenharmony_ci 366cc1dc7a3Sopenharmony_ci // Polynomial fit of log2(x)/(x - 1), for x in range [1, 2) 367cc1dc7a3Sopenharmony_ci vfloat4 p = POLY4(m, 368cc1dc7a3Sopenharmony_ci 2.8882704548164776201f, 369cc1dc7a3Sopenharmony_ci -2.52074962577807006663f, 370cc1dc7a3Sopenharmony_ci 1.48116647521213171641f, 371cc1dc7a3Sopenharmony_ci -0.465725644288844778798f, 372cc1dc7a3Sopenharmony_ci 0.0596515482674574969533f); 373cc1dc7a3Sopenharmony_ci 374cc1dc7a3Sopenharmony_ci // Increases the polynomial degree, but ensures that log2(1) == 0 375cc1dc7a3Sopenharmony_ci p = p * (m - 1.0f); 376cc1dc7a3Sopenharmony_ci 377cc1dc7a3Sopenharmony_ci return p + e; 378cc1dc7a3Sopenharmony_ci} 379cc1dc7a3Sopenharmony_ci 380cc1dc7a3Sopenharmony_ci/** 381cc1dc7a3Sopenharmony_ci * @brief Compute an approximate pow(x, y) for each lane in the vector. 382cc1dc7a3Sopenharmony_ci * 383cc1dc7a3Sopenharmony_ci * Power function based on the exp2(log2(x) * y) transform. 384cc1dc7a3Sopenharmony_ci */ 385cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y) 386cc1dc7a3Sopenharmony_ci{ 387cc1dc7a3Sopenharmony_ci vmask4 zero_mask = y == vfloat4(0.0f); 388cc1dc7a3Sopenharmony_ci vfloat4 estimate = exp2(log2(x) * y); 389cc1dc7a3Sopenharmony_ci 390cc1dc7a3Sopenharmony_ci // Guarantee that y == 0 returns exactly 1.0f 391cc1dc7a3Sopenharmony_ci return select(estimate, vfloat4(1.0f), zero_mask); 392cc1dc7a3Sopenharmony_ci} 393cc1dc7a3Sopenharmony_ci 394cc1dc7a3Sopenharmony_ci/** 395cc1dc7a3Sopenharmony_ci * @brief Count the leading zeros for each lane in @c a. 396cc1dc7a3Sopenharmony_ci * 397cc1dc7a3Sopenharmony_ci * Valid for all data values of @c a; will return a per-lane value [0, 32]. 398cc1dc7a3Sopenharmony_ci */ 399cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vint4 clz(vint4 a) 400cc1dc7a3Sopenharmony_ci{ 401cc1dc7a3Sopenharmony_ci // This function is a horrible abuse of floating point exponents to convert 402cc1dc7a3Sopenharmony_ci // the original integer value into a 2^N encoding we can recover easily. 403cc1dc7a3Sopenharmony_ci 404cc1dc7a3Sopenharmony_ci // Convert to float without risk of rounding up by keeping only top 8 bits. 405cc1dc7a3Sopenharmony_ci // This trick is is guaranteed to keep top 8 bits and clear the 9th. 406cc1dc7a3Sopenharmony_ci a = (~lsr<8>(a)) & a; 407cc1dc7a3Sopenharmony_ci a = float_as_int(int_to_float(a)); 408cc1dc7a3Sopenharmony_ci 409cc1dc7a3Sopenharmony_ci // Extract and unbias exponent 410cc1dc7a3Sopenharmony_ci a = vint4(127 + 31) - lsr<23>(a); 411cc1dc7a3Sopenharmony_ci 412cc1dc7a3Sopenharmony_ci // Clamp result to a valid 32-bit range 413cc1dc7a3Sopenharmony_ci return clamp(0, 32, a); 414cc1dc7a3Sopenharmony_ci} 415cc1dc7a3Sopenharmony_ci 416cc1dc7a3Sopenharmony_ci/** 417cc1dc7a3Sopenharmony_ci * @brief Return lanewise 2^a for each lane in @c a. 418cc1dc7a3Sopenharmony_ci * 419cc1dc7a3Sopenharmony_ci * Use of signed int means that this is only valid for values in range [0, 31]. 420cc1dc7a3Sopenharmony_ci */ 421cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a) 422cc1dc7a3Sopenharmony_ci{ 423cc1dc7a3Sopenharmony_ci // 2^30 is the largest signed number than can be represented 424cc1dc7a3Sopenharmony_ci assert(all(a < vint4(31))); 425cc1dc7a3Sopenharmony_ci 426cc1dc7a3Sopenharmony_ci // This function is a horrible abuse of floating point to use the exponent 427cc1dc7a3Sopenharmony_ci // and float conversion to generate a 2^N multiple. 428cc1dc7a3Sopenharmony_ci 429cc1dc7a3Sopenharmony_ci // Bias the exponent 430cc1dc7a3Sopenharmony_ci vint4 exp = a + 127; 431cc1dc7a3Sopenharmony_ci exp = lsl<23>(exp); 432cc1dc7a3Sopenharmony_ci 433cc1dc7a3Sopenharmony_ci // Reinterpret the bits as a float, and then convert to an int 434cc1dc7a3Sopenharmony_ci vfloat4 f = int_as_float(exp); 435cc1dc7a3Sopenharmony_ci return float_to_int(f); 436cc1dc7a3Sopenharmony_ci} 437cc1dc7a3Sopenharmony_ci 438cc1dc7a3Sopenharmony_ci/** 439cc1dc7a3Sopenharmony_ci * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1]. 440cc1dc7a3Sopenharmony_ci */ 441cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p) 442cc1dc7a3Sopenharmony_ci{ 443cc1dc7a3Sopenharmony_ci vint4 fp16_one = vint4(0x3C00); 444cc1dc7a3Sopenharmony_ci vint4 fp16_small = lsl<8>(p); 445cc1dc7a3Sopenharmony_ci 446cc1dc7a3Sopenharmony_ci vmask4 is_one = p == vint4(0xFFFF); 447cc1dc7a3Sopenharmony_ci vmask4 is_small = p < vint4(4); 448cc1dc7a3Sopenharmony_ci 449cc1dc7a3Sopenharmony_ci // Manually inline clz() on Visual Studio to avoid release build codegen bug 450cc1dc7a3Sopenharmony_ci // see https://github.com/ARM-software/astc-encoder/issues/259 451cc1dc7a3Sopenharmony_ci#if !defined(__clang__) && defined(_MSC_VER) 452cc1dc7a3Sopenharmony_ci vint4 a = (~lsr<8>(p)) & p; 453cc1dc7a3Sopenharmony_ci a = float_as_int(int_to_float(a)); 454cc1dc7a3Sopenharmony_ci a = vint4(127 + 31) - lsr<23>(a); 455cc1dc7a3Sopenharmony_ci vint4 lz = clamp(0, 32, a) - 16; 456cc1dc7a3Sopenharmony_ci#else 457cc1dc7a3Sopenharmony_ci vint4 lz = clz(p) - 16; 458cc1dc7a3Sopenharmony_ci#endif 459cc1dc7a3Sopenharmony_ci 460cc1dc7a3Sopenharmony_ci p = p * two_to_the_n(lz + 1); 461cc1dc7a3Sopenharmony_ci p = p & vint4(0xFFFF); 462cc1dc7a3Sopenharmony_ci 463cc1dc7a3Sopenharmony_ci p = lsr<6>(p); 464cc1dc7a3Sopenharmony_ci 465cc1dc7a3Sopenharmony_ci p = p | lsl<10>(vint4(14) - lz); 466cc1dc7a3Sopenharmony_ci 467cc1dc7a3Sopenharmony_ci vint4 r = select(p, fp16_one, is_one); 468cc1dc7a3Sopenharmony_ci r = select(r, fp16_small, is_small); 469cc1dc7a3Sopenharmony_ci return r; 470cc1dc7a3Sopenharmony_ci} 471cc1dc7a3Sopenharmony_ci 472cc1dc7a3Sopenharmony_ci/** 473cc1dc7a3Sopenharmony_ci * @brief Convert 16-bit LNS to float16. 474cc1dc7a3Sopenharmony_ci */ 475cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p) 476cc1dc7a3Sopenharmony_ci{ 477cc1dc7a3Sopenharmony_ci vint4 mc = p & 0x7FF; 478cc1dc7a3Sopenharmony_ci vint4 ec = lsr<11>(p); 479cc1dc7a3Sopenharmony_ci 480cc1dc7a3Sopenharmony_ci vint4 mc_512 = mc * 3; 481cc1dc7a3Sopenharmony_ci vmask4 mask_512 = mc < vint4(512); 482cc1dc7a3Sopenharmony_ci 483cc1dc7a3Sopenharmony_ci vint4 mc_1536 = mc * 4 - 512; 484cc1dc7a3Sopenharmony_ci vmask4 mask_1536 = mc < vint4(1536); 485cc1dc7a3Sopenharmony_ci 486cc1dc7a3Sopenharmony_ci vint4 mc_else = mc * 5 - 2048; 487cc1dc7a3Sopenharmony_ci 488cc1dc7a3Sopenharmony_ci vint4 mt = mc_else; 489cc1dc7a3Sopenharmony_ci mt = select(mt, mc_1536, mask_1536); 490cc1dc7a3Sopenharmony_ci mt = select(mt, mc_512, mask_512); 491cc1dc7a3Sopenharmony_ci 492cc1dc7a3Sopenharmony_ci vint4 res = lsl<10>(ec) | lsr<3>(mt); 493cc1dc7a3Sopenharmony_ci return min(res, vint4(0x7BFF)); 494cc1dc7a3Sopenharmony_ci} 495cc1dc7a3Sopenharmony_ci 496cc1dc7a3Sopenharmony_ci/** 497cc1dc7a3Sopenharmony_ci * @brief Extract mantissa and exponent of a float value. 498cc1dc7a3Sopenharmony_ci * 499cc1dc7a3Sopenharmony_ci * @param a The input value. 500cc1dc7a3Sopenharmony_ci * @param[out] exp The output exponent. 501cc1dc7a3Sopenharmony_ci * 502cc1dc7a3Sopenharmony_ci * @return The mantissa. 503cc1dc7a3Sopenharmony_ci */ 504cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp) 505cc1dc7a3Sopenharmony_ci{ 506cc1dc7a3Sopenharmony_ci // Interpret the bits as an integer 507cc1dc7a3Sopenharmony_ci vint4 ai = float_as_int(a); 508cc1dc7a3Sopenharmony_ci 509cc1dc7a3Sopenharmony_ci // Extract and unbias the exponent 510cc1dc7a3Sopenharmony_ci exp = (lsr<23>(ai) & 0xFF) - 126; 511cc1dc7a3Sopenharmony_ci 512cc1dc7a3Sopenharmony_ci // Extract and unbias the mantissa 513cc1dc7a3Sopenharmony_ci vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000; 514cc1dc7a3Sopenharmony_ci return int_as_float(manti); 515cc1dc7a3Sopenharmony_ci} 516cc1dc7a3Sopenharmony_ci 517cc1dc7a3Sopenharmony_ci/** 518cc1dc7a3Sopenharmony_ci * @brief Convert float to 16-bit LNS. 519cc1dc7a3Sopenharmony_ci */ 520cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a) 521cc1dc7a3Sopenharmony_ci{ 522cc1dc7a3Sopenharmony_ci vint4 exp; 523cc1dc7a3Sopenharmony_ci vfloat4 mant = frexp(a, exp); 524cc1dc7a3Sopenharmony_ci 525cc1dc7a3Sopenharmony_ci // Do these early before we start messing about ... 526cc1dc7a3Sopenharmony_ci vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f)); 527cc1dc7a3Sopenharmony_ci vmask4 mask_infinity = a >= vfloat4(65536.0f); 528cc1dc7a3Sopenharmony_ci 529cc1dc7a3Sopenharmony_ci // If input is smaller than 2^-14, multiply by 2^25 and don't bias. 530cc1dc7a3Sopenharmony_ci vmask4 exp_lt_m13 = exp < vint4(-13); 531cc1dc7a3Sopenharmony_ci 532cc1dc7a3Sopenharmony_ci vfloat4 a1a = a * 33554432.0f; 533cc1dc7a3Sopenharmony_ci vint4 expa = vint4::zero(); 534cc1dc7a3Sopenharmony_ci 535cc1dc7a3Sopenharmony_ci vfloat4 a1b = (mant - 0.5f) * 4096; 536cc1dc7a3Sopenharmony_ci vint4 expb = exp + 14; 537cc1dc7a3Sopenharmony_ci 538cc1dc7a3Sopenharmony_ci a = select(a1b, a1a, exp_lt_m13); 539cc1dc7a3Sopenharmony_ci exp = select(expb, expa, exp_lt_m13); 540cc1dc7a3Sopenharmony_ci 541cc1dc7a3Sopenharmony_ci vmask4 a_lt_384 = a < vfloat4(384.0f); 542cc1dc7a3Sopenharmony_ci vmask4 a_lt_1408 = a <= vfloat4(1408.0f); 543cc1dc7a3Sopenharmony_ci 544cc1dc7a3Sopenharmony_ci vfloat4 a2a = a * (4.0f / 3.0f); 545cc1dc7a3Sopenharmony_ci vfloat4 a2b = a + 128.0f; 546cc1dc7a3Sopenharmony_ci vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f); 547cc1dc7a3Sopenharmony_ci 548cc1dc7a3Sopenharmony_ci a = a2c; 549cc1dc7a3Sopenharmony_ci a = select(a, a2b, a_lt_1408); 550cc1dc7a3Sopenharmony_ci a = select(a, a2a, a_lt_384); 551cc1dc7a3Sopenharmony_ci 552cc1dc7a3Sopenharmony_ci a = a + (int_to_float(exp) * 2048.0f) + 1.0f; 553cc1dc7a3Sopenharmony_ci 554cc1dc7a3Sopenharmony_ci a = select(a, vfloat4(65535.0f), mask_infinity); 555cc1dc7a3Sopenharmony_ci a = select(a, vfloat4::zero(), mask_underflow_nan); 556cc1dc7a3Sopenharmony_ci 557cc1dc7a3Sopenharmony_ci return a; 558cc1dc7a3Sopenharmony_ci} 559cc1dc7a3Sopenharmony_ci 560cc1dc7a3Sopenharmony_cinamespace astc 561cc1dc7a3Sopenharmony_ci{ 562cc1dc7a3Sopenharmony_ci 563cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE float pow(float x, float y) 564cc1dc7a3Sopenharmony_ci{ 565cc1dc7a3Sopenharmony_ci return pow(vfloat4(x), vfloat4(y)).lane<0>(); 566cc1dc7a3Sopenharmony_ci} 567cc1dc7a3Sopenharmony_ci 568cc1dc7a3Sopenharmony_ci} 569cc1dc7a3Sopenharmony_ci 570cc1dc7a3Sopenharmony_ci#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED 571