1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0 2cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 3cc1dc7a3Sopenharmony_ci// Copyright 2020-2024 Arm Limited 4cc1dc7a3Sopenharmony_ci// 5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy 7cc1dc7a3Sopenharmony_ci// of the License at: 8cc1dc7a3Sopenharmony_ci// 9cc1dc7a3Sopenharmony_ci// http://www.apache.org/licenses/LICENSE-2.0 10cc1dc7a3Sopenharmony_ci// 11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software 12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations 15cc1dc7a3Sopenharmony_ci// under the License. 16cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 17cc1dc7a3Sopenharmony_ci 18cc1dc7a3Sopenharmony_ci/** 19cc1dc7a3Sopenharmony_ci * @brief Generic 4x32-bit vector functions. 20cc1dc7a3Sopenharmony_ci * 21cc1dc7a3Sopenharmony_ci * This module implements generic 4-wide vector functions that are valid for 22cc1dc7a3Sopenharmony_ci * all instruction sets, typically implemented using lower level 4-wide 23cc1dc7a3Sopenharmony_ci * operations that are ISA-specific. 24cc1dc7a3Sopenharmony_ci */ 25cc1dc7a3Sopenharmony_ci 26cc1dc7a3Sopenharmony_ci#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED 27cc1dc7a3Sopenharmony_ci#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED 28cc1dc7a3Sopenharmony_ci 29cc1dc7a3Sopenharmony_ci#ifndef ASTCENC_SIMD_INLINE 30cc1dc7a3Sopenharmony_ci #error "Include astcenc_vecmathlib.h, do not include directly" 31cc1dc7a3Sopenharmony_ci#endif 32cc1dc7a3Sopenharmony_ci 33cc1dc7a3Sopenharmony_ci#include <cstdio> 34cc1dc7a3Sopenharmony_ci 35cc1dc7a3Sopenharmony_ci// ============================================================================ 36cc1dc7a3Sopenharmony_ci// vmask4 operators and functions 37cc1dc7a3Sopenharmony_ci// ============================================================================ 38cc1dc7a3Sopenharmony_ci 39cc1dc7a3Sopenharmony_ci/** 40cc1dc7a3Sopenharmony_ci * @brief True if any lanes are enabled, false otherwise. 41cc1dc7a3Sopenharmony_ci */ 42cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE bool any(vmask4 a) 43cc1dc7a3Sopenharmony_ci{ 44cc1dc7a3Sopenharmony_ci return mask(a) != 0; 45cc1dc7a3Sopenharmony_ci} 46cc1dc7a3Sopenharmony_ci 47cc1dc7a3Sopenharmony_ci/** 48cc1dc7a3Sopenharmony_ci * @brief True if all lanes are enabled, false otherwise. 49cc1dc7a3Sopenharmony_ci */ 50cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE bool all(vmask4 a) 51cc1dc7a3Sopenharmony_ci{ 52cc1dc7a3Sopenharmony_ci return mask(a) == 0xF; 53cc1dc7a3Sopenharmony_ci} 54cc1dc7a3Sopenharmony_ci 55cc1dc7a3Sopenharmony_ci// ============================================================================ 56cc1dc7a3Sopenharmony_ci// vint4 operators and functions 57cc1dc7a3Sopenharmony_ci// ============================================================================ 58cc1dc7a3Sopenharmony_ci 59cc1dc7a3Sopenharmony_ci/** 60cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar addition. 61cc1dc7a3Sopenharmony_ci */ 62cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b) 63cc1dc7a3Sopenharmony_ci{ 64cc1dc7a3Sopenharmony_ci return a + vint4(b); 65cc1dc7a3Sopenharmony_ci} 66cc1dc7a3Sopenharmony_ci 67cc1dc7a3Sopenharmony_ci/** 68cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector incremental addition. 69cc1dc7a3Sopenharmony_ci */ 70cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b) 71cc1dc7a3Sopenharmony_ci{ 72cc1dc7a3Sopenharmony_ci a = a + b; 73cc1dc7a3Sopenharmony_ci return a; 74cc1dc7a3Sopenharmony_ci} 75cc1dc7a3Sopenharmony_ci 76cc1dc7a3Sopenharmony_ci/** 77cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar subtraction. 78cc1dc7a3Sopenharmony_ci */ 79cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b) 80cc1dc7a3Sopenharmony_ci{ 81cc1dc7a3Sopenharmony_ci return a - vint4(b); 82cc1dc7a3Sopenharmony_ci} 83cc1dc7a3Sopenharmony_ci 84cc1dc7a3Sopenharmony_ci/** 85cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar multiplication. 86cc1dc7a3Sopenharmony_ci */ 87cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b) 88cc1dc7a3Sopenharmony_ci{ 89cc1dc7a3Sopenharmony_ci return a * vint4(b); 90cc1dc7a3Sopenharmony_ci} 91cc1dc7a3Sopenharmony_ci 92cc1dc7a3Sopenharmony_ci/** 93cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar bitwise or. 94cc1dc7a3Sopenharmony_ci */ 95cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b) 96cc1dc7a3Sopenharmony_ci{ 97cc1dc7a3Sopenharmony_ci return a | vint4(b); 98cc1dc7a3Sopenharmony_ci} 99cc1dc7a3Sopenharmony_ci 100cc1dc7a3Sopenharmony_ci/** 101cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar bitwise and. 102cc1dc7a3Sopenharmony_ci */ 103cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b) 104cc1dc7a3Sopenharmony_ci{ 105cc1dc7a3Sopenharmony_ci return a & vint4(b); 106cc1dc7a3Sopenharmony_ci} 107cc1dc7a3Sopenharmony_ci 108cc1dc7a3Sopenharmony_ci/** 109cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar bitwise xor. 110cc1dc7a3Sopenharmony_ci */ 111cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b) 112cc1dc7a3Sopenharmony_ci{ 113cc1dc7a3Sopenharmony_ci return a ^ vint4(b); 114cc1dc7a3Sopenharmony_ci} 115cc1dc7a3Sopenharmony_ci 116cc1dc7a3Sopenharmony_ci/** 117cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between min and max. 118cc1dc7a3Sopenharmony_ci */ 119cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a) 120cc1dc7a3Sopenharmony_ci{ 121cc1dc7a3Sopenharmony_ci return min(max(a, vint4(minv)), vint4(maxv)); 122cc1dc7a3Sopenharmony_ci} 123cc1dc7a3Sopenharmony_ci 124cc1dc7a3Sopenharmony_ci/** 125cc1dc7a3Sopenharmony_ci * @brief Return the horizontal sum of RGB vector lanes as a scalar. 126cc1dc7a3Sopenharmony_ci */ 127cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a) 128cc1dc7a3Sopenharmony_ci{ 129cc1dc7a3Sopenharmony_ci return a.lane<0>() + a.lane<1>() + a.lane<2>(); 130cc1dc7a3Sopenharmony_ci} 131cc1dc7a3Sopenharmony_ci 132cc1dc7a3Sopenharmony_ci// ============================================================================ 133cc1dc7a3Sopenharmony_ci// vfloat4 operators and functions 134cc1dc7a3Sopenharmony_ci// ============================================================================ 135cc1dc7a3Sopenharmony_ci 136cc1dc7a3Sopenharmony_ci/** 137cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector incremental addition. 138cc1dc7a3Sopenharmony_ci */ 139cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b) 140cc1dc7a3Sopenharmony_ci{ 141cc1dc7a3Sopenharmony_ci a = a + b; 142cc1dc7a3Sopenharmony_ci return a; 143cc1dc7a3Sopenharmony_ci} 144cc1dc7a3Sopenharmony_ci 145cc1dc7a3Sopenharmony_ci/** 146cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar addition. 147cc1dc7a3Sopenharmony_ci */ 148cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b) 149cc1dc7a3Sopenharmony_ci{ 150cc1dc7a3Sopenharmony_ci return a + vfloat4(b); 151cc1dc7a3Sopenharmony_ci} 152cc1dc7a3Sopenharmony_ci 153cc1dc7a3Sopenharmony_ci/** 154cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar subtraction. 155cc1dc7a3Sopenharmony_ci */ 156cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b) 157cc1dc7a3Sopenharmony_ci{ 158cc1dc7a3Sopenharmony_ci return a - vfloat4(b); 159cc1dc7a3Sopenharmony_ci} 160cc1dc7a3Sopenharmony_ci 161cc1dc7a3Sopenharmony_ci/** 162cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar multiplication. 163cc1dc7a3Sopenharmony_ci */ 164cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b) 165cc1dc7a3Sopenharmony_ci{ 166cc1dc7a3Sopenharmony_ci return a * vfloat4(b); 167cc1dc7a3Sopenharmony_ci} 168cc1dc7a3Sopenharmony_ci 169cc1dc7a3Sopenharmony_ci/** 170cc1dc7a3Sopenharmony_ci * @brief Overload: scalar by vector multiplication. 171cc1dc7a3Sopenharmony_ci */ 172cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b) 173cc1dc7a3Sopenharmony_ci{ 174cc1dc7a3Sopenharmony_ci return vfloat4(a) * b; 175cc1dc7a3Sopenharmony_ci} 176cc1dc7a3Sopenharmony_ci 177cc1dc7a3Sopenharmony_ci/** 178cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar division. 179cc1dc7a3Sopenharmony_ci */ 180cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b) 181cc1dc7a3Sopenharmony_ci{ 182cc1dc7a3Sopenharmony_ci return a / vfloat4(b); 183cc1dc7a3Sopenharmony_ci} 184cc1dc7a3Sopenharmony_ci 185cc1dc7a3Sopenharmony_ci/** 186cc1dc7a3Sopenharmony_ci * @brief Overload: scalar by vector division. 187cc1dc7a3Sopenharmony_ci */ 188cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b) 189cc1dc7a3Sopenharmony_ci{ 190cc1dc7a3Sopenharmony_ci return vfloat4(a) / b; 191cc1dc7a3Sopenharmony_ci} 192cc1dc7a3Sopenharmony_ci 193cc1dc7a3Sopenharmony_ci/** 194cc1dc7a3Sopenharmony_ci * @brief Return the min vector of a vector and a scalar. 195cc1dc7a3Sopenharmony_ci * 196cc1dc7a3Sopenharmony_ci * If either lane value is NaN, @c b will be returned for that lane. 197cc1dc7a3Sopenharmony_ci */ 198cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b) 199cc1dc7a3Sopenharmony_ci{ 200cc1dc7a3Sopenharmony_ci return min(a, vfloat4(b)); 201cc1dc7a3Sopenharmony_ci} 202cc1dc7a3Sopenharmony_ci 203cc1dc7a3Sopenharmony_ci/** 204cc1dc7a3Sopenharmony_ci * @brief Return the max vector of a vector and a scalar. 205cc1dc7a3Sopenharmony_ci * 206cc1dc7a3Sopenharmony_ci * If either lane value is NaN, @c b will be returned for that lane. 207cc1dc7a3Sopenharmony_ci */ 208cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b) 209cc1dc7a3Sopenharmony_ci{ 210cc1dc7a3Sopenharmony_ci return max(a, vfloat4(b)); 211cc1dc7a3Sopenharmony_ci} 212cc1dc7a3Sopenharmony_ci 213cc1dc7a3Sopenharmony_ci/** 214cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between min and max. 215cc1dc7a3Sopenharmony_ci * 216cc1dc7a3Sopenharmony_ci * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN 217cc1dc7a3Sopenharmony_ci * then @c min will be returned for that lane. 218cc1dc7a3Sopenharmony_ci */ 219cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a) 220cc1dc7a3Sopenharmony_ci{ 221cc1dc7a3Sopenharmony_ci // Do not reorder - second operand will return if either is NaN 222cc1dc7a3Sopenharmony_ci return min(max(a, minv), maxv); 223cc1dc7a3Sopenharmony_ci} 224cc1dc7a3Sopenharmony_ci 225cc1dc7a3Sopenharmony_ci/** 226cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between 0.0f and max. 227cc1dc7a3Sopenharmony_ci * 228cc1dc7a3Sopenharmony_ci * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will 229cc1dc7a3Sopenharmony_ci * be returned for that lane. 230cc1dc7a3Sopenharmony_ci */ 231cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a) 232cc1dc7a3Sopenharmony_ci{ 233cc1dc7a3Sopenharmony_ci // Do not reorder - second operand will return if either is NaN 234cc1dc7a3Sopenharmony_ci return min(max(a, vfloat4::zero()), maxv); 235cc1dc7a3Sopenharmony_ci} 236cc1dc7a3Sopenharmony_ci 237cc1dc7a3Sopenharmony_ci/** 238cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between 0.0f and 1.0f. 239cc1dc7a3Sopenharmony_ci * 240cc1dc7a3Sopenharmony_ci * If @c a is NaN then zero will be returned for that lane. 241cc1dc7a3Sopenharmony_ci */ 242cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a) 243cc1dc7a3Sopenharmony_ci{ 244cc1dc7a3Sopenharmony_ci // Do not reorder - second operand will return if either is NaN 245cc1dc7a3Sopenharmony_ci return min(max(a, vfloat4::zero()), 1.0f); 246cc1dc7a3Sopenharmony_ci} 247cc1dc7a3Sopenharmony_ci 248cc1dc7a3Sopenharmony_ci/** 249cc1dc7a3Sopenharmony_ci * @brief Return the horizontal minimum of a vector. 250cc1dc7a3Sopenharmony_ci */ 251cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hmin_s(vfloat4 a) 252cc1dc7a3Sopenharmony_ci{ 253cc1dc7a3Sopenharmony_ci return hmin(a).lane<0>(); 254cc1dc7a3Sopenharmony_ci} 255cc1dc7a3Sopenharmony_ci 256cc1dc7a3Sopenharmony_ci/** 257cc1dc7a3Sopenharmony_ci * @brief Return the horizontal min of RGB vector lanes as a scalar. 258cc1dc7a3Sopenharmony_ci */ 259cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a) 260cc1dc7a3Sopenharmony_ci{ 261cc1dc7a3Sopenharmony_ci a.set_lane<3>(a.lane<0>()); 262cc1dc7a3Sopenharmony_ci return hmin_s(a); 263cc1dc7a3Sopenharmony_ci} 264cc1dc7a3Sopenharmony_ci 265cc1dc7a3Sopenharmony_ci/** 266cc1dc7a3Sopenharmony_ci * @brief Return the horizontal maximum of a vector. 267cc1dc7a3Sopenharmony_ci */ 268cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hmax_s(vfloat4 a) 269cc1dc7a3Sopenharmony_ci{ 270cc1dc7a3Sopenharmony_ci return hmax(a).lane<0>(); 271cc1dc7a3Sopenharmony_ci} 272cc1dc7a3Sopenharmony_ci 273cc1dc7a3Sopenharmony_ci/** 274cc1dc7a3Sopenharmony_ci * @brief Accumulate lane-wise sums for a vector. 275cc1dc7a3Sopenharmony_ci */ 276cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a) 277cc1dc7a3Sopenharmony_ci{ 278cc1dc7a3Sopenharmony_ci accum = accum + a; 279cc1dc7a3Sopenharmony_ci} 280cc1dc7a3Sopenharmony_ci 281cc1dc7a3Sopenharmony_ci/** 282cc1dc7a3Sopenharmony_ci * @brief Accumulate lane-wise sums for a masked vector. 283cc1dc7a3Sopenharmony_ci */ 284cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m) 285cc1dc7a3Sopenharmony_ci{ 286cc1dc7a3Sopenharmony_ci a = select(vfloat4::zero(), a, m); 287cc1dc7a3Sopenharmony_ci haccumulate(accum, a); 288cc1dc7a3Sopenharmony_ci} 289cc1dc7a3Sopenharmony_ci 290cc1dc7a3Sopenharmony_ci#define ASTCENC_USE_COMMON_GATHERF 291cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, const uint8_t* idx) 292cc1dc7a3Sopenharmony_ci{ 293cc1dc7a3Sopenharmony_ci return vfloat4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]); // index 0,1,2,3 294cc1dc7a3Sopenharmony_ci} 295cc1dc7a3Sopenharmony_ci 296cc1dc7a3Sopenharmony_ci/** 297cc1dc7a3Sopenharmony_ci * @brief Return the horizontal sum of RGB vector lanes as a scalar. 298cc1dc7a3Sopenharmony_ci */ 299cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a) 300cc1dc7a3Sopenharmony_ci{ 301cc1dc7a3Sopenharmony_ci return a.lane<0>() + a.lane<1>() + a.lane<2>(); 302cc1dc7a3Sopenharmony_ci} 303cc1dc7a3Sopenharmony_ci 304cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_USE_NATIVE_ADDV) 305cc1dc7a3Sopenharmony_ci/** 306cc1dc7a3Sopenharmony_ci * @brief Return the horizontal sum of a vector. 307cc1dc7a3Sopenharmony_ci */ 308cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hadd_rgba_s(vfloat4 a) 309cc1dc7a3Sopenharmony_ci{ 310cc1dc7a3Sopenharmony_ci return a.lane<0>() + a.lane<1>() + a.lane<2>() + a.lane<3>(); // channel 0,1,2,3 311cc1dc7a3Sopenharmony_ci} 312cc1dc7a3Sopenharmony_ci#endif 313cc1dc7a3Sopenharmony_ci 314cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT) 315cc1dc7a3Sopenharmony_ci 316cc1dc7a3Sopenharmony_ci/** 317cc1dc7a3Sopenharmony_ci * @brief Return the dot product for the full 4 lanes, returning scalar. 318cc1dc7a3Sopenharmony_ci */ 319cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b) 320cc1dc7a3Sopenharmony_ci{ 321cc1dc7a3Sopenharmony_ci vfloat4 m = a * b; 322cc1dc7a3Sopenharmony_ci return hadd_s(m); 323cc1dc7a3Sopenharmony_ci} 324cc1dc7a3Sopenharmony_ci 325cc1dc7a3Sopenharmony_ci/** 326cc1dc7a3Sopenharmony_ci * @brief Return the dot product for the full 4 lanes, returning vector. 327cc1dc7a3Sopenharmony_ci */ 328cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b) 329cc1dc7a3Sopenharmony_ci{ 330cc1dc7a3Sopenharmony_ci vfloat4 m = a * b; 331cc1dc7a3Sopenharmony_ci return vfloat4(hadd_s(m)); 332cc1dc7a3Sopenharmony_ci} 333cc1dc7a3Sopenharmony_ci 334cc1dc7a3Sopenharmony_ci/** 335cc1dc7a3Sopenharmony_ci * @brief Return the dot product for the bottom 3 lanes, returning scalar. 336cc1dc7a3Sopenharmony_ci */ 337cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b) 338cc1dc7a3Sopenharmony_ci{ 339cc1dc7a3Sopenharmony_ci vfloat4 m = a * b; 340cc1dc7a3Sopenharmony_ci return hadd_rgb_s(m); 341cc1dc7a3Sopenharmony_ci} 342cc1dc7a3Sopenharmony_ci 343cc1dc7a3Sopenharmony_ci/** 344cc1dc7a3Sopenharmony_ci * @brief Return the dot product for the bottom 3 lanes, returning vector. 345cc1dc7a3Sopenharmony_ci */ 346cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b) 347cc1dc7a3Sopenharmony_ci{ 348cc1dc7a3Sopenharmony_ci vfloat4 m = a * b; 349cc1dc7a3Sopenharmony_ci float d3 = hadd_rgb_s(m); 350cc1dc7a3Sopenharmony_ci return vfloat4(d3, d3, d3, 0.0f); 351cc1dc7a3Sopenharmony_ci} 352cc1dc7a3Sopenharmony_ci 353cc1dc7a3Sopenharmony_ci#endif 354cc1dc7a3Sopenharmony_ci 355cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_USE_NATIVE_POPCOUNT) 356cc1dc7a3Sopenharmony_ci 357cc1dc7a3Sopenharmony_ci/** 358cc1dc7a3Sopenharmony_ci * @brief Population bit count. 359cc1dc7a3Sopenharmony_ci * 360cc1dc7a3Sopenharmony_ci * @param v The value to population count. 361cc1dc7a3Sopenharmony_ci * 362cc1dc7a3Sopenharmony_ci * @return The number of 1 bits. 363cc1dc7a3Sopenharmony_ci */ 364cc1dc7a3Sopenharmony_cistatic inline int popcount(uint64_t v) 365cc1dc7a3Sopenharmony_ci{ 366cc1dc7a3Sopenharmony_ci uint64_t mask1 = 0x5555555555555555ULL; 367cc1dc7a3Sopenharmony_ci uint64_t mask2 = 0x3333333333333333ULL; 368cc1dc7a3Sopenharmony_ci uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL; 369cc1dc7a3Sopenharmony_ci v -= (v >> 1) & mask1; 370cc1dc7a3Sopenharmony_ci v = (v & mask2) + ((v >> 2) & mask2); 371cc1dc7a3Sopenharmony_ci v += v >> 4; 372cc1dc7a3Sopenharmony_ci v &= mask3; 373cc1dc7a3Sopenharmony_ci v *= 0x0101010101010101ULL; 374cc1dc7a3Sopenharmony_ci v >>= 56; 375cc1dc7a3Sopenharmony_ci return static_cast<int>(v); 376cc1dc7a3Sopenharmony_ci} 377cc1dc7a3Sopenharmony_ci 378cc1dc7a3Sopenharmony_ci#endif 379cc1dc7a3Sopenharmony_ci 380cc1dc7a3Sopenharmony_ci/** 381cc1dc7a3Sopenharmony_ci * @brief Apply signed bit transfer. 382cc1dc7a3Sopenharmony_ci * 383cc1dc7a3Sopenharmony_ci * @param input0 The first encoded endpoint. 384cc1dc7a3Sopenharmony_ci * @param input1 The second encoded endpoint. 385cc1dc7a3Sopenharmony_ci */ 386cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE void bit_transfer_signed( 387cc1dc7a3Sopenharmony_ci vint4& input0, 388cc1dc7a3Sopenharmony_ci vint4& input1 389cc1dc7a3Sopenharmony_ci) { 390cc1dc7a3Sopenharmony_ci input1 = lsr<1>(input1) | (input0 & 0x80); 391cc1dc7a3Sopenharmony_ci input0 = lsr<1>(input0) & 0x3F; 392cc1dc7a3Sopenharmony_ci 393cc1dc7a3Sopenharmony_ci vmask4 mask = (input0 & 0x20) != vint4::zero(); 394cc1dc7a3Sopenharmony_ci input0 = select(input0, input0 - 0x40, mask); 395cc1dc7a3Sopenharmony_ci} 396cc1dc7a3Sopenharmony_ci 397cc1dc7a3Sopenharmony_ci/** 398cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of ints. 399cc1dc7a3Sopenharmony_ci */ 400cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void print(vint4 a) 401cc1dc7a3Sopenharmony_ci{ 402cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS int v[4]; 403cc1dc7a3Sopenharmony_ci storea(a, v); 404cc1dc7a3Sopenharmony_ci printf("v4_i32:\n %8d %8d %8d %8d\n", 405cc1dc7a3Sopenharmony_ci v[0], v[1], v[2], v[3]); 406cc1dc7a3Sopenharmony_ci} 407cc1dc7a3Sopenharmony_ci 408cc1dc7a3Sopenharmony_ci/** 409cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of ints. 410cc1dc7a3Sopenharmony_ci */ 411cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void printx(vint4 a) 412cc1dc7a3Sopenharmony_ci{ 413cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS int v[4]; 414cc1dc7a3Sopenharmony_ci storea(a, v); 415cc1dc7a3Sopenharmony_ci printf("v4_i32:\n %08x %08x %08x %08x\n", 416cc1dc7a3Sopenharmony_ci v[0], v[1], v[2], v[3]); 417cc1dc7a3Sopenharmony_ci} 418cc1dc7a3Sopenharmony_ci 419cc1dc7a3Sopenharmony_ci/** 420cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of floats. 421cc1dc7a3Sopenharmony_ci */ 422cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void print(vfloat4 a) 423cc1dc7a3Sopenharmony_ci{ 424cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS float v[4]; 425cc1dc7a3Sopenharmony_ci storea(a, v); 426cc1dc7a3Sopenharmony_ci printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n", 427cc1dc7a3Sopenharmony_ci static_cast<double>(v[0]), static_cast<double>(v[1]), 428cc1dc7a3Sopenharmony_ci static_cast<double>(v[2]), static_cast<double>(v[3])); 429cc1dc7a3Sopenharmony_ci} 430cc1dc7a3Sopenharmony_ci 431cc1dc7a3Sopenharmony_ci/** 432cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of masks. 433cc1dc7a3Sopenharmony_ci */ 434cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void print(vmask4 a) 435cc1dc7a3Sopenharmony_ci{ 436cc1dc7a3Sopenharmony_ci print(select(vint4(0), vint4(1), a)); 437cc1dc7a3Sopenharmony_ci} 438cc1dc7a3Sopenharmony_ci 439cc1dc7a3Sopenharmony_ci#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED 440