astc-encoder/Source/astcenc_vecmathlib_common_4.h

cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0
cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
cc1dc7a3Sopenharmony_ci// Copyright 2020-2024 Arm Limited
cc1dc7a3Sopenharmony_ci//
cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not
cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy
cc1dc7a3Sopenharmony_ci// of the License at:
cc1dc7a3Sopenharmony_ci//
cc1dc7a3Sopenharmony_ci//     http://www.apache.org/licenses/LICENSE-2.0
cc1dc7a3Sopenharmony_ci//
cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software
cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations
cc1dc7a3Sopenharmony_ci// under the License.
cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Generic 4x32-bit vector functions.
cc1dc7a3Sopenharmony_ci *
cc1dc7a3Sopenharmony_ci * This module implements generic 4-wide vector functions that are valid for
cc1dc7a3Sopenharmony_ci * all instruction sets, typically implemented using lower level 4-wide
cc1dc7a3Sopenharmony_ci * operations that are ISA-specific.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
cc1dc7a3Sopenharmony_ci#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci#ifndef ASTCENC_SIMD_INLINE
cc1dc7a3Sopenharmony_ci	#error "Include astcenc_vecmathlib.h, do not include directly"
cc1dc7a3Sopenharmony_ci#endif
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci#include <cstdio>
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci// ============================================================================
cc1dc7a3Sopenharmony_ci// vmask4 operators and functions
cc1dc7a3Sopenharmony_ci// ============================================================================
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief True if any lanes are enabled, false otherwise.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE bool any(vmask4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return mask(a) != 0;
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief True if all lanes are enabled, false otherwise.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE bool all(vmask4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return mask(a) == 0xF;
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci// ============================================================================
cc1dc7a3Sopenharmony_ci// vint4 operators and functions
cc1dc7a3Sopenharmony_ci// ============================================================================
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar addition.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a + vint4(b);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector incremental addition.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	a = a + b;
cc1dc7a3Sopenharmony_ci	return a;
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar subtraction.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a - vint4(b);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar multiplication.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a * vint4(b);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar bitwise or.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a | vint4(b);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar bitwise and.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a & vint4(b);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar bitwise xor.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a ^ vint4(b);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between min and max.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return min(max(a, vint4(minv)), vint4(maxv));
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the horizontal sum of RGB vector lanes as a scalar.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a.lane<0>() + a.lane<1>() + a.lane<2>();
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci// ============================================================================
cc1dc7a3Sopenharmony_ci// vfloat4 operators and functions
cc1dc7a3Sopenharmony_ci// ============================================================================
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector incremental addition.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	a = a + b;
cc1dc7a3Sopenharmony_ci	return a;
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar addition.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a + vfloat4(b);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar subtraction.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a - vfloat4(b);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar multiplication.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a * vfloat4(b);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: scalar by vector multiplication.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return vfloat4(a) * b;
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar division.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a / vfloat4(b);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Overload: scalar by vector division.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return vfloat4(a) / b;
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the min vector of a vector and a scalar.
cc1dc7a3Sopenharmony_ci *
cc1dc7a3Sopenharmony_ci * If either lane value is NaN, @c b will be returned for that lane.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return min(a, vfloat4(b));
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the max vector of a vector and a scalar.
cc1dc7a3Sopenharmony_ci *
cc1dc7a3Sopenharmony_ci * If either lane value is NaN, @c b will be returned for that lane.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return max(a, vfloat4(b));
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between min and max.
cc1dc7a3Sopenharmony_ci *
cc1dc7a3Sopenharmony_ci * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
cc1dc7a3Sopenharmony_ci * then @c min will be returned for that lane.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	// Do not reorder - second operand will return if either is NaN
cc1dc7a3Sopenharmony_ci	return min(max(a, minv), maxv);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between 0.0f and max.
cc1dc7a3Sopenharmony_ci *
cc1dc7a3Sopenharmony_ci * It is assumed that  @c max is not a NaN value. If @c a is NaN then zero will
cc1dc7a3Sopenharmony_ci * be returned for that lane.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	// Do not reorder - second operand will return if either is NaN
cc1dc7a3Sopenharmony_ci	return min(max(a, vfloat4::zero()), maxv);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between 0.0f and 1.0f.
cc1dc7a3Sopenharmony_ci *
cc1dc7a3Sopenharmony_ci * If @c a is NaN then zero will be returned for that lane.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	// Do not reorder - second operand will return if either is NaN
cc1dc7a3Sopenharmony_ci	return min(max(a, vfloat4::zero()), 1.0f);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the horizontal minimum of a vector.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return hmin(a).lane<0>();
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the horizontal min of RGB vector lanes as a scalar.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	a.set_lane<3>(a.lane<0>());
cc1dc7a3Sopenharmony_ci	return hmin_s(a);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the horizontal maximum of a vector.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return hmax(a).lane<0>();
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Accumulate lane-wise sums for a vector.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	accum = accum + a;
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Accumulate lane-wise sums for a masked vector.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	a = select(vfloat4::zero(), a, m);
cc1dc7a3Sopenharmony_ci	haccumulate(accum, a);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci#define ASTCENC_USE_COMMON_GATHERF
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, const uint8_t* idx)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return vfloat4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);    // index 0,1,2,3
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the horizontal sum of RGB vector lanes as a scalar.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a.lane<0>() + a.lane<1>() + a.lane<2>();
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_USE_NATIVE_ADDV)
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the horizontal sum of a vector.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hadd_rgba_s(vfloat4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	return a.lane<0>() + a.lane<1>() + a.lane<2>() + a.lane<3>();    // channel 0,1,2,3
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci#endif
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the dot product for the full 4 lanes, returning scalar.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	vfloat4 m = a * b;
cc1dc7a3Sopenharmony_ci	return hadd_s(m);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the dot product for the full 4 lanes, returning vector.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	vfloat4 m = a * b;
cc1dc7a3Sopenharmony_ci	return vfloat4(hadd_s(m));
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the dot product for the bottom 3 lanes, returning scalar.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	vfloat4 m = a * b;
cc1dc7a3Sopenharmony_ci	return hadd_rgb_s(m);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Return the dot product for the bottom 3 lanes, returning vector.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	vfloat4 m = a * b;
cc1dc7a3Sopenharmony_ci	float d3 = hadd_rgb_s(m);
cc1dc7a3Sopenharmony_ci	return vfloat4(d3, d3, d3, 0.0f);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci#endif
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Population bit count.
cc1dc7a3Sopenharmony_ci *
cc1dc7a3Sopenharmony_ci * @param v   The value to population count.
cc1dc7a3Sopenharmony_ci *
cc1dc7a3Sopenharmony_ci * @return The number of 1 bits.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_cistatic inline int popcount(uint64_t v)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	uint64_t mask1 = 0x5555555555555555ULL;
cc1dc7a3Sopenharmony_ci	uint64_t mask2 = 0x3333333333333333ULL;
cc1dc7a3Sopenharmony_ci	uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
cc1dc7a3Sopenharmony_ci	v -= (v >> 1) & mask1;
cc1dc7a3Sopenharmony_ci	v = (v & mask2) + ((v >> 2) & mask2);
cc1dc7a3Sopenharmony_ci	v += v >> 4;
cc1dc7a3Sopenharmony_ci	v &= mask3;
cc1dc7a3Sopenharmony_ci	v *= 0x0101010101010101ULL;
cc1dc7a3Sopenharmony_ci	v >>= 56;
cc1dc7a3Sopenharmony_ci	return static_cast<int>(v);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci#endif
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Apply signed bit transfer.
cc1dc7a3Sopenharmony_ci *
cc1dc7a3Sopenharmony_ci * @param input0   The first encoded endpoint.
cc1dc7a3Sopenharmony_ci * @param input1   The second encoded endpoint.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE void bit_transfer_signed(
cc1dc7a3Sopenharmony_ci	vint4& input0,
cc1dc7a3Sopenharmony_ci	vint4& input1
cc1dc7a3Sopenharmony_ci) {
cc1dc7a3Sopenharmony_ci	input1 = lsr<1>(input1) | (input0 & 0x80);
cc1dc7a3Sopenharmony_ci	input0 = lsr<1>(input0) & 0x3F;
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci	vmask4 mask = (input0 & 0x20) != vint4::zero();
cc1dc7a3Sopenharmony_ci	input0 = select(input0, input0 - 0x40, mask);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of ints.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void print(vint4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	ASTCENC_ALIGNAS int v[4];
cc1dc7a3Sopenharmony_ci	storea(a, v);
cc1dc7a3Sopenharmony_ci	printf("v4_i32:\n  %8d %8d %8d %8d\n",
cc1dc7a3Sopenharmony_ci	       v[0], v[1], v[2], v[3]);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of ints.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void printx(vint4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	ASTCENC_ALIGNAS int v[4];
cc1dc7a3Sopenharmony_ci	storea(a, v);
cc1dc7a3Sopenharmony_ci	printf("v4_i32:\n  %08x %08x %08x %08x\n",
cc1dc7a3Sopenharmony_ci	       v[0], v[1], v[2], v[3]);
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of floats.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void print(vfloat4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	ASTCENC_ALIGNAS float v[4];
cc1dc7a3Sopenharmony_ci	storea(a, v);
cc1dc7a3Sopenharmony_ci	printf("v4_f32:\n  %0.4f %0.4f %0.4f %0.4f\n",
cc1dc7a3Sopenharmony_ci	       static_cast<double>(v[0]), static_cast<double>(v[1]),
cc1dc7a3Sopenharmony_ci	       static_cast<double>(v[2]), static_cast<double>(v[3]));
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci/**
cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of masks.
cc1dc7a3Sopenharmony_ci */
cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void print(vmask4 a)
cc1dc7a3Sopenharmony_ci{
cc1dc7a3Sopenharmony_ci	print(select(vint4(0), vint4(1), a));
cc1dc7a3Sopenharmony_ci}
cc1dc7a3Sopenharmony_ci
cc1dc7a3Sopenharmony_ci#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED