1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0
2cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
3cc1dc7a3Sopenharmony_ci// Copyright 2020-2024 Arm Limited
4cc1dc7a3Sopenharmony_ci//
5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy
7cc1dc7a3Sopenharmony_ci// of the License at:
8cc1dc7a3Sopenharmony_ci//
9cc1dc7a3Sopenharmony_ci//     http://www.apache.org/licenses/LICENSE-2.0
10cc1dc7a3Sopenharmony_ci//
11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software
12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations
15cc1dc7a3Sopenharmony_ci// under the License.
16cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
17cc1dc7a3Sopenharmony_ci
18cc1dc7a3Sopenharmony_ci/**
19cc1dc7a3Sopenharmony_ci * @brief Generic 4x32-bit vector functions.
20cc1dc7a3Sopenharmony_ci *
21cc1dc7a3Sopenharmony_ci * This module implements generic 4-wide vector functions that are valid for
22cc1dc7a3Sopenharmony_ci * all instruction sets, typically implemented using lower level 4-wide
23cc1dc7a3Sopenharmony_ci * operations that are ISA-specific.
24cc1dc7a3Sopenharmony_ci */
25cc1dc7a3Sopenharmony_ci
26cc1dc7a3Sopenharmony_ci#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
27cc1dc7a3Sopenharmony_ci#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
28cc1dc7a3Sopenharmony_ci
29cc1dc7a3Sopenharmony_ci#ifndef ASTCENC_SIMD_INLINE
30cc1dc7a3Sopenharmony_ci	#error "Include astcenc_vecmathlib.h, do not include directly"
31cc1dc7a3Sopenharmony_ci#endif
32cc1dc7a3Sopenharmony_ci
33cc1dc7a3Sopenharmony_ci#include <cstdio>
34cc1dc7a3Sopenharmony_ci
35cc1dc7a3Sopenharmony_ci// ============================================================================
36cc1dc7a3Sopenharmony_ci// vmask4 operators and functions
37cc1dc7a3Sopenharmony_ci// ============================================================================
38cc1dc7a3Sopenharmony_ci
39cc1dc7a3Sopenharmony_ci/**
40cc1dc7a3Sopenharmony_ci * @brief True if any lanes are enabled, false otherwise.
41cc1dc7a3Sopenharmony_ci */
42cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE bool any(vmask4 a)
43cc1dc7a3Sopenharmony_ci{
44cc1dc7a3Sopenharmony_ci	return mask(a) != 0;
45cc1dc7a3Sopenharmony_ci}
46cc1dc7a3Sopenharmony_ci
47cc1dc7a3Sopenharmony_ci/**
48cc1dc7a3Sopenharmony_ci * @brief True if all lanes are enabled, false otherwise.
49cc1dc7a3Sopenharmony_ci */
50cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE bool all(vmask4 a)
51cc1dc7a3Sopenharmony_ci{
52cc1dc7a3Sopenharmony_ci	return mask(a) == 0xF;
53cc1dc7a3Sopenharmony_ci}
54cc1dc7a3Sopenharmony_ci
55cc1dc7a3Sopenharmony_ci// ============================================================================
56cc1dc7a3Sopenharmony_ci// vint4 operators and functions
57cc1dc7a3Sopenharmony_ci// ============================================================================
58cc1dc7a3Sopenharmony_ci
59cc1dc7a3Sopenharmony_ci/**
60cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar addition.
61cc1dc7a3Sopenharmony_ci */
62cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
63cc1dc7a3Sopenharmony_ci{
64cc1dc7a3Sopenharmony_ci	return a + vint4(b);
65cc1dc7a3Sopenharmony_ci}
66cc1dc7a3Sopenharmony_ci
67cc1dc7a3Sopenharmony_ci/**
68cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector incremental addition.
69cc1dc7a3Sopenharmony_ci */
70cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
71cc1dc7a3Sopenharmony_ci{
72cc1dc7a3Sopenharmony_ci	a = a + b;
73cc1dc7a3Sopenharmony_ci	return a;
74cc1dc7a3Sopenharmony_ci}
75cc1dc7a3Sopenharmony_ci
76cc1dc7a3Sopenharmony_ci/**
77cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar subtraction.
78cc1dc7a3Sopenharmony_ci */
79cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
80cc1dc7a3Sopenharmony_ci{
81cc1dc7a3Sopenharmony_ci	return a - vint4(b);
82cc1dc7a3Sopenharmony_ci}
83cc1dc7a3Sopenharmony_ci
84cc1dc7a3Sopenharmony_ci/**
85cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar multiplication.
86cc1dc7a3Sopenharmony_ci */
87cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
88cc1dc7a3Sopenharmony_ci{
89cc1dc7a3Sopenharmony_ci	return a * vint4(b);
90cc1dc7a3Sopenharmony_ci}
91cc1dc7a3Sopenharmony_ci
92cc1dc7a3Sopenharmony_ci/**
93cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar bitwise or.
94cc1dc7a3Sopenharmony_ci */
95cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
96cc1dc7a3Sopenharmony_ci{
97cc1dc7a3Sopenharmony_ci	return a | vint4(b);
98cc1dc7a3Sopenharmony_ci}
99cc1dc7a3Sopenharmony_ci
100cc1dc7a3Sopenharmony_ci/**
101cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar bitwise and.
102cc1dc7a3Sopenharmony_ci */
103cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
104cc1dc7a3Sopenharmony_ci{
105cc1dc7a3Sopenharmony_ci	return a & vint4(b);
106cc1dc7a3Sopenharmony_ci}
107cc1dc7a3Sopenharmony_ci
108cc1dc7a3Sopenharmony_ci/**
109cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar bitwise xor.
110cc1dc7a3Sopenharmony_ci */
111cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
112cc1dc7a3Sopenharmony_ci{
113cc1dc7a3Sopenharmony_ci	return a ^ vint4(b);
114cc1dc7a3Sopenharmony_ci}
115cc1dc7a3Sopenharmony_ci
116cc1dc7a3Sopenharmony_ci/**
117cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between min and max.
118cc1dc7a3Sopenharmony_ci */
119cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
120cc1dc7a3Sopenharmony_ci{
121cc1dc7a3Sopenharmony_ci	return min(max(a, vint4(minv)), vint4(maxv));
122cc1dc7a3Sopenharmony_ci}
123cc1dc7a3Sopenharmony_ci
124cc1dc7a3Sopenharmony_ci/**
125cc1dc7a3Sopenharmony_ci * @brief Return the horizontal sum of RGB vector lanes as a scalar.
126cc1dc7a3Sopenharmony_ci */
127cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
128cc1dc7a3Sopenharmony_ci{
129cc1dc7a3Sopenharmony_ci	return a.lane<0>() + a.lane<1>() + a.lane<2>();
130cc1dc7a3Sopenharmony_ci}
131cc1dc7a3Sopenharmony_ci
132cc1dc7a3Sopenharmony_ci// ============================================================================
133cc1dc7a3Sopenharmony_ci// vfloat4 operators and functions
134cc1dc7a3Sopenharmony_ci// ============================================================================
135cc1dc7a3Sopenharmony_ci
136cc1dc7a3Sopenharmony_ci/**
137cc1dc7a3Sopenharmony_ci * @brief Overload: vector by vector incremental addition.
138cc1dc7a3Sopenharmony_ci */
139cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
140cc1dc7a3Sopenharmony_ci{
141cc1dc7a3Sopenharmony_ci	a = a + b;
142cc1dc7a3Sopenharmony_ci	return a;
143cc1dc7a3Sopenharmony_ci}
144cc1dc7a3Sopenharmony_ci
145cc1dc7a3Sopenharmony_ci/**
146cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar addition.
147cc1dc7a3Sopenharmony_ci */
148cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
149cc1dc7a3Sopenharmony_ci{
150cc1dc7a3Sopenharmony_ci	return a + vfloat4(b);
151cc1dc7a3Sopenharmony_ci}
152cc1dc7a3Sopenharmony_ci
153cc1dc7a3Sopenharmony_ci/**
154cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar subtraction.
155cc1dc7a3Sopenharmony_ci */
156cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
157cc1dc7a3Sopenharmony_ci{
158cc1dc7a3Sopenharmony_ci	return a - vfloat4(b);
159cc1dc7a3Sopenharmony_ci}
160cc1dc7a3Sopenharmony_ci
161cc1dc7a3Sopenharmony_ci/**
162cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar multiplication.
163cc1dc7a3Sopenharmony_ci */
164cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
165cc1dc7a3Sopenharmony_ci{
166cc1dc7a3Sopenharmony_ci	return a * vfloat4(b);
167cc1dc7a3Sopenharmony_ci}
168cc1dc7a3Sopenharmony_ci
169cc1dc7a3Sopenharmony_ci/**
170cc1dc7a3Sopenharmony_ci * @brief Overload: scalar by vector multiplication.
171cc1dc7a3Sopenharmony_ci */
172cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
173cc1dc7a3Sopenharmony_ci{
174cc1dc7a3Sopenharmony_ci	return vfloat4(a) * b;
175cc1dc7a3Sopenharmony_ci}
176cc1dc7a3Sopenharmony_ci
177cc1dc7a3Sopenharmony_ci/**
178cc1dc7a3Sopenharmony_ci * @brief Overload: vector by scalar division.
179cc1dc7a3Sopenharmony_ci */
180cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
181cc1dc7a3Sopenharmony_ci{
182cc1dc7a3Sopenharmony_ci	return a / vfloat4(b);
183cc1dc7a3Sopenharmony_ci}
184cc1dc7a3Sopenharmony_ci
185cc1dc7a3Sopenharmony_ci/**
186cc1dc7a3Sopenharmony_ci * @brief Overload: scalar by vector division.
187cc1dc7a3Sopenharmony_ci */
188cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
189cc1dc7a3Sopenharmony_ci{
190cc1dc7a3Sopenharmony_ci	return vfloat4(a) / b;
191cc1dc7a3Sopenharmony_ci}
192cc1dc7a3Sopenharmony_ci
193cc1dc7a3Sopenharmony_ci/**
194cc1dc7a3Sopenharmony_ci * @brief Return the min vector of a vector and a scalar.
195cc1dc7a3Sopenharmony_ci *
196cc1dc7a3Sopenharmony_ci * If either lane value is NaN, @c b will be returned for that lane.
197cc1dc7a3Sopenharmony_ci */
198cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
199cc1dc7a3Sopenharmony_ci{
200cc1dc7a3Sopenharmony_ci	return min(a, vfloat4(b));
201cc1dc7a3Sopenharmony_ci}
202cc1dc7a3Sopenharmony_ci
203cc1dc7a3Sopenharmony_ci/**
204cc1dc7a3Sopenharmony_ci * @brief Return the max vector of a vector and a scalar.
205cc1dc7a3Sopenharmony_ci *
206cc1dc7a3Sopenharmony_ci * If either lane value is NaN, @c b will be returned for that lane.
207cc1dc7a3Sopenharmony_ci */
208cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
209cc1dc7a3Sopenharmony_ci{
210cc1dc7a3Sopenharmony_ci	return max(a, vfloat4(b));
211cc1dc7a3Sopenharmony_ci}
212cc1dc7a3Sopenharmony_ci
213cc1dc7a3Sopenharmony_ci/**
214cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between min and max.
215cc1dc7a3Sopenharmony_ci *
216cc1dc7a3Sopenharmony_ci * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
217cc1dc7a3Sopenharmony_ci * then @c min will be returned for that lane.
218cc1dc7a3Sopenharmony_ci */
219cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
220cc1dc7a3Sopenharmony_ci{
221cc1dc7a3Sopenharmony_ci	// Do not reorder - second operand will return if either is NaN
222cc1dc7a3Sopenharmony_ci	return min(max(a, minv), maxv);
223cc1dc7a3Sopenharmony_ci}
224cc1dc7a3Sopenharmony_ci
225cc1dc7a3Sopenharmony_ci/**
226cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between 0.0f and max.
227cc1dc7a3Sopenharmony_ci *
228cc1dc7a3Sopenharmony_ci * It is assumed that  @c max is not a NaN value. If @c a is NaN then zero will
229cc1dc7a3Sopenharmony_ci * be returned for that lane.
230cc1dc7a3Sopenharmony_ci */
231cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
232cc1dc7a3Sopenharmony_ci{
233cc1dc7a3Sopenharmony_ci	// Do not reorder - second operand will return if either is NaN
234cc1dc7a3Sopenharmony_ci	return min(max(a, vfloat4::zero()), maxv);
235cc1dc7a3Sopenharmony_ci}
236cc1dc7a3Sopenharmony_ci
237cc1dc7a3Sopenharmony_ci/**
238cc1dc7a3Sopenharmony_ci * @brief Return the clamped value between 0.0f and 1.0f.
239cc1dc7a3Sopenharmony_ci *
240cc1dc7a3Sopenharmony_ci * If @c a is NaN then zero will be returned for that lane.
241cc1dc7a3Sopenharmony_ci */
242cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
243cc1dc7a3Sopenharmony_ci{
244cc1dc7a3Sopenharmony_ci	// Do not reorder - second operand will return if either is NaN
245cc1dc7a3Sopenharmony_ci	return min(max(a, vfloat4::zero()), 1.0f);
246cc1dc7a3Sopenharmony_ci}
247cc1dc7a3Sopenharmony_ci
248cc1dc7a3Sopenharmony_ci/**
249cc1dc7a3Sopenharmony_ci * @brief Return the horizontal minimum of a vector.
250cc1dc7a3Sopenharmony_ci */
251cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
252cc1dc7a3Sopenharmony_ci{
253cc1dc7a3Sopenharmony_ci	return hmin(a).lane<0>();
254cc1dc7a3Sopenharmony_ci}
255cc1dc7a3Sopenharmony_ci
256cc1dc7a3Sopenharmony_ci/**
257cc1dc7a3Sopenharmony_ci * @brief Return the horizontal min of RGB vector lanes as a scalar.
258cc1dc7a3Sopenharmony_ci */
259cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
260cc1dc7a3Sopenharmony_ci{
261cc1dc7a3Sopenharmony_ci	a.set_lane<3>(a.lane<0>());
262cc1dc7a3Sopenharmony_ci	return hmin_s(a);
263cc1dc7a3Sopenharmony_ci}
264cc1dc7a3Sopenharmony_ci
265cc1dc7a3Sopenharmony_ci/**
266cc1dc7a3Sopenharmony_ci * @brief Return the horizontal maximum of a vector.
267cc1dc7a3Sopenharmony_ci */
268cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
269cc1dc7a3Sopenharmony_ci{
270cc1dc7a3Sopenharmony_ci	return hmax(a).lane<0>();
271cc1dc7a3Sopenharmony_ci}
272cc1dc7a3Sopenharmony_ci
273cc1dc7a3Sopenharmony_ci/**
274cc1dc7a3Sopenharmony_ci * @brief Accumulate lane-wise sums for a vector.
275cc1dc7a3Sopenharmony_ci */
276cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
277cc1dc7a3Sopenharmony_ci{
278cc1dc7a3Sopenharmony_ci	accum = accum + a;
279cc1dc7a3Sopenharmony_ci}
280cc1dc7a3Sopenharmony_ci
281cc1dc7a3Sopenharmony_ci/**
282cc1dc7a3Sopenharmony_ci * @brief Accumulate lane-wise sums for a masked vector.
283cc1dc7a3Sopenharmony_ci */
284cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
285cc1dc7a3Sopenharmony_ci{
286cc1dc7a3Sopenharmony_ci	a = select(vfloat4::zero(), a, m);
287cc1dc7a3Sopenharmony_ci	haccumulate(accum, a);
288cc1dc7a3Sopenharmony_ci}
289cc1dc7a3Sopenharmony_ci
290cc1dc7a3Sopenharmony_ci#define ASTCENC_USE_COMMON_GATHERF
291cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, const uint8_t* idx)
292cc1dc7a3Sopenharmony_ci{
293cc1dc7a3Sopenharmony_ci	return vfloat4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);    // index 0,1,2,3
294cc1dc7a3Sopenharmony_ci}
295cc1dc7a3Sopenharmony_ci
296cc1dc7a3Sopenharmony_ci/**
297cc1dc7a3Sopenharmony_ci * @brief Return the horizontal sum of RGB vector lanes as a scalar.
298cc1dc7a3Sopenharmony_ci */
299cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
300cc1dc7a3Sopenharmony_ci{
301cc1dc7a3Sopenharmony_ci	return a.lane<0>() + a.lane<1>() + a.lane<2>();
302cc1dc7a3Sopenharmony_ci}
303cc1dc7a3Sopenharmony_ci
304cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_USE_NATIVE_ADDV)
305cc1dc7a3Sopenharmony_ci/**
306cc1dc7a3Sopenharmony_ci * @brief Return the horizontal sum of a vector.
307cc1dc7a3Sopenharmony_ci */
308cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float hadd_rgba_s(vfloat4 a)
309cc1dc7a3Sopenharmony_ci{
310cc1dc7a3Sopenharmony_ci	return a.lane<0>() + a.lane<1>() + a.lane<2>() + a.lane<3>();    // channel 0,1,2,3
311cc1dc7a3Sopenharmony_ci}
312cc1dc7a3Sopenharmony_ci#endif
313cc1dc7a3Sopenharmony_ci
314cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
315cc1dc7a3Sopenharmony_ci
316cc1dc7a3Sopenharmony_ci/**
317cc1dc7a3Sopenharmony_ci * @brief Return the dot product for the full 4 lanes, returning scalar.
318cc1dc7a3Sopenharmony_ci */
319cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
320cc1dc7a3Sopenharmony_ci{
321cc1dc7a3Sopenharmony_ci	vfloat4 m = a * b;
322cc1dc7a3Sopenharmony_ci	return hadd_s(m);
323cc1dc7a3Sopenharmony_ci}
324cc1dc7a3Sopenharmony_ci
325cc1dc7a3Sopenharmony_ci/**
326cc1dc7a3Sopenharmony_ci * @brief Return the dot product for the full 4 lanes, returning vector.
327cc1dc7a3Sopenharmony_ci */
328cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
329cc1dc7a3Sopenharmony_ci{
330cc1dc7a3Sopenharmony_ci	vfloat4 m = a * b;
331cc1dc7a3Sopenharmony_ci	return vfloat4(hadd_s(m));
332cc1dc7a3Sopenharmony_ci}
333cc1dc7a3Sopenharmony_ci
334cc1dc7a3Sopenharmony_ci/**
335cc1dc7a3Sopenharmony_ci * @brief Return the dot product for the bottom 3 lanes, returning scalar.
336cc1dc7a3Sopenharmony_ci */
337cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
338cc1dc7a3Sopenharmony_ci{
339cc1dc7a3Sopenharmony_ci	vfloat4 m = a * b;
340cc1dc7a3Sopenharmony_ci	return hadd_rgb_s(m);
341cc1dc7a3Sopenharmony_ci}
342cc1dc7a3Sopenharmony_ci
343cc1dc7a3Sopenharmony_ci/**
344cc1dc7a3Sopenharmony_ci * @brief Return the dot product for the bottom 3 lanes, returning vector.
345cc1dc7a3Sopenharmony_ci */
346cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
347cc1dc7a3Sopenharmony_ci{
348cc1dc7a3Sopenharmony_ci	vfloat4 m = a * b;
349cc1dc7a3Sopenharmony_ci	float d3 = hadd_rgb_s(m);
350cc1dc7a3Sopenharmony_ci	return vfloat4(d3, d3, d3, 0.0f);
351cc1dc7a3Sopenharmony_ci}
352cc1dc7a3Sopenharmony_ci
353cc1dc7a3Sopenharmony_ci#endif
354cc1dc7a3Sopenharmony_ci
355cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
356cc1dc7a3Sopenharmony_ci
357cc1dc7a3Sopenharmony_ci/**
358cc1dc7a3Sopenharmony_ci * @brief Population bit count.
359cc1dc7a3Sopenharmony_ci *
360cc1dc7a3Sopenharmony_ci * @param v   The value to population count.
361cc1dc7a3Sopenharmony_ci *
362cc1dc7a3Sopenharmony_ci * @return The number of 1 bits.
363cc1dc7a3Sopenharmony_ci */
364cc1dc7a3Sopenharmony_cistatic inline int popcount(uint64_t v)
365cc1dc7a3Sopenharmony_ci{
366cc1dc7a3Sopenharmony_ci	uint64_t mask1 = 0x5555555555555555ULL;
367cc1dc7a3Sopenharmony_ci	uint64_t mask2 = 0x3333333333333333ULL;
368cc1dc7a3Sopenharmony_ci	uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
369cc1dc7a3Sopenharmony_ci	v -= (v >> 1) & mask1;
370cc1dc7a3Sopenharmony_ci	v = (v & mask2) + ((v >> 2) & mask2);
371cc1dc7a3Sopenharmony_ci	v += v >> 4;
372cc1dc7a3Sopenharmony_ci	v &= mask3;
373cc1dc7a3Sopenharmony_ci	v *= 0x0101010101010101ULL;
374cc1dc7a3Sopenharmony_ci	v >>= 56;
375cc1dc7a3Sopenharmony_ci	return static_cast<int>(v);
376cc1dc7a3Sopenharmony_ci}
377cc1dc7a3Sopenharmony_ci
378cc1dc7a3Sopenharmony_ci#endif
379cc1dc7a3Sopenharmony_ci
380cc1dc7a3Sopenharmony_ci/**
381cc1dc7a3Sopenharmony_ci * @brief Apply signed bit transfer.
382cc1dc7a3Sopenharmony_ci *
383cc1dc7a3Sopenharmony_ci * @param input0   The first encoded endpoint.
384cc1dc7a3Sopenharmony_ci * @param input1   The second encoded endpoint.
385cc1dc7a3Sopenharmony_ci */
386cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE void bit_transfer_signed(
387cc1dc7a3Sopenharmony_ci	vint4& input0,
388cc1dc7a3Sopenharmony_ci	vint4& input1
389cc1dc7a3Sopenharmony_ci) {
390cc1dc7a3Sopenharmony_ci	input1 = lsr<1>(input1) | (input0 & 0x80);
391cc1dc7a3Sopenharmony_ci	input0 = lsr<1>(input0) & 0x3F;
392cc1dc7a3Sopenharmony_ci
393cc1dc7a3Sopenharmony_ci	vmask4 mask = (input0 & 0x20) != vint4::zero();
394cc1dc7a3Sopenharmony_ci	input0 = select(input0, input0 - 0x40, mask);
395cc1dc7a3Sopenharmony_ci}
396cc1dc7a3Sopenharmony_ci
397cc1dc7a3Sopenharmony_ci/**
398cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of ints.
399cc1dc7a3Sopenharmony_ci */
400cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void print(vint4 a)
401cc1dc7a3Sopenharmony_ci{
402cc1dc7a3Sopenharmony_ci	ASTCENC_ALIGNAS int v[4];
403cc1dc7a3Sopenharmony_ci	storea(a, v);
404cc1dc7a3Sopenharmony_ci	printf("v4_i32:\n  %8d %8d %8d %8d\n",
405cc1dc7a3Sopenharmony_ci	       v[0], v[1], v[2], v[3]);
406cc1dc7a3Sopenharmony_ci}
407cc1dc7a3Sopenharmony_ci
408cc1dc7a3Sopenharmony_ci/**
409cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of ints.
410cc1dc7a3Sopenharmony_ci */
411cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void printx(vint4 a)
412cc1dc7a3Sopenharmony_ci{
413cc1dc7a3Sopenharmony_ci	ASTCENC_ALIGNAS int v[4];
414cc1dc7a3Sopenharmony_ci	storea(a, v);
415cc1dc7a3Sopenharmony_ci	printf("v4_i32:\n  %08x %08x %08x %08x\n",
416cc1dc7a3Sopenharmony_ci	       v[0], v[1], v[2], v[3]);
417cc1dc7a3Sopenharmony_ci}
418cc1dc7a3Sopenharmony_ci
419cc1dc7a3Sopenharmony_ci/**
420cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of floats.
421cc1dc7a3Sopenharmony_ci */
422cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void print(vfloat4 a)
423cc1dc7a3Sopenharmony_ci{
424cc1dc7a3Sopenharmony_ci	ASTCENC_ALIGNAS float v[4];
425cc1dc7a3Sopenharmony_ci	storea(a, v);
426cc1dc7a3Sopenharmony_ci	printf("v4_f32:\n  %0.4f %0.4f %0.4f %0.4f\n",
427cc1dc7a3Sopenharmony_ci	       static_cast<double>(v[0]), static_cast<double>(v[1]),
428cc1dc7a3Sopenharmony_ci	       static_cast<double>(v[2]), static_cast<double>(v[3]));
429cc1dc7a3Sopenharmony_ci}
430cc1dc7a3Sopenharmony_ci
431cc1dc7a3Sopenharmony_ci/**
432cc1dc7a3Sopenharmony_ci * @brief Debug function to print a vector of masks.
433cc1dc7a3Sopenharmony_ci */
434cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE void print(vmask4 a)
435cc1dc7a3Sopenharmony_ci{
436cc1dc7a3Sopenharmony_ci	print(select(vint4(0), vint4(1), a));
437cc1dc7a3Sopenharmony_ci}
438cc1dc7a3Sopenharmony_ci
439cc1dc7a3Sopenharmony_ci#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
440