1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0
2cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
3cc1dc7a3Sopenharmony_ci// Copyright 2019-2022 Arm Limited
4cc1dc7a3Sopenharmony_ci// Copyright 2008 Jose Fonseca
5cc1dc7a3Sopenharmony_ci//
6cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not
7cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy
8cc1dc7a3Sopenharmony_ci// of the License at:
9cc1dc7a3Sopenharmony_ci//
10cc1dc7a3Sopenharmony_ci//     http://www.apache.org/licenses/LICENSE-2.0
11cc1dc7a3Sopenharmony_ci//
12cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software
13cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations
16cc1dc7a3Sopenharmony_ci// under the License.
17cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
18cc1dc7a3Sopenharmony_ci
19cc1dc7a3Sopenharmony_ci/*
20cc1dc7a3Sopenharmony_ci * This module implements vector support for floats, ints, and vector lane
21cc1dc7a3Sopenharmony_ci * control masks. It provides access to both explicit vector width types, and
22cc1dc7a3Sopenharmony_ci * flexible N-wide types where N can be determined at compile time.
23cc1dc7a3Sopenharmony_ci *
24cc1dc7a3Sopenharmony_ci * The design of this module encourages use of vector length agnostic code, via
25cc1dc7a3Sopenharmony_ci * the vint, vfloat, and vmask types. These will take on the widest SIMD vector
26cc1dc7a3Sopenharmony_ci * with that is available at compile time. The current vector width is
27cc1dc7a3Sopenharmony_ci * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
28cc1dc7a3Sopenharmony_ci *
29cc1dc7a3Sopenharmony_ci * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
30cc1dc7a3Sopenharmony_ci * These are provided primarily for prototyping and algorithm debug of VLA
31cc1dc7a3Sopenharmony_ci * implementations.
32cc1dc7a3Sopenharmony_ci *
33cc1dc7a3Sopenharmony_ci * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
34cc1dc7a3Sopenharmony_ci * types. These are provided for use by VLA code, but are also expected to be
35cc1dc7a3Sopenharmony_ci * used as a fixed-width type and will supported a reference C++ fallback for
36cc1dc7a3Sopenharmony_ci * use on platforms without SIMD intrinsics.
37cc1dc7a3Sopenharmony_ci *
38cc1dc7a3Sopenharmony_ci * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
39cc1dc7a3Sopenharmony_ci * types. These are provide for use by VLA code, and are not expected to be
40cc1dc7a3Sopenharmony_ci * used as a fixed-width type in normal code. No reference C implementation is
41cc1dc7a3Sopenharmony_ci * provided on platforms without underlying SIMD intrinsics.
42cc1dc7a3Sopenharmony_ci *
43cc1dc7a3Sopenharmony_ci * With the current implementation ISA support is provided for:
44cc1dc7a3Sopenharmony_ci *
45cc1dc7a3Sopenharmony_ci *     * 1-wide for scalar reference.
46cc1dc7a3Sopenharmony_ci *     * 4-wide for Armv8-A NEON.
47cc1dc7a3Sopenharmony_ci *     * 4-wide for x86-64 SSE2.
48cc1dc7a3Sopenharmony_ci *     * 4-wide for x86-64 SSE4.1.
49cc1dc7a3Sopenharmony_ci *     * 8-wide for x86-64 AVX2.
50cc1dc7a3Sopenharmony_ci */
51cc1dc7a3Sopenharmony_ci
52cc1dc7a3Sopenharmony_ci#ifndef ASTC_VECMATHLIB_H_INCLUDED
53cc1dc7a3Sopenharmony_ci#define ASTC_VECMATHLIB_H_INCLUDED
54cc1dc7a3Sopenharmony_ci
55cc1dc7a3Sopenharmony_ci#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
56cc1dc7a3Sopenharmony_ci	#include <immintrin.h>
57cc1dc7a3Sopenharmony_ci#elif ASTCENC_NEON != 0
58cc1dc7a3Sopenharmony_ci	#include <arm_neon.h>
59cc1dc7a3Sopenharmony_ci#endif
60cc1dc7a3Sopenharmony_ci
61cc1dc7a3Sopenharmony_ci#if !defined(__clang__) && defined(_MSC_VER)
62cc1dc7a3Sopenharmony_ci	#define ASTCENC_SIMD_INLINE __forceinline
63cc1dc7a3Sopenharmony_ci	#define ASTCENC_NO_INLINE
64cc1dc7a3Sopenharmony_ci#elif defined(__GNUC__) && !defined(__clang__)
65cc1dc7a3Sopenharmony_ci	#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
66cc1dc7a3Sopenharmony_ci	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
67cc1dc7a3Sopenharmony_ci#else
68cc1dc7a3Sopenharmony_ci	#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
69cc1dc7a3Sopenharmony_ci	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
70cc1dc7a3Sopenharmony_ci#endif
71cc1dc7a3Sopenharmony_ci
72cc1dc7a3Sopenharmony_ci#if ASTCENC_AVX >= 2
73cc1dc7a3Sopenharmony_ci	/* If we have AVX2 expose 8-wide VLA. */
74cc1dc7a3Sopenharmony_ci	#include "astcenc_vecmathlib_sse_4.h"
75cc1dc7a3Sopenharmony_ci	#include "astcenc_vecmathlib_common_4.h"
76cc1dc7a3Sopenharmony_ci	#include "astcenc_vecmathlib_avx2_8.h"
77cc1dc7a3Sopenharmony_ci
78cc1dc7a3Sopenharmony_ci	#define ASTCENC_SIMD_WIDTH 8
79cc1dc7a3Sopenharmony_ci
80cc1dc7a3Sopenharmony_ci	using vfloat = vfloat8;
81cc1dc7a3Sopenharmony_ci
82cc1dc7a3Sopenharmony_ci	#if defined(ASTCENC_NO_INVARIANCE)
83cc1dc7a3Sopenharmony_ci		using vfloatacc = vfloat8;
84cc1dc7a3Sopenharmony_ci	#else
85cc1dc7a3Sopenharmony_ci		using vfloatacc = vfloat4;
86cc1dc7a3Sopenharmony_ci	#endif
87cc1dc7a3Sopenharmony_ci
88cc1dc7a3Sopenharmony_ci	using vint = vint8;
89cc1dc7a3Sopenharmony_ci	using vmask = vmask8;
90cc1dc7a3Sopenharmony_ci
91cc1dc7a3Sopenharmony_ci	constexpr auto loada = vfloat8::loada;
92cc1dc7a3Sopenharmony_ci	constexpr auto load1 = vfloat8::load1;
93cc1dc7a3Sopenharmony_ci
94cc1dc7a3Sopenharmony_ci#elif ASTCENC_SSE >= 20
95cc1dc7a3Sopenharmony_ci	/* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */
96cc1dc7a3Sopenharmony_ci	#include "astcenc_vecmathlib_sse_4.h"
97cc1dc7a3Sopenharmony_ci	#include "astcenc_vecmathlib_common_4.h"
98cc1dc7a3Sopenharmony_ci
99cc1dc7a3Sopenharmony_ci	#define ASTCENC_SIMD_WIDTH 4
100cc1dc7a3Sopenharmony_ci
101cc1dc7a3Sopenharmony_ci	using vfloat = vfloat4;
102cc1dc7a3Sopenharmony_ci	using vfloatacc = vfloat4;
103cc1dc7a3Sopenharmony_ci	using vint = vint4;
104cc1dc7a3Sopenharmony_ci	using vmask = vmask4;
105cc1dc7a3Sopenharmony_ci
106cc1dc7a3Sopenharmony_ci	constexpr auto loada = vfloat4::loada;
107cc1dc7a3Sopenharmony_ci	constexpr auto load1 = vfloat4::load1;
108cc1dc7a3Sopenharmony_ci
109cc1dc7a3Sopenharmony_ci#elif ASTCENC_NEON > 0
110cc1dc7a3Sopenharmony_ci	/* If we have NEON expose 4-wide VLA. */
111cc1dc7a3Sopenharmony_ci	#include "astcenc_vecmathlib_neon_4.h"
112cc1dc7a3Sopenharmony_ci	#include "astcenc_vecmathlib_common_4.h"
113cc1dc7a3Sopenharmony_ci
114cc1dc7a3Sopenharmony_ci	#define ASTCENC_SIMD_WIDTH 4
115cc1dc7a3Sopenharmony_ci
116cc1dc7a3Sopenharmony_ci	using vfloat = vfloat4;
117cc1dc7a3Sopenharmony_ci	using vfloatacc = vfloat4;
118cc1dc7a3Sopenharmony_ci	using vint = vint4;
119cc1dc7a3Sopenharmony_ci	using vmask = vmask4;
120cc1dc7a3Sopenharmony_ci
121cc1dc7a3Sopenharmony_ci	constexpr auto loada = vfloat4::loada;
122cc1dc7a3Sopenharmony_ci	constexpr auto load1 = vfloat4::load1;
123cc1dc7a3Sopenharmony_ci
124cc1dc7a3Sopenharmony_ci#else
125cc1dc7a3Sopenharmony_ci	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
126cc1dc7a3Sopenharmony_ci
127cc1dc7a3Sopenharmony_ci	// Note: We no longer expose the 1-wide scalar fallback because it is not
128cc1dc7a3Sopenharmony_ci	// invariant with the 4-wide path due to algorithms that use horizontal
129cc1dc7a3Sopenharmony_ci	// operations that accumulate a local vector sum before accumulating into
130cc1dc7a3Sopenharmony_ci	// a running sum.
131cc1dc7a3Sopenharmony_ci	//
132cc1dc7a3Sopenharmony_ci	// For 4 items adding into an accumulator using 1-wide vectors the sum is:
133cc1dc7a3Sopenharmony_ci	//
134cc1dc7a3Sopenharmony_ci	//     result = ((((sum + l0) + l1) + l2) + l3)
135cc1dc7a3Sopenharmony_ci	//
136cc1dc7a3Sopenharmony_ci    // ... whereas the accumulator for a 4-wide vector sum is:
137cc1dc7a3Sopenharmony_ci	//
138cc1dc7a3Sopenharmony_ci	//     result = sum + ((l0 + l2) + (l1 + l3))
139cc1dc7a3Sopenharmony_ci	//
140cc1dc7a3Sopenharmony_ci	// In "normal maths" this is the same, but the floating point reassociation
141cc1dc7a3Sopenharmony_ci	// differences mean that these will not produce the same result.
142cc1dc7a3Sopenharmony_ci
143cc1dc7a3Sopenharmony_ci	#include "astcenc_vecmathlib_none_4.h"
144cc1dc7a3Sopenharmony_ci	#include "astcenc_vecmathlib_common_4.h"
145cc1dc7a3Sopenharmony_ci
146cc1dc7a3Sopenharmony_ci	#define ASTCENC_SIMD_WIDTH 4
147cc1dc7a3Sopenharmony_ci
148cc1dc7a3Sopenharmony_ci	using vfloat = vfloat4;
149cc1dc7a3Sopenharmony_ci	using vfloatacc = vfloat4;
150cc1dc7a3Sopenharmony_ci	using vint = vint4;
151cc1dc7a3Sopenharmony_ci	using vmask = vmask4;
152cc1dc7a3Sopenharmony_ci
153cc1dc7a3Sopenharmony_ci	constexpr auto loada = vfloat4::loada;
154cc1dc7a3Sopenharmony_ci	constexpr auto load1 = vfloat4::load1;
155cc1dc7a3Sopenharmony_ci#endif
156cc1dc7a3Sopenharmony_ci
157cc1dc7a3Sopenharmony_ci/**
158cc1dc7a3Sopenharmony_ci * @brief Round a count down to the largest multiple of 8.
159cc1dc7a3Sopenharmony_ci *
160cc1dc7a3Sopenharmony_ci * @param count   The unrounded value.
161cc1dc7a3Sopenharmony_ci *
162cc1dc7a3Sopenharmony_ci * @return The rounded value.
163cc1dc7a3Sopenharmony_ci */
164cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
165cc1dc7a3Sopenharmony_ci{
166cc1dc7a3Sopenharmony_ci	return count & static_cast<unsigned int>(~(8 - 1));
167cc1dc7a3Sopenharmony_ci}
168cc1dc7a3Sopenharmony_ci
169cc1dc7a3Sopenharmony_ci/**
170cc1dc7a3Sopenharmony_ci * @brief Round a count down to the largest multiple of 4.
171cc1dc7a3Sopenharmony_ci *
172cc1dc7a3Sopenharmony_ci * @param count   The unrounded value.
173cc1dc7a3Sopenharmony_ci *
174cc1dc7a3Sopenharmony_ci * @return The rounded value.
175cc1dc7a3Sopenharmony_ci */
176cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
177cc1dc7a3Sopenharmony_ci{
178cc1dc7a3Sopenharmony_ci	return count & static_cast<unsigned int>(~(4 - 1));
179cc1dc7a3Sopenharmony_ci}
180cc1dc7a3Sopenharmony_ci
181cc1dc7a3Sopenharmony_ci/**
182cc1dc7a3Sopenharmony_ci * @brief Round a count down to the largest multiple of the SIMD width.
183cc1dc7a3Sopenharmony_ci *
184cc1dc7a3Sopenharmony_ci * Assumption that the vector width is a power of two ...
185cc1dc7a3Sopenharmony_ci *
186cc1dc7a3Sopenharmony_ci * @param count   The unrounded value.
187cc1dc7a3Sopenharmony_ci *
188cc1dc7a3Sopenharmony_ci * @return The rounded value.
189cc1dc7a3Sopenharmony_ci */
190cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
191cc1dc7a3Sopenharmony_ci{
192cc1dc7a3Sopenharmony_ci	return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1));
193cc1dc7a3Sopenharmony_ci}
194cc1dc7a3Sopenharmony_ci
195cc1dc7a3Sopenharmony_ci/**
196cc1dc7a3Sopenharmony_ci * @brief Round a count up to the largest multiple of the SIMD width.
197cc1dc7a3Sopenharmony_ci *
198cc1dc7a3Sopenharmony_ci * Assumption that the vector width is a power of two ...
199cc1dc7a3Sopenharmony_ci *
200cc1dc7a3Sopenharmony_ci * @param count   The unrounded value.
201cc1dc7a3Sopenharmony_ci *
202cc1dc7a3Sopenharmony_ci * @return The rounded value.
203cc1dc7a3Sopenharmony_ci */
204cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
205cc1dc7a3Sopenharmony_ci{
206cc1dc7a3Sopenharmony_ci	unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
207cc1dc7a3Sopenharmony_ci	return multiples * ASTCENC_SIMD_WIDTH;
208cc1dc7a3Sopenharmony_ci}
209cc1dc7a3Sopenharmony_ci
210cc1dc7a3Sopenharmony_ci/**
211cc1dc7a3Sopenharmony_ci * @brief Return @c a with lanes negated if the @c b lane is negative.
212cc1dc7a3Sopenharmony_ci */
213cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
214cc1dc7a3Sopenharmony_ci{
215cc1dc7a3Sopenharmony_ci	vint ia = float_as_int(a);
216cc1dc7a3Sopenharmony_ci	vint ib = float_as_int(b);
217cc1dc7a3Sopenharmony_ci	vint sign_mask(static_cast<int>(0x80000000));
218cc1dc7a3Sopenharmony_ci	vint r = ia ^ (ib & sign_mask);
219cc1dc7a3Sopenharmony_ci	return int_as_float(r);
220cc1dc7a3Sopenharmony_ci}
221cc1dc7a3Sopenharmony_ci
222cc1dc7a3Sopenharmony_ci/**
223cc1dc7a3Sopenharmony_ci * @brief Return fast, but approximate, vector atan(x).
224cc1dc7a3Sopenharmony_ci *
225cc1dc7a3Sopenharmony_ci * Max error of this implementation is 0.004883.
226cc1dc7a3Sopenharmony_ci */
227cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat atan(vfloat x)
228cc1dc7a3Sopenharmony_ci{
229cc1dc7a3Sopenharmony_ci	vmask c = abs(x) > vfloat(1.0f);
230cc1dc7a3Sopenharmony_ci	vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
231cc1dc7a3Sopenharmony_ci	vfloat y = select(x, vfloat(1.0f) / x, c);
232cc1dc7a3Sopenharmony_ci	y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
233cc1dc7a3Sopenharmony_ci	return select(y, z - y, c);
234cc1dc7a3Sopenharmony_ci}
235cc1dc7a3Sopenharmony_ci
236cc1dc7a3Sopenharmony_ci/**
237cc1dc7a3Sopenharmony_ci * @brief Return fast, but approximate, vector atan2(x, y).
238cc1dc7a3Sopenharmony_ci */
239cc1dc7a3Sopenharmony_ciASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
240cc1dc7a3Sopenharmony_ci{
241cc1dc7a3Sopenharmony_ci	vfloat z = atan(abs(y / x));
242cc1dc7a3Sopenharmony_ci	vmask xmask = vmask(float_as_int(x).m);
243cc1dc7a3Sopenharmony_ci	return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y);
244cc1dc7a3Sopenharmony_ci}
245cc1dc7a3Sopenharmony_ci
246cc1dc7a3Sopenharmony_ci/*
247cc1dc7a3Sopenharmony_ci * @brief Factory that returns a unit length 4 component vfloat4.
248cc1dc7a3Sopenharmony_ci */
249cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 unit4()
250cc1dc7a3Sopenharmony_ci{
251cc1dc7a3Sopenharmony_ci	return vfloat4(0.5f);
252cc1dc7a3Sopenharmony_ci}
253cc1dc7a3Sopenharmony_ci
254cc1dc7a3Sopenharmony_ci/**
255cc1dc7a3Sopenharmony_ci * @brief Factory that returns a unit length 3 component vfloat4.
256cc1dc7a3Sopenharmony_ci */
257cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 unit3()
258cc1dc7a3Sopenharmony_ci{
259cc1dc7a3Sopenharmony_ci	float val = 0.577350258827209473f;
260cc1dc7a3Sopenharmony_ci	return vfloat4(val, val, val, 0.0f);
261cc1dc7a3Sopenharmony_ci}
262cc1dc7a3Sopenharmony_ci
263cc1dc7a3Sopenharmony_ci/**
264cc1dc7a3Sopenharmony_ci * @brief Factory that returns a unit length 2 component vfloat4.
265cc1dc7a3Sopenharmony_ci */
266cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 unit2()
267cc1dc7a3Sopenharmony_ci{
268cc1dc7a3Sopenharmony_ci	float val = 0.707106769084930420f;
269cc1dc7a3Sopenharmony_ci	return vfloat4(val, val, 0.0f, 0.0f);
270cc1dc7a3Sopenharmony_ci}
271cc1dc7a3Sopenharmony_ci
272cc1dc7a3Sopenharmony_ci/**
273cc1dc7a3Sopenharmony_ci * @brief Factory that returns a 3 component vfloat4.
274cc1dc7a3Sopenharmony_ci */
275cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
276cc1dc7a3Sopenharmony_ci{
277cc1dc7a3Sopenharmony_ci	return vfloat4(a, b, c, 0.0f);
278cc1dc7a3Sopenharmony_ci}
279cc1dc7a3Sopenharmony_ci
280cc1dc7a3Sopenharmony_ci/**
281cc1dc7a3Sopenharmony_ci * @brief Factory that returns a 2 component vfloat4.
282cc1dc7a3Sopenharmony_ci */
283cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
284cc1dc7a3Sopenharmony_ci{
285cc1dc7a3Sopenharmony_ci	return vfloat4(a, b, 0.0f, 0.0f);
286cc1dc7a3Sopenharmony_ci}
287cc1dc7a3Sopenharmony_ci
288cc1dc7a3Sopenharmony_ci/**
289cc1dc7a3Sopenharmony_ci * @brief Normalize a non-zero length vector to unit length.
290cc1dc7a3Sopenharmony_ci */
291cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
292cc1dc7a3Sopenharmony_ci{
293cc1dc7a3Sopenharmony_ci	vfloat4 length = dot(a, a);
294cc1dc7a3Sopenharmony_ci	return a / sqrt(length);
295cc1dc7a3Sopenharmony_ci}
296cc1dc7a3Sopenharmony_ci
297cc1dc7a3Sopenharmony_ci/**
298cc1dc7a3Sopenharmony_ci * @brief Normalize a vector, returning @c safe if len is zero.
299cc1dc7a3Sopenharmony_ci */
300cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
301cc1dc7a3Sopenharmony_ci{
302cc1dc7a3Sopenharmony_ci	vfloat4 length = dot(a, a);
303cc1dc7a3Sopenharmony_ci	if (length.lane<0>() != 0.0f)
304cc1dc7a3Sopenharmony_ci	{
305cc1dc7a3Sopenharmony_ci		return a / sqrt(length);
306cc1dc7a3Sopenharmony_ci	}
307cc1dc7a3Sopenharmony_ci
308cc1dc7a3Sopenharmony_ci	return safe;
309cc1dc7a3Sopenharmony_ci}
310cc1dc7a3Sopenharmony_ci
311cc1dc7a3Sopenharmony_ci
312cc1dc7a3Sopenharmony_ci
313cc1dc7a3Sopenharmony_ci#define POLY0(x, c0)                     (                                     c0)
314cc1dc7a3Sopenharmony_ci#define POLY1(x, c0, c1)                 ((POLY0(x, c1) * x)                 + c0)
315cc1dc7a3Sopenharmony_ci#define POLY2(x, c0, c1, c2)             ((POLY1(x, c1, c2) * x)             + c0)
316cc1dc7a3Sopenharmony_ci#define POLY3(x, c0, c1, c2, c3)         ((POLY2(x, c1, c2, c3) * x)         + c0)
317cc1dc7a3Sopenharmony_ci#define POLY4(x, c0, c1, c2, c3, c4)     ((POLY3(x, c1, c2, c3, c4) * x)     + c0)
318cc1dc7a3Sopenharmony_ci#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
319cc1dc7a3Sopenharmony_ci
320cc1dc7a3Sopenharmony_ci/**
321cc1dc7a3Sopenharmony_ci * @brief Compute an approximate exp2(x) for each lane in the vector.
322cc1dc7a3Sopenharmony_ci *
323cc1dc7a3Sopenharmony_ci * Based on 5th degree minimax polynomials, ported from this blog
324cc1dc7a3Sopenharmony_ci * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
325cc1dc7a3Sopenharmony_ci */
326cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
327cc1dc7a3Sopenharmony_ci{
328cc1dc7a3Sopenharmony_ci	x = clamp(-126.99999f, 129.0f, x);
329cc1dc7a3Sopenharmony_ci
330cc1dc7a3Sopenharmony_ci	vint4 ipart = float_to_int(x - 0.5f);
331cc1dc7a3Sopenharmony_ci	vfloat4 fpart = x - int_to_float(ipart);
332cc1dc7a3Sopenharmony_ci
333cc1dc7a3Sopenharmony_ci	// Integer contrib, using 1 << ipart
334cc1dc7a3Sopenharmony_ci	vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
335cc1dc7a3Sopenharmony_ci
336cc1dc7a3Sopenharmony_ci	// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
337cc1dc7a3Sopenharmony_ci	vfloat4 fexp = POLY5(fpart,
338cc1dc7a3Sopenharmony_ci	                     9.9999994e-1f,
339cc1dc7a3Sopenharmony_ci	                     6.9315308e-1f,
340cc1dc7a3Sopenharmony_ci	                     2.4015361e-1f,
341cc1dc7a3Sopenharmony_ci	                     5.5826318e-2f,
342cc1dc7a3Sopenharmony_ci	                     8.9893397e-3f,
343cc1dc7a3Sopenharmony_ci	                     1.8775767e-3f);
344cc1dc7a3Sopenharmony_ci
345cc1dc7a3Sopenharmony_ci	return iexp * fexp;
346cc1dc7a3Sopenharmony_ci}
347cc1dc7a3Sopenharmony_ci
348cc1dc7a3Sopenharmony_ci/**
349cc1dc7a3Sopenharmony_ci * @brief Compute an approximate log2(x) for each lane in the vector.
350cc1dc7a3Sopenharmony_ci *
351cc1dc7a3Sopenharmony_ci * Based on 5th degree minimax polynomials, ported from this blog
352cc1dc7a3Sopenharmony_ci * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
353cc1dc7a3Sopenharmony_ci */
354cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
355cc1dc7a3Sopenharmony_ci{
356cc1dc7a3Sopenharmony_ci	vint4 exp(0x7F800000);
357cc1dc7a3Sopenharmony_ci	vint4 mant(0x007FFFFF);
358cc1dc7a3Sopenharmony_ci	vint4 one(0x3F800000);
359cc1dc7a3Sopenharmony_ci
360cc1dc7a3Sopenharmony_ci	vint4 i = float_as_int(x);
361cc1dc7a3Sopenharmony_ci
362cc1dc7a3Sopenharmony_ci	vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
363cc1dc7a3Sopenharmony_ci
364cc1dc7a3Sopenharmony_ci	vfloat4 m = int_as_float((i & mant) | one);
365cc1dc7a3Sopenharmony_ci
366cc1dc7a3Sopenharmony_ci	// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
367cc1dc7a3Sopenharmony_ci	vfloat4 p = POLY4(m,
368cc1dc7a3Sopenharmony_ci	                  2.8882704548164776201f,
369cc1dc7a3Sopenharmony_ci	                 -2.52074962577807006663f,
370cc1dc7a3Sopenharmony_ci	                  1.48116647521213171641f,
371cc1dc7a3Sopenharmony_ci	                 -0.465725644288844778798f,
372cc1dc7a3Sopenharmony_ci	                  0.0596515482674574969533f);
373cc1dc7a3Sopenharmony_ci
374cc1dc7a3Sopenharmony_ci	// Increases the polynomial degree, but ensures that log2(1) == 0
375cc1dc7a3Sopenharmony_ci	p = p * (m - 1.0f);
376cc1dc7a3Sopenharmony_ci
377cc1dc7a3Sopenharmony_ci	return p + e;
378cc1dc7a3Sopenharmony_ci}
379cc1dc7a3Sopenharmony_ci
380cc1dc7a3Sopenharmony_ci/**
381cc1dc7a3Sopenharmony_ci * @brief Compute an approximate pow(x, y) for each lane in the vector.
382cc1dc7a3Sopenharmony_ci *
383cc1dc7a3Sopenharmony_ci * Power function based on the exp2(log2(x) * y) transform.
384cc1dc7a3Sopenharmony_ci */
385cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
386cc1dc7a3Sopenharmony_ci{
387cc1dc7a3Sopenharmony_ci	vmask4 zero_mask = y == vfloat4(0.0f);
388cc1dc7a3Sopenharmony_ci	vfloat4 estimate = exp2(log2(x) * y);
389cc1dc7a3Sopenharmony_ci
390cc1dc7a3Sopenharmony_ci	// Guarantee that y == 0 returns exactly 1.0f
391cc1dc7a3Sopenharmony_ci	return select(estimate, vfloat4(1.0f), zero_mask);
392cc1dc7a3Sopenharmony_ci}
393cc1dc7a3Sopenharmony_ci
394cc1dc7a3Sopenharmony_ci/**
395cc1dc7a3Sopenharmony_ci * @brief Count the leading zeros for each lane in @c a.
396cc1dc7a3Sopenharmony_ci *
397cc1dc7a3Sopenharmony_ci * Valid for all data values of @c a; will return a per-lane value [0, 32].
398cc1dc7a3Sopenharmony_ci */
399cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
400cc1dc7a3Sopenharmony_ci{
401cc1dc7a3Sopenharmony_ci	// This function is a horrible abuse of floating point exponents to convert
402cc1dc7a3Sopenharmony_ci	// the original integer value into a 2^N encoding we can recover easily.
403cc1dc7a3Sopenharmony_ci
404cc1dc7a3Sopenharmony_ci	// Convert to float without risk of rounding up by keeping only top 8 bits.
405cc1dc7a3Sopenharmony_ci	// This trick is is guaranteed to keep top 8 bits and clear the 9th.
406cc1dc7a3Sopenharmony_ci	a = (~lsr<8>(a)) & a;
407cc1dc7a3Sopenharmony_ci	a = float_as_int(int_to_float(a));
408cc1dc7a3Sopenharmony_ci
409cc1dc7a3Sopenharmony_ci	// Extract and unbias exponent
410cc1dc7a3Sopenharmony_ci	a = vint4(127 + 31) - lsr<23>(a);
411cc1dc7a3Sopenharmony_ci
412cc1dc7a3Sopenharmony_ci	// Clamp result to a valid 32-bit range
413cc1dc7a3Sopenharmony_ci	return clamp(0, 32, a);
414cc1dc7a3Sopenharmony_ci}
415cc1dc7a3Sopenharmony_ci
416cc1dc7a3Sopenharmony_ci/**
417cc1dc7a3Sopenharmony_ci * @brief Return lanewise 2^a for each lane in @c a.
418cc1dc7a3Sopenharmony_ci *
419cc1dc7a3Sopenharmony_ci * Use of signed int means that this is only valid for values in range [0, 31].
420cc1dc7a3Sopenharmony_ci */
421cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
422cc1dc7a3Sopenharmony_ci{
423cc1dc7a3Sopenharmony_ci	// 2^30 is the largest signed number than can be represented
424cc1dc7a3Sopenharmony_ci	assert(all(a < vint4(31)));
425cc1dc7a3Sopenharmony_ci
426cc1dc7a3Sopenharmony_ci	// This function is a horrible abuse of floating point to use the exponent
427cc1dc7a3Sopenharmony_ci	// and float conversion to generate a 2^N multiple.
428cc1dc7a3Sopenharmony_ci
429cc1dc7a3Sopenharmony_ci	// Bias the exponent
430cc1dc7a3Sopenharmony_ci	vint4 exp = a + 127;
431cc1dc7a3Sopenharmony_ci	exp = lsl<23>(exp);
432cc1dc7a3Sopenharmony_ci
433cc1dc7a3Sopenharmony_ci	// Reinterpret the bits as a float, and then convert to an int
434cc1dc7a3Sopenharmony_ci	vfloat4 f = int_as_float(exp);
435cc1dc7a3Sopenharmony_ci	return float_to_int(f);
436cc1dc7a3Sopenharmony_ci}
437cc1dc7a3Sopenharmony_ci
438cc1dc7a3Sopenharmony_ci/**
439cc1dc7a3Sopenharmony_ci * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
440cc1dc7a3Sopenharmony_ci */
441cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
442cc1dc7a3Sopenharmony_ci{
443cc1dc7a3Sopenharmony_ci	vint4 fp16_one = vint4(0x3C00);
444cc1dc7a3Sopenharmony_ci	vint4 fp16_small = lsl<8>(p);
445cc1dc7a3Sopenharmony_ci
446cc1dc7a3Sopenharmony_ci	vmask4 is_one = p == vint4(0xFFFF);
447cc1dc7a3Sopenharmony_ci	vmask4 is_small = p < vint4(4);
448cc1dc7a3Sopenharmony_ci
449cc1dc7a3Sopenharmony_ci	// Manually inline clz() on Visual Studio to avoid release build codegen bug
450cc1dc7a3Sopenharmony_ci	// see https://github.com/ARM-software/astc-encoder/issues/259
451cc1dc7a3Sopenharmony_ci#if !defined(__clang__) && defined(_MSC_VER)
452cc1dc7a3Sopenharmony_ci	vint4 a = (~lsr<8>(p)) & p;
453cc1dc7a3Sopenharmony_ci	a = float_as_int(int_to_float(a));
454cc1dc7a3Sopenharmony_ci	a = vint4(127 + 31) - lsr<23>(a);
455cc1dc7a3Sopenharmony_ci	vint4 lz = clamp(0, 32, a) - 16;
456cc1dc7a3Sopenharmony_ci#else
457cc1dc7a3Sopenharmony_ci	vint4 lz = clz(p) - 16;
458cc1dc7a3Sopenharmony_ci#endif
459cc1dc7a3Sopenharmony_ci
460cc1dc7a3Sopenharmony_ci	p = p * two_to_the_n(lz + 1);
461cc1dc7a3Sopenharmony_ci	p = p & vint4(0xFFFF);
462cc1dc7a3Sopenharmony_ci
463cc1dc7a3Sopenharmony_ci	p = lsr<6>(p);
464cc1dc7a3Sopenharmony_ci
465cc1dc7a3Sopenharmony_ci	p = p | lsl<10>(vint4(14) - lz);
466cc1dc7a3Sopenharmony_ci
467cc1dc7a3Sopenharmony_ci	vint4 r = select(p, fp16_one, is_one);
468cc1dc7a3Sopenharmony_ci	r = select(r, fp16_small, is_small);
469cc1dc7a3Sopenharmony_ci	return r;
470cc1dc7a3Sopenharmony_ci}
471cc1dc7a3Sopenharmony_ci
472cc1dc7a3Sopenharmony_ci/**
473cc1dc7a3Sopenharmony_ci * @brief Convert 16-bit LNS to float16.
474cc1dc7a3Sopenharmony_ci */
475cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
476cc1dc7a3Sopenharmony_ci{
477cc1dc7a3Sopenharmony_ci	vint4 mc = p & 0x7FF;
478cc1dc7a3Sopenharmony_ci	vint4 ec = lsr<11>(p);
479cc1dc7a3Sopenharmony_ci
480cc1dc7a3Sopenharmony_ci	vint4 mc_512 = mc * 3;
481cc1dc7a3Sopenharmony_ci	vmask4 mask_512 = mc < vint4(512);
482cc1dc7a3Sopenharmony_ci
483cc1dc7a3Sopenharmony_ci	vint4 mc_1536 = mc * 4 - 512;
484cc1dc7a3Sopenharmony_ci	vmask4 mask_1536 = mc < vint4(1536);
485cc1dc7a3Sopenharmony_ci
486cc1dc7a3Sopenharmony_ci	vint4 mc_else = mc * 5 - 2048;
487cc1dc7a3Sopenharmony_ci
488cc1dc7a3Sopenharmony_ci	vint4 mt = mc_else;
489cc1dc7a3Sopenharmony_ci	mt = select(mt, mc_1536, mask_1536);
490cc1dc7a3Sopenharmony_ci	mt = select(mt, mc_512, mask_512);
491cc1dc7a3Sopenharmony_ci
492cc1dc7a3Sopenharmony_ci	vint4 res = lsl<10>(ec) | lsr<3>(mt);
493cc1dc7a3Sopenharmony_ci	return min(res, vint4(0x7BFF));
494cc1dc7a3Sopenharmony_ci}
495cc1dc7a3Sopenharmony_ci
496cc1dc7a3Sopenharmony_ci/**
497cc1dc7a3Sopenharmony_ci * @brief Extract mantissa and exponent of a float value.
498cc1dc7a3Sopenharmony_ci *
499cc1dc7a3Sopenharmony_ci * @param      a      The input value.
500cc1dc7a3Sopenharmony_ci * @param[out] exp    The output exponent.
501cc1dc7a3Sopenharmony_ci *
502cc1dc7a3Sopenharmony_ci * @return The mantissa.
503cc1dc7a3Sopenharmony_ci */
504cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
505cc1dc7a3Sopenharmony_ci{
506cc1dc7a3Sopenharmony_ci	// Interpret the bits as an integer
507cc1dc7a3Sopenharmony_ci	vint4 ai = float_as_int(a);
508cc1dc7a3Sopenharmony_ci
509cc1dc7a3Sopenharmony_ci	// Extract and unbias the exponent
510cc1dc7a3Sopenharmony_ci	exp = (lsr<23>(ai) & 0xFF) - 126;
511cc1dc7a3Sopenharmony_ci
512cc1dc7a3Sopenharmony_ci	// Extract and unbias the mantissa
513cc1dc7a3Sopenharmony_ci	vint4 manti = (ai &  static_cast<int>(0x807FFFFF)) | 0x3F000000;
514cc1dc7a3Sopenharmony_ci	return int_as_float(manti);
515cc1dc7a3Sopenharmony_ci}
516cc1dc7a3Sopenharmony_ci
517cc1dc7a3Sopenharmony_ci/**
518cc1dc7a3Sopenharmony_ci * @brief Convert float to 16-bit LNS.
519cc1dc7a3Sopenharmony_ci */
520cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
521cc1dc7a3Sopenharmony_ci{
522cc1dc7a3Sopenharmony_ci	vint4 exp;
523cc1dc7a3Sopenharmony_ci	vfloat4 mant = frexp(a, exp);
524cc1dc7a3Sopenharmony_ci
525cc1dc7a3Sopenharmony_ci	// Do these early before we start messing about ...
526cc1dc7a3Sopenharmony_ci	vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
527cc1dc7a3Sopenharmony_ci	vmask4 mask_infinity = a >= vfloat4(65536.0f);
528cc1dc7a3Sopenharmony_ci
529cc1dc7a3Sopenharmony_ci	// If input is smaller than 2^-14, multiply by 2^25 and don't bias.
530cc1dc7a3Sopenharmony_ci	vmask4 exp_lt_m13 = exp < vint4(-13);
531cc1dc7a3Sopenharmony_ci
532cc1dc7a3Sopenharmony_ci	vfloat4 a1a = a * 33554432.0f;
533cc1dc7a3Sopenharmony_ci	vint4 expa = vint4::zero();
534cc1dc7a3Sopenharmony_ci
535cc1dc7a3Sopenharmony_ci	vfloat4 a1b = (mant - 0.5f) * 4096;
536cc1dc7a3Sopenharmony_ci	vint4 expb = exp + 14;
537cc1dc7a3Sopenharmony_ci
538cc1dc7a3Sopenharmony_ci	a = select(a1b, a1a, exp_lt_m13);
539cc1dc7a3Sopenharmony_ci	exp = select(expb, expa, exp_lt_m13);
540cc1dc7a3Sopenharmony_ci
541cc1dc7a3Sopenharmony_ci	vmask4 a_lt_384 = a < vfloat4(384.0f);
542cc1dc7a3Sopenharmony_ci	vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
543cc1dc7a3Sopenharmony_ci
544cc1dc7a3Sopenharmony_ci	vfloat4 a2a = a * (4.0f / 3.0f);
545cc1dc7a3Sopenharmony_ci	vfloat4 a2b = a + 128.0f;
546cc1dc7a3Sopenharmony_ci	vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
547cc1dc7a3Sopenharmony_ci
548cc1dc7a3Sopenharmony_ci	a = a2c;
549cc1dc7a3Sopenharmony_ci	a = select(a, a2b, a_lt_1408);
550cc1dc7a3Sopenharmony_ci	a = select(a, a2a, a_lt_384);
551cc1dc7a3Sopenharmony_ci
552cc1dc7a3Sopenharmony_ci	a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
553cc1dc7a3Sopenharmony_ci
554cc1dc7a3Sopenharmony_ci	a = select(a, vfloat4(65535.0f), mask_infinity);
555cc1dc7a3Sopenharmony_ci	a = select(a, vfloat4::zero(), mask_underflow_nan);
556cc1dc7a3Sopenharmony_ci
557cc1dc7a3Sopenharmony_ci	return a;
558cc1dc7a3Sopenharmony_ci}
559cc1dc7a3Sopenharmony_ci
560cc1dc7a3Sopenharmony_cinamespace astc
561cc1dc7a3Sopenharmony_ci{
562cc1dc7a3Sopenharmony_ci
563cc1dc7a3Sopenharmony_cistatic ASTCENC_SIMD_INLINE float pow(float x, float y)
564cc1dc7a3Sopenharmony_ci{
565cc1dc7a3Sopenharmony_ci	return pow(vfloat4(x), vfloat4(y)).lane<0>();
566cc1dc7a3Sopenharmony_ci}
567cc1dc7a3Sopenharmony_ci
568cc1dc7a3Sopenharmony_ci}
569cc1dc7a3Sopenharmony_ci
570cc1dc7a3Sopenharmony_ci#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
571