1// SPDX-License-Identifier: Apache-2.0
2// ----------------------------------------------------------------------------
3// Copyright 2019-2023 Arm Limited
4//
5// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6// use this file except in compliance with the License. You may obtain a copy
7// of the License at:
8//
9//     http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14// License for the specific language governing permissions and limitations
15// under the License.
16// ----------------------------------------------------------------------------
17
18/**
19 * @brief 4x32-bit vectors, implemented using Armv8-A NEON.
20 *
21 * This module implements 4-wide 32-bit float, int, and mask vectors for
22 * Armv8-A NEON.
23 *
24 * There is a baseline level of functionality provided by all vector widths and
25 * implementations. This is implemented using identical function signatures,
26 * modulo data type, so we can use them as substitutable implementations in VLA
27 * code.
28 *
29 * The 4-wide vectors are also used as a fixed-width type, and significantly
30 * extend the functionality above that available to VLA code.
31 */
32
33#ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
34#define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
35
36#ifndef ASTCENC_SIMD_INLINE
37	#error "Include astcenc_vecmathlib.h, do not include directly"
38#endif
39
40#include <cstdio>
41#include <cstring>
42
43// ============================================================================
44// vfloat4 data type
45// ============================================================================
46
47/**
48 * @brief Data type for 4-wide floats.
49 */
50struct vfloat4
51{
52	/**
53	 * @brief Construct from zero-initialized value.
54	 */
55	ASTCENC_SIMD_INLINE vfloat4() = default;
56
57	/**
58	 * @brief Construct from 4 values loaded from an unaligned address.
59	 *
60	 * Consider using loada() which is better with vectors if data is aligned
61	 * to vector length.
62	 */
63	ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
64	{
65		m = vld1q_f32(p);
66	}
67
68	/**
69	 * @brief Construct from 1 scalar value replicated across all lanes.
70	 *
71	 * Consider using zero() for constexpr zeros.
72	 */
73	ASTCENC_SIMD_INLINE explicit vfloat4(float a)
74	{
75		m = vdupq_n_f32(a);
76	}
77
78	/**
79	 * @brief Construct from 4 scalar values.
80	 *
81	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
82	 */
83	ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
84	{
85		float v[4] { a, b, c, d };
86		m = vld1q_f32(v);
87	}
88
89	/**
90	 * @brief Construct from an existing SIMD register.
91	 */
92	ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
93	{
94		m = a;
95	}
96
97	/**
98	 * @brief Get the scalar value of a single lane.
99	 */
100	template <int l> ASTCENC_SIMD_INLINE float lane() const
101	{
102		return vgetq_lane_f32(m, l);
103	}
104
105	/**
106	 * @brief Set the scalar value of a single lane.
107	 */
108	template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
109	{
110		m = vsetq_lane_f32(a, m, l);
111	}
112
113	/**
114	 * @brief Factory that returns a vector of zeros.
115	 */
116	static ASTCENC_SIMD_INLINE vfloat4 zero()
117	{
118		return vfloat4(vdupq_n_f32(0.0f));
119	}
120
121	/**
122	 * @brief Factory that returns a replicated scalar loaded from memory.
123	 */
124	static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
125	{
126		return vfloat4(vld1q_dup_f32(p));
127	}
128
129	/**
130	 * @brief Factory that returns a vector loaded from 16B aligned memory.
131	 */
132	static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
133	{
134		return vfloat4(vld1q_f32(p));
135	}
136
137	/**
138	 * @brief Factory that returns a vector containing the lane IDs.
139	 */
140	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
141	{
142		alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
143		return vfloat4(vld1q_f32(data));
144	}
145
146	/**
147	 * @brief Return a swizzled float 2.
148	 */
149	template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
150	{
151		return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
152	}
153
154	/**
155	 * @brief Return a swizzled float 3.
156	 */
157	template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
158	{
159		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
160	}
161
162	/**
163	 * @brief Return a swizzled float 4.
164	 */
165	template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
166	{
167		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
168	}
169
170	/**
171	 * @brief The vector ...
172	 */
173	float32x4_t m;
174};
175
176// ============================================================================
177// vint4 data type
178// ============================================================================
179
180/**
181 * @brief Data type for 4-wide ints.
182 */
183struct vint4
184{
185	/**
186	 * @brief Construct from zero-initialized value.
187	 */
188	ASTCENC_SIMD_INLINE vint4() = default;
189
190	/**
191	 * @brief Construct from 4 values loaded from an unaligned address.
192	 *
193	 * Consider using loada() which is better with vectors if data is aligned
194	 * to vector length.
195	 */
196	ASTCENC_SIMD_INLINE explicit vint4(const int *p)
197	{
198		m = vld1q_s32(p);
199	}
200
201	/**
202	 * @brief Construct from 4 uint8_t loaded from an unaligned address.
203	 */
204	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
205	{
206		// Cast is safe - NEON loads are allowed to be unaligned
207		uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
208		uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
209		m = vreinterpretq_s32_u32(vmovl_u16(t16));
210	}
211
212	/**
213	 * @brief Construct from 1 scalar value replicated across all lanes.
214	 *
215	 * Consider using vfloat4::zero() for constexpr zeros.
216	 */
217	ASTCENC_SIMD_INLINE explicit vint4(int a)
218	{
219		m = vdupq_n_s32(a);
220	}
221
222	/**
223	 * @brief Construct from 4 scalar values.
224	 *
225	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
226	 */
227	ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
228	{
229		int v[4] { a, b, c, d };
230		m = vld1q_s32(v);
231	}
232
233	/**
234	 * @brief Construct from an existing SIMD register.
235	 */
236	ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
237	{
238		m = a;
239	}
240
241	/**
242	 * @brief Get the scalar from a single lane.
243	 */
244	template <int l> ASTCENC_SIMD_INLINE int lane() const
245	{
246		return vgetq_lane_s32(m, l);
247	}
248
249	/**
250	 * @brief Set the scalar value of a single lane.
251	 */
252	template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
253	{
254		m = vsetq_lane_s32(a, m, l);
255	}
256
257	/**
258	 * @brief Factory that returns a vector of zeros.
259	 */
260	static ASTCENC_SIMD_INLINE vint4 zero()
261	{
262		return vint4(0);
263	}
264
265	/**
266	 * @brief Factory that returns a replicated scalar loaded from memory.
267	 */
268	static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
269	{
270		return vint4(*p);
271	}
272
273	/**
274	 * @brief Factory that returns a vector loaded from unaligned memory.
275	 */
276	static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
277	{
278		vint4 data;
279		std::memcpy(&data.m, p, 4 * sizeof(int));
280		return data;
281	}
282
283	/**
284	 * @brief Factory that returns a vector loaded from 16B aligned memory.
285	 */
286	static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
287	{
288		return vint4(p);
289	}
290
291	/**
292	 * @brief Factory that returns a vector containing the lane IDs.
293	 */
294	static ASTCENC_SIMD_INLINE vint4 lane_id()
295	{
296		alignas(16) static const int data[4] { 0, 1, 2, 3 };
297		return vint4(vld1q_s32(data));
298	}
299
300	/**
301	 * @brief The vector ...
302	 */
303	int32x4_t m;
304};
305
306// ============================================================================
307// vmask4 data type
308// ============================================================================
309
310/**
311 * @brief Data type for 4-wide control plane masks.
312 */
313struct vmask4
314{
315	/**
316	 * @brief Construct from an existing SIMD register.
317	 */
318	ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
319	{
320		m = a;
321	}
322
323#if !defined(_MSC_VER)
324	/**
325	 * @brief Construct from an existing SIMD register.
326	 */
327	ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
328	{
329		m = vreinterpretq_u32_s32(a);
330	}
331#endif
332
333	/**
334	 * @brief Construct from 1 scalar value.
335	 */
336	ASTCENC_SIMD_INLINE explicit vmask4(bool a)
337	{
338		m = vreinterpretq_u32_s32(vdupq_n_s32(a == true ? -1 : 0));
339	}
340
341	/**
342	 * @brief Construct from 4 scalar values.
343	 *
344	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
345	 */
346	ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
347	{
348		int v[4] {
349			a == true ? -1 : 0,
350			b == true ? -1 : 0,
351			c == true ? -1 : 0,
352			d == true ? -1 : 0
353		};
354
355		int32x4_t ms = vld1q_s32(v);
356		m = vreinterpretq_u32_s32(ms);
357	}
358
359	/**
360	 * @brief Get the scalar from a single lane.
361	 */
362	template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
363	{
364		return vgetq_lane_u32(m, l) != 0;
365	}
366
367	/**
368	 * @brief The vector ...
369	 */
370	uint32x4_t m;
371};
372
373// ============================================================================
374// vmask4 operators and functions
375// ============================================================================
376
377/**
378 * @brief Overload: mask union (or).
379 */
380ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
381{
382	return vmask4(vorrq_u32(a.m, b.m));
383}
384
385/**
386 * @brief Overload: mask intersect (and).
387 */
388ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
389{
390	return vmask4(vandq_u32(a.m, b.m));
391}
392
393/**
394 * @brief Overload: mask difference (xor).
395 */
396ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
397{
398	return vmask4(veorq_u32(a.m, b.m));
399}
400
401/**
402 * @brief Overload: mask invert (not).
403 */
404ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
405{
406	return vmask4(vmvnq_u32(a.m));
407}
408
409/**
410 * @brief Return a 4-bit mask code indicating mask status.
411 *
412 * bit0 = lane 0
413 */
414ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
415{
416	static const int shifta[4] { 0, 1, 2, 3 };
417	static const int32x4_t shift = vld1q_s32(shifta);
418
419	uint32x4_t tmp = vshrq_n_u32(a.m, 31);
420	return vaddvq_u32(vshlq_u32(tmp, shift));
421}
422
423// ============================================================================
424// vint4 operators and functions
425// ============================================================================
426
427/**
428 * @brief Overload: vector by vector addition.
429 */
430ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
431{
432	return vint4(vaddq_s32(a.m, b.m));
433}
434
435/**
436 * @brief Overload: vector by vector subtraction.
437 */
438ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
439{
440	return vint4(vsubq_s32(a.m, b.m));
441}
442
443/**
444 * @brief Overload: vector by vector multiplication.
445 */
446ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
447{
448	return vint4(vmulq_s32(a.m, b.m));
449}
450
451/**
452 * @brief Overload: vector bit invert.
453 */
454ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
455{
456	return vint4(vmvnq_s32(a.m));
457}
458
459/**
460 * @brief Overload: vector by vector bitwise or.
461 */
462ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
463{
464	return vint4(vorrq_s32(a.m, b.m));
465}
466
467/**
468 * @brief Overload: vector by vector bitwise and.
469 */
470ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
471{
472	return vint4(vandq_s32(a.m, b.m));
473}
474
475/**
476 * @brief Overload: vector by vector bitwise xor.
477 */
478ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
479{
480	return vint4(veorq_s32(a.m, b.m));
481}
482
483/**
484 * @brief Overload: vector by vector equality.
485 */
486ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
487{
488	return vmask4(vceqq_s32(a.m, b.m));
489}
490
491/**
492 * @brief Overload: vector by vector inequality.
493 */
494ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
495{
496	return ~vmask4(vceqq_s32(a.m, b.m));
497}
498
499/**
500 * @brief Overload: vector by vector less than.
501 */
502ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
503{
504	return vmask4(vcltq_s32(a.m, b.m));
505}
506
507/**
508 * @brief Overload: vector by vector greater than.
509 */
510ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
511{
512	return vmask4(vcgtq_s32(a.m, b.m));
513}
514
515/**
516 * @brief Logical shift left.
517 */
518template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
519{
520	return vint4(vshlq_s32(a.m, vdupq_n_s32(s)));
521}
522
523/**
524 * @brief Logical shift right.
525 */
526template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
527{
528	uint32x4_t ua = vreinterpretq_u32_s32(a.m);
529	ua = vshlq_u32(ua, vdupq_n_s32(-s));
530	return vint4(vreinterpretq_s32_u32(ua));
531}
532
533/**
534 * @brief Arithmetic shift right.
535 */
536template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
537{
538	return vint4(vshlq_s32(a.m, vdupq_n_s32(-s)));
539}
540
541/**
542 * @brief Return the min vector of two vectors.
543 */
544ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
545{
546	return vint4(vminq_s32(a.m, b.m));
547}
548
549/**
550 * @brief Return the max vector of two vectors.
551 */
552ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
553{
554	return vint4(vmaxq_s32(a.m, b.m));
555}
556
557/**
558 * @brief Return the horizontal minimum of a vector.
559 */
560ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
561{
562	return vint4(vminvq_s32(a.m));
563}
564
565/**
566 * @brief Return the horizontal maximum of a vector.
567 */
568ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
569{
570	return vint4(vmaxvq_s32(a.m));
571}
572
573/**
574 * @brief Return the horizontal sum of a vector.
575 */
576ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
577{
578	int32x2_t t = vadd_s32(vget_high_s32(a.m), vget_low_s32(a.m));
579	return vget_lane_s32(vpadd_s32(t, t), 0);
580}
581
582/**
583 * @brief Return the horizontal sum of a vector.
584 */
585ASTCENC_SIMD_INLINE uint32_t hadd_s(vmask4 a)
586{
587	// Use add with SIMD versions
588	return vaddvq_u32(a.m);
589}
590
591#define ASTCENC_USE_NATIVE_ADDV
592/**
593 * @brief Return the horizontal sum of a vector.
594 */
595ASTCENC_SIMD_INLINE float hadd_rgba_s(vfloat4 a)
596{
597	// Use add with SIMD versions
598	return vaddvq_f32(a.m);
599}
600
601/**
602 * @brief Store a vector to a 16B aligned memory address.
603 */
604ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
605{
606	vst1q_s32(p, a.m);
607}
608
609/**
610 * @brief Store a vector to an unaligned memory address.
611 */
612ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
613{
614	vst1q_s32(p, a.m);
615}
616
617/**
618 * @brief Store a vector to an unaligned memory address.
619 */
620ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
621{
622	std::memcpy(p, &a.m, sizeof(int) * 4);
623}
624
625/**
626 * @brief Store lowest N (vector width) bytes into an unaligned address.
627 */
628ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
629{
630	vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
631}
632
633/**
634 * @brief Gather N (vector width) indices from the array.
635 */
636ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
637{
638	alignas(16) int idx[4];
639	storea(indices, idx);
640	alignas(16) int vals[4];
641	vals[0] = base[idx[0]];
642	vals[1] = base[idx[1]];
643	vals[2] = base[idx[2]];
644	vals[3] = base[idx[3]];
645	return vint4(vals);
646}
647
648/**
649 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
650 */
651ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
652{
653	uint8x16_t idx = {0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
654	int8x16_t av = vreinterpretq_s8_s32(a.m);
655	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
656}
657
658/**
659 * @brief Return lanes from @c b if @c cond is set, else @c a.
660 */
661ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
662{
663	return vint4(vbslq_s32(cond.m, b.m, a.m));
664}
665
666// ============================================================================
667// vfloat4 operators and functions
668// ============================================================================
669
670/**
671 * @brief Overload: vector by vector addition.
672 */
673ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
674{
675	return vfloat4(vaddq_f32(a.m, b.m));
676}
677
678/**
679 * @brief Overload: vector by vector subtraction.
680 */
681ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
682{
683	return vfloat4(vsubq_f32(a.m, b.m));
684}
685
686/**
687 * @brief Overload: vector by vector multiplication.
688 */
689ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
690{
691	return vfloat4(vmulq_f32(a.m, b.m));
692}
693
694/**
695 * @brief Overload: vector by vector division.
696 */
697ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
698{
699	return vfloat4(vdivq_f32(a.m, b.m));
700}
701
702/**
703 * @brief Overload: vector by vector equality.
704 */
705ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
706{
707	return vmask4(vceqq_f32(a.m, b.m));
708}
709
710/**
711 * @brief Overload: vector by vector inequality.
712 */
713ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
714{
715	return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
716}
717
718/**
719 * @brief Overload: vector by vector less than.
720 */
721ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
722{
723	return vmask4(vcltq_f32(a.m, b.m));
724}
725
726/**
727 * @brief Overload: vector by vector greater than.
728 */
729ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
730{
731	return vmask4(vcgtq_f32(a.m, b.m));
732}
733
734/**
735 * @brief Overload: vector by vector less than or equal.
736 */
737ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
738{
739	return vmask4(vcleq_f32(a.m, b.m));
740}
741
742/**
743 * @brief Overload: vector by vector greater than or equal.
744 */
745ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
746{
747	return vmask4(vcgeq_f32(a.m, b.m));
748}
749
750/**
751 * @brief Return the min vector of two vectors.
752 *
753 * If either lane value is NaN, @c b will be returned for that lane.
754 */
755ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
756{
757	// Do not reorder - second operand will return if either is NaN
758	return vfloat4(vminnmq_f32(a.m, b.m));
759}
760
761/**
762 * @brief Return the max vector of two vectors.
763 *
764 * If either lane value is NaN, @c b will be returned for that lane.
765 */
766ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
767{
768	// Do not reorder - second operand will return if either is NaN
769	return vfloat4(vmaxnmq_f32(a.m, b.m));
770}
771
772/**
773 * @brief Return the absolute value of the float vector.
774 */
775ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
776{
777	float32x4_t zero = vdupq_n_f32(0.0f);
778	float32x4_t inv = vsubq_f32(zero, a.m);
779	return vfloat4(vmaxq_f32(a.m, inv));
780}
781
782/**
783 * @brief Return a float rounded to the nearest integer value.
784 */
785ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
786{
787	return vfloat4(vrndnq_f32(a.m));
788}
789
790/**
791 * @brief Return the horizontal minimum of a vector.
792 */
793ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
794{
795	return vfloat4(vminvq_f32(a.m));
796}
797
798/**
799 * @brief Return the horizontal maximum of a vector.
800 */
801ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
802{
803	return vfloat4(vmaxvq_f32(a.m));
804}
805
806/**
807 * @brief Return the horizontal sum of a vector.
808 */
809ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
810{
811	// Perform halving add to ensure invariance; we cannot use vaddqv as this
812	// does (0 + 1 + 2 + 3) which is not invariant with x86 (0 + 2) + (1 + 3).
813	float32x2_t t = vadd_f32(vget_high_f32(a.m), vget_low_f32(a.m));
814	return vget_lane_f32(vpadd_f32(t, t), 0);
815}
816
817/**
818 * @brief Return the sqrt of the lanes in the vector.
819 */
820ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
821{
822	return vfloat4(vsqrtq_f32(a.m));
823}
824
825/**
826 * @brief Return lanes from @c b if @c cond is set, else @c a.
827 */
828ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
829{
830	return vfloat4(vbslq_f32(cond.m, b.m, a.m));
831}
832
833/**
834 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
835 */
836ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
837{
838	static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
839	uint32x4_t mask = vcgeq_u32(cond.m, msb);
840	return vfloat4(vbslq_f32(mask, b.m, a.m));
841}
842
843/**
844 * @brief Load a vector of gathered results from an array;
845 */
846ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
847{
848	alignas(16) int idx[4];
849	storea(indices, idx);
850	alignas(16) float vals[4];
851	vals[0] = base[idx[0]];
852	vals[1] = base[idx[1]];
853	vals[2] = base[idx[2]];
854	vals[3] = base[idx[3]];
855	return vfloat4(vals);
856}
857
858/**
859 * @brief Store a vector to an unaligned memory address.
860 */
861ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
862{
863	vst1q_f32(p, a.m);
864}
865
866/**
867 * @brief Store a vector to a 16B aligned memory address.
868 */
869ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
870{
871	vst1q_f32(p, a.m);
872}
873
874/**
875 * @brief Return a integer value for a float vector, using truncation.
876 */
877ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
878{
879	return vint4(vcvtq_s32_f32(a.m));
880}
881
882/**
883 * @brief Return a integer value for a float vector, using round-to-nearest.
884 */
885ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
886{
887	a = a + vfloat4(0.5f);
888	return vint4(vcvtq_s32_f32(a.m));
889}
890
891/**
892 * @brief Return a float value for an integer vector.
893 */
894ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
895{
896	return vfloat4(vcvtq_f32_s32(a.m));
897}
898
899/**
900 * @brief Return a float16 value for a float vector, using round-to-nearest.
901 */
902ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
903{
904	// Generate float16 value
905	float16x4_t f16 = vcvt_f16_f32(a.m);
906
907	// Convert each 16-bit float pattern to a 32-bit pattern
908	uint16x4_t u16 = vreinterpret_u16_f16(f16);
909	uint32x4_t u32 = vmovl_u16(u16);
910	return vint4(vreinterpretq_s32_u32(u32));
911}
912
913/**
914 * @brief Return a float16 value for a float scalar, using round-to-nearest.
915 */
916static inline uint16_t float_to_float16(float a)
917{
918	vfloat4 av(a);
919	return static_cast<uint16_t>(float_to_float16(av).lane<0>());
920}
921
922/**
923 * @brief Return a float value for a float16 vector.
924 */
925ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
926{
927	// Convert each 32-bit float pattern to a 16-bit pattern
928	uint32x4_t u32 = vreinterpretq_u32_s32(a.m);
929	uint16x4_t u16 = vmovn_u32(u32);
930	float16x4_t f16 = vreinterpret_f16_u16(u16);
931
932	// Generate float16 value
933	return vfloat4(vcvt_f32_f16(f16));
934}
935
936/**
937 * @brief Return a float value for a float16 scalar.
938 */
939ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
940{
941	vint4 av(a);
942	return float16_to_float(av).lane<0>();
943}
944
945/**
946 * @brief Return a float value as an integer bit pattern (i.e. no conversion).
947 *
948 * It is a common trick to convert floats into integer bit patterns, perform
949 * some bit hackery based on knowledge they are IEEE 754 layout, and then
950 * convert them back again. This is the first half of that flip.
951 */
952ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
953{
954	return vint4(vreinterpretq_s32_f32(a.m));
955}
956
957/**
958 * @brief Return a integer value as a float bit pattern (i.e. no conversion).
959 *
960 * It is a common trick to convert floats into integer bit patterns, perform
961 * some bit hackery based on knowledge they are IEEE 754 layout, and then
962 * convert them back again. This is the second half of that flip.
963 */
964ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
965{
966	return vfloat4(vreinterpretq_f32_s32(v.m));
967}
968
969/**
970 * @brief Prepare a vtable lookup table for use with the native SIMD size.
971 */
972ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
973{
974	t0p = t0;
975}
976
977
978/**
979 * @brief Prepare a vtable lookup table for use with the native SIMD size.
980 */
981ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
982{
983	t0p = t0;
984	t1p = t1;
985}
986
987/**
988 * @brief Prepare a vtable lookup table for use with the native SIMD size.
989 */
990ASTCENC_SIMD_INLINE void vtable_prepare(
991	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
992	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
993{
994	t0p = t0;
995	t1p = t1;
996	t2p = t2;
997	t3p = t3;
998}
999
1000/**
1001 * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
1002 */
1003ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
1004{
1005	int8x16_t table {
1006		vreinterpretq_s8_s32(t0.m)
1007	};
1008
1009	// Set index byte above max index for unused bytes so table lookup returns zero
1010	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
1011	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
1012
1013	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
1014}
1015
1016/**
1017 * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
1018 */
1019ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
1020{
1021	int8x16x2_t table {
1022		vreinterpretq_s8_s32(t0.m),
1023		vreinterpretq_s8_s32(t1.m)
1024	};
1025
1026	// Set index byte above max index for unused bytes so table lookup returns zero
1027	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
1028	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
1029
1030	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
1031}
1032
1033/**
1034 * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
1035 */
1036ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
1037{
1038	int8x16x4_t table {
1039		vreinterpretq_s8_s32(t0.m),
1040		vreinterpretq_s8_s32(t1.m),
1041		vreinterpretq_s8_s32(t2.m),
1042		vreinterpretq_s8_s32(t3.m)
1043	};
1044
1045	// Set index byte above max index for unused bytes so table lookup returns zero
1046	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
1047	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
1048
1049	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
1050}
1051
1052/**
1053 * @brief Return a vector of interleaved RGBA data.
1054 *
1055 * Input vectors have the value stored in the bottom 8 bits of each lane,
1056 * with high  bits set to zero.
1057 *
1058 * Output vector stores a single RGBA texel packed in each lane.
1059 */
1060ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
1061{
1062	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
1063}
1064
1065/**
1066 * @brief Store a single vector lane to an unaligned address.
1067 */
1068ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
1069{
1070	std::memcpy(base, &data, sizeof(int));
1071}
1072
1073/**
1074 * @brief Store a vector, skipping masked lanes.
1075 *
1076 * All masked lanes must be at the end of vector, after all non-masked lanes.
1077 */
1078ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
1079{
1080	if (mask.lane<3>())
1081	{
1082		store(data, base);
1083	}
1084	else if (mask.lane<2>() != 0.0f)
1085	{
1086		store_lane(base + 0, data.lane<0>());
1087		store_lane(base + 4, data.lane<1>());
1088		store_lane(base + 8, data.lane<2>());
1089	}
1090	else if (mask.lane<1>() != 0.0f)
1091	{
1092		store_lane(base + 0, data.lane<0>());
1093		store_lane(base + 4, data.lane<1>());
1094	}
1095	else if (mask.lane<0>() != 0.0f)
1096	{
1097		store_lane(base + 0, data.lane<0>());
1098	}
1099}
1100
1101#define ASTCENC_USE_NATIVE_POPCOUNT 1
1102
1103/**
1104 * @brief Population bit count.
1105 *
1106 * @param v   The value to population count.
1107 *
1108 * @return The number of 1 bits.
1109 */
1110ASTCENC_SIMD_INLINE int popcount(uint64_t v)
1111{
1112	return static_cast<int>(vaddlv_u8(vcnt_u8(vcreate_u8(v))));
1113}
1114
1115/**
1116 * @brief Population bit count.
1117 *
1118 * @param v   The value to population count.
1119 *
1120 * @return The number of 1 bits.
1121 */
1122ASTCENC_SIMD_INLINE int popcount(uint64x2_t v)
1123{
1124	return static_cast<int>(vaddvq_u8(vcntq_u8(vreinterpretq_u8_u64(v))));
1125}
1126
1127/**
1128 * @brief Population bit count.
1129 *
1130 * @param v   The value to population count.
1131 *
1132 * @return The number of 1 bits.
1133 */
1134ASTCENC_SIMD_INLINE int popcount(vmask4 v)
1135{
1136	return static_cast<int>(vaddvq_u8(vcntq_u8(vreinterpretq_u8_u32(v.m))));
1137}
1138
1139#endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
1140