1// SPDX-License-Identifier: Apache-2.0
2// ----------------------------------------------------------------------------
3// Copyright 2011-2023 Arm Limited
4//
5// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6// use this file except in compliance with the License. You may obtain a copy
7// of the License at:
8//
9//     http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14// License for the specific language governing permissions and limitations
15// under the License.
16// ----------------------------------------------------------------------------
17
18/**
19 * @brief Functions for finding dominant direction of a set of colors.
20 */
21#if !defined(ASTCENC_DECOMPRESS_ONLY)
22
23#include "astcenc_internal.h"
24
25#include <cassert>
26
27/**
28 * @brief Compute the average RGB color of each partition.
29 *
30 * The algorithm here uses a vectorized sequential scan and per-partition
31 * color accumulators, using select() to mask texel lanes in other partitions.
32 *
33 * We only accumulate sums for N-1 partitions during the scan; the value for
34 * the last partition can be computed given that we know the block-wide average
35 * already.
36 *
37 * Because of this we could reduce the loop iteration count so it "just" spans
38 * the max texel index needed for the N-1 partitions, which could need fewer
39 * iterations than the full block texel count. However, this makes the loop
40 * count erratic and causes more branch mispredictions so is a net loss.
41 *
42 * @param      pi         The partitioning to use.
43 * @param      blk        The block data to process.
44 * @param[out] averages   The output averages. Unused partition indices will
45 *                        not be initialized, and lane<3> will be zero.
46 */
47static void compute_partition_averages_rgb(
48	const partition_info& pi,
49	const image_block& blk,
50	vfloat4 averages[BLOCK_MAX_PARTITIONS]
51) {
52	unsigned int partition_count = pi.partition_count;
53	unsigned int texel_count = blk.texel_count;
54	promise(texel_count > 0);
55
56	// For 1 partition just use the precomputed mean
57	if (partition_count == 1)
58	{
59		averages[0] = blk.data_mean.swz<0, 1, 2>();
60	}
61	// For 2 partitions scan results for partition 0, compute partition 1
62	else if (partition_count == 2)
63	{
64		vfloatacc pp_avg_rgb[3] {};
65
66		vint lane_id = vint::lane_id();
67		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
68		{
69			vint texel_partition(pi.partition_of_texel + i);
70
71			vmask lane_mask = lane_id < vint(texel_count);
72			lane_id += vint(ASTCENC_SIMD_WIDTH);
73
74			vmask p0_mask = lane_mask & (texel_partition == vint(0));
75
76			vfloat data_r = loada(blk.data_r + i);
77			haccumulate(pp_avg_rgb[0], data_r, p0_mask);
78
79			vfloat data_g = loada(blk.data_g + i);
80			haccumulate(pp_avg_rgb[1], data_g, p0_mask);
81
82			vfloat data_b = loada(blk.data_b + i);
83			haccumulate(pp_avg_rgb[2], data_b, p0_mask);
84		}
85
86		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
87
88		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
89		                           hadd_s(pp_avg_rgb[1]),
90		                           hadd_s(pp_avg_rgb[2]));
91
92		vfloat4 p1_total = block_total - p0_total;
93
94		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
95		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
96	}
97	// For 3 partitions scan results for partition 0/1, compute partition 2
98	else if (partition_count == 3)
99	{
100		vfloatacc pp_avg_rgb[2][3] {};
101
102		vint lane_id = vint::lane_id();
103		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
104		{
105			vint texel_partition(pi.partition_of_texel + i);
106
107			vmask lane_mask = lane_id < vint(texel_count);
108			lane_id += vint(ASTCENC_SIMD_WIDTH);
109
110			vmask p0_mask = lane_mask & (texel_partition == vint(0));
111			vmask p1_mask = lane_mask & (texel_partition == vint(1));
112
113			vfloat data_r = loada(blk.data_r + i);
114			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
115			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
116
117			vfloat data_g = loada(blk.data_g + i);
118			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
119			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
120
121			vfloat data_b = loada(blk.data_b + i);
122			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
123			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
124		}
125
126		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
127
128		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
129		                           hadd_s(pp_avg_rgb[0][1]),
130		                           hadd_s(pp_avg_rgb[0][2]));
131
132		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
133		                           hadd_s(pp_avg_rgb[1][1]),
134		                           hadd_s(pp_avg_rgb[1][2]));
135
136		vfloat4 p2_total = block_total - p0_total - p1_total;
137
138		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
139		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
140		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
141	}
142	else
143	{
144		// For 4 partitions scan results for partition 0/1/2, compute partition 3
145		vfloatacc pp_avg_rgb[3][3] {};
146
147		vint lane_id = vint::lane_id();
148		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
149		{
150			vint texel_partition(pi.partition_of_texel + i);
151
152			vmask lane_mask = lane_id < vint(texel_count);
153			lane_id += vint(ASTCENC_SIMD_WIDTH);
154
155			vmask p0_mask = lane_mask & (texel_partition == vint(0));
156			vmask p1_mask = lane_mask & (texel_partition == vint(1));
157			vmask p2_mask = lane_mask & (texel_partition == vint(2));
158
159			vfloat data_r = loada(blk.data_r + i);
160			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
161			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
162			haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
163
164			vfloat data_g = loada(blk.data_g + i);
165			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
166			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
167			haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
168
169			vfloat data_b = loada(blk.data_b + i);
170			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
171			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
172			haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
173		}
174
175		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
176
177		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
178		                           hadd_s(pp_avg_rgb[0][1]),
179		                           hadd_s(pp_avg_rgb[0][2]));
180
181		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
182		                           hadd_s(pp_avg_rgb[1][1]),
183		                           hadd_s(pp_avg_rgb[1][2]));
184
185		vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
186		                           hadd_s(pp_avg_rgb[2][1]),
187		                           hadd_s(pp_avg_rgb[2][2]));
188
189		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
190
191		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
192		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
193		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
194		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
195	}
196}
197
198/**
199 * @brief Compute the average RGBA color of each partition.
200 *
201 * The algorithm here uses a vectorized sequential scan and per-partition
202 * color accumulators, using select() to mask texel lanes in other partitions.
203 *
204 * We only accumulate sums for N-1 partitions during the scan; the value for
205 * the last partition can be computed given that we know the block-wide average
206 * already.
207 *
208 * Because of this we could reduce the loop iteration count so it "just" spans
209 * the max texel index needed for the N-1 partitions, which could need fewer
210 * iterations than the full block texel count. However, this makes the loop
211 * count erratic and causes more branch mispredictions so is a net loss.
212 *
213 * @param      pi         The partitioning to use.
214 * @param      blk        The block data to process.
215 * @param[out] averages   The output averages. Unused partition indices will
216 *                        not be initialized.
217 */
218static void compute_partition_averages_rgba(
219	const partition_info& pi,
220	const image_block& blk,
221	vfloat4 averages[BLOCK_MAX_PARTITIONS]
222) {
223	unsigned int partition_count = pi.partition_count;
224	unsigned int texel_count = blk.texel_count;
225	promise(texel_count > 0);
226
227	// For 1 partition just use the precomputed mean
228	if (partition_count == 1)
229	{
230		averages[0] = blk.data_mean;
231	}
232	// For 2 partitions scan results for partition 0, compute partition 1
233	else if (partition_count == 2)
234	{
235		vfloat4 pp_avg_rgba[4] {};
236
237		vint lane_id = vint::lane_id();
238		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
239		{
240			vint texel_partition(pi.partition_of_texel + i);
241
242			vmask lane_mask = lane_id < vint(texel_count);
243			lane_id += vint(ASTCENC_SIMD_WIDTH);
244
245			vmask p0_mask = lane_mask & (texel_partition == vint(0));
246
247			vfloat data_r = loada(blk.data_r + i);
248			haccumulate(pp_avg_rgba[0], data_r, p0_mask);
249
250			vfloat data_g = loada(blk.data_g + i);
251			haccumulate(pp_avg_rgba[1], data_g, p0_mask);
252
253			vfloat data_b = loada(blk.data_b + i);
254			haccumulate(pp_avg_rgba[2], data_b, p0_mask);
255
256			vfloat data_a = loada(blk.data_a + i);
257			haccumulate(pp_avg_rgba[3], data_a, p0_mask);
258		}
259
260		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
261
262		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
263		                           hadd_s(pp_avg_rgba[1]),
264		                           hadd_s(pp_avg_rgba[2]),
265		                           hadd_s(pp_avg_rgba[3]));
266
267		vfloat4 p1_total = block_total - p0_total;
268
269		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
270		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
271	}
272	// For 3 partitions scan results for partition 0/1, compute partition 2
273	else if (partition_count == 3)
274	{
275		vfloat4 pp_avg_rgba[2][4] {};
276
277		vint lane_id = vint::lane_id();
278		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
279		{
280			vint texel_partition(pi.partition_of_texel + i);
281
282			vmask lane_mask = lane_id < vint(texel_count);
283			lane_id += vint(ASTCENC_SIMD_WIDTH);
284
285			vmask p0_mask = lane_mask & (texel_partition == vint(0));
286			vmask p1_mask = lane_mask & (texel_partition == vint(1));
287
288			vfloat data_r = loada(blk.data_r + i);
289			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
290			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
291
292			vfloat data_g = loada(blk.data_g + i);
293			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
294			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
295
296			vfloat data_b = loada(blk.data_b + i);
297			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
298			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
299
300			vfloat data_a = loada(blk.data_a + i);
301			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
302			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
303		}
304
305		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
306
307		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
308		                           hadd_s(pp_avg_rgba[0][1]),
309		                           hadd_s(pp_avg_rgba[0][2]),
310		                           hadd_s(pp_avg_rgba[0][3]));
311
312		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
313		                           hadd_s(pp_avg_rgba[1][1]),
314		                           hadd_s(pp_avg_rgba[1][2]),
315		                           hadd_s(pp_avg_rgba[1][3]));
316
317		vfloat4 p2_total = block_total - p0_total - p1_total;
318
319		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
320		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
321		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
322	}
323	else
324	{
325		// For 4 partitions scan results for partition 0/1/2, compute partition 3
326		vfloat4 pp_avg_rgba[3][4] {};
327
328		vint lane_id = vint::lane_id();
329		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
330		{
331			vint texel_partition(pi.partition_of_texel + i);
332
333			vmask lane_mask = lane_id < vint(texel_count);
334			lane_id += vint(ASTCENC_SIMD_WIDTH);
335
336			vmask p0_mask = lane_mask & (texel_partition == vint(0));
337			vmask p1_mask = lane_mask & (texel_partition == vint(1));
338			vmask p2_mask = lane_mask & (texel_partition == vint(2));
339
340			vfloat data_r = loada(blk.data_r + i);
341			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
342			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
343			haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
344
345			vfloat data_g = loada(blk.data_g + i);
346			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
347			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
348			haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
349
350			vfloat data_b = loada(blk.data_b + i);
351			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
352			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
353			haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
354
355			vfloat data_a = loada(blk.data_a + i);
356			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
357			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
358			haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
359		}
360
361		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
362
363		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
364		                           hadd_s(pp_avg_rgba[0][1]),
365		                           hadd_s(pp_avg_rgba[0][2]),
366		                           hadd_s(pp_avg_rgba[0][3]));
367
368		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
369		                           hadd_s(pp_avg_rgba[1][1]),
370		                           hadd_s(pp_avg_rgba[1][2]),
371		                           hadd_s(pp_avg_rgba[1][3]));
372
373		vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
374		                           hadd_s(pp_avg_rgba[2][1]),
375		                           hadd_s(pp_avg_rgba[2][2]),
376		                           hadd_s(pp_avg_rgba[2][3]));
377
378		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
379
380		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
381		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
382		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
383		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
384	}
385}
386
387/* See header for documentation. */
388void compute_avgs_and_dirs_4_comp(
389	const partition_info& pi,
390	const image_block& blk,
391	partition_metrics pm[BLOCK_MAX_PARTITIONS]
392) {
393	int partition_count = pi.partition_count;
394	promise(partition_count > 0);
395
396	// Pre-compute partition_averages
397	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
398	compute_partition_averages_rgba(pi, blk, partition_averages);
399
400	for (int partition = 0; partition < partition_count; partition++)
401	{
402		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
403		unsigned int texel_count = pi.partition_texel_count[partition];
404		promise(texel_count > 0);
405
406		vfloat4 average = partition_averages[partition];
407		pm[partition].avg = average;
408
409		vfloat4 sum_xp = vfloat4::zero();
410		vfloat4 sum_yp = vfloat4::zero();
411		vfloat4 sum_zp = vfloat4::zero();
412		vfloat4 sum_wp = vfloat4::zero();
413
414		for (unsigned int i = 0; i < texel_count; i++)
415		{
416			unsigned int iwt = texel_indexes[i];
417			vfloat4 texel_datum = blk.texel(iwt);
418			texel_datum = texel_datum - average;
419
420			vfloat4 zero = vfloat4::zero();
421
422			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
423			sum_xp += select(zero, texel_datum, tdm0);
424
425			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
426			sum_yp += select(zero, texel_datum, tdm1);
427
428			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
429			sum_zp += select(zero, texel_datum, tdm2);
430
431			vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
432			sum_wp += select(zero, texel_datum, tdm3);
433		}
434
435		vfloat4 prod_xp = dot(sum_xp, sum_xp);
436		vfloat4 prod_yp = dot(sum_yp, sum_yp);
437		vfloat4 prod_zp = dot(sum_zp, sum_zp);
438		vfloat4 prod_wp = dot(sum_wp, sum_wp);
439
440		vfloat4 best_vector = sum_xp;
441		vfloat4 best_sum = prod_xp;
442
443		vmask4 mask = prod_yp > best_sum;
444		best_vector = select(best_vector, sum_yp, mask);
445		best_sum = select(best_sum, prod_yp, mask);
446
447		mask = prod_zp > best_sum;
448		best_vector = select(best_vector, sum_zp, mask);
449		best_sum = select(best_sum, prod_zp, mask);
450
451		mask = prod_wp > best_sum;
452		best_vector = select(best_vector, sum_wp, mask);
453
454		pm[partition].dir = best_vector;
455	}
456}
457
458/* See header for documentation. */
459void compute_avgs_and_dirs_3_comp(
460	const partition_info& pi,
461	const image_block& blk,
462	unsigned int omitted_component,
463	partition_metrics pm[BLOCK_MAX_PARTITIONS]
464) {
465	// Pre-compute partition_averages
466	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
467	compute_partition_averages_rgba(pi, blk, partition_averages);
468
469	const float* data_vr = blk.data_r;
470	const float* data_vg = blk.data_g;
471	const float* data_vb = blk.data_b;
472
473	// TODO: Data-driven permute would be useful to avoid this ...
474	if (omitted_component == 0)
475	{
476		partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
477		partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
478		partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
479		partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
480
481		data_vr = blk.data_g;
482		data_vg = blk.data_b;
483		data_vb = blk.data_a;
484	}
485	else if (omitted_component == 1)
486	{
487		partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
488		partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
489		partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
490		partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
491
492		data_vg = blk.data_b;
493		data_vb = blk.data_a;
494	}
495	else if (omitted_component == 2)
496	{
497		partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
498		partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
499		partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
500		partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
501
502		data_vb = blk.data_a;
503	}
504	else
505	{
506		partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
507		partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
508		partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
509		partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
510	}
511
512	unsigned int partition_count = pi.partition_count;
513	promise(partition_count > 0);
514
515	for (unsigned int partition = 0; partition < partition_count; partition++)
516	{
517		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
518		unsigned int texel_count = pi.partition_texel_count[partition];
519		promise(texel_count > 0);
520
521		vfloat4 average = partition_averages[partition];
522		pm[partition].avg = average;
523
524		vfloat4 sum_xp = vfloat4::zero();
525		vfloat4 sum_yp = vfloat4::zero();
526		vfloat4 sum_zp = vfloat4::zero();
527
528		for (unsigned int i = 0; i < texel_count; i++)
529		{
530			unsigned int iwt = texel_indexes[i];
531
532			vfloat4 texel_datum = vfloat3(data_vr[iwt],
533			                              data_vg[iwt],
534			                              data_vb[iwt]);
535			texel_datum = texel_datum - average;
536
537			vfloat4 zero = vfloat4::zero();
538
539			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
540			sum_xp += select(zero, texel_datum, tdm0);
541
542			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
543			sum_yp += select(zero, texel_datum, tdm1);
544
545			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
546			sum_zp += select(zero, texel_datum, tdm2);
547		}
548
549		vfloat4 prod_xp = dot(sum_xp, sum_xp);
550		vfloat4 prod_yp = dot(sum_yp, sum_yp);
551		vfloat4 prod_zp = dot(sum_zp, sum_zp);
552
553		vfloat4 best_vector = sum_xp;
554		vfloat4 best_sum = prod_xp;
555
556		vmask4 mask = prod_yp > best_sum;
557		best_vector = select(best_vector, sum_yp, mask);
558		best_sum = select(best_sum, prod_yp, mask);
559
560		mask = prod_zp > best_sum;
561		best_vector = select(best_vector, sum_zp, mask);
562
563		pm[partition].dir = best_vector;
564	}
565}
566
567/* See header for documentation. */
568void compute_avgs_and_dirs_3_comp_rgb(
569	const partition_info& pi,
570	const image_block& blk,
571	partition_metrics pm[BLOCK_MAX_PARTITIONS]
572) {
573	unsigned int partition_count = pi.partition_count;
574	promise(partition_count > 0);
575
576	// Pre-compute partition_averages
577	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
578	compute_partition_averages_rgb(pi, blk, partition_averages);
579
580	for (unsigned int partition = 0; partition < partition_count; partition++)
581	{
582		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
583		unsigned int texel_count = pi.partition_texel_count[partition];
584		promise(texel_count > 0);
585
586		vfloat4 average = partition_averages[partition];
587		pm[partition].avg = average;
588
589		vfloat4 sum_xp = vfloat4::zero();
590		vfloat4 sum_yp = vfloat4::zero();
591		vfloat4 sum_zp = vfloat4::zero();
592
593		for (unsigned int i = 0; i < texel_count; i++)
594		{
595			unsigned int iwt = texel_indexes[i];
596
597			vfloat4 texel_datum = blk.texel3(iwt);
598			texel_datum = texel_datum - average;
599
600			vfloat4 zero = vfloat4::zero();
601
602			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
603			sum_xp += select(zero, texel_datum, tdm0);
604
605			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
606			sum_yp += select(zero, texel_datum, tdm1);
607
608			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
609			sum_zp += select(zero, texel_datum, tdm2);
610		}
611
612		vfloat4 prod_xp = dot(sum_xp, sum_xp);
613		vfloat4 prod_yp = dot(sum_yp, sum_yp);
614		vfloat4 prod_zp = dot(sum_zp, sum_zp);
615
616		vfloat4 best_vector = sum_xp;
617		vfloat4 best_sum = prod_xp;
618
619		vmask4 mask = prod_yp > best_sum;
620		best_vector = select(best_vector, sum_yp, mask);
621		best_sum = select(best_sum, prod_yp, mask);
622
623		mask = prod_zp > best_sum;
624		best_vector = select(best_vector, sum_zp, mask);
625
626		pm[partition].dir = best_vector;
627	}
628}
629
630/* See header for documentation. */
631void compute_avgs_and_dirs_2_comp(
632	const partition_info& pt,
633	const image_block& blk,
634	unsigned int component1,
635	unsigned int component2,
636	partition_metrics pm[BLOCK_MAX_PARTITIONS]
637) {
638	vfloat4 average;
639
640	const float* data_vr = nullptr;
641	const float* data_vg = nullptr;
642
643	if (component1 == 0 && component2 == 1)
644	{
645		average = blk.data_mean.swz<0, 1>();
646
647		data_vr = blk.data_r;
648		data_vg = blk.data_g;
649	}
650	else if (component1 == 0 && component2 == 2)
651	{
652		average = blk.data_mean.swz<0, 2>();
653
654		data_vr = blk.data_r;
655		data_vg = blk.data_b;
656	}
657	else // (component1 == 1 && component2 == 2)
658	{
659		assert(component1 == 1 && component2 == 2);
660
661		average = blk.data_mean.swz<1, 2>();
662
663		data_vr = blk.data_g;
664		data_vg = blk.data_b;
665	}
666
667	unsigned int partition_count = pt.partition_count;
668	promise(partition_count > 0);
669
670	for (unsigned int partition = 0; partition < partition_count; partition++)
671	{
672		const uint8_t *texel_indexes = pt.texels_of_partition[partition];
673		unsigned int texel_count = pt.partition_texel_count[partition];
674		promise(texel_count > 0);
675
676		// Only compute a partition mean if more than one partition
677		if (partition_count > 1)
678		{
679			average = vfloat4::zero();
680			for (unsigned int i = 0; i < texel_count; i++)
681			{
682				unsigned int iwt = texel_indexes[i];
683				average += vfloat2(data_vr[iwt], data_vg[iwt]);
684			}
685
686			average = average / static_cast<float>(texel_count);
687		}
688
689		pm[partition].avg = average;
690
691		vfloat4 sum_xp = vfloat4::zero();
692		vfloat4 sum_yp = vfloat4::zero();
693
694		for (unsigned int i = 0; i < texel_count; i++)
695		{
696			unsigned int iwt = texel_indexes[i];
697			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
698			texel_datum = texel_datum - average;
699
700			vfloat4 zero = vfloat4::zero();
701
702			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
703			sum_xp += select(zero, texel_datum, tdm0);
704
705			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
706			sum_yp += select(zero, texel_datum, tdm1);
707		}
708
709		vfloat4 prod_xp = dot(sum_xp, sum_xp);
710		vfloat4 prod_yp = dot(sum_yp, sum_yp);
711
712		vfloat4 best_vector = sum_xp;
713		vfloat4 best_sum = prod_xp;
714
715		vmask4 mask = prod_yp > best_sum;
716		best_vector = select(best_vector, sum_yp, mask);
717
718		pm[partition].dir = best_vector;
719	}
720}
721
722/* See header for documentation. */
723void compute_error_squared_rgba(
724	const partition_info& pi,
725	const image_block& blk,
726	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
727	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
728	float line_lengths[BLOCK_MAX_PARTITIONS],
729	float& uncor_error,
730	float& samec_error
731) {
732	unsigned int partition_count = pi.partition_count;
733	promise(partition_count > 0);
734
735	vfloatacc uncor_errorsumv = vfloatacc::zero();
736	vfloatacc samec_errorsumv = vfloatacc::zero();
737
738	for (unsigned int partition = 0; partition < partition_count; partition++)
739	{
740		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
741
742		processed_line4 l_uncor = uncor_plines[partition];
743		processed_line4 l_samec = samec_plines[partition];
744
745		unsigned int texel_count = pi.partition_texel_count[partition];
746		promise(texel_count > 0);
747
748		// Vectorize some useful scalar inputs
749		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
750		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
751		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
752		vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
753
754		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
755		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
756		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
757		vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
758
759		vfloat l_samec_bs0(l_samec.bs.lane<0>());
760		vfloat l_samec_bs1(l_samec.bs.lane<1>());
761		vfloat l_samec_bs2(l_samec.bs.lane<2>());
762		vfloat l_samec_bs3(l_samec.bs.lane<3>());
763
764		assert(all(l_samec.amod == vfloat4(0.0f)));
765
766		vfloat uncor_loparamv(1e10f);
767		vfloat uncor_hiparamv(-1e10f);
768
769		vfloat ew_r(blk.channel_weight.lane<0>());
770		vfloat ew_g(blk.channel_weight.lane<1>());
771		vfloat ew_b(blk.channel_weight.lane<2>());
772		vfloat ew_a(blk.channel_weight.lane<3>());
773
774		// This implementation over-shoots, but this is safe as we initialize the texel_indexes
775		// array to extend the last value. This means min/max are not impacted, but we need to mask
776		// out the dummy values when we compute the line weighting.
777		vint lane_ids = vint::lane_id();
778		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
779		{
780			vmask mask = lane_ids < vint(texel_count);
781			vint texel_idxs(texel_indexes + i);
782
783			vfloat data_r = gatherf(blk.data_r, texel_idxs);
784			vfloat data_g = gatherf(blk.data_g, texel_idxs);
785			vfloat data_b = gatherf(blk.data_b, texel_idxs);
786			vfloat data_a = gatherf(blk.data_a, texel_idxs);
787
788			vfloat uncor_param = (data_r * l_uncor_bs0)
789			                   + (data_g * l_uncor_bs1)
790			                   + (data_b * l_uncor_bs2)
791			                   + (data_a * l_uncor_bs3);
792
793			uncor_loparamv = min(uncor_param, uncor_loparamv);
794			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
795
796			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
797			                   + (uncor_param * l_uncor_bs0);
798			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
799			                   + (uncor_param * l_uncor_bs1);
800			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
801			                   + (uncor_param * l_uncor_bs2);
802			vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
803			                   + (uncor_param * l_uncor_bs3);
804
805			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
806			                 + (ew_g * uncor_dist1 * uncor_dist1)
807			                 + (ew_b * uncor_dist2 * uncor_dist2)
808			                 + (ew_a * uncor_dist3 * uncor_dist3);
809
810			haccumulate(uncor_errorsumv, uncor_err, mask);
811
812			// Process samechroma data
813			vfloat samec_param = (data_r * l_samec_bs0)
814			                   + (data_g * l_samec_bs1)
815			                   + (data_b * l_samec_bs2)
816			                   + (data_a * l_samec_bs3);
817
818			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
819			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
820			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
821			vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
822
823			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
824			                 + (ew_g * samec_dist1 * samec_dist1)
825			                 + (ew_b * samec_dist2 * samec_dist2)
826			                 + (ew_a * samec_dist3 * samec_dist3);
827
828			haccumulate(samec_errorsumv, samec_err, mask);
829
830			lane_ids += vint(ASTCENC_SIMD_WIDTH);
831		}
832
833		// Turn very small numbers and NaNs into a small number
834		float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
835		line_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
836	}
837
838	uncor_error = hadd_s(uncor_errorsumv);
839	samec_error = hadd_s(samec_errorsumv);
840}
841
842/* See header for documentation. */
843void compute_error_squared_rgb(
844	const partition_info& pi,
845	const image_block& blk,
846	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
847	float& uncor_error,
848	float& samec_error
849) {
850	unsigned int partition_count = pi.partition_count;
851	promise(partition_count > 0);
852
853	vfloatacc uncor_errorsumv = vfloatacc::zero();
854	vfloatacc samec_errorsumv = vfloatacc::zero();
855
856	for (unsigned int partition = 0; partition < partition_count; partition++)
857	{
858		partition_lines3& pl = plines[partition];
859		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
860		unsigned int texel_count = pi.partition_texel_count[partition];
861		promise(texel_count > 0);
862
863		processed_line3 l_uncor = pl.uncor_pline;
864		processed_line3 l_samec = pl.samec_pline;
865
866		// Vectorize some useful scalar inputs
867		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
868		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
869		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
870
871		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
872		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
873		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
874
875		vfloat l_samec_bs0(l_samec.bs.lane<0>());
876		vfloat l_samec_bs1(l_samec.bs.lane<1>());
877		vfloat l_samec_bs2(l_samec.bs.lane<2>());
878
879		assert(all(l_samec.amod == vfloat4(0.0f)));
880
881		vfloat uncor_loparamv(1e10f);
882		vfloat uncor_hiparamv(-1e10f);
883
884		vfloat ew_r(blk.channel_weight.lane<0>());
885		vfloat ew_g(blk.channel_weight.lane<1>());
886		vfloat ew_b(blk.channel_weight.lane<2>());
887
888		// This implementation over-shoots, but this is safe as we initialize the weights array
889		// to extend the last value. This means min/max are not impacted, but we need to mask
890		// out the dummy values when we compute the line weighting.
891		vint lane_ids = vint::lane_id();
892		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
893		{
894			vmask mask = lane_ids < vint(texel_count);
895			vint texel_idxs(texel_indexes + i);
896
897			vfloat data_r = gatherf(blk.data_r, texel_idxs);
898			vfloat data_g = gatherf(blk.data_g, texel_idxs);
899			vfloat data_b = gatherf(blk.data_b, texel_idxs);
900
901			vfloat uncor_param = (data_r * l_uncor_bs0)
902			                   + (data_g * l_uncor_bs1)
903			                   + (data_b * l_uncor_bs2);
904
905			uncor_loparamv = min(uncor_param, uncor_loparamv);
906			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
907
908			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
909			                   + (uncor_param * l_uncor_bs0);
910			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
911			                   + (uncor_param * l_uncor_bs1);
912			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
913			                   + (uncor_param * l_uncor_bs2);
914
915			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
916			                 + (ew_g * uncor_dist1 * uncor_dist1)
917			                 + (ew_b * uncor_dist2 * uncor_dist2);
918
919			haccumulate(uncor_errorsumv, uncor_err, mask);
920
921			// Process samechroma data
922			vfloat samec_param = (data_r * l_samec_bs0)
923			                   + (data_g * l_samec_bs1)
924			                   + (data_b * l_samec_bs2);
925
926			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
927			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
928			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
929
930			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
931			                 + (ew_g * samec_dist1 * samec_dist1)
932			                 + (ew_b * samec_dist2 * samec_dist2);
933
934			haccumulate(samec_errorsumv, samec_err, mask);
935
936			lane_ids += vint(ASTCENC_SIMD_WIDTH);
937		}
938
939		// Turn very small numbers and NaNs into a small number
940		float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
941		pl.line_length = astc::max(uncor_linelen, 1e-7f);
942	}
943
944	uncor_error = hadd_s(uncor_errorsumv);
945	samec_error = hadd_s(samec_errorsumv);
946}
947
948#endif
949