1// SPDX-License-Identifier: Apache-2.0
2// ----------------------------------------------------------------------------
3// Copyright 2011-2023 Arm Limited
4//
5// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6// use this file except in compliance with the License. You may obtain a copy
7// of the License at:
8//
9//     http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14// License for the specific language governing permissions and limitations
15// under the License.
16// ----------------------------------------------------------------------------
17
18#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20/**
21 * @brief Functions for finding best partition for a block.
22 *
23 * The partition search operates in two stages. The first pass uses kmeans clustering to group
24 * texels into an ideal partitioning for the requested partition count, and then compares that
25 * against the 1024 partitionings generated by the ASTC partition hash function. The generated
26 * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
27 * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
28 * partitionings that actually generate fewer than the requested partition count, but only the top
29 * N candidates are actually put through a more detailed search. N is determined by the compressor
30 * quality preset.
31 *
32 * For the detailed search, each candidate is checked against two possible encoding methods:
33 *
34 *   - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
35 *   - The best partitioning assuming same chroma colors (RGB + scale endpoints).
36 *
37 * This is implemented by computing the compute mean color and dominant direction for each
38 * partition. This defines two lines, both of which go through the mean color value.
39 *
40 * - One line has a direction defined by the dominant direction; this is used to assess the error
41 *   from using an uncorrelated color representation.
42 * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
43 *   (RGB + scale) color representation.
44 *
45 * The best candidate is selected by computing the squared-errors that result from using these
46 * lines for endpoint selection.
47 */
48
49#include <limits>
50#include "astcenc_internal.h"
51
52/**
53 * @brief Pick some initial kmeans cluster centers.
54 *
55 * @param      blk               The image block color data to compress.
56 * @param      texel_count       The number of texels in the block.
57 * @param      partition_count   The number of partitions in the block.
58 * @param[out] cluster_centers   The initial partition cluster center colors.
59 */
60static void kmeans_init(
61	const image_block& blk,
62	unsigned int texel_count,
63	unsigned int partition_count,
64	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
65) {
66	promise(texel_count > 0);
67	promise(partition_count > 0);
68
69	unsigned int clusters_selected = 0;
70	float distances[BLOCK_MAX_TEXELS];
71
72	// Pick a random sample as first cluster center; 145897 from random.org
73	unsigned int sample = 145897 % texel_count;
74	vfloat4 center_color = blk.texel(sample);
75	cluster_centers[clusters_selected] = center_color;
76	clusters_selected++;
77
78	// Compute the distance to the first cluster center
79	float distance_sum = 0.0f;
80	for (unsigned int i = 0; i < texel_count; i++)
81	{
82		vfloat4 color = blk.texel(i);
83		vfloat4 diff = color - center_color;
84		float distance = dot_s(diff * diff, blk.channel_weight);
85		distance_sum += distance;
86		distances[i] = distance;
87	}
88
89	// More numbers from random.org for weighted-random center selection
90	const float cluster_cutoffs[9] {
91		0.626220f, 0.932770f, 0.275454f,
92		0.318558f, 0.240113f, 0.009190f,
93		0.347661f, 0.731960f, 0.156391f
94	};
95
96	unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
97
98	// Pick the remaining samples as needed
99	while (true)
100	{
101		// Pick the next center in a weighted-random fashion.
102		float summa = 0.0f;
103		float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
104		for (sample = 0; sample < texel_count; sample++)
105		{
106			summa += distances[sample];
107			if (summa >= distance_cutoff)
108			{
109				break;
110			}
111		}
112
113		// Clamp to a valid range and store the selected cluster center
114		sample = astc::min(sample, texel_count - 1);
115
116		center_color = blk.texel(sample);
117		cluster_centers[clusters_selected++] = center_color;
118		if (clusters_selected >= partition_count)
119		{
120			break;
121		}
122
123		// Compute the distance to the new cluster center, keep the min dist
124		distance_sum = 0.0f;
125		for (unsigned int i = 0; i < texel_count; i++)
126		{
127			vfloat4 color = blk.texel(i);
128			vfloat4 diff = color - center_color;
129			float distance = dot_s(diff * diff, blk.channel_weight);
130			distance = astc::min(distance, distances[i]);
131			distance_sum += distance;
132			distances[i] = distance;
133		}
134	}
135}
136
137/**
138 * @brief Assign texels to clusters, based on a set of chosen center points.
139 *
140 * @param      blk                  The image block color data to compress.
141 * @param      texel_count          The number of texels in the block.
142 * @param      partition_count      The number of partitions in the block.
143 * @param      cluster_centers      The partition cluster center colors.
144 * @param[out] partition_of_texel   The partition assigned for each texel.
145 */
146static void kmeans_assign(
147	const image_block& blk,
148	unsigned int texel_count,
149	unsigned int partition_count,
150	const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
151	uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
152) {
153	promise(texel_count > 0);
154	promise(partition_count > 0);
155
156	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
157
158	// Find the best partition for every texel
159	for (unsigned int i = 0; i < texel_count; i++)
160	{
161		float best_distance = std::numeric_limits<float>::max();
162		unsigned int best_partition = 0;
163
164		vfloat4 color = blk.texel(i);
165		for (unsigned int j = 0; j < partition_count; j++)
166		{
167			vfloat4 diff = color - cluster_centers[j];
168			float distance = dot_s(diff * diff, blk.channel_weight);
169			if (distance < best_distance)
170			{
171				best_distance = distance;
172				best_partition = j;
173			}
174		}
175
176		partition_of_texel[i] = static_cast<uint8_t>(best_partition);
177		partition_texel_count[best_partition]++;
178	}
179
180	// It is possible to get a situation where a partition ends up without any texels. In this case,
181	// assign texel N to partition N. This is silly, but ensures that every partition retains at
182	// least one texel. Reassigning a texel in this manner may cause another partition to go empty,
183	// so if we actually did a reassignment, run the whole loop over again.
184	bool problem_case;
185	do
186	{
187		problem_case = false;
188		for (unsigned int i = 0; i < partition_count; i++)
189		{
190			if (partition_texel_count[i] == 0)
191			{
192				partition_texel_count[partition_of_texel[i]]--;
193				partition_texel_count[i]++;
194				partition_of_texel[i] = static_cast<uint8_t>(i);
195				problem_case = true;
196			}
197		}
198	} while (problem_case);
199}
200
201/**
202 * @brief Compute new cluster centers based on their center of gravity.
203 *
204 * @param       blk                  The image block color data to compress.
205 * @param       texel_count          The number of texels in the block.
206 * @param       partition_count      The number of partitions in the block.
207 * @param[out]  cluster_centers      The new cluster center colors.
208 * @param       partition_of_texel   The partition assigned for each texel.
209 */
210static void kmeans_update(
211	const image_block& blk,
212	unsigned int texel_count,
213	unsigned int partition_count,
214	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
215	const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
216) {
217	promise(texel_count > 0);
218	promise(partition_count > 0);
219
220	vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
221		vfloat4::zero(),
222		vfloat4::zero(),
223		vfloat4::zero(),
224		vfloat4::zero()
225	};
226
227	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
228
229	// Find the center-of-gravity in each cluster
230	for (unsigned int i = 0; i < texel_count; i++)
231	{
232		uint8_t partition = partition_of_texel[i];
233		color_sum[partition] += blk.texel(i);
234		partition_texel_count[partition]++;
235	}
236
237	// Set the center of gravity to be the new cluster center
238	for (unsigned int i = 0; i < partition_count; i++)
239	{
240		float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
241		cluster_centers[i] = color_sum[i] * scale;
242	}
243}
244
245/**
246 * @brief Compute bit-mismatch for partitioning in 2-partition mode.
247 *
248 * @param a   The texel assignment bitvector for the block.
249 * @param b   The texel assignment bitvector for the partition table.
250 *
251 * @return    The number of bit mismatches.
252 */
253#if ASTCENC_NEON != 0
254static inline uint8_t partition_mismatch2(
255	const uint64_t a[2],
256	const uint64_t b[2]
257) {
258	uint64x2_t a01 = vld1q_u64(a);
259	uint64x2_t b01 = vld1q_u64(b);
260	uint64x2_t b10 = vextq_u64(b01, b01, 1);
261	uint8_t c1 = popcount(veorq_u64(a01, b01));
262	uint8_t c2 = popcount(veorq_u64(a01, b10));
263	return static_cast<uint8_t>(astc::min(c1, c2) / 2);    // 2 is the number of partitions
264}
265#else
266static inline uint8_t partition_mismatch2(
267	const uint64_t a[2],
268	const uint64_t b[2]
269) {
270	int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
271	int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
272
273	// Divide by 2 because XOR always counts errors twice, once when missing
274	// in the expected position, and again when present in the wrong partition
275	return static_cast<uint8_t>(astc::min(v1, v2) / 2);
276}
277#endif
278
279/**
280 * @brief Compute bit-mismatch for partitioning in 3-partition mode.
281 *
282 * @param a   The texel assignment bitvector for the block.
283 * @param b   The texel assignment bitvector for the partition table.
284 *
285 * @return    The number of bit mismatches.
286 */
287static inline uint8_t partition_mismatch3(
288	const uint64_t a[3],
289	const uint64_t b[3]
290) {
291	int p00 = popcount(a[0] ^ b[0]);
292	int p01 = popcount(a[0] ^ b[1]);
293	int p02 = popcount(a[0] ^ b[2]);
294
295	int p10 = popcount(a[1] ^ b[0]);
296	int p11 = popcount(a[1] ^ b[1]);
297	int p12 = popcount(a[1] ^ b[2]);
298
299	int p20 = popcount(a[2] ^ b[0]);
300	int p21 = popcount(a[2] ^ b[1]);
301	int p22 = popcount(a[2] ^ b[2]);
302
303	int s0 = p11 + p22;
304	int s1 = p12 + p21;
305	int v0 = astc::min(s0, s1) + p00;
306
307	int s2 = p10 + p22;
308	int s3 = p12 + p20;
309	int v1 = astc::min(s2, s3) + p01;
310
311	int s4 = p10 + p21;
312	int s5 = p11 + p20;
313	int v2 = astc::min(s4, s5) + p02;
314
315	// Divide by 2 because XOR always counts errors twice, once when missing
316	// in the expected position, and again when present in the wrong partition
317	return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
318}
319
320/**
321 * @brief Compute bit-mismatch for partitioning in 4-partition mode.
322 *
323 * @param a   The texel assignment bitvector for the block.
324 * @param b   The texel assignment bitvector for the partition table.
325 *
326 * @return    The number of bit mismatches.
327 */
328static inline uint8_t partition_mismatch4(
329	const uint64_t a[4],
330	const uint64_t b[4]
331) {
332	int p00 = popcount(a[0] ^ b[0]);
333	int p01 = popcount(a[0] ^ b[1]);
334	int p02 = popcount(a[0] ^ b[2]);
335	int p03 = popcount(a[0] ^ b[3]);
336
337	int p10 = popcount(a[1] ^ b[0]);
338	int p11 = popcount(a[1] ^ b[1]);
339	int p12 = popcount(a[1] ^ b[2]);
340	int p13 = popcount(a[1] ^ b[3]);
341
342	int p20 = popcount(a[2] ^ b[0]);
343	int p21 = popcount(a[2] ^ b[1]);
344	int p22 = popcount(a[2] ^ b[2]);
345	int p23 = popcount(a[2] ^ b[3]);
346
347	int p30 = popcount(a[3] ^ b[0]);
348	int p31 = popcount(a[3] ^ b[1]);
349	int p32 = popcount(a[3] ^ b[2]);
350	int p33 = popcount(a[3] ^ b[3]);
351
352	int mx23 = astc::min(p22 + p33, p23 + p32);
353	int mx13 = astc::min(p21 + p33, p23 + p31);
354	int mx12 = astc::min(p21 + p32, p22 + p31);
355	int mx03 = astc::min(p20 + p33, p23 + p30);
356	int mx02 = astc::min(p20 + p32, p22 + p30);
357	int mx01 = astc::min(p21 + p30, p20 + p31);
358
359	int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
360	int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
361	int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
362	int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
363
364	// Divide by 2 because XOR always counts errors twice, once when missing
365	// in the expected position, and again when present in the wrong partition
366	return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
367}
368
369using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
370
371/**
372 * @brief Count the partition table mismatches vs the data clustering.
373 *
374 * @param      bsd               The block size information.
375 * @param      partition_count   The number of partitions in the block.
376 * @param      bitmaps           The block texel partition assignment patterns.
377 * @param[out] mismatch_counts   The array storing per partitioning mismatch counts.
378 */
379static void count_partition_mismatch_bits(
380	const block_size_descriptor& bsd,
381	unsigned int partition_count,
382	const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
383	uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
384) {
385	unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
386	promise(active_count > 0);
387
388	if (partition_count == 2)
389	{
390		for (unsigned int i = 0; i < active_count; i++)
391		{
392			mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
393			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
394			assert(mismatch_counts[i] < bsd.texel_count);
395		}
396	}
397	else if (partition_count == 3)
398	{
399		for (unsigned int i = 0; i < active_count; i++)
400		{
401			mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
402			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
403			assert(mismatch_counts[i] < bsd.texel_count);
404		}
405	}
406	else
407	{
408		for (unsigned int i = 0; i < active_count; i++)
409		{
410			mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
411			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
412			assert(mismatch_counts[i] < bsd.texel_count);
413		}
414	}
415}
416
417/**
418 * @brief Use counting sort on the mismatch array to sort partition candidates.
419 *
420 * @param      partitioning_count   The number of packed partitionings.
421 * @param      mismatch_count       Partitioning mismatch counts, in index order.
422 * @param[out] partition_ordering   Partition index values, in mismatch order.
423 *
424 * @return The number of active partitions in this selection.
425 */
426static unsigned int get_partition_ordering_by_mismatch_bits(
427	unsigned int texel_count,
428	unsigned int partitioning_count,
429	const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
430	uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
431) {
432	promise(partitioning_count > 0);
433	uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
434
435	// Create the histogram of mismatch counts
436	for (unsigned int i = 0; i < partitioning_count; i++)
437	{
438		mscount[mismatch_count[i]]++;
439	}
440
441	// Create a running sum from the histogram array
442	// Cells store previous values only; i.e. exclude self after sum
443	unsigned int sum = 0;
444	for (unsigned int i = 0; i < texel_count; i++)
445	{
446		uint16_t cnt = mscount[i];
447		mscount[i] = sum;
448		sum += cnt;
449	}
450
451	// Use the running sum as the index, incrementing after read to allow
452	// sequential entries with the same count
453	for (unsigned int i = 0; i < partitioning_count; i++)
454	{
455		unsigned int idx = mscount[mismatch_count[i]]++;
456		partition_ordering[idx] = static_cast<uint16_t>(i);
457	}
458
459	return partitioning_count;
460}
461
462/**
463 * @brief Use k-means clustering to compute a partition ordering for a block..
464 *
465 * @param      bsd                  The block size information.
466 * @param      blk                  The image block color data to compress.
467 * @param      partition_count      The desired number of partitions in the block.
468 * @param[out] partition_ordering   The list of recommended partition indices, in priority order.
469 *
470 * @return The number of active partitionings in this selection.
471 */
472static unsigned int compute_kmeans_partition_ordering(
473	const block_size_descriptor& bsd,
474	const image_block& blk,
475	unsigned int partition_count,
476	uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
477) {
478	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
479	uint8_t texel_partitions[BLOCK_MAX_TEXELS];
480
481	// Use three passes of k-means clustering to partition the block data
482	for (unsigned int i = 0; i < 3; i++)
483	{
484		if (i == 0)
485		{
486			kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
487		}
488		else
489		{
490			kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
491		}
492
493		kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
494	}
495
496	// Construct the block bitmaps of texel assignments to each partition
497	uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
498	unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
499	promise(texels_to_process > 0);
500	for (unsigned int i = 0; i < texels_to_process; i++)
501	{
502		unsigned int idx = bsd.kmeans_texels[i];
503		bitmaps[texel_partitions[idx]] |= 1ULL << i;
504	}
505
506	// Count the mismatch between the block and the format's partition tables
507	uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
508	count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
509
510	// Sort the partitions based on the number of mismatched bits
511	return get_partition_ordering_by_mismatch_bits(
512	    texels_to_process,
513	    bsd.partitioning_count_selected[partition_count - 1],
514	    mismatch_counts, partition_ordering);
515}
516
517/**
518 * @brief Insert a partitioning into an order list of results, sorted by error.
519 *
520 * @param      max_values      The max number of entries in the best result arrays.
521 * @param      this_error      The error of the new entry.
522 * @param      this_partition  The partition ID of the new entry.
523 * @param[out] best_errors     The array of best error values.
524 * @param[out] best_partitions The array of best partition values.
525 */
526static void insert_result(
527	unsigned int max_values,
528	float this_error,
529	unsigned int this_partition,
530	float* best_errors,
531	unsigned int* best_partitions)
532{
533	promise(max_values > 0);
534
535	// Don't bother searching if the current worst error beats the new error
536	if (this_error >= best_errors[max_values - 1])
537	{
538		return;
539	}
540
541	// Else insert into the list in error-order
542	for (unsigned int i = 0; i < max_values; i++)
543	{
544		// Existing result is better - move on ...
545		if (this_error > best_errors[i])
546		{
547			continue;
548		}
549
550		// Move existing results down one
551		for (unsigned int j = max_values - 1; j > i; j--)
552		{
553			best_errors[j] = best_errors[j - 1];
554			best_partitions[j] = best_partitions[j - 1];
555		}
556
557		// Insert new result
558		best_errors[i] = this_error;
559		best_partitions[i] = this_partition;
560		break;
561	}
562}
563
564/* See header for documentation. */
565unsigned int find_best_partition_candidates(
566	const block_size_descriptor& bsd,
567	const image_block& blk,
568	unsigned int partition_count,
569	unsigned int partition_search_limit,
570	unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
571	unsigned int requested_candidates
572) {
573	// Constant used to estimate quantization error for a given partitioning; the optimal value for
574	// this depends on bitrate. These values have been determined empirically.
575	unsigned int texels_per_block = bsd.texel_count;
576	float weight_imprecision_estim = 0.055f;
577	if (texels_per_block <= 20)
578	{
579		weight_imprecision_estim = 0.03f;
580	}
581	else if (texels_per_block <= 31)
582	{
583		weight_imprecision_estim = 0.04f;
584	}
585	else if (texels_per_block <= 41)
586	{
587		weight_imprecision_estim = 0.05f;
588	}
589
590	promise(partition_count > 0);
591	promise(partition_search_limit > 0);
592
593	weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
594
595	uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
596	unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
597	partition_search_limit = astc::min(partition_search_limit, sequence_len);
598	requested_candidates = astc::min(partition_search_limit, requested_candidates);
599
600	bool uses_alpha = !blk.is_constant_channel(3);
601
602	// Partitioning errors assuming uncorrelated-chrominance endpoints
603	float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
604	unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
605
606	// Partitioning errors assuming same-chrominance endpoints
607	float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
608	unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
609
610	for (unsigned int i = 0; i < requested_candidates; i++)
611	{
612		uncor_best_errors[i] = ERROR_CALC_DEFAULT;
613		samec_best_errors[i] = ERROR_CALC_DEFAULT;
614	}
615
616	if (uses_alpha)
617	{
618		for (unsigned int i = 0; i < partition_search_limit; i++)
619		{
620			unsigned int partition = partition_sequence[i];
621			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
622
623			// Compute weighting to give to each component in each partition
624			partition_metrics pms[BLOCK_MAX_PARTITIONS];
625
626			compute_avgs_and_dirs_4_comp(pi, blk, pms);
627
628			line4 uncor_lines[BLOCK_MAX_PARTITIONS];
629			line4 samec_lines[BLOCK_MAX_PARTITIONS];
630
631			processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
632			processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
633
634			float line_lengths[BLOCK_MAX_PARTITIONS];
635
636			for (unsigned int j = 0; j < partition_count; j++)
637			{
638				partition_metrics& pm = pms[j];
639
640				uncor_lines[j].a = pm.avg;
641				uncor_lines[j].b = normalize_safe(pm.dir, unit4());
642
643				uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
644				uncor_plines[j].bs = uncor_lines[j].b;
645
646				samec_lines[j].a = vfloat4::zero();
647				samec_lines[j].b = normalize_safe(pm.avg, unit4());
648
649				samec_plines[j].amod = vfloat4::zero();
650				samec_plines[j].bs = samec_lines[j].b;
651			}
652
653			float uncor_error = 0.0f;
654			float samec_error = 0.0f;
655
656			compute_error_squared_rgba(pi,
657			                           blk,
658			                           uncor_plines,
659			                           samec_plines,
660			                           line_lengths,
661			                           uncor_error,
662			                           samec_error);
663
664			// Compute an estimate of error introduced by weight quantization imprecision.
665			// This error is computed as follows, for each partition
666			//     1: compute the principal-axis vector (full length) in error-space
667			//     2: convert the principal-axis vector to regular RGB-space
668			//     3: scale the vector by a constant that estimates average quantization error
669			//     4: for each texel, square the vector, then do a dot-product with the texel's
670			//        error weight; sum up the results across all texels.
671			//     4(optimized): square the vector once, then do a dot-product with the average
672			//        texel error, then multiply by the number of texels.
673
674			for (unsigned int j = 0; j < partition_count; j++)
675			{
676				float tpp = static_cast<float>(pi.partition_texel_count[j]);
677				vfloat4 error_weights(tpp * weight_imprecision_estim);
678
679				vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];
680				vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];
681
682				uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
683				samec_error += dot_s(samec_vector * samec_vector, error_weights);
684			}
685
686			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
687			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
688		}
689	}
690	else
691	{
692		for (unsigned int i = 0; i < partition_search_limit; i++)
693		{
694			unsigned int partition = partition_sequence[i];
695			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
696
697			// Compute weighting to give to each component in each partition
698			partition_metrics pms[BLOCK_MAX_PARTITIONS];
699			compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
700
701			partition_lines3 plines[BLOCK_MAX_PARTITIONS];
702
703			for (unsigned int j = 0; j < partition_count; j++)
704			{
705				partition_metrics& pm = pms[j];
706				partition_lines3& pl = plines[j];
707
708				pl.uncor_line.a = pm.avg;
709				pl.uncor_line.b = normalize_safe(pm.dir, unit3());
710
711				pl.samec_line.a = vfloat4::zero();
712				pl.samec_line.b = normalize_safe(pm.avg, unit3());
713
714				pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
715				pl.uncor_pline.bs   = pl.uncor_line.b;
716
717				pl.samec_pline.amod = vfloat4::zero();
718				pl.samec_pline.bs   = pl.samec_line.b;
719			}
720
721			float uncor_error = 0.0f;
722			float samec_error = 0.0f;
723
724			compute_error_squared_rgb(pi,
725			                          blk,
726			                          plines,
727			                          uncor_error,
728			                          samec_error);
729
730			// Compute an estimate of error introduced by weight quantization imprecision.
731			// This error is computed as follows, for each partition
732			//     1: compute the principal-axis vector (full length) in error-space
733			//     2: convert the principal-axis vector to regular RGB-space
734			//     3: scale the vector by a constant that estimates average quantization error
735			//     4: for each texel, square the vector, then do a dot-product with the texel's
736			//        error weight; sum up the results across all texels.
737			//     4(optimized): square the vector once, then do a dot-product with the average
738			//        texel error, then multiply by the number of texels.
739
740			for (unsigned int j = 0; j < partition_count; j++)
741			{
742				partition_lines3& pl = plines[j];
743
744				float tpp = static_cast<float>(pi.partition_texel_count[j]);
745				vfloat4 error_weights(tpp * weight_imprecision_estim);
746
747				vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;
748				vfloat4 samec_vector = pl.samec_line.b * pl.line_length;
749
750				uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
751				samec_error += dot3_s(samec_vector * samec_vector, error_weights);
752			}
753
754			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
755			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
756		}
757	}
758
759	unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
760	for (unsigned int i = 0; i < requested_candidates; i++)
761	{
762		interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
763		interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
764	}
765
766	uint64_t bitmasks[1024/64] { 0 };
767	unsigned int emitted = 0;
768
769	// Deduplicate the first "requested" entries
770	for (unsigned int i = 0; i < requested_candidates * 2;  i++)
771	{
772		unsigned int partition = interleave[i];
773
774		unsigned int word = partition / 64;
775		unsigned int bit = partition % 64;
776
777		bool written = bitmasks[word] & (1ull << bit);
778
779		if (!written)
780		{
781			best_partitions[emitted] = partition;
782			bitmasks[word] |= 1ull << bit;
783			emitted++;
784
785			if (emitted == requested_candidates)
786			{
787				break;
788			}
789		}
790	}
791
792	return emitted;
793}
794
795#endif
796