1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0
2cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
3cc1dc7a3Sopenharmony_ci// Copyright 2011-2023 Arm Limited
4cc1dc7a3Sopenharmony_ci//
5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy
7cc1dc7a3Sopenharmony_ci// of the License at:
8cc1dc7a3Sopenharmony_ci//
9cc1dc7a3Sopenharmony_ci//     http://www.apache.org/licenses/LICENSE-2.0
10cc1dc7a3Sopenharmony_ci//
11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software
12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations
15cc1dc7a3Sopenharmony_ci// under the License.
16cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
17cc1dc7a3Sopenharmony_ci
18cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DECOMPRESS_ONLY)
19cc1dc7a3Sopenharmony_ci
20cc1dc7a3Sopenharmony_ci/**
21cc1dc7a3Sopenharmony_ci * @brief Functions for finding best partition for a block.
22cc1dc7a3Sopenharmony_ci *
23cc1dc7a3Sopenharmony_ci * The partition search operates in two stages. The first pass uses kmeans clustering to group
24cc1dc7a3Sopenharmony_ci * texels into an ideal partitioning for the requested partition count, and then compares that
25cc1dc7a3Sopenharmony_ci * against the 1024 partitionings generated by the ASTC partition hash function. The generated
26cc1dc7a3Sopenharmony_ci * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
27cc1dc7a3Sopenharmony_ci * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
28cc1dc7a3Sopenharmony_ci * partitionings that actually generate fewer than the requested partition count, but only the top
29cc1dc7a3Sopenharmony_ci * N candidates are actually put through a more detailed search. N is determined by the compressor
30cc1dc7a3Sopenharmony_ci * quality preset.
31cc1dc7a3Sopenharmony_ci *
32cc1dc7a3Sopenharmony_ci * For the detailed search, each candidate is checked against two possible encoding methods:
33cc1dc7a3Sopenharmony_ci *
34cc1dc7a3Sopenharmony_ci *   - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
35cc1dc7a3Sopenharmony_ci *   - The best partitioning assuming same chroma colors (RGB + scale endpoints).
36cc1dc7a3Sopenharmony_ci *
37cc1dc7a3Sopenharmony_ci * This is implemented by computing the compute mean color and dominant direction for each
38cc1dc7a3Sopenharmony_ci * partition. This defines two lines, both of which go through the mean color value.
39cc1dc7a3Sopenharmony_ci *
40cc1dc7a3Sopenharmony_ci * - One line has a direction defined by the dominant direction; this is used to assess the error
41cc1dc7a3Sopenharmony_ci *   from using an uncorrelated color representation.
42cc1dc7a3Sopenharmony_ci * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
43cc1dc7a3Sopenharmony_ci *   (RGB + scale) color representation.
44cc1dc7a3Sopenharmony_ci *
45cc1dc7a3Sopenharmony_ci * The best candidate is selected by computing the squared-errors that result from using these
46cc1dc7a3Sopenharmony_ci * lines for endpoint selection.
47cc1dc7a3Sopenharmony_ci */
48cc1dc7a3Sopenharmony_ci
49cc1dc7a3Sopenharmony_ci#include <limits>
50cc1dc7a3Sopenharmony_ci#include "astcenc_internal.h"
51cc1dc7a3Sopenharmony_ci
52cc1dc7a3Sopenharmony_ci/**
53cc1dc7a3Sopenharmony_ci * @brief Pick some initial kmeans cluster centers.
54cc1dc7a3Sopenharmony_ci *
55cc1dc7a3Sopenharmony_ci * @param      blk               The image block color data to compress.
56cc1dc7a3Sopenharmony_ci * @param      texel_count       The number of texels in the block.
57cc1dc7a3Sopenharmony_ci * @param      partition_count   The number of partitions in the block.
58cc1dc7a3Sopenharmony_ci * @param[out] cluster_centers   The initial partition cluster center colors.
59cc1dc7a3Sopenharmony_ci */
60cc1dc7a3Sopenharmony_cistatic void kmeans_init(
61cc1dc7a3Sopenharmony_ci	const image_block& blk,
62cc1dc7a3Sopenharmony_ci	unsigned int texel_count,
63cc1dc7a3Sopenharmony_ci	unsigned int partition_count,
64cc1dc7a3Sopenharmony_ci	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
65cc1dc7a3Sopenharmony_ci) {
66cc1dc7a3Sopenharmony_ci	promise(texel_count > 0);
67cc1dc7a3Sopenharmony_ci	promise(partition_count > 0);
68cc1dc7a3Sopenharmony_ci
69cc1dc7a3Sopenharmony_ci	unsigned int clusters_selected = 0;
70cc1dc7a3Sopenharmony_ci	float distances[BLOCK_MAX_TEXELS];
71cc1dc7a3Sopenharmony_ci
72cc1dc7a3Sopenharmony_ci	// Pick a random sample as first cluster center; 145897 from random.org
73cc1dc7a3Sopenharmony_ci	unsigned int sample = 145897 % texel_count;
74cc1dc7a3Sopenharmony_ci	vfloat4 center_color = blk.texel(sample);
75cc1dc7a3Sopenharmony_ci	cluster_centers[clusters_selected] = center_color;
76cc1dc7a3Sopenharmony_ci	clusters_selected++;
77cc1dc7a3Sopenharmony_ci
78cc1dc7a3Sopenharmony_ci	// Compute the distance to the first cluster center
79cc1dc7a3Sopenharmony_ci	float distance_sum = 0.0f;
80cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < texel_count; i++)
81cc1dc7a3Sopenharmony_ci	{
82cc1dc7a3Sopenharmony_ci		vfloat4 color = blk.texel(i);
83cc1dc7a3Sopenharmony_ci		vfloat4 diff = color - center_color;
84cc1dc7a3Sopenharmony_ci		float distance = dot_s(diff * diff, blk.channel_weight);
85cc1dc7a3Sopenharmony_ci		distance_sum += distance;
86cc1dc7a3Sopenharmony_ci		distances[i] = distance;
87cc1dc7a3Sopenharmony_ci	}
88cc1dc7a3Sopenharmony_ci
89cc1dc7a3Sopenharmony_ci	// More numbers from random.org for weighted-random center selection
90cc1dc7a3Sopenharmony_ci	const float cluster_cutoffs[9] {
91cc1dc7a3Sopenharmony_ci		0.626220f, 0.932770f, 0.275454f,
92cc1dc7a3Sopenharmony_ci		0.318558f, 0.240113f, 0.009190f,
93cc1dc7a3Sopenharmony_ci		0.347661f, 0.731960f, 0.156391f
94cc1dc7a3Sopenharmony_ci	};
95cc1dc7a3Sopenharmony_ci
96cc1dc7a3Sopenharmony_ci	unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
97cc1dc7a3Sopenharmony_ci
98cc1dc7a3Sopenharmony_ci	// Pick the remaining samples as needed
99cc1dc7a3Sopenharmony_ci	while (true)
100cc1dc7a3Sopenharmony_ci	{
101cc1dc7a3Sopenharmony_ci		// Pick the next center in a weighted-random fashion.
102cc1dc7a3Sopenharmony_ci		float summa = 0.0f;
103cc1dc7a3Sopenharmony_ci		float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
104cc1dc7a3Sopenharmony_ci		for (sample = 0; sample < texel_count; sample++)
105cc1dc7a3Sopenharmony_ci		{
106cc1dc7a3Sopenharmony_ci			summa += distances[sample];
107cc1dc7a3Sopenharmony_ci			if (summa >= distance_cutoff)
108cc1dc7a3Sopenharmony_ci			{
109cc1dc7a3Sopenharmony_ci				break;
110cc1dc7a3Sopenharmony_ci			}
111cc1dc7a3Sopenharmony_ci		}
112cc1dc7a3Sopenharmony_ci
113cc1dc7a3Sopenharmony_ci		// Clamp to a valid range and store the selected cluster center
114cc1dc7a3Sopenharmony_ci		sample = astc::min(sample, texel_count - 1);
115cc1dc7a3Sopenharmony_ci
116cc1dc7a3Sopenharmony_ci		center_color = blk.texel(sample);
117cc1dc7a3Sopenharmony_ci		cluster_centers[clusters_selected++] = center_color;
118cc1dc7a3Sopenharmony_ci		if (clusters_selected >= partition_count)
119cc1dc7a3Sopenharmony_ci		{
120cc1dc7a3Sopenharmony_ci			break;
121cc1dc7a3Sopenharmony_ci		}
122cc1dc7a3Sopenharmony_ci
123cc1dc7a3Sopenharmony_ci		// Compute the distance to the new cluster center, keep the min dist
124cc1dc7a3Sopenharmony_ci		distance_sum = 0.0f;
125cc1dc7a3Sopenharmony_ci		for (unsigned int i = 0; i < texel_count; i++)
126cc1dc7a3Sopenharmony_ci		{
127cc1dc7a3Sopenharmony_ci			vfloat4 color = blk.texel(i);
128cc1dc7a3Sopenharmony_ci			vfloat4 diff = color - center_color;
129cc1dc7a3Sopenharmony_ci			float distance = dot_s(diff * diff, blk.channel_weight);
130cc1dc7a3Sopenharmony_ci			distance = astc::min(distance, distances[i]);
131cc1dc7a3Sopenharmony_ci			distance_sum += distance;
132cc1dc7a3Sopenharmony_ci			distances[i] = distance;
133cc1dc7a3Sopenharmony_ci		}
134cc1dc7a3Sopenharmony_ci	}
135cc1dc7a3Sopenharmony_ci}
136cc1dc7a3Sopenharmony_ci
137cc1dc7a3Sopenharmony_ci/**
138cc1dc7a3Sopenharmony_ci * @brief Assign texels to clusters, based on a set of chosen center points.
139cc1dc7a3Sopenharmony_ci *
140cc1dc7a3Sopenharmony_ci * @param      blk                  The image block color data to compress.
141cc1dc7a3Sopenharmony_ci * @param      texel_count          The number of texels in the block.
142cc1dc7a3Sopenharmony_ci * @param      partition_count      The number of partitions in the block.
143cc1dc7a3Sopenharmony_ci * @param      cluster_centers      The partition cluster center colors.
144cc1dc7a3Sopenharmony_ci * @param[out] partition_of_texel   The partition assigned for each texel.
145cc1dc7a3Sopenharmony_ci */
146cc1dc7a3Sopenharmony_cistatic void kmeans_assign(
147cc1dc7a3Sopenharmony_ci	const image_block& blk,
148cc1dc7a3Sopenharmony_ci	unsigned int texel_count,
149cc1dc7a3Sopenharmony_ci	unsigned int partition_count,
150cc1dc7a3Sopenharmony_ci	const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
151cc1dc7a3Sopenharmony_ci	uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
152cc1dc7a3Sopenharmony_ci) {
153cc1dc7a3Sopenharmony_ci	promise(texel_count > 0);
154cc1dc7a3Sopenharmony_ci	promise(partition_count > 0);
155cc1dc7a3Sopenharmony_ci
156cc1dc7a3Sopenharmony_ci	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
157cc1dc7a3Sopenharmony_ci
158cc1dc7a3Sopenharmony_ci	// Find the best partition for every texel
159cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < texel_count; i++)
160cc1dc7a3Sopenharmony_ci	{
161cc1dc7a3Sopenharmony_ci		float best_distance = std::numeric_limits<float>::max();
162cc1dc7a3Sopenharmony_ci		unsigned int best_partition = 0;
163cc1dc7a3Sopenharmony_ci
164cc1dc7a3Sopenharmony_ci		vfloat4 color = blk.texel(i);
165cc1dc7a3Sopenharmony_ci		for (unsigned int j = 0; j < partition_count; j++)
166cc1dc7a3Sopenharmony_ci		{
167cc1dc7a3Sopenharmony_ci			vfloat4 diff = color - cluster_centers[j];
168cc1dc7a3Sopenharmony_ci			float distance = dot_s(diff * diff, blk.channel_weight);
169cc1dc7a3Sopenharmony_ci			if (distance < best_distance)
170cc1dc7a3Sopenharmony_ci			{
171cc1dc7a3Sopenharmony_ci				best_distance = distance;
172cc1dc7a3Sopenharmony_ci				best_partition = j;
173cc1dc7a3Sopenharmony_ci			}
174cc1dc7a3Sopenharmony_ci		}
175cc1dc7a3Sopenharmony_ci
176cc1dc7a3Sopenharmony_ci		partition_of_texel[i] = static_cast<uint8_t>(best_partition);
177cc1dc7a3Sopenharmony_ci		partition_texel_count[best_partition]++;
178cc1dc7a3Sopenharmony_ci	}
179cc1dc7a3Sopenharmony_ci
180cc1dc7a3Sopenharmony_ci	// It is possible to get a situation where a partition ends up without any texels. In this case,
181cc1dc7a3Sopenharmony_ci	// assign texel N to partition N. This is silly, but ensures that every partition retains at
182cc1dc7a3Sopenharmony_ci	// least one texel. Reassigning a texel in this manner may cause another partition to go empty,
183cc1dc7a3Sopenharmony_ci	// so if we actually did a reassignment, run the whole loop over again.
184cc1dc7a3Sopenharmony_ci	bool problem_case;
185cc1dc7a3Sopenharmony_ci	do
186cc1dc7a3Sopenharmony_ci	{
187cc1dc7a3Sopenharmony_ci		problem_case = false;
188cc1dc7a3Sopenharmony_ci		for (unsigned int i = 0; i < partition_count; i++)
189cc1dc7a3Sopenharmony_ci		{
190cc1dc7a3Sopenharmony_ci			if (partition_texel_count[i] == 0)
191cc1dc7a3Sopenharmony_ci			{
192cc1dc7a3Sopenharmony_ci				partition_texel_count[partition_of_texel[i]]--;
193cc1dc7a3Sopenharmony_ci				partition_texel_count[i]++;
194cc1dc7a3Sopenharmony_ci				partition_of_texel[i] = static_cast<uint8_t>(i);
195cc1dc7a3Sopenharmony_ci				problem_case = true;
196cc1dc7a3Sopenharmony_ci			}
197cc1dc7a3Sopenharmony_ci		}
198cc1dc7a3Sopenharmony_ci	} while (problem_case);
199cc1dc7a3Sopenharmony_ci}
200cc1dc7a3Sopenharmony_ci
201cc1dc7a3Sopenharmony_ci/**
202cc1dc7a3Sopenharmony_ci * @brief Compute new cluster centers based on their center of gravity.
203cc1dc7a3Sopenharmony_ci *
204cc1dc7a3Sopenharmony_ci * @param       blk                  The image block color data to compress.
205cc1dc7a3Sopenharmony_ci * @param       texel_count          The number of texels in the block.
206cc1dc7a3Sopenharmony_ci * @param       partition_count      The number of partitions in the block.
207cc1dc7a3Sopenharmony_ci * @param[out]  cluster_centers      The new cluster center colors.
208cc1dc7a3Sopenharmony_ci * @param       partition_of_texel   The partition assigned for each texel.
209cc1dc7a3Sopenharmony_ci */
210cc1dc7a3Sopenharmony_cistatic void kmeans_update(
211cc1dc7a3Sopenharmony_ci	const image_block& blk,
212cc1dc7a3Sopenharmony_ci	unsigned int texel_count,
213cc1dc7a3Sopenharmony_ci	unsigned int partition_count,
214cc1dc7a3Sopenharmony_ci	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
215cc1dc7a3Sopenharmony_ci	const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
216cc1dc7a3Sopenharmony_ci) {
217cc1dc7a3Sopenharmony_ci	promise(texel_count > 0);
218cc1dc7a3Sopenharmony_ci	promise(partition_count > 0);
219cc1dc7a3Sopenharmony_ci
220cc1dc7a3Sopenharmony_ci	vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
221cc1dc7a3Sopenharmony_ci		vfloat4::zero(),
222cc1dc7a3Sopenharmony_ci		vfloat4::zero(),
223cc1dc7a3Sopenharmony_ci		vfloat4::zero(),
224cc1dc7a3Sopenharmony_ci		vfloat4::zero()
225cc1dc7a3Sopenharmony_ci	};
226cc1dc7a3Sopenharmony_ci
227cc1dc7a3Sopenharmony_ci	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
228cc1dc7a3Sopenharmony_ci
229cc1dc7a3Sopenharmony_ci	// Find the center-of-gravity in each cluster
230cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < texel_count; i++)
231cc1dc7a3Sopenharmony_ci	{
232cc1dc7a3Sopenharmony_ci		uint8_t partition = partition_of_texel[i];
233cc1dc7a3Sopenharmony_ci		color_sum[partition] += blk.texel(i);
234cc1dc7a3Sopenharmony_ci		partition_texel_count[partition]++;
235cc1dc7a3Sopenharmony_ci	}
236cc1dc7a3Sopenharmony_ci
237cc1dc7a3Sopenharmony_ci	// Set the center of gravity to be the new cluster center
238cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < partition_count; i++)
239cc1dc7a3Sopenharmony_ci	{
240cc1dc7a3Sopenharmony_ci		float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
241cc1dc7a3Sopenharmony_ci		cluster_centers[i] = color_sum[i] * scale;
242cc1dc7a3Sopenharmony_ci	}
243cc1dc7a3Sopenharmony_ci}
244cc1dc7a3Sopenharmony_ci
245cc1dc7a3Sopenharmony_ci/**
246cc1dc7a3Sopenharmony_ci * @brief Compute bit-mismatch for partitioning in 2-partition mode.
247cc1dc7a3Sopenharmony_ci *
248cc1dc7a3Sopenharmony_ci * @param a   The texel assignment bitvector for the block.
249cc1dc7a3Sopenharmony_ci * @param b   The texel assignment bitvector for the partition table.
250cc1dc7a3Sopenharmony_ci *
251cc1dc7a3Sopenharmony_ci * @return    The number of bit mismatches.
252cc1dc7a3Sopenharmony_ci */
253cc1dc7a3Sopenharmony_ci#if ASTCENC_NEON != 0
254cc1dc7a3Sopenharmony_cistatic inline uint8_t partition_mismatch2(
255cc1dc7a3Sopenharmony_ci	const uint64_t a[2],
256cc1dc7a3Sopenharmony_ci	const uint64_t b[2]
257cc1dc7a3Sopenharmony_ci) {
258cc1dc7a3Sopenharmony_ci	uint64x2_t a01 = vld1q_u64(a);
259cc1dc7a3Sopenharmony_ci	uint64x2_t b01 = vld1q_u64(b);
260cc1dc7a3Sopenharmony_ci	uint64x2_t b10 = vextq_u64(b01, b01, 1);
261cc1dc7a3Sopenharmony_ci	uint8_t c1 = popcount(veorq_u64(a01, b01));
262cc1dc7a3Sopenharmony_ci	uint8_t c2 = popcount(veorq_u64(a01, b10));
263cc1dc7a3Sopenharmony_ci	return static_cast<uint8_t>(astc::min(c1, c2) / 2);    // 2 is the number of partitions
264cc1dc7a3Sopenharmony_ci}
265cc1dc7a3Sopenharmony_ci#else
266cc1dc7a3Sopenharmony_cistatic inline uint8_t partition_mismatch2(
267cc1dc7a3Sopenharmony_ci	const uint64_t a[2],
268cc1dc7a3Sopenharmony_ci	const uint64_t b[2]
269cc1dc7a3Sopenharmony_ci) {
270cc1dc7a3Sopenharmony_ci	int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
271cc1dc7a3Sopenharmony_ci	int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
272cc1dc7a3Sopenharmony_ci
273cc1dc7a3Sopenharmony_ci	// Divide by 2 because XOR always counts errors twice, once when missing
274cc1dc7a3Sopenharmony_ci	// in the expected position, and again when present in the wrong partition
275cc1dc7a3Sopenharmony_ci	return static_cast<uint8_t>(astc::min(v1, v2) / 2);
276cc1dc7a3Sopenharmony_ci}
277cc1dc7a3Sopenharmony_ci#endif
278cc1dc7a3Sopenharmony_ci
279cc1dc7a3Sopenharmony_ci/**
280cc1dc7a3Sopenharmony_ci * @brief Compute bit-mismatch for partitioning in 3-partition mode.
281cc1dc7a3Sopenharmony_ci *
282cc1dc7a3Sopenharmony_ci * @param a   The texel assignment bitvector for the block.
283cc1dc7a3Sopenharmony_ci * @param b   The texel assignment bitvector for the partition table.
284cc1dc7a3Sopenharmony_ci *
285cc1dc7a3Sopenharmony_ci * @return    The number of bit mismatches.
286cc1dc7a3Sopenharmony_ci */
287cc1dc7a3Sopenharmony_cistatic inline uint8_t partition_mismatch3(
288cc1dc7a3Sopenharmony_ci	const uint64_t a[3],
289cc1dc7a3Sopenharmony_ci	const uint64_t b[3]
290cc1dc7a3Sopenharmony_ci) {
291cc1dc7a3Sopenharmony_ci	int p00 = popcount(a[0] ^ b[0]);
292cc1dc7a3Sopenharmony_ci	int p01 = popcount(a[0] ^ b[1]);
293cc1dc7a3Sopenharmony_ci	int p02 = popcount(a[0] ^ b[2]);
294cc1dc7a3Sopenharmony_ci
295cc1dc7a3Sopenharmony_ci	int p10 = popcount(a[1] ^ b[0]);
296cc1dc7a3Sopenharmony_ci	int p11 = popcount(a[1] ^ b[1]);
297cc1dc7a3Sopenharmony_ci	int p12 = popcount(a[1] ^ b[2]);
298cc1dc7a3Sopenharmony_ci
299cc1dc7a3Sopenharmony_ci	int p20 = popcount(a[2] ^ b[0]);
300cc1dc7a3Sopenharmony_ci	int p21 = popcount(a[2] ^ b[1]);
301cc1dc7a3Sopenharmony_ci	int p22 = popcount(a[2] ^ b[2]);
302cc1dc7a3Sopenharmony_ci
303cc1dc7a3Sopenharmony_ci	int s0 = p11 + p22;
304cc1dc7a3Sopenharmony_ci	int s1 = p12 + p21;
305cc1dc7a3Sopenharmony_ci	int v0 = astc::min(s0, s1) + p00;
306cc1dc7a3Sopenharmony_ci
307cc1dc7a3Sopenharmony_ci	int s2 = p10 + p22;
308cc1dc7a3Sopenharmony_ci	int s3 = p12 + p20;
309cc1dc7a3Sopenharmony_ci	int v1 = astc::min(s2, s3) + p01;
310cc1dc7a3Sopenharmony_ci
311cc1dc7a3Sopenharmony_ci	int s4 = p10 + p21;
312cc1dc7a3Sopenharmony_ci	int s5 = p11 + p20;
313cc1dc7a3Sopenharmony_ci	int v2 = astc::min(s4, s5) + p02;
314cc1dc7a3Sopenharmony_ci
315cc1dc7a3Sopenharmony_ci	// Divide by 2 because XOR always counts errors twice, once when missing
316cc1dc7a3Sopenharmony_ci	// in the expected position, and again when present in the wrong partition
317cc1dc7a3Sopenharmony_ci	return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
318cc1dc7a3Sopenharmony_ci}
319cc1dc7a3Sopenharmony_ci
320cc1dc7a3Sopenharmony_ci/**
321cc1dc7a3Sopenharmony_ci * @brief Compute bit-mismatch for partitioning in 4-partition mode.
322cc1dc7a3Sopenharmony_ci *
323cc1dc7a3Sopenharmony_ci * @param a   The texel assignment bitvector for the block.
324cc1dc7a3Sopenharmony_ci * @param b   The texel assignment bitvector for the partition table.
325cc1dc7a3Sopenharmony_ci *
326cc1dc7a3Sopenharmony_ci * @return    The number of bit mismatches.
327cc1dc7a3Sopenharmony_ci */
328cc1dc7a3Sopenharmony_cistatic inline uint8_t partition_mismatch4(
329cc1dc7a3Sopenharmony_ci	const uint64_t a[4],
330cc1dc7a3Sopenharmony_ci	const uint64_t b[4]
331cc1dc7a3Sopenharmony_ci) {
332cc1dc7a3Sopenharmony_ci	int p00 = popcount(a[0] ^ b[0]);
333cc1dc7a3Sopenharmony_ci	int p01 = popcount(a[0] ^ b[1]);
334cc1dc7a3Sopenharmony_ci	int p02 = popcount(a[0] ^ b[2]);
335cc1dc7a3Sopenharmony_ci	int p03 = popcount(a[0] ^ b[3]);
336cc1dc7a3Sopenharmony_ci
337cc1dc7a3Sopenharmony_ci	int p10 = popcount(a[1] ^ b[0]);
338cc1dc7a3Sopenharmony_ci	int p11 = popcount(a[1] ^ b[1]);
339cc1dc7a3Sopenharmony_ci	int p12 = popcount(a[1] ^ b[2]);
340cc1dc7a3Sopenharmony_ci	int p13 = popcount(a[1] ^ b[3]);
341cc1dc7a3Sopenharmony_ci
342cc1dc7a3Sopenharmony_ci	int p20 = popcount(a[2] ^ b[0]);
343cc1dc7a3Sopenharmony_ci	int p21 = popcount(a[2] ^ b[1]);
344cc1dc7a3Sopenharmony_ci	int p22 = popcount(a[2] ^ b[2]);
345cc1dc7a3Sopenharmony_ci	int p23 = popcount(a[2] ^ b[3]);
346cc1dc7a3Sopenharmony_ci
347cc1dc7a3Sopenharmony_ci	int p30 = popcount(a[3] ^ b[0]);
348cc1dc7a3Sopenharmony_ci	int p31 = popcount(a[3] ^ b[1]);
349cc1dc7a3Sopenharmony_ci	int p32 = popcount(a[3] ^ b[2]);
350cc1dc7a3Sopenharmony_ci	int p33 = popcount(a[3] ^ b[3]);
351cc1dc7a3Sopenharmony_ci
352cc1dc7a3Sopenharmony_ci	int mx23 = astc::min(p22 + p33, p23 + p32);
353cc1dc7a3Sopenharmony_ci	int mx13 = astc::min(p21 + p33, p23 + p31);
354cc1dc7a3Sopenharmony_ci	int mx12 = astc::min(p21 + p32, p22 + p31);
355cc1dc7a3Sopenharmony_ci	int mx03 = astc::min(p20 + p33, p23 + p30);
356cc1dc7a3Sopenharmony_ci	int mx02 = astc::min(p20 + p32, p22 + p30);
357cc1dc7a3Sopenharmony_ci	int mx01 = astc::min(p21 + p30, p20 + p31);
358cc1dc7a3Sopenharmony_ci
359cc1dc7a3Sopenharmony_ci	int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
360cc1dc7a3Sopenharmony_ci	int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
361cc1dc7a3Sopenharmony_ci	int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
362cc1dc7a3Sopenharmony_ci	int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
363cc1dc7a3Sopenharmony_ci
364cc1dc7a3Sopenharmony_ci	// Divide by 2 because XOR always counts errors twice, once when missing
365cc1dc7a3Sopenharmony_ci	// in the expected position, and again when present in the wrong partition
366cc1dc7a3Sopenharmony_ci	return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
367cc1dc7a3Sopenharmony_ci}
368cc1dc7a3Sopenharmony_ci
369cc1dc7a3Sopenharmony_ciusing mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
370cc1dc7a3Sopenharmony_ci
371cc1dc7a3Sopenharmony_ci/**
372cc1dc7a3Sopenharmony_ci * @brief Count the partition table mismatches vs the data clustering.
373cc1dc7a3Sopenharmony_ci *
374cc1dc7a3Sopenharmony_ci * @param      bsd               The block size information.
375cc1dc7a3Sopenharmony_ci * @param      partition_count   The number of partitions in the block.
376cc1dc7a3Sopenharmony_ci * @param      bitmaps           The block texel partition assignment patterns.
377cc1dc7a3Sopenharmony_ci * @param[out] mismatch_counts   The array storing per partitioning mismatch counts.
378cc1dc7a3Sopenharmony_ci */
379cc1dc7a3Sopenharmony_cistatic void count_partition_mismatch_bits(
380cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
381cc1dc7a3Sopenharmony_ci	unsigned int partition_count,
382cc1dc7a3Sopenharmony_ci	const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
383cc1dc7a3Sopenharmony_ci	uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
384cc1dc7a3Sopenharmony_ci) {
385cc1dc7a3Sopenharmony_ci	unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
386cc1dc7a3Sopenharmony_ci	promise(active_count > 0);
387cc1dc7a3Sopenharmony_ci
388cc1dc7a3Sopenharmony_ci	if (partition_count == 2)
389cc1dc7a3Sopenharmony_ci	{
390cc1dc7a3Sopenharmony_ci		for (unsigned int i = 0; i < active_count; i++)
391cc1dc7a3Sopenharmony_ci		{
392cc1dc7a3Sopenharmony_ci			mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
393cc1dc7a3Sopenharmony_ci			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
394cc1dc7a3Sopenharmony_ci			assert(mismatch_counts[i] < bsd.texel_count);
395cc1dc7a3Sopenharmony_ci		}
396cc1dc7a3Sopenharmony_ci	}
397cc1dc7a3Sopenharmony_ci	else if (partition_count == 3)
398cc1dc7a3Sopenharmony_ci	{
399cc1dc7a3Sopenharmony_ci		for (unsigned int i = 0; i < active_count; i++)
400cc1dc7a3Sopenharmony_ci		{
401cc1dc7a3Sopenharmony_ci			mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
402cc1dc7a3Sopenharmony_ci			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
403cc1dc7a3Sopenharmony_ci			assert(mismatch_counts[i] < bsd.texel_count);
404cc1dc7a3Sopenharmony_ci		}
405cc1dc7a3Sopenharmony_ci	}
406cc1dc7a3Sopenharmony_ci	else
407cc1dc7a3Sopenharmony_ci	{
408cc1dc7a3Sopenharmony_ci		for (unsigned int i = 0; i < active_count; i++)
409cc1dc7a3Sopenharmony_ci		{
410cc1dc7a3Sopenharmony_ci			mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
411cc1dc7a3Sopenharmony_ci			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
412cc1dc7a3Sopenharmony_ci			assert(mismatch_counts[i] < bsd.texel_count);
413cc1dc7a3Sopenharmony_ci		}
414cc1dc7a3Sopenharmony_ci	}
415cc1dc7a3Sopenharmony_ci}
416cc1dc7a3Sopenharmony_ci
417cc1dc7a3Sopenharmony_ci/**
418cc1dc7a3Sopenharmony_ci * @brief Use counting sort on the mismatch array to sort partition candidates.
419cc1dc7a3Sopenharmony_ci *
420cc1dc7a3Sopenharmony_ci * @param      partitioning_count   The number of packed partitionings.
421cc1dc7a3Sopenharmony_ci * @param      mismatch_count       Partitioning mismatch counts, in index order.
422cc1dc7a3Sopenharmony_ci * @param[out] partition_ordering   Partition index values, in mismatch order.
423cc1dc7a3Sopenharmony_ci *
424cc1dc7a3Sopenharmony_ci * @return The number of active partitions in this selection.
425cc1dc7a3Sopenharmony_ci */
426cc1dc7a3Sopenharmony_cistatic unsigned int get_partition_ordering_by_mismatch_bits(
427cc1dc7a3Sopenharmony_ci	unsigned int texel_count,
428cc1dc7a3Sopenharmony_ci	unsigned int partitioning_count,
429cc1dc7a3Sopenharmony_ci	const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
430cc1dc7a3Sopenharmony_ci	uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
431cc1dc7a3Sopenharmony_ci) {
432cc1dc7a3Sopenharmony_ci	promise(partitioning_count > 0);
433cc1dc7a3Sopenharmony_ci	uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
434cc1dc7a3Sopenharmony_ci
435cc1dc7a3Sopenharmony_ci	// Create the histogram of mismatch counts
436cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < partitioning_count; i++)
437cc1dc7a3Sopenharmony_ci	{
438cc1dc7a3Sopenharmony_ci		mscount[mismatch_count[i]]++;
439cc1dc7a3Sopenharmony_ci	}
440cc1dc7a3Sopenharmony_ci
441cc1dc7a3Sopenharmony_ci	// Create a running sum from the histogram array
442cc1dc7a3Sopenharmony_ci	// Cells store previous values only; i.e. exclude self after sum
443cc1dc7a3Sopenharmony_ci	unsigned int sum = 0;
444cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < texel_count; i++)
445cc1dc7a3Sopenharmony_ci	{
446cc1dc7a3Sopenharmony_ci		uint16_t cnt = mscount[i];
447cc1dc7a3Sopenharmony_ci		mscount[i] = sum;
448cc1dc7a3Sopenharmony_ci		sum += cnt;
449cc1dc7a3Sopenharmony_ci	}
450cc1dc7a3Sopenharmony_ci
451cc1dc7a3Sopenharmony_ci	// Use the running sum as the index, incrementing after read to allow
452cc1dc7a3Sopenharmony_ci	// sequential entries with the same count
453cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < partitioning_count; i++)
454cc1dc7a3Sopenharmony_ci	{
455cc1dc7a3Sopenharmony_ci		unsigned int idx = mscount[mismatch_count[i]]++;
456cc1dc7a3Sopenharmony_ci		partition_ordering[idx] = static_cast<uint16_t>(i);
457cc1dc7a3Sopenharmony_ci	}
458cc1dc7a3Sopenharmony_ci
459cc1dc7a3Sopenharmony_ci	return partitioning_count;
460cc1dc7a3Sopenharmony_ci}
461cc1dc7a3Sopenharmony_ci
462cc1dc7a3Sopenharmony_ci/**
463cc1dc7a3Sopenharmony_ci * @brief Use k-means clustering to compute a partition ordering for a block..
464cc1dc7a3Sopenharmony_ci *
465cc1dc7a3Sopenharmony_ci * @param      bsd                  The block size information.
466cc1dc7a3Sopenharmony_ci * @param      blk                  The image block color data to compress.
467cc1dc7a3Sopenharmony_ci * @param      partition_count      The desired number of partitions in the block.
468cc1dc7a3Sopenharmony_ci * @param[out] partition_ordering   The list of recommended partition indices, in priority order.
469cc1dc7a3Sopenharmony_ci *
470cc1dc7a3Sopenharmony_ci * @return The number of active partitionings in this selection.
471cc1dc7a3Sopenharmony_ci */
472cc1dc7a3Sopenharmony_cistatic unsigned int compute_kmeans_partition_ordering(
473cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
474cc1dc7a3Sopenharmony_ci	const image_block& blk,
475cc1dc7a3Sopenharmony_ci	unsigned int partition_count,
476cc1dc7a3Sopenharmony_ci	uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
477cc1dc7a3Sopenharmony_ci) {
478cc1dc7a3Sopenharmony_ci	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
479cc1dc7a3Sopenharmony_ci	uint8_t texel_partitions[BLOCK_MAX_TEXELS];
480cc1dc7a3Sopenharmony_ci
481cc1dc7a3Sopenharmony_ci	// Use three passes of k-means clustering to partition the block data
482cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < 3; i++)
483cc1dc7a3Sopenharmony_ci	{
484cc1dc7a3Sopenharmony_ci		if (i == 0)
485cc1dc7a3Sopenharmony_ci		{
486cc1dc7a3Sopenharmony_ci			kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
487cc1dc7a3Sopenharmony_ci		}
488cc1dc7a3Sopenharmony_ci		else
489cc1dc7a3Sopenharmony_ci		{
490cc1dc7a3Sopenharmony_ci			kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
491cc1dc7a3Sopenharmony_ci		}
492cc1dc7a3Sopenharmony_ci
493cc1dc7a3Sopenharmony_ci		kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
494cc1dc7a3Sopenharmony_ci	}
495cc1dc7a3Sopenharmony_ci
496cc1dc7a3Sopenharmony_ci	// Construct the block bitmaps of texel assignments to each partition
497cc1dc7a3Sopenharmony_ci	uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
498cc1dc7a3Sopenharmony_ci	unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
499cc1dc7a3Sopenharmony_ci	promise(texels_to_process > 0);
500cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < texels_to_process; i++)
501cc1dc7a3Sopenharmony_ci	{
502cc1dc7a3Sopenharmony_ci		unsigned int idx = bsd.kmeans_texels[i];
503cc1dc7a3Sopenharmony_ci		bitmaps[texel_partitions[idx]] |= 1ULL << i;
504cc1dc7a3Sopenharmony_ci	}
505cc1dc7a3Sopenharmony_ci
506cc1dc7a3Sopenharmony_ci	// Count the mismatch between the block and the format's partition tables
507cc1dc7a3Sopenharmony_ci	uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
508cc1dc7a3Sopenharmony_ci	count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
509cc1dc7a3Sopenharmony_ci
510cc1dc7a3Sopenharmony_ci	// Sort the partitions based on the number of mismatched bits
511cc1dc7a3Sopenharmony_ci	return get_partition_ordering_by_mismatch_bits(
512cc1dc7a3Sopenharmony_ci	    texels_to_process,
513cc1dc7a3Sopenharmony_ci	    bsd.partitioning_count_selected[partition_count - 1],
514cc1dc7a3Sopenharmony_ci	    mismatch_counts, partition_ordering);
515cc1dc7a3Sopenharmony_ci}
516cc1dc7a3Sopenharmony_ci
517cc1dc7a3Sopenharmony_ci/**
518cc1dc7a3Sopenharmony_ci * @brief Insert a partitioning into an order list of results, sorted by error.
519cc1dc7a3Sopenharmony_ci *
520cc1dc7a3Sopenharmony_ci * @param      max_values      The max number of entries in the best result arrays.
521cc1dc7a3Sopenharmony_ci * @param      this_error      The error of the new entry.
522cc1dc7a3Sopenharmony_ci * @param      this_partition  The partition ID of the new entry.
523cc1dc7a3Sopenharmony_ci * @param[out] best_errors     The array of best error values.
524cc1dc7a3Sopenharmony_ci * @param[out] best_partitions The array of best partition values.
525cc1dc7a3Sopenharmony_ci */
526cc1dc7a3Sopenharmony_cistatic void insert_result(
527cc1dc7a3Sopenharmony_ci	unsigned int max_values,
528cc1dc7a3Sopenharmony_ci	float this_error,
529cc1dc7a3Sopenharmony_ci	unsigned int this_partition,
530cc1dc7a3Sopenharmony_ci	float* best_errors,
531cc1dc7a3Sopenharmony_ci	unsigned int* best_partitions)
532cc1dc7a3Sopenharmony_ci{
533cc1dc7a3Sopenharmony_ci	promise(max_values > 0);
534cc1dc7a3Sopenharmony_ci
535cc1dc7a3Sopenharmony_ci	// Don't bother searching if the current worst error beats the new error
536cc1dc7a3Sopenharmony_ci	if (this_error >= best_errors[max_values - 1])
537cc1dc7a3Sopenharmony_ci	{
538cc1dc7a3Sopenharmony_ci		return;
539cc1dc7a3Sopenharmony_ci	}
540cc1dc7a3Sopenharmony_ci
541cc1dc7a3Sopenharmony_ci	// Else insert into the list in error-order
542cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < max_values; i++)
543cc1dc7a3Sopenharmony_ci	{
544cc1dc7a3Sopenharmony_ci		// Existing result is better - move on ...
545cc1dc7a3Sopenharmony_ci		if (this_error > best_errors[i])
546cc1dc7a3Sopenharmony_ci		{
547cc1dc7a3Sopenharmony_ci			continue;
548cc1dc7a3Sopenharmony_ci		}
549cc1dc7a3Sopenharmony_ci
550cc1dc7a3Sopenharmony_ci		// Move existing results down one
551cc1dc7a3Sopenharmony_ci		for (unsigned int j = max_values - 1; j > i; j--)
552cc1dc7a3Sopenharmony_ci		{
553cc1dc7a3Sopenharmony_ci			best_errors[j] = best_errors[j - 1];
554cc1dc7a3Sopenharmony_ci			best_partitions[j] = best_partitions[j - 1];
555cc1dc7a3Sopenharmony_ci		}
556cc1dc7a3Sopenharmony_ci
557cc1dc7a3Sopenharmony_ci		// Insert new result
558cc1dc7a3Sopenharmony_ci		best_errors[i] = this_error;
559cc1dc7a3Sopenharmony_ci		best_partitions[i] = this_partition;
560cc1dc7a3Sopenharmony_ci		break;
561cc1dc7a3Sopenharmony_ci	}
562cc1dc7a3Sopenharmony_ci}
563cc1dc7a3Sopenharmony_ci
564cc1dc7a3Sopenharmony_ci/* See header for documentation. */
565cc1dc7a3Sopenharmony_ciunsigned int find_best_partition_candidates(
566cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
567cc1dc7a3Sopenharmony_ci	const image_block& blk,
568cc1dc7a3Sopenharmony_ci	unsigned int partition_count,
569cc1dc7a3Sopenharmony_ci	unsigned int partition_search_limit,
570cc1dc7a3Sopenharmony_ci	unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
571cc1dc7a3Sopenharmony_ci	unsigned int requested_candidates
572cc1dc7a3Sopenharmony_ci) {
573cc1dc7a3Sopenharmony_ci	// Constant used to estimate quantization error for a given partitioning; the optimal value for
574cc1dc7a3Sopenharmony_ci	// this depends on bitrate. These values have been determined empirically.
575cc1dc7a3Sopenharmony_ci	unsigned int texels_per_block = bsd.texel_count;
576cc1dc7a3Sopenharmony_ci	float weight_imprecision_estim = 0.055f;
577cc1dc7a3Sopenharmony_ci	if (texels_per_block <= 20)
578cc1dc7a3Sopenharmony_ci	{
579cc1dc7a3Sopenharmony_ci		weight_imprecision_estim = 0.03f;
580cc1dc7a3Sopenharmony_ci	}
581cc1dc7a3Sopenharmony_ci	else if (texels_per_block <= 31)
582cc1dc7a3Sopenharmony_ci	{
583cc1dc7a3Sopenharmony_ci		weight_imprecision_estim = 0.04f;
584cc1dc7a3Sopenharmony_ci	}
585cc1dc7a3Sopenharmony_ci	else if (texels_per_block <= 41)
586cc1dc7a3Sopenharmony_ci	{
587cc1dc7a3Sopenharmony_ci		weight_imprecision_estim = 0.05f;
588cc1dc7a3Sopenharmony_ci	}
589cc1dc7a3Sopenharmony_ci
590cc1dc7a3Sopenharmony_ci	promise(partition_count > 0);
591cc1dc7a3Sopenharmony_ci	promise(partition_search_limit > 0);
592cc1dc7a3Sopenharmony_ci
593cc1dc7a3Sopenharmony_ci	weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
594cc1dc7a3Sopenharmony_ci
595cc1dc7a3Sopenharmony_ci	uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
596cc1dc7a3Sopenharmony_ci	unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
597cc1dc7a3Sopenharmony_ci	partition_search_limit = astc::min(partition_search_limit, sequence_len);
598cc1dc7a3Sopenharmony_ci	requested_candidates = astc::min(partition_search_limit, requested_candidates);
599cc1dc7a3Sopenharmony_ci
600cc1dc7a3Sopenharmony_ci	bool uses_alpha = !blk.is_constant_channel(3);
601cc1dc7a3Sopenharmony_ci
602cc1dc7a3Sopenharmony_ci	// Partitioning errors assuming uncorrelated-chrominance endpoints
603cc1dc7a3Sopenharmony_ci	float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
604cc1dc7a3Sopenharmony_ci	unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
605cc1dc7a3Sopenharmony_ci
606cc1dc7a3Sopenharmony_ci	// Partitioning errors assuming same-chrominance endpoints
607cc1dc7a3Sopenharmony_ci	float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
608cc1dc7a3Sopenharmony_ci	unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
609cc1dc7a3Sopenharmony_ci
610cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < requested_candidates; i++)
611cc1dc7a3Sopenharmony_ci	{
612cc1dc7a3Sopenharmony_ci		uncor_best_errors[i] = ERROR_CALC_DEFAULT;
613cc1dc7a3Sopenharmony_ci		samec_best_errors[i] = ERROR_CALC_DEFAULT;
614cc1dc7a3Sopenharmony_ci	}
615cc1dc7a3Sopenharmony_ci
616cc1dc7a3Sopenharmony_ci	if (uses_alpha)
617cc1dc7a3Sopenharmony_ci	{
618cc1dc7a3Sopenharmony_ci		for (unsigned int i = 0; i < partition_search_limit; i++)
619cc1dc7a3Sopenharmony_ci		{
620cc1dc7a3Sopenharmony_ci			unsigned int partition = partition_sequence[i];
621cc1dc7a3Sopenharmony_ci			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
622cc1dc7a3Sopenharmony_ci
623cc1dc7a3Sopenharmony_ci			// Compute weighting to give to each component in each partition
624cc1dc7a3Sopenharmony_ci			partition_metrics pms[BLOCK_MAX_PARTITIONS];
625cc1dc7a3Sopenharmony_ci
626cc1dc7a3Sopenharmony_ci			compute_avgs_and_dirs_4_comp(pi, blk, pms);
627cc1dc7a3Sopenharmony_ci
628cc1dc7a3Sopenharmony_ci			line4 uncor_lines[BLOCK_MAX_PARTITIONS];
629cc1dc7a3Sopenharmony_ci			line4 samec_lines[BLOCK_MAX_PARTITIONS];
630cc1dc7a3Sopenharmony_ci
631cc1dc7a3Sopenharmony_ci			processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
632cc1dc7a3Sopenharmony_ci			processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
633cc1dc7a3Sopenharmony_ci
634cc1dc7a3Sopenharmony_ci			float line_lengths[BLOCK_MAX_PARTITIONS];
635cc1dc7a3Sopenharmony_ci
636cc1dc7a3Sopenharmony_ci			for (unsigned int j = 0; j < partition_count; j++)
637cc1dc7a3Sopenharmony_ci			{
638cc1dc7a3Sopenharmony_ci				partition_metrics& pm = pms[j];
639cc1dc7a3Sopenharmony_ci
640cc1dc7a3Sopenharmony_ci				uncor_lines[j].a = pm.avg;
641cc1dc7a3Sopenharmony_ci				uncor_lines[j].b = normalize_safe(pm.dir, unit4());
642cc1dc7a3Sopenharmony_ci
643cc1dc7a3Sopenharmony_ci				uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
644cc1dc7a3Sopenharmony_ci				uncor_plines[j].bs = uncor_lines[j].b;
645cc1dc7a3Sopenharmony_ci
646cc1dc7a3Sopenharmony_ci				samec_lines[j].a = vfloat4::zero();
647cc1dc7a3Sopenharmony_ci				samec_lines[j].b = normalize_safe(pm.avg, unit4());
648cc1dc7a3Sopenharmony_ci
649cc1dc7a3Sopenharmony_ci				samec_plines[j].amod = vfloat4::zero();
650cc1dc7a3Sopenharmony_ci				samec_plines[j].bs = samec_lines[j].b;
651cc1dc7a3Sopenharmony_ci			}
652cc1dc7a3Sopenharmony_ci
653cc1dc7a3Sopenharmony_ci			float uncor_error = 0.0f;
654cc1dc7a3Sopenharmony_ci			float samec_error = 0.0f;
655cc1dc7a3Sopenharmony_ci
656cc1dc7a3Sopenharmony_ci			compute_error_squared_rgba(pi,
657cc1dc7a3Sopenharmony_ci			                           blk,
658cc1dc7a3Sopenharmony_ci			                           uncor_plines,
659cc1dc7a3Sopenharmony_ci			                           samec_plines,
660cc1dc7a3Sopenharmony_ci			                           line_lengths,
661cc1dc7a3Sopenharmony_ci			                           uncor_error,
662cc1dc7a3Sopenharmony_ci			                           samec_error);
663cc1dc7a3Sopenharmony_ci
664cc1dc7a3Sopenharmony_ci			// Compute an estimate of error introduced by weight quantization imprecision.
665cc1dc7a3Sopenharmony_ci			// This error is computed as follows, for each partition
666cc1dc7a3Sopenharmony_ci			//     1: compute the principal-axis vector (full length) in error-space
667cc1dc7a3Sopenharmony_ci			//     2: convert the principal-axis vector to regular RGB-space
668cc1dc7a3Sopenharmony_ci			//     3: scale the vector by a constant that estimates average quantization error
669cc1dc7a3Sopenharmony_ci			//     4: for each texel, square the vector, then do a dot-product with the texel's
670cc1dc7a3Sopenharmony_ci			//        error weight; sum up the results across all texels.
671cc1dc7a3Sopenharmony_ci			//     4(optimized): square the vector once, then do a dot-product with the average
672cc1dc7a3Sopenharmony_ci			//        texel error, then multiply by the number of texels.
673cc1dc7a3Sopenharmony_ci
674cc1dc7a3Sopenharmony_ci			for (unsigned int j = 0; j < partition_count; j++)
675cc1dc7a3Sopenharmony_ci			{
676cc1dc7a3Sopenharmony_ci				float tpp = static_cast<float>(pi.partition_texel_count[j]);
677cc1dc7a3Sopenharmony_ci				vfloat4 error_weights(tpp * weight_imprecision_estim);
678cc1dc7a3Sopenharmony_ci
679cc1dc7a3Sopenharmony_ci				vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];
680cc1dc7a3Sopenharmony_ci				vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];
681cc1dc7a3Sopenharmony_ci
682cc1dc7a3Sopenharmony_ci				uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
683cc1dc7a3Sopenharmony_ci				samec_error += dot_s(samec_vector * samec_vector, error_weights);
684cc1dc7a3Sopenharmony_ci			}
685cc1dc7a3Sopenharmony_ci
686cc1dc7a3Sopenharmony_ci			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
687cc1dc7a3Sopenharmony_ci			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
688cc1dc7a3Sopenharmony_ci		}
689cc1dc7a3Sopenharmony_ci	}
690cc1dc7a3Sopenharmony_ci	else
691cc1dc7a3Sopenharmony_ci	{
692cc1dc7a3Sopenharmony_ci		for (unsigned int i = 0; i < partition_search_limit; i++)
693cc1dc7a3Sopenharmony_ci		{
694cc1dc7a3Sopenharmony_ci			unsigned int partition = partition_sequence[i];
695cc1dc7a3Sopenharmony_ci			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
696cc1dc7a3Sopenharmony_ci
697cc1dc7a3Sopenharmony_ci			// Compute weighting to give to each component in each partition
698cc1dc7a3Sopenharmony_ci			partition_metrics pms[BLOCK_MAX_PARTITIONS];
699cc1dc7a3Sopenharmony_ci			compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
700cc1dc7a3Sopenharmony_ci
701cc1dc7a3Sopenharmony_ci			partition_lines3 plines[BLOCK_MAX_PARTITIONS];
702cc1dc7a3Sopenharmony_ci
703cc1dc7a3Sopenharmony_ci			for (unsigned int j = 0; j < partition_count; j++)
704cc1dc7a3Sopenharmony_ci			{
705cc1dc7a3Sopenharmony_ci				partition_metrics& pm = pms[j];
706cc1dc7a3Sopenharmony_ci				partition_lines3& pl = plines[j];
707cc1dc7a3Sopenharmony_ci
708cc1dc7a3Sopenharmony_ci				pl.uncor_line.a = pm.avg;
709cc1dc7a3Sopenharmony_ci				pl.uncor_line.b = normalize_safe(pm.dir, unit3());
710cc1dc7a3Sopenharmony_ci
711cc1dc7a3Sopenharmony_ci				pl.samec_line.a = vfloat4::zero();
712cc1dc7a3Sopenharmony_ci				pl.samec_line.b = normalize_safe(pm.avg, unit3());
713cc1dc7a3Sopenharmony_ci
714cc1dc7a3Sopenharmony_ci				pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
715cc1dc7a3Sopenharmony_ci				pl.uncor_pline.bs   = pl.uncor_line.b;
716cc1dc7a3Sopenharmony_ci
717cc1dc7a3Sopenharmony_ci				pl.samec_pline.amod = vfloat4::zero();
718cc1dc7a3Sopenharmony_ci				pl.samec_pline.bs   = pl.samec_line.b;
719cc1dc7a3Sopenharmony_ci			}
720cc1dc7a3Sopenharmony_ci
721cc1dc7a3Sopenharmony_ci			float uncor_error = 0.0f;
722cc1dc7a3Sopenharmony_ci			float samec_error = 0.0f;
723cc1dc7a3Sopenharmony_ci
724cc1dc7a3Sopenharmony_ci			compute_error_squared_rgb(pi,
725cc1dc7a3Sopenharmony_ci			                          blk,
726cc1dc7a3Sopenharmony_ci			                          plines,
727cc1dc7a3Sopenharmony_ci			                          uncor_error,
728cc1dc7a3Sopenharmony_ci			                          samec_error);
729cc1dc7a3Sopenharmony_ci
730cc1dc7a3Sopenharmony_ci			// Compute an estimate of error introduced by weight quantization imprecision.
731cc1dc7a3Sopenharmony_ci			// This error is computed as follows, for each partition
732cc1dc7a3Sopenharmony_ci			//     1: compute the principal-axis vector (full length) in error-space
733cc1dc7a3Sopenharmony_ci			//     2: convert the principal-axis vector to regular RGB-space
734cc1dc7a3Sopenharmony_ci			//     3: scale the vector by a constant that estimates average quantization error
735cc1dc7a3Sopenharmony_ci			//     4: for each texel, square the vector, then do a dot-product with the texel's
736cc1dc7a3Sopenharmony_ci			//        error weight; sum up the results across all texels.
737cc1dc7a3Sopenharmony_ci			//     4(optimized): square the vector once, then do a dot-product with the average
738cc1dc7a3Sopenharmony_ci			//        texel error, then multiply by the number of texels.
739cc1dc7a3Sopenharmony_ci
740cc1dc7a3Sopenharmony_ci			for (unsigned int j = 0; j < partition_count; j++)
741cc1dc7a3Sopenharmony_ci			{
742cc1dc7a3Sopenharmony_ci				partition_lines3& pl = plines[j];
743cc1dc7a3Sopenharmony_ci
744cc1dc7a3Sopenharmony_ci				float tpp = static_cast<float>(pi.partition_texel_count[j]);
745cc1dc7a3Sopenharmony_ci				vfloat4 error_weights(tpp * weight_imprecision_estim);
746cc1dc7a3Sopenharmony_ci
747cc1dc7a3Sopenharmony_ci				vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;
748cc1dc7a3Sopenharmony_ci				vfloat4 samec_vector = pl.samec_line.b * pl.line_length;
749cc1dc7a3Sopenharmony_ci
750cc1dc7a3Sopenharmony_ci				uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
751cc1dc7a3Sopenharmony_ci				samec_error += dot3_s(samec_vector * samec_vector, error_weights);
752cc1dc7a3Sopenharmony_ci			}
753cc1dc7a3Sopenharmony_ci
754cc1dc7a3Sopenharmony_ci			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
755cc1dc7a3Sopenharmony_ci			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
756cc1dc7a3Sopenharmony_ci		}
757cc1dc7a3Sopenharmony_ci	}
758cc1dc7a3Sopenharmony_ci
759cc1dc7a3Sopenharmony_ci	unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
760cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < requested_candidates; i++)
761cc1dc7a3Sopenharmony_ci	{
762cc1dc7a3Sopenharmony_ci		interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
763cc1dc7a3Sopenharmony_ci		interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
764cc1dc7a3Sopenharmony_ci	}
765cc1dc7a3Sopenharmony_ci
766cc1dc7a3Sopenharmony_ci	uint64_t bitmasks[1024/64] { 0 };
767cc1dc7a3Sopenharmony_ci	unsigned int emitted = 0;
768cc1dc7a3Sopenharmony_ci
769cc1dc7a3Sopenharmony_ci	// Deduplicate the first "requested" entries
770cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < requested_candidates * 2;  i++)
771cc1dc7a3Sopenharmony_ci	{
772cc1dc7a3Sopenharmony_ci		unsigned int partition = interleave[i];
773cc1dc7a3Sopenharmony_ci
774cc1dc7a3Sopenharmony_ci		unsigned int word = partition / 64;
775cc1dc7a3Sopenharmony_ci		unsigned int bit = partition % 64;
776cc1dc7a3Sopenharmony_ci
777cc1dc7a3Sopenharmony_ci		bool written = bitmasks[word] & (1ull << bit);
778cc1dc7a3Sopenharmony_ci
779cc1dc7a3Sopenharmony_ci		if (!written)
780cc1dc7a3Sopenharmony_ci		{
781cc1dc7a3Sopenharmony_ci			best_partitions[emitted] = partition;
782cc1dc7a3Sopenharmony_ci			bitmasks[word] |= 1ull << bit;
783cc1dc7a3Sopenharmony_ci			emitted++;
784cc1dc7a3Sopenharmony_ci
785cc1dc7a3Sopenharmony_ci			if (emitted == requested_candidates)
786cc1dc7a3Sopenharmony_ci			{
787cc1dc7a3Sopenharmony_ci				break;
788cc1dc7a3Sopenharmony_ci			}
789cc1dc7a3Sopenharmony_ci		}
790cc1dc7a3Sopenharmony_ci	}
791cc1dc7a3Sopenharmony_ci
792cc1dc7a3Sopenharmony_ci	return emitted;
793cc1dc7a3Sopenharmony_ci}
794cc1dc7a3Sopenharmony_ci
795cc1dc7a3Sopenharmony_ci#endif
796