1// SPDX-License-Identifier: Apache-2.0
2// ----------------------------------------------------------------------------
3// Copyright 2011-2024 Arm Limited
4//
5// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6// use this file except in compliance with the License. You may obtain a copy
7// of the License at:
8//
9//     http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14// License for the specific language governing permissions and limitations
15// under the License.
16// ----------------------------------------------------------------------------
17
18#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20/**
21 * @brief Functions to compress a symbolic block.
22 */
23
24#include "astcenc_internal.h"
25#include "astcenc_diagnostic_trace.h"
26
27#include <cassert>
28#ifdef ASTC_CUSTOMIZED_ENABLE
29AstcCustomizedSoManager g_astcCustomizedSoManager;
30#endif
31
32/**
33 * @brief Merge two planes of endpoints into a single vector.
34 *
35 * @param      ep_plane1          The endpoints for plane 1.
36 * @param      ep_plane2          The endpoints for plane 2.
37 * @param      component_plane2   The color component for plane 2.
38 * @param[out] result             The merged output.
39 */
40static void merge_endpoints(
41	const endpoints& ep_plane1,
42	const endpoints& ep_plane2,
43	unsigned int component_plane2,
44	endpoints& result
45) {
46	unsigned int partition_count = ep_plane1.partition_count;
47	assert(partition_count == 1);
48
49	vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
50
51	result.partition_count = partition_count;
52	result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
53	result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
54}
55
56/**
57 * @brief Attempt to improve weights given a chosen configuration.
58 *
59 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
60 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
61 * down by one quantization step.
62 *
63 * This is a specialized function which only supports operating on undecimated weight grids,
64 * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
65 * is needed less often.
66 *
67 * @param      decode_mode   The decode mode (LDR, HDR).
68 * @param      bsd           The block size information.
69 * @param      blk           The image block color data to compress.
70 * @param[out] scb           The symbolic compressed block output.
71 */
72#if ASTCENC_NEON != 0
73static bool realign_weights_undecimated(
74	astcenc_profile decode_mode,
75	const block_size_descriptor& bsd,
76	const image_block& blk,
77	symbolic_compressed_block& scb
78) {
79	// Get the partition descriptor
80	unsigned int partition_count = scb.partition_count;
81	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
82
83	// Get the quantization table
84	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
85	unsigned int weight_quant_level = bm.quant_mode;
86	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
87
88	unsigned int max_plane = bm.is_dual_plane;
89	int plane2_component = scb.plane2_component;
90	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
91
92	// Decode the color endpoints
93	bool rgb_hdr;
94	bool alpha_hdr;
95	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
96	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
97	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
98	vfloat4 offset[BLOCK_MAX_PARTITIONS];
99
100	promise(partition_count > 0);
101
102	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
103	{
104		unpack_color_endpoints(decode_mode,
105		                       scb.color_formats[pa_idx],
106		                       scb.color_values[pa_idx],
107		                       rgb_hdr, alpha_hdr,
108		                       endpnt0[pa_idx],
109		                       endpnt1[pa_idx]);
110	}
111
112	uint8_t* dec_weights_uquant = scb.weights;
113	bool adjustments = false;
114
115	// For each plane and partition ...
116	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
117	{
118		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
119		{
120			// Compute the endpoint delta for all components in current plane
121			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
122			epd = select(epd, vint4::zero(), plane_mask);
123
124			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
125			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
126		}
127
128		// For each weight compute previous, current, and next errors
129		promise(bsd.texel_count > 0);
130
131		unsigned int texel = 0;
132		for (; texel + ASTCENC_SIMD_WIDTH <= bsd.texel_count; texel += ASTCENC_SIMD_WIDTH)
133		{
134			int uqw0 = dec_weights_uquant[texel];
135			int uqw1 = dec_weights_uquant[texel + 1];
136			int uqw2 = dec_weights_uquant[texel + 2];
137			int uqw3 = dec_weights_uquant[texel + 3];
138
139			vint4 uqw_vec = vint4(uqw0, uqw1, uqw2, uqw3);
140			vint4 prev_and_next_vec = vint4(qat.prev_next_values[uqw0], qat.prev_next_values[uqw1],
141							qat.prev_next_values[uqw2], qat.prev_next_values[uqw3]);
142
143			vint4 mask = vint4(0xFF, 0xFF, 0xFF, 0xFF);
144			vint4 uqw_down_vec = prev_and_next_vec & mask;
145			vint4 uqw_up_vec = vint4(vshrq_n_s32(prev_and_next_vec.m, 8)) & mask;
146
147			vfloat4 weight_base_vec = int_to_float(uqw_vec);
148			vfloat4 weight_down_vec = int_to_float(uqw_down_vec) - weight_base_vec;
149			vfloat4 weight_up_vec = int_to_float(uqw_up_vec) - weight_base_vec;
150
151			unsigned int partition0 = pi.partition_of_texel[texel];
152			unsigned int partition1 = pi.partition_of_texel[texel + 1];
153			unsigned int partition2 = pi.partition_of_texel[texel + 2];
154			unsigned int partition3 = pi.partition_of_texel[texel + 3];
155
156			vfloat4 color_offset0 = offset[partition0];
157			vfloat4 color_offset1 = offset[partition1];
158			vfloat4 color_offset2 = offset[partition2];
159			vfloat4 color_offset3 = offset[partition3];
160
161			vfloat4 color_base0 = endpnt0f[partition0];
162			vfloat4 color_base1 = endpnt0f[partition1];
163			vfloat4 color_base2 = endpnt0f[partition2];
164			vfloat4 color_base3 = endpnt0f[partition3];
165
166			vfloat4 color0 = color_base0 + color_offset0 * weight_base_vec.lane<0>();
167			vfloat4 color1 = color_base1 + color_offset1 * weight_base_vec.lane<1>();
168			vfloat4 color2 = color_base2 + color_offset2 * weight_base_vec.lane<2>();
169			vfloat4 color3 = color_base3 + color_offset3 * weight_base_vec.lane<3>();
170
171			vfloat4 orig_color0 = blk.texel(texel);
172			vfloat4 orig_color1 = blk.texel(texel + 1);
173			vfloat4 orig_color2 = blk.texel(texel + 2);
174			vfloat4 orig_color3 = blk.texel(texel + 3);
175
176			vfloat4 error_weight = blk.channel_weight;
177
178			vfloat4 color_diff0 = color0 - orig_color0;
179			vfloat4 color_diff1 = color1 - orig_color1;
180			vfloat4 color_diff2 = color2 - orig_color2;
181			vfloat4 color_diff3 = color3 - orig_color3;
182
183			vfloat4 color_diff_down0 = color_diff0 + color_offset0 * weight_down_vec.lane<0>();
184			vfloat4 color_diff_down1 = color_diff1 + color_offset1 * weight_down_vec.lane<1>();
185			vfloat4 color_diff_down2 = color_diff2 + color_offset2 * weight_down_vec.lane<2>();
186			vfloat4 color_diff_down3 = color_diff3 + color_offset3 * weight_down_vec.lane<3>();
187
188			vfloat4 color_diff_up0 = color_diff0 + color_offset0 * weight_up_vec.lane<0>();
189			vfloat4 color_diff_up1 = color_diff1 + color_offset1 * weight_up_vec.lane<1>();
190			vfloat4 color_diff_up2 = color_diff2 + color_offset2 * weight_up_vec.lane<2>();
191			vfloat4 color_diff_up3 = color_diff3 + color_offset3 * weight_up_vec.lane<3>();
192
193			float error_base0 = dot_s(color_diff0 * color_diff0, error_weight);
194			float error_base1 = dot_s(color_diff1 * color_diff1, error_weight);
195			float error_base2 = dot_s(color_diff2 * color_diff2, error_weight);
196			float error_base3 = dot_s(color_diff3 * color_diff3, error_weight);
197
198			float error_down0 = dot_s(color_diff_down0 * color_diff_down0, error_weight);
199			float error_down1 = dot_s(color_diff_down1 * color_diff_down1, error_weight);
200			float error_down2 = dot_s(color_diff_down2 * color_diff_down2, error_weight);
201			float error_down3 = dot_s(color_diff_down3 * color_diff_down3, error_weight);
202
203			float error_up0 = dot_s(color_diff_up0 * color_diff_up0, error_weight);
204			float error_up1 = dot_s(color_diff_up1 * color_diff_up1, error_weight);
205			float error_up2 = dot_s(color_diff_up2 * color_diff_up2, error_weight);
206			float error_up3 = dot_s(color_diff_up3 * color_diff_up3, error_weight);
207
208			vfloat4 error_base_vec = vfloat4(error_base0, error_base1, error_base2, error_base3);
209			vfloat4 error_down_vec = vfloat4(error_down0, error_down1, error_down2, error_down3);
210			vfloat4 error_up_vec = vfloat4(error_up0, error_up1, error_up2, error_up3);
211
212			vmask4 check_result_up = (error_up_vec < error_base_vec) &
213			        (error_up_vec < error_down_vec) & (uqw_vec < vint4(64));
214
215			vmask4 check_result_down = (error_down_vec < error_base_vec) & (uqw_vec > vint4::zero());
216			check_result_down = check_result_down & (~check_result_up);
217
218			if (popcount(check_result_up | check_result_down) != 0)
219			{
220				uqw_vec = select(uqw_vec, uqw_up_vec, check_result_up);
221				uqw_vec = select(uqw_vec, uqw_down_vec, check_result_down);
222
223				dec_weights_uquant[texel] = uqw_vec.lane<0>();
224				dec_weights_uquant[texel + 1] = uqw_vec.lane<1>();
225				dec_weights_uquant[texel + 2] = uqw_vec.lane<2>();    // channel 2
226				dec_weights_uquant[texel + 3] = uqw_vec.lane<3>();    // channel 3
227				adjustments = true;
228			}
229		};
230
231		for (; texel < bsd.texel_count; texel++)
232		{
233			int uqw = dec_weights_uquant[texel];
234
235			uint32_t prev_and_next = qat.prev_next_values[uqw];
236			int uqw_down = prev_and_next & 0xFF;
237			int uqw_up = (prev_and_next >> 8) & 0xFF;
238
239			// Interpolate the colors to create the diffs
240			float weight_base = static_cast<float>(uqw);
241			float weight_down = static_cast<float>(uqw_down - uqw);
242			float weight_up = static_cast<float>(uqw_up - uqw);
243
244			unsigned int partition = pi.partition_of_texel[texel];
245			vfloat4 color_offset = offset[partition];
246			vfloat4 color_base   = endpnt0f[partition];
247
248			vfloat4 color = color_base + color_offset * weight_base;
249			vfloat4 orig_color   = blk.texel(texel);
250			vfloat4 error_weight = blk.channel_weight;
251
252			vfloat4 color_diff      = color - orig_color;
253			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
254			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;
255
256			float error_base = dot_s(color_diff      * color_diff,      error_weight);
257			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
258			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);
259
260			// Check if the prev or next error is better, and if so use it
261			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
262			{
263				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
264				adjustments = true;
265			}
266			else if ((error_down < error_base) && (uqw > 0))
267			{
268				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
269				adjustments = true;
270			}
271		}
272
273		// Prepare iteration for plane 2
274		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
275		plane_mask = ~plane_mask;
276	}
277	return adjustments;
278}
279#else
280static bool realign_weights_undecimated(
281	astcenc_profile decode_mode,
282	const block_size_descriptor& bsd,
283	const image_block& blk,
284	symbolic_compressed_block& scb
285) {
286	// Get the partition descriptor
287	unsigned int partition_count = scb.partition_count;
288	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
289
290	// Get the quantization table
291	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
292	unsigned int weight_quant_level = bm.quant_mode;
293	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
294
295	unsigned int max_plane = bm.is_dual_plane;
296	int plane2_component = scb.plane2_component;
297	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
298
299	// Decode the color endpoints
300	bool rgb_hdr;
301	bool alpha_hdr;
302	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
303	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
304	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
305	vfloat4 offset[BLOCK_MAX_PARTITIONS];
306
307	promise(partition_count > 0);
308
309	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
310	{
311		unpack_color_endpoints(decode_mode,
312		                       scb.color_formats[pa_idx],
313		                       scb.color_values[pa_idx],
314		                       rgb_hdr, alpha_hdr,
315		                       endpnt0[pa_idx],
316		                       endpnt1[pa_idx]);
317	}
318
319	uint8_t* dec_weights_uquant = scb.weights;
320	bool adjustments = false;
321
322	// For each plane and partition ...
323	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
324	{
325		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
326		{
327			// Compute the endpoint delta for all components in current plane
328			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
329			epd = select(epd, vint4::zero(), plane_mask);
330
331			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
332			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
333		}
334
335		// For each weight compute previous, current, and next errors
336		promise(bsd.texel_count > 0);
337		for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
338		{
339			int uqw = dec_weights_uquant[texel];
340
341			uint32_t prev_and_next = qat.prev_next_values[uqw];
342			int uqw_down = prev_and_next & 0xFF;
343			int uqw_up = (prev_and_next >> 8) & 0xFF;
344
345			// Interpolate the colors to create the diffs
346			float weight_base = static_cast<float>(uqw);
347			float weight_down = static_cast<float>(uqw_down - uqw);
348			float weight_up = static_cast<float>(uqw_up - uqw);
349
350			unsigned int partition = pi.partition_of_texel[texel];
351			vfloat4 color_offset = offset[partition];
352			vfloat4 color_base   = endpnt0f[partition];
353
354			vfloat4 color = color_base + color_offset * weight_base;
355			vfloat4 orig_color   = blk.texel(texel);
356			vfloat4 error_weight = blk.channel_weight;
357
358			vfloat4 color_diff      = color - orig_color;
359			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
360			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;
361
362			float error_base = dot_s(color_diff      * color_diff,      error_weight);
363			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
364			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);
365
366			// Check if the prev or next error is better, and if so use it
367			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
368			{
369				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
370				adjustments = true;
371			}
372			else if ((error_down < error_base) && (uqw > 0))
373			{
374				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
375				adjustments = true;
376			}
377		}
378
379		// Prepare iteration for plane 2
380		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
381		plane_mask = ~plane_mask;
382	}
383
384	return adjustments;
385}
386#endif
387
388/**
389 * @brief Attempt to improve weights given a chosen configuration.
390 *
391 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
392 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
393 * down by one quantization step.
394 *
395 * @param      decode_mode   The decode mode (LDR, HDR).
396 * @param      bsd           The block size information.
397 * @param      blk           The image block color data to compress.
398 * @param[out] scb           The symbolic compressed block output.
399 */
400static bool realign_weights_decimated(
401	astcenc_profile decode_mode,
402	const block_size_descriptor& bsd,
403	const image_block& blk,
404	symbolic_compressed_block& scb
405) {
406	// Get the partition descriptor
407	unsigned int partition_count = scb.partition_count;
408	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
409
410	// Get the quantization table
411	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
412	unsigned int weight_quant_level = bm.quant_mode;
413	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
414
415	// Get the decimation table
416	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
417	unsigned int weight_count = di.weight_count;
418	assert(weight_count != bsd.texel_count);
419
420	unsigned int max_plane = bm.is_dual_plane;
421	int plane2_component = scb.plane2_component;
422	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
423
424	// Decode the color endpoints
425	bool rgb_hdr;
426	bool alpha_hdr;
427	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
428	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
429	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
430	vfloat4 offset[BLOCK_MAX_PARTITIONS];
431
432	promise(partition_count > 0);
433	promise(weight_count > 0);
434
435	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
436	{
437		unpack_color_endpoints(decode_mode,
438		                       scb.color_formats[pa_idx],
439		                       scb.color_values[pa_idx],
440		                       rgb_hdr, alpha_hdr,
441		                       endpnt0[pa_idx],
442		                       endpnt1[pa_idx]);
443	}
444
445	uint8_t* dec_weights_uquant = scb.weights;
446	bool adjustments = false;
447
448	// For each plane and partition ...
449	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
450	{
451		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
452		{
453			// Compute the endpoint delta for all components in current plane
454			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
455			epd = select(epd, vint4::zero(), plane_mask);
456
457			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
458			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
459		}
460
461		// Create an unquantized weight grid for this decimation level
462		ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
463		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
464		{
465			vint unquant_value(dec_weights_uquant + we_idx);
466			vfloat unquant_valuef = int_to_float(unquant_value);
467			storea(unquant_valuef, uq_weightsf + we_idx);
468		}
469
470		// For each weight compute previous, current, and next errors
471		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
472		{
473			int uqw = dec_weights_uquant[we_idx];
474			uint32_t prev_and_next = qat.prev_next_values[uqw];
475
476			float uqw_base = uq_weightsf[we_idx];
477			float uqw_down = static_cast<float>(prev_and_next & 0xFF);
478			float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
479
480			float uqw_diff_down = uqw_down - uqw_base;
481			float uqw_diff_up = uqw_up - uqw_base;
482
483			vfloat4 error_basev = vfloat4::zero();
484			vfloat4 error_downv = vfloat4::zero();
485			vfloat4 error_upv = vfloat4::zero();
486
487			// Interpolate the colors to create the diffs
488			unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
489			promise(texels_to_evaluate > 0);
490			for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
491			{
492				unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
493
494				float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
495
496				float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
497				                   + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
498					              + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
499				                   + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
500
501				// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
502				// float weight = astc::flt_rd(weight_base + 0.5f);
503				// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
504				// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
505				float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
506				float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
507
508				unsigned int partition = pi.partition_of_texel[texel];
509				vfloat4 color_offset = offset[partition];
510				vfloat4 color_base   = endpnt0f[partition];
511
512				vfloat4 color = color_base + color_offset * weight_base;
513				vfloat4 orig_color = blk.texel(texel);
514
515				vfloat4 color_diff      = color - orig_color;
516				vfloat4 color_down_diff = color_diff + color_offset * weight_down;
517				vfloat4 color_up_diff   = color_diff + color_offset * weight_up;
518
519				error_basev += color_diff * color_diff;
520				error_downv += color_down_diff * color_down_diff;
521				error_upv   += color_up_diff * color_up_diff;
522			}
523
524			vfloat4 error_weight = blk.channel_weight;
525			float error_base = hadd_s(error_basev * error_weight);
526			float error_down = hadd_s(error_downv * error_weight);
527			float error_up   = hadd_s(error_upv   * error_weight);
528
529			// Check if the prev or next error is better, and if so use it
530			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
531			{
532				uq_weightsf[we_idx] = uqw_up;
533				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
534				adjustments = true;
535			}
536			else if ((error_down < error_base) && (uqw > 0))
537			{
538				uq_weightsf[we_idx] = uqw_down;
539				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
540				adjustments = true;
541			}
542		}
543
544		// Prepare iteration for plane 2
545		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
546		plane_mask = ~plane_mask;
547	}
548
549	return adjustments;
550}
551
552/**
553 * @brief Compress a block using a chosen partitioning and 1 plane of weights.
554 *
555 * @param      config                    The compressor configuration.
556 * @param      bsd                       The block size information.
557 * @param      blk                       The image block color data to compress.
558 * @param      only_always               True if we only use "always" percentile block modes.
559 * @param      tune_errorval_threshold   The error value threshold.
560 * @param      partition_count           The partition count.
561 * @param      partition_index           The partition index if @c partition_count is 2-4.
562 * @param[out] scb                       The symbolic compressed block output.
563 * @param[out] tmpbuf                    The quantized weights for plane 1.
564 */
565static float compress_symbolic_block_for_partition_1plane(
566	QualityProfile privateProfile,
567	const astcenc_config& config,
568	const block_size_descriptor& bsd,
569	const image_block& blk,
570	bool only_always,
571	float tune_errorval_threshold,
572	unsigned int partition_count,
573	unsigned int partition_index,
574	symbolic_compressed_block& scb,
575	compression_working_buffers& tmpbuf,
576	int quant_limit
577) {
578	promise(partition_count > 0);
579	promise(config.tune_candidate_limit > 0);
580	promise(config.tune_refinement_limit > 0);
581
582	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
583
584	auto compute_difference = &compute_symbolic_block_difference_1plane;
585	if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
586	{
587		compute_difference = &compute_symbolic_block_difference_1plane_1partition;
588	}
589
590	const auto& pi = bsd.get_partition_info(partition_count, partition_index);
591
592	// Compute ideal weights and endpoint colors, with no quantization or decimation
593	endpoints_and_weights& ei = tmpbuf.ei1;
594	compute_ideal_colors_and_weights_1plane(blk, pi, ei);
595
596	// Compute ideal weights and endpoint colors for every decimation
597	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
598	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
599
600	// For each decimation mode, compute an ideal set of weights with no quantization
601	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
602	                                                : bsd.decimation_mode_count_selected;
603	promise(max_decimation_modes > 0);
604	for (unsigned int i = 0; i < max_decimation_modes; i++)
605	{
606		const auto& dm = bsd.get_decimation_mode(i);
607		if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
608		{
609			continue;
610		}
611
612		const auto& di = bsd.get_decimation_info(i);
613
614		compute_ideal_weights_for_decimation(
615		    ei,
616		    di,
617		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
618	}
619
620	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
621	// weight pair, compute the smallest weight that will result in a color value greater than 1
622	vfloat4 min_ep(10.0f);
623	for (unsigned int i = 0; i < partition_count; i++)
624	{
625		vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
626
627		vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
628		min_ep = select(min_ep, ep, use_ep);
629	}
630
631	float min_wt_cutoff = hmin_s(min_ep);
632
633	// For each mode, use the angular method to compute a shift
634	compute_angular_endpoints_1plane(
635	    privateProfile, only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
636
637	float* weight_low_value = tmpbuf.weight_low_value1;
638	float* weight_high_value = tmpbuf.weight_high_value1;
639	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
640	float* qwt_errors = tmpbuf.qwt_errors;
641
642	// For each mode (which specifies a decimation and a quantization):
643	//     * Compute number of bits needed for the quantized weights
644	//     * Generate an optimized set of quantized weights
645	//     * Compute quantization errors for the mode
646
647
648	static const int8_t free_bits_for_partition_count[4] {
649		115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
650	};
651
652	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
653	                                           : bsd.block_mode_count_1plane_selected;
654	promise(max_block_modes > 0);
655	for (unsigned int i = 0; i < max_block_modes; i++)
656	{
657		const block_mode& bm = bsd.block_modes[i];
658
659		if (bm.quant_mode > max_weight_quant)
660		{
661			qwt_errors[i] = 1e38f;
662			continue;
663		}
664
665		assert(!bm.is_dual_plane);
666		int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
667		if (bitcount <= 0)
668		{
669			qwt_errors[i] = 1e38f;
670			continue;
671		}
672
673		if (weight_high_value[i] > 1.02f * min_wt_cutoff)
674		{
675			weight_high_value[i] = 1.0f;
676		}
677
678		int decimation_mode = bm.decimation_mode;
679		const auto& di = bsd.get_decimation_info(decimation_mode);
680
681		qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
682
683		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
684
685		// Generate the optimized set of weights for the weight mode
686		compute_quantized_weights_for_decimation(
687		    di,
688		    weight_low_value[i], weight_high_value[i],
689		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
690		    dec_weights_uquantf,
691		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
692		    bm.get_weight_quant_mode());
693
694		// Compute weight quantization errors for the block mode
695		qwt_errors[i] = compute_error_of_weight_set_1plane(
696		    ei,
697		    di,
698		    dec_weights_uquantf);
699	}
700
701	// Decide the optimal combination of color endpoint encodings and weight encodings
702	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
703	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
704
705	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
706	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
707
708	unsigned int candidate_count = compute_ideal_endpoint_formats(
709	    privateProfile,
710	    pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
711	    config.tune_candidate_limit, 0, max_block_modes,
712	    partition_format_specifiers, block_mode_index,
713	    color_quant_level, color_quant_level_mod, tmpbuf);
714
715	// Iterate over the N believed-to-be-best modes to find out which one is actually best
716	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
717	float best_errorval_in_scb = scb.errorval;
718
719	for (unsigned int i = 0; i < candidate_count; i++)
720	{
721		TRACE_NODE(node0, "candidate");
722
723		const int bm_packed_index = block_mode_index[i];
724		assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
725		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
726
727		int decimation_mode = qw_bm.decimation_mode;
728		const auto& di = bsd.get_decimation_info(decimation_mode);
729		promise(di.weight_count > 0);
730
731		trace_add_data("weight_x", di.weight_x);
732		trace_add_data("weight_y", di.weight_y);
733		trace_add_data("weight_z", di.weight_z);
734		trace_add_data("weight_quant", qw_bm.quant_mode);
735
736		// Recompute the ideal color endpoints before storing them
737		vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
738		vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
739
740		symbolic_compressed_block workscb;
741		endpoints workep = ei.ep;
742
743		uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
744
745		for (unsigned int j = 0; j < di.weight_count; j++)
746		{
747			workscb.weights[j] = u8_weight_src[j];
748		}
749
750		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
751		{
752			recompute_ideal_colors_1plane(
753			    blk, pi, di, workscb.weights,
754			    workep, rgbs_colors, rgbo_colors);
755
756			// Quantize the chosen color, tracking if worth trying the mod value
757			bool all_same = color_quant_level[i] != color_quant_level_mod[i];
758			for (unsigned int j = 0; j < partition_count; j++)
759			{
760				workscb.color_formats[j] = pack_color_endpoints(
761				    privateProfile,
762				    workep.endpt0[j],
763				    workep.endpt1[j],
764				    rgbs_colors[j],
765				    rgbo_colors[j],
766				    partition_format_specifiers[i][j],
767				    workscb.color_values[j],
768				    color_quant_level[i]);
769
770				all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
771			}
772
773			// If all the color endpoint modes are the same, we get a few more bits to store colors;
774			// let's see if we can take advantage of this: requantize all the colors and see if the
775			// endpoint modes remain the same.
776			workscb.color_formats_matched = 0;
777			if (partition_count >= 2 && all_same)
778			{
779				uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
780				uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
781				bool all_same_mod = true;
782				for (unsigned int j = 0; j < partition_count; j++)
783				{
784					color_formats_mod[j] = pack_color_endpoints(
785					    privateProfile,
786					    workep.endpt0[j],
787					    workep.endpt1[j],
788					    rgbs_colors[j],
789					    rgbo_colors[j],
790					    partition_format_specifiers[i][j],
791					    colorvals[j],
792					    color_quant_level_mod[i]);
793
794					// Early out as soon as it's no longer possible to use mod
795					if (color_formats_mod[j] != color_formats_mod[0])
796					{
797						all_same_mod = false;
798						break;
799					}
800				}
801
802				if (all_same_mod)
803				{
804					workscb.color_formats_matched = 1;
805					for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
806					{
807						for (unsigned int k = 0; k < 8; k++)
808						{
809							workscb.color_values[j][k] = colorvals[j][k];
810						}
811
812						workscb.color_formats[j] = color_formats_mod[j];
813					}
814				}
815			}
816
817			// Store header fields
818			workscb.partition_count = static_cast<uint8_t>(partition_count);
819			workscb.partition_index = static_cast<uint16_t>(partition_index);
820			workscb.plane2_component = -1;
821			workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
822			workscb.block_mode = qw_bm.mode_index;
823			workscb.block_type = SYM_BTYPE_NONCONST;
824			if (privateProfile == HIGH_SPEED_PROFILE)
825			{
826				workscb.errorval = 0;
827				scb = workscb;
828				break;
829			}
830			// Pre-realign test
831			if (l == 0)
832			{
833				float errorval = compute_difference(config, bsd, workscb, blk);
834				if (errorval == -ERROR_CALC_DEFAULT)
835				{
836					errorval = -errorval;
837					workscb.block_type = SYM_BTYPE_ERROR;
838				}
839
840				trace_add_data("error_prerealign", errorval);
841				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
842
843				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
844				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
845				// drive a heuristic to skip blocks that are unlikely to catch up with the best
846				// block we have already.
847				unsigned int iters_remaining = config.tune_refinement_limit - l;
848				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
849				if (errorval > (threshold * best_errorval_in_scb))
850				{
851					break;
852				}
853
854				if (errorval < best_errorval_in_scb)
855				{
856					best_errorval_in_scb = errorval;
857					workscb.errorval = errorval;
858					scb = workscb;
859
860					if (errorval < tune_errorval_threshold)
861					{
862						// Skip remaining candidates - this is "good enough"
863						i = candidate_count;
864						break;
865					}
866				}
867			}
868
869			bool adjustments;
870			if (di.weight_count != bsd.texel_count)
871			{
872				adjustments = realign_weights_decimated(
873					config.profile, bsd, blk, workscb);
874			}
875			else
876			{
877				adjustments = realign_weights_undecimated(
878					config.profile, bsd, blk, workscb);
879			}
880
881			// Post-realign test
882			float errorval = compute_difference(config, bsd, workscb, blk);
883			if (errorval == -ERROR_CALC_DEFAULT)
884			{
885				errorval = -errorval;
886				workscb.block_type = SYM_BTYPE_ERROR;
887			}
888
889			trace_add_data("error_postrealign", errorval);
890			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
891
892			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
893			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
894			// give benefit of the doubt ...
895			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
896			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
897			if (errorval > (threshold * best_errorval_in_scb))
898			{
899				break;
900			}
901
902			if (errorval < best_errorval_in_scb)
903			{
904				best_errorval_in_scb = errorval;
905				workscb.errorval = errorval;
906				scb = workscb;
907
908				if (errorval < tune_errorval_threshold)
909				{
910					// Skip remaining candidates - this is "good enough"
911					i = candidate_count;
912					break;
913				}
914			}
915
916			if (!adjustments)
917			{
918				break;
919			}
920		}
921	}
922
923	return best_errorval_in_mode;
924}
925
926/**
927 * @brief Compress a block using a chosen partitioning and 2 planes of weights.
928 *
929 * @param      config                    The compressor configuration.
930 * @param      bsd                       The block size information.
931 * @param      blk                       The image block color data to compress.
932 * @param      tune_errorval_threshold   The error value threshold.
933 * @param      plane2_component          The component index for the second plane of weights.
934 * @param[out] scb                       The symbolic compressed block output.
935 * @param[out] tmpbuf                    The quantized weights for plane 1.
936 */
937static float compress_symbolic_block_for_partition_2planes(
938	QualityProfile privateProfile,
939	const astcenc_config& config,
940	const block_size_descriptor& bsd,
941	const image_block& blk,
942	float tune_errorval_threshold,
943	unsigned int plane2_component,
944	symbolic_compressed_block& scb,
945	compression_working_buffers& tmpbuf,
946	int quant_limit
947) {
948	promise(config.tune_candidate_limit > 0);
949	promise(config.tune_refinement_limit > 0);
950	promise(bsd.decimation_mode_count_selected > 0);
951
952	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
953
954	// Compute ideal weights and endpoint colors, with no quantization or decimation
955	endpoints_and_weights& ei1 = tmpbuf.ei1;
956	endpoints_and_weights& ei2 = tmpbuf.ei2;
957
958	compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
959
960	// Compute ideal weights and endpoint colors for every decimation
961	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
962	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
963
964	// For each decimation mode, compute an ideal set of weights with no quantization
965	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
966	{
967		const auto& dm = bsd.get_decimation_mode(i);
968		if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
969		{
970			continue;
971		}
972
973		const auto& di = bsd.get_decimation_info(i);
974
975		compute_ideal_weights_for_decimation(
976		    ei1,
977		    di,
978		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
979
980		compute_ideal_weights_for_decimation(
981		    ei2,
982		    di,
983		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
984	}
985
986	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
987	// weight pair, compute the smallest weight that will result in a color value greater than 1
988	vfloat4 min_ep1(10.0f);
989	vfloat4 min_ep2(10.0f);
990
991	vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
992	vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
993	min_ep1 = select(min_ep1, ep1, use_ep1);
994
995	vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
996	vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
997	min_ep2 = select(min_ep2, ep2, use_ep2);
998
999	vfloat4 err_max(ERROR_CALC_DEFAULT);
1000	vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
1001
1002	// Set the plane2 component to max error in ep1
1003	min_ep1 = select(min_ep1, err_max, err_mask);
1004
1005	float min_wt_cutoff1 = hmin_s(min_ep1);
1006
1007	// Set the minwt2 to the plane2 component min in ep2
1008	float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
1009
1010	compute_angular_endpoints_2planes(
1011	    privateProfile, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
1012
1013	// For each mode (which specifies a decimation and a quantization):
1014	//     * Compute number of bits needed for the quantized weights
1015	//     * Generate an optimized set of quantized weights
1016	//     * Compute quantization errors for the mode
1017
1018	float* weight_low_value1 = tmpbuf.weight_low_value1;
1019	float* weight_high_value1 = tmpbuf.weight_high_value1;
1020	float* weight_low_value2 = tmpbuf.weight_low_value2;
1021	float* weight_high_value2 = tmpbuf.weight_high_value2;
1022
1023	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
1024	float* qwt_errors = tmpbuf.qwt_errors;
1025
1026	unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
1027	unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
1028
1029	for (unsigned int i = start_2plane; i < end_2plane; i++)
1030	{
1031		const block_mode& bm = bsd.block_modes[i];
1032		assert(bm.is_dual_plane);
1033
1034		if (bm.quant_mode > max_weight_quant)
1035		{
1036			qwt_errors[i] = 1e38f;
1037			continue;
1038		}
1039
1040		qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
1041
1042		if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
1043		{
1044			weight_high_value1[i] = 1.0f;
1045		}
1046
1047		if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
1048		{
1049			weight_high_value2[i] = 1.0f;
1050		}
1051
1052		unsigned int decimation_mode = bm.decimation_mode;
1053		const auto& di = bsd.get_decimation_info(decimation_mode);
1054
1055		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
1056
1057		// Generate the optimized set of weights for the mode
1058		compute_quantized_weights_for_decimation(
1059		    di,
1060		    weight_low_value1[i],
1061		    weight_high_value1[i],
1062		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
1063		    dec_weights_uquantf,
1064		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
1065		    bm.get_weight_quant_mode());
1066
1067		compute_quantized_weights_for_decimation(
1068		    di,
1069		    weight_low_value2[i],
1070		    weight_high_value2[i],
1071		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
1072		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
1073		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
1074		    bm.get_weight_quant_mode());
1075
1076		// Compute weight quantization errors for the block mode
1077		qwt_errors[i] = compute_error_of_weight_set_2planes(
1078		    ei1,
1079		    ei2,
1080		    di,
1081		    dec_weights_uquantf,
1082		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
1083	}
1084
1085	// Decide the optimal combination of color endpoint encodings and weight encodings
1086	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
1087	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
1088
1089	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
1090	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
1091
1092	endpoints epm;
1093	merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
1094
1095	const auto& pi = bsd.get_partition_info(1, 0);
1096	unsigned int candidate_count = compute_ideal_endpoint_formats(
1097	    config.privateProfile,
1098	    pi, blk, epm, qwt_bitcounts, qwt_errors,
1099	    config.tune_candidate_limit,
1100		bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
1101	    partition_format_specifiers, block_mode_index,
1102	    color_quant_level, color_quant_level_mod, tmpbuf);
1103
1104	// Iterate over the N believed-to-be-best modes to find out which one is actually best
1105	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
1106	float best_errorval_in_scb = scb.errorval;
1107
1108	for (unsigned int i = 0; i < candidate_count; i++)
1109	{
1110		TRACE_NODE(node0, "candidate");
1111
1112		const int bm_packed_index = block_mode_index[i];
1113		assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
1114		       bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
1115		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
1116
1117		int decimation_mode = qw_bm.decimation_mode;
1118		const auto& di = bsd.get_decimation_info(decimation_mode);
1119		promise(di.weight_count > 0);
1120
1121		trace_add_data("weight_x", di.weight_x);
1122		trace_add_data("weight_y", di.weight_y);
1123		trace_add_data("weight_z", di.weight_z);
1124		trace_add_data("weight_quant", qw_bm.quant_mode);
1125
1126		vfloat4 rgbs_color;
1127		vfloat4 rgbo_color;
1128
1129		symbolic_compressed_block workscb;
1130		endpoints workep = epm;
1131
1132		uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
1133		uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
1134
1135		for (int j = 0; j < di.weight_count; j++)
1136		{
1137			workscb.weights[j] = u8_weight1_src[j];
1138			workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
1139		}
1140
1141		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
1142		{
1143			recompute_ideal_colors_2planes(
1144			    blk, bsd, di,
1145			    workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
1146			    workep, rgbs_color, rgbo_color, plane2_component);
1147
1148			// Quantize the chosen color
1149			workscb.color_formats[0] = pack_color_endpoints(
1150			                               privateProfile,
1151			                               workep.endpt0[0],
1152			                               workep.endpt1[0],
1153			                               rgbs_color, rgbo_color,
1154			                               partition_format_specifiers[i][0],
1155			                               workscb.color_values[0],
1156			                               color_quant_level[i]);
1157
1158			// Store header fields
1159			workscb.partition_count = 1;
1160			workscb.partition_index = 0;
1161			workscb.quant_mode = color_quant_level[i];
1162			workscb.color_formats_matched = 0;
1163			workscb.block_mode = qw_bm.mode_index;
1164			workscb.plane2_component = static_cast<int8_t>(plane2_component);
1165			workscb.block_type = SYM_BTYPE_NONCONST;
1166
1167			// Pre-realign test
1168			if (l == 0)
1169			{
1170				float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
1171				if (errorval == -ERROR_CALC_DEFAULT)
1172				{
1173					errorval = -errorval;
1174					workscb.block_type = SYM_BTYPE_ERROR;
1175				}
1176
1177				trace_add_data("error_prerealign", errorval);
1178				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1179
1180				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
1181				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
1182				// drive a heuristic to skip blocks that are unlikely to catch up with the best
1183				// block we have already.
1184				unsigned int iters_remaining = config.tune_refinement_limit - l;
1185				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
1186				if (errorval > (threshold * best_errorval_in_scb))
1187				{
1188					break;
1189				}
1190
1191				if (errorval < best_errorval_in_scb)
1192				{
1193					best_errorval_in_scb = errorval;
1194					workscb.errorval = errorval;
1195					scb = workscb;
1196
1197					if (errorval < tune_errorval_threshold)
1198					{
1199						// Skip remaining candidates - this is "good enough"
1200						i = candidate_count;
1201						break;
1202					}
1203				}
1204			}
1205
1206			// Perform a final pass over the weights to try to improve them.
1207			bool adjustments;
1208			if (di.weight_count != bsd.texel_count)
1209			{
1210				adjustments = realign_weights_decimated(
1211					config.profile, bsd, blk, workscb);
1212			}
1213			else
1214			{
1215				adjustments = realign_weights_undecimated(
1216					config.profile, bsd, blk, workscb);
1217			}
1218
1219			// Post-realign test
1220			float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
1221			if (errorval == -ERROR_CALC_DEFAULT)
1222			{
1223				errorval = -errorval;
1224				workscb.block_type = SYM_BTYPE_ERROR;
1225			}
1226
1227			trace_add_data("error_postrealign", errorval);
1228			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1229
1230			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
1231			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
1232			// give benefit of the doubt ...
1233			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
1234			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
1235			if (errorval > (threshold * best_errorval_in_scb))
1236			{
1237				break;
1238			}
1239
1240			if (errorval < best_errorval_in_scb)
1241			{
1242				best_errorval_in_scb = errorval;
1243				workscb.errorval = errorval;
1244				scb = workscb;
1245
1246				if (errorval < tune_errorval_threshold)
1247				{
1248					// Skip remaining candidates - this is "good enough"
1249					i = candidate_count;
1250					break;
1251				}
1252			}
1253
1254			if (!adjustments)
1255			{
1256				break;
1257			}
1258		}
1259	}
1260
1261	return best_errorval_in_mode;
1262}
1263
1264/**
1265 * @brief Determine the lowest cross-channel correlation factor.
1266 *
1267 * @param texels_per_block   The number of texels in a block.
1268 * @param blk                The image block color data to compress.
1269 *
1270 * @return Return the lowest correlation factor.
1271 */
1272static float prepare_block_statistics(
1273	int texels_per_block,
1274	const image_block& blk
1275) {
1276	// Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
1277	// of the matrix. The matrix is symmetric, so this is all we need for this use case.
1278	float rs = 0.0f;
1279	float gs = 0.0f;
1280	float bs = 0.0f;
1281	float as = 0.0f;
1282	float rr_var = 0.0f;
1283	float gg_var = 0.0f;
1284	float bb_var = 0.0f;
1285	float aa_var = 0.0f;
1286	float rg_cov = 0.0f;
1287	float rb_cov = 0.0f;
1288	float ra_cov = 0.0f;
1289	float gb_cov = 0.0f;
1290	float ga_cov = 0.0f;
1291	float ba_cov = 0.0f;
1292
1293	float weight_sum = 0.0f;
1294
1295	promise(texels_per_block > 0);
1296	for (int i = 0; i < texels_per_block; i++)
1297	{
1298		float weight = hadd_s(blk.channel_weight) / 4.0f;
1299		assert(weight >= 0.0f);
1300		weight_sum += weight;
1301
1302		float r = blk.data_r[i];
1303		float g = blk.data_g[i];
1304		float b = blk.data_b[i];
1305		float a = blk.data_a[i];
1306
1307		float rw = r * weight;
1308		rs += rw;
1309		rr_var += r * rw;
1310		rg_cov += g * rw;
1311		rb_cov += b * rw;
1312		ra_cov += a * rw;
1313
1314		float gw = g * weight;
1315		gs += gw;
1316		gg_var += g * gw;
1317		gb_cov += b * gw;
1318		ga_cov += a * gw;
1319
1320		float bw = b * weight;
1321		bs += bw;
1322		bb_var += b * bw;
1323		ba_cov += a * bw;
1324
1325		float aw = a * weight;
1326		as += aw;
1327		aa_var += a * aw;
1328	}
1329
1330	float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
1331
1332	rr_var -= rs * (rs * rpt);
1333	rg_cov -= gs * (rs * rpt);
1334	rb_cov -= bs * (rs * rpt);
1335	ra_cov -= as * (rs * rpt);
1336
1337	gg_var -= gs * (gs * rpt);
1338	gb_cov -= bs * (gs * rpt);
1339	ga_cov -= as * (gs * rpt);
1340
1341	bb_var -= bs * (bs * rpt);
1342	ba_cov -= as * (bs * rpt);
1343
1344	aa_var -= as * (as * rpt);
1345
1346	// These will give a NaN if a channel is constant - these are fixed up in the next step
1347	rg_cov *= astc::rsqrt(rr_var * gg_var);
1348	rb_cov *= astc::rsqrt(rr_var * bb_var);
1349	ra_cov *= astc::rsqrt(rr_var * aa_var);
1350	gb_cov *= astc::rsqrt(gg_var * bb_var);
1351	ga_cov *= astc::rsqrt(gg_var * aa_var);
1352	ba_cov *= astc::rsqrt(bb_var * aa_var);
1353
1354	if (astc::isnan(rg_cov)) rg_cov = 1.0f;
1355	if (astc::isnan(rb_cov)) rb_cov = 1.0f;
1356	if (astc::isnan(ra_cov)) ra_cov = 1.0f;
1357	if (astc::isnan(gb_cov)) gb_cov = 1.0f;
1358	if (astc::isnan(ga_cov)) ga_cov = 1.0f;
1359	if (astc::isnan(ba_cov)) ba_cov = 1.0f;
1360
1361	float lowest_correlation = astc::min(fabsf(rg_cov),      fabsf(rb_cov));
1362	lowest_correlation       = astc::min(lowest_correlation, fabsf(ra_cov));
1363	lowest_correlation       = astc::min(lowest_correlation, fabsf(gb_cov));
1364	lowest_correlation       = astc::min(lowest_correlation, fabsf(ga_cov));
1365	lowest_correlation       = astc::min(lowest_correlation, fabsf(ba_cov));
1366
1367	// Diagnostic trace points
1368	trace_add_data("min_r", blk.data_min.lane<0>());
1369	trace_add_data("max_r", blk.data_max.lane<0>());
1370	trace_add_data("min_g", blk.data_min.lane<1>());
1371	trace_add_data("max_g", blk.data_max.lane<1>());
1372	trace_add_data("min_b", blk.data_min.lane<2>());
1373	trace_add_data("max_b", blk.data_max.lane<2>());
1374	trace_add_data("min_a", blk.data_min.lane<3>());
1375	trace_add_data("max_a", blk.data_max.lane<3>());
1376	trace_add_data("cov_rg", fabsf(rg_cov));
1377	trace_add_data("cov_rb", fabsf(rb_cov));
1378	trace_add_data("cov_ra", fabsf(ra_cov));
1379	trace_add_data("cov_gb", fabsf(gb_cov));
1380	trace_add_data("cov_ga", fabsf(ga_cov));
1381	trace_add_data("cov_ba", fabsf(ba_cov));
1382
1383	return lowest_correlation;
1384}
1385
1386/* See header for documentation. */
1387void compress_block(
1388	const astcenc_contexti& ctx,
1389	const image_block& blk,
1390	uint8_t pcb[16],
1391#if QUALITY_CONTROL
1392	compression_working_buffers& tmpbuf,
1393	bool calQualityEnable,
1394	int32_t *mseBlock[RGBA_COM]
1395#else
1396	compression_working_buffers& tmpbuf
1397#endif
1398	)
1399{
1400	astcenc_profile decode_mode = ctx.config.profile;
1401	symbolic_compressed_block scb;
1402	const block_size_descriptor& bsd = *ctx.bsd;
1403	float lowest_correl;
1404
1405	TRACE_NODE(node0, "block");
1406	trace_add_data("pos_x", blk.xpos);
1407	trace_add_data("pos_y", blk.ypos);
1408	trace_add_data("pos_z", blk.zpos);
1409
1410	// Set stricter block targets for luminance data as we have more bits to play with
1411	bool block_is_l = blk.is_luminance();
1412	float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
1413
1414	// Set slightly stricter block targets for lumalpha data as we have more bits to play with
1415	bool block_is_la = blk.is_luminancealpha();
1416	float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
1417
1418	bool block_skip_two_plane = false;
1419	int max_partitions;
1420	if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
1421	{
1422		max_partitions = 1;
1423	}
1424#ifdef ASTC_CUSTOMIZED_ENABLE
1425	else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
1426	{
1427		if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
1428			g_astcCustomizedSoManager.customizedMaxPartitionsFunc_ == nullptr)
1429		{
1430			printf("astcenc customized so dlopen failed or customizedMaxPartitionsFunc_ is nullptr!\n");
1431			return;
1432		}
1433		max_partitions = g_astcCustomizedSoManager.customizedMaxPartitionsFunc_();
1434	}
1435#endif
1436	else
1437	{
1438		max_partitions = ctx.config.tune_partition_count_limit;
1439	}
1440
1441	unsigned int requested_partition_indices[3] {
1442		ctx.config.tune_2partition_index_limit,
1443		ctx.config.tune_3partition_index_limit,
1444		ctx.config.tune_4partition_index_limit
1445	};
1446
1447	unsigned int requested_partition_trials[3] {
1448		ctx.config.tune_2partitioning_candidate_limit,
1449		ctx.config.tune_3partitioning_candidate_limit,
1450		ctx.config.tune_4partitioning_candidate_limit
1451	};
1452
1453#if defined(ASTCENC_DIAGNOSTICS)
1454	// Do this early in diagnostic builds so we can dump uniform metrics
1455	// for every block. Do it later in release builds to avoid redundant work!
1456	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1457	float error_threshold = ctx.config.tune_db_limit
1458	                      * error_weight_sum
1459	                      * block_is_l_scale
1460	                      * block_is_la_scale;
1461
1462	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1463	trace_add_data("lowest_correl", lowest_correl);
1464	trace_add_data("tune_error_threshold", error_threshold);
1465#endif
1466
1467	// Detected a constant-color block
1468	if (all(blk.data_min == blk.data_max))
1469	{
1470		TRACE_NODE(node1, "pass");
1471		trace_add_data("partition_count", 0);
1472		trace_add_data("plane_count", 1);
1473
1474		scb.partition_count = 0;
1475
1476		// Encode as FP16 if using HDR
1477		if ((decode_mode == ASTCENC_PRF_HDR) ||
1478		    (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
1479		{
1480			scb.block_type = SYM_BTYPE_CONST_F16;
1481			vint4 color_f16 = float_to_float16(blk.origin_texel);
1482			store(color_f16, scb.constant_color);
1483		}
1484		// Encode as UNORM16 if NOT using HDR
1485		else
1486		{
1487			scb.block_type = SYM_BTYPE_CONST_U16;
1488			vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1489			vint4 color_u16 = float_to_int_rtn(color_f32);
1490			store(color_u16, scb.constant_color);
1491		}
1492
1493		trace_add_data("exit", "quality hit");
1494		if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE)
1495		{
1496			scb.block_type = SYM_BTYPE_NONCONST;
1497			scb.partition_count = 1;
1498			scb.color_formats_matched = 0;
1499			scb.plane2_component = -1;
1500			if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
1501			{
1502				scb.block_mode = HIGH_SPEED_PROFILE_BLOCK_MODE;
1503			}
1504#ifdef ASTC_CUSTOMIZED_ENABLE
1505			else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
1506			{
1507				if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
1508					g_astcCustomizedSoManager.customizedBlockModeFunc_ == nullptr)
1509				{
1510					printf("astcenc customized so dlopen failed or customizedBlockModeFunc_ is nullptr!\n");
1511					return;
1512				}
1513				scb.block_mode = g_astcCustomizedSoManager.customizedBlockModeFunc_();
1514			}
1515#endif
1516			scb.partition_index = 0;
1517			scb.quant_mode = QUANT_256;
1518			scb.color_formats[0] = 12; // color format is 12 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
1519			for (int w = 0; w < 16; w++) { // weights num is 16 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
1520				scb.weights[w] = 0;
1521			}
1522			for (unsigned int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit
1523				scb.color_values[0][pixel << 1] = scb.constant_color[pixel] & BYTE_MASK; // low byte
1524				scb.color_values[0][(pixel << 1) + 1] = (scb.constant_color[pixel] >> 8) & BYTE_MASK; // high byte
1525			}
1526		}
1527		scb.privateProfile = ctx.config.privateProfile;
1528		symbolic_to_physical(bsd, scb, pcb);
1529#if QUALITY_CONTROL
1530	if (calQualityEnable) {
1531		*mseBlock[R_COM] = *mseBlock[G_COM] = *mseBlock[B_COM] = *mseBlock[A_COM] = 0;
1532	}
1533#endif
1534		return;
1535	}
1536
1537#if !defined(ASTCENC_DIAGNOSTICS)
1538	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1539	float error_threshold = ctx.config.tune_db_limit
1540	                      * error_weight_sum
1541	                      * block_is_l_scale
1542	                      * block_is_la_scale;
1543#endif
1544
1545	// Set SCB and mode errors to a very high error value
1546	scb.errorval = ERROR_CALC_DEFAULT;
1547	scb.block_type = SYM_BTYPE_ERROR;
1548
1549	float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
1550		ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
1551	};
1552
1553	float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
1554		0.0f,
1555		ctx.config.tune_2partition_early_out_limit_factor,
1556		ctx.config.tune_3partition_early_out_limit_factor,
1557		0.0f
1558	};
1559
1560	// Trial using 1 plane of weights and 1 partition.
1561
1562	// Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
1563	// mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
1564	// optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
1565	// compression and slightly reduces image quality.
1566
1567	float errorval_mult[2] {
1568		1.0f / ctx.config.tune_mse_overshoot,
1569		1.0f
1570	};
1571
1572	static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
1573
1574	// Only enable MODE0 fast path if enabled
1575	// Never enable for 3D blocks as no "always" block modes are available
1576	int start_trial = 1;
1577 	if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
1578	{
1579		start_trial = 0;
1580	}
1581
1582	int quant_limit = QUANT_32;
1583	for (int i = start_trial; i < 2; i++)
1584	{
1585		TRACE_NODE(node1, "pass");
1586		trace_add_data("partition_count", 1);
1587		trace_add_data("plane_count", 1);
1588		trace_add_data("search_mode", i);
1589
1590		float errorval = compress_symbolic_block_for_partition_1plane(
1591		    ctx.config.privateProfile,
1592		    ctx.config, bsd, blk, i == 0,
1593		    error_threshold * errorval_mult[i] * errorval_overshoot,
1594		    1, 0,  scb, tmpbuf, QUANT_32);
1595
1596		// Record the quant level so we can use the filter later searches
1597		const auto& bm = bsd.get_block_mode(scb.block_mode);
1598		quant_limit = bm.get_weight_quant_mode();
1599
1600		best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
1601		if ((ctx.config.privateProfile == HIGH_SPEED_PROFILE) || (errorval < (error_threshold * errorval_mult[i])))
1602		{
1603			trace_add_data("exit", "quality hit");
1604			goto END_OF_TESTS;
1605		}
1606	}
1607
1608#if !defined(ASTCENC_DIAGNOSTICS)
1609	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1610#endif
1611
1612	block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
1613
1614	// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
1615	// alpha is the most likely to be non-correlated if it is present in the data.
1616	for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
1617	{
1618		if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE)
1619		{
1620			break;
1621		}
1622		TRACE_NODE(node1, "pass");
1623		trace_add_data("partition_count", 1);
1624		trace_add_data("plane_count", 2);
1625		trace_add_data("plane_component", i);
1626
1627		if (block_skip_two_plane)
1628		{
1629			trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
1630			continue;
1631		}
1632
1633		if (blk.grayscale && i != 3)
1634		{
1635			trace_add_data("skip", "grayscale block");
1636			continue;
1637		}
1638
1639		if (blk.is_constant_channel(i))
1640		{
1641			trace_add_data("skip", "constant component");
1642			continue;
1643		}
1644
1645		float errorval = compress_symbolic_block_for_partition_2planes(
1646		    ctx.config.privateProfile,
1647		    ctx.config, bsd, blk, error_threshold * errorval_overshoot,
1648		    i, scb, tmpbuf, quant_limit);
1649
1650		// If attempting two planes is much worse than the best one plane result
1651		// then further two plane searches are unlikely to help so move on ...
1652		if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
1653		{
1654			break;
1655		}
1656
1657		if (errorval < error_threshold)
1658		{
1659			trace_add_data("exit", "quality hit");
1660			goto END_OF_TESTS;
1661		}
1662	}
1663
1664	// Find best blocks for 2, 3 and 4 partitions
1665	for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
1666	{
1667		unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
1668
1669		unsigned int requested_indices = requested_partition_indices[partition_count - 2];
1670
1671		unsigned int requested_trials = requested_partition_trials[partition_count - 2];
1672		requested_trials = astc::min(requested_trials, requested_indices);
1673
1674		unsigned int actual_trials = find_best_partition_candidates(
1675		    bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
1676
1677		float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
1678
1679		for (unsigned int i = 0; i < actual_trials; i++)
1680		{
1681			TRACE_NODE(node1, "pass");
1682			trace_add_data("partition_count", partition_count);
1683			trace_add_data("partition_index", partition_indices[i]);
1684			trace_add_data("plane_count", 1);
1685			trace_add_data("search_mode", i);
1686
1687			float errorval = compress_symbolic_block_for_partition_1plane(
1688			    ctx.config.privateProfile,
1689			    ctx.config, bsd, blk, false,
1690			    error_threshold * errorval_overshoot,
1691			    partition_count, partition_indices[i],
1692			    scb, tmpbuf, quant_limit);
1693
1694			best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
1695
1696			// If using N partitions doesn't improve much over using N-1 partitions then skip trying
1697			// N+1. Error can dramatically improve if the data is correlated or non-correlated and
1698			// aligns with a partitioning that suits that encoding, so for this inner loop check add
1699			// a large error scale because the "other" trial could be a lot better.
1700			float best_error = best_errorvals_for_pcount[partition_count - 1];
1701			float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
1702			if (best_error > (best_error_in_prev * best_error_scale))
1703			{
1704				trace_add_data("skip", "tune_partition_early_out_limit_factor");
1705				goto END_OF_TESTS;
1706			}
1707
1708			if (errorval < error_threshold)
1709			{
1710				trace_add_data("exit", "quality hit");
1711				goto END_OF_TESTS;
1712			}
1713		}
1714
1715		// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
1716		float best_error = best_errorvals_for_pcount[partition_count - 1];
1717		float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
1718		if (best_error > (best_error_in_prev * best_error_scale))
1719		{
1720			trace_add_data("skip", "tune_partition_early_out_limit_factor");
1721			goto END_OF_TESTS;
1722		}
1723	}
1724
1725	trace_add_data("exit", "quality not hit");
1726
1727END_OF_TESTS:
1728	// If we still have an error block then convert to something we can encode
1729	// TODO: Do something more sensible here, such as average color block
1730	if (scb.block_type == SYM_BTYPE_ERROR)
1731	{
1732#if defined(ASTCENC_DIAGNOSTICS)
1733		static bool printed_once = false;
1734		if (!printed_once)
1735		{
1736			printed_once = true;
1737			printf("WARN: At least one block failed to find a valid encoding.\n"
1738			       "      Try increasing compression quality settings.\n\n");
1739		}
1740#endif
1741
1742		scb.block_type = SYM_BTYPE_CONST_U16;
1743		vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1744		vint4 color_u16 = float_to_int_rtn(color_f32);
1745		store(color_u16, scb.constant_color);
1746	}
1747
1748	// Compress to a physical block
1749	scb.privateProfile = ctx.config.privateProfile;
1750	symbolic_to_physical(bsd, scb, pcb);
1751#if QUALITY_CONTROL
1752	if (calQualityEnable) {
1753		image_block decBlk = blk;
1754		decompress_symbolic_block(ctx.config.profile, bsd, blk.xpos, blk.ypos, blk.zpos, scb, decBlk);
1755		vint4 colorSumDiff = vint4::zero();
1756		for (size_t ii = 0; ii < bsd.texel_count; ii++) {
1757			vint4 colorRef = float_to_int_rtn(blk.texel(ii) * 255.0f / 65535.0f);
1758			vint4 colorTest = float_to_int_rtn(min(decBlk.texel(ii), 1.0f) * 255.0f);
1759			vint4 colorDiff = colorRef - colorTest;
1760			colorSumDiff += colorDiff * colorDiff;
1761		}
1762		*mseBlock[R_COM] = colorSumDiff.lane<0>();
1763		*mseBlock[G_COM] = colorSumDiff.lane<1>();
1764		*mseBlock[B_COM] = colorSumDiff.lane<2>();
1765		*mseBlock[A_COM] = colorSumDiff.lane<3>();
1766    }
1767#endif
1768}
1769
1770#endif
1771