1// SPDX-License-Identifier: Apache-2.0
2// ----------------------------------------------------------------------------
3// Copyright 2011-2024 Arm Limited
4//
5// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6// use this file except in compliance with the License. You may obtain a copy
7// of the License at:
8//
9//     http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14// License for the specific language governing permissions and limitations
15// under the License.
16// ----------------------------------------------------------------------------
17
18/**
19 * @brief Functions for the library entrypoint.
20 */
21
22#include <array>
23#include <cstring>
24#include <new>
25
26#include "astcenc.h"
27#include "astcenc_internal_entry.h"
28#include "astcenc_diagnostic_trace.h"
29
30/**
31 * @brief Record of the quality tuning parameter values.
32 *
33 * See the @c astcenc_config structure for detailed parameter documentation.
34 *
35 * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit.
36 * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios
37 * for the more through search presets because the underlying db_limit is so much higher.
38 */
39struct astcenc_preset_config
40{
41	float quality;
42	unsigned int tune_partition_count_limit;
43	unsigned int tune_2partition_index_limit;
44	unsigned int tune_3partition_index_limit;
45	unsigned int tune_4partition_index_limit;
46	unsigned int tune_block_mode_limit;
47	unsigned int tune_refinement_limit;
48	unsigned int tune_candidate_limit;
49	unsigned int tune_2partitioning_candidate_limit;
50	unsigned int tune_3partitioning_candidate_limit;
51	unsigned int tune_4partitioning_candidate_limit;
52	float tune_db_limit_a_base;
53	float tune_db_limit_b_base;
54	float tune_mse_overshoot;
55	float tune_2partition_early_out_limit_factor;
56	float tune_3partition_early_out_limit_factor;
57	float tune_2plane_early_out_limit_correlation;
58	float tune_search_mode0_enable;
59};
60
61/**
62 * @brief The static presets for high bandwidth encodings (x < 25 texels per block).
63 */
64static const std::array<astcenc_preset_config, 6> preset_configs_high {{
65	{
66		ASTCENC_PRE_FASTEST,
67		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f
68	}, {
69		ASTCENC_PRE_FAST,
70		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f
71	}, {
72		ASTCENC_PRE_MEDIUM,
73		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f
74	}, {
75		ASTCENC_PRE_THOROUGH,
76		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f
77	}, {
78		ASTCENC_PRE_VERYTHOROUGH,
79		4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
80	}, {
81		ASTCENC_PRE_EXHAUSTIVE,
82		4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
83	}
84}};
85
86/**
87 * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block).
88 */
89static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
90	{
91		ASTCENC_PRE_FASTEST,
92		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
93	}, {
94		ASTCENC_PRE_FAST,
95		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
96	}, {
97		ASTCENC_PRE_MEDIUM,
98		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f
99	}, {
100		ASTCENC_PRE_THOROUGH,
101		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f
102	}, {
103		ASTCENC_PRE_VERYTHOROUGH,
104		4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
105	}, {
106		ASTCENC_PRE_EXHAUSTIVE,
107		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
108	}
109}};
110
111/**
112 * @brief The static presets for low bandwidth encodings (64 <= x texels per block).
113 */
114static const std::array<astcenc_preset_config, 6> preset_configs_low {{
115	{
116		ASTCENC_PRE_FASTEST,
117		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
118	}, {
119		ASTCENC_PRE_FAST,
120		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
121	}, {
122		ASTCENC_PRE_MEDIUM,
123		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f
124	}, {
125		ASTCENC_PRE_THOROUGH,
126		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f
127	}, {
128		ASTCENC_PRE_VERYTHOROUGH,
129		4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f
130	}, {
131		ASTCENC_PRE_EXHAUSTIVE,
132		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f
133	}
134}};
135
136/**
137 * @brief Validate CPU floating point meets assumptions made in the codec.
138 *
139 * The codec is written with the assumption that a float threaded through the @c if32 union will be
140 * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the
141 * case in an IEEE-754 compliant system, however not every system or compilation mode is actually
142 * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled.
143 *
144 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
145 */
146static astcenc_error validate_cpu_float()
147{
148	if32 p;
149	volatile float xprec_testval = 2.51f;
150	p.f = xprec_testval + 12582912.0f;
151	float q = p.f - 12582912.0f;
152
153	if (q != 3.0f)
154	{
155		return ASTCENC_ERR_BAD_CPU_FLOAT;
156	}
157
158	return ASTCENC_SUCCESS;
159}
160
161/**
162 * @brief Validate config profile.
163 *
164 * @param profile   The profile to check.
165 *
166 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
167 */
168static astcenc_error validate_profile(
169	astcenc_profile profile
170) {
171	// Values in this enum are from an external user, so not guaranteed to be
172	// bounded to the enum values
173	switch (static_cast<int>(profile))
174	{
175	case ASTCENC_PRF_LDR_SRGB:
176	case ASTCENC_PRF_LDR:
177	case ASTCENC_PRF_HDR_RGB_LDR_A:
178	case ASTCENC_PRF_HDR:
179		return ASTCENC_SUCCESS;
180	default:
181		return ASTCENC_ERR_BAD_PROFILE;
182	}
183}
184
185/**
186 * @brief Validate block size.
187 *
188 * @param block_x   The block x dimensions.
189 * @param block_y   The block y dimensions.
190 * @param block_z   The block z dimensions.
191 *
192 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
193 */
194static astcenc_error validate_block_size(
195	unsigned int block_x,
196	unsigned int block_y,
197	unsigned int block_z
198) {
199	// Test if this is a legal block size at all
200	bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
201	                 ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
202	if (!is_legal)
203	{
204		return ASTCENC_ERR_BAD_BLOCK_SIZE;
205	}
206
207	// Test if this build has sufficient capacity for this block size
208	bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
209	if (!have_capacity)
210	{
211		return ASTCENC_ERR_NOT_IMPLEMENTED;
212	}
213
214	return ASTCENC_SUCCESS;
215}
216
217/**
218 * @brief Validate flags.
219 *
220 * @param profile   The profile to check.
221 * @param flags     The flags to check.
222 *
223 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
224 */
225static astcenc_error validate_flags(
226	astcenc_profile profile,
227	unsigned int flags
228) {
229	// Flags field must not contain any unknown flag bits
230	unsigned int exMask = ~ASTCENC_ALL_FLAGS;
231	if (popcount(flags & exMask) != 0)
232	{
233		return ASTCENC_ERR_BAD_FLAGS;
234	}
235
236	// Flags field must only contain at most a single map type
237	exMask = ASTCENC_FLG_MAP_NORMAL
238	       | ASTCENC_FLG_MAP_RGBM;
239	if (popcount(flags & exMask) > 1)
240	{
241		return ASTCENC_ERR_BAD_FLAGS;
242	}
243
244	// Decode_unorm8 must only be used with an LDR profile
245	bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
246	bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
247	if (is_unorm8 && is_hdr)
248	{
249		return ASTCENC_ERR_BAD_DECODE_MODE;
250	}
251
252	return ASTCENC_SUCCESS;
253}
254
255#if !defined(ASTCENC_DECOMPRESS_ONLY)
256
257/**
258 * @brief Validate single channel compression swizzle.
259 *
260 * @param swizzle   The swizzle to check.
261 *
262 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
263 */
264static astcenc_error validate_compression_swz(
265	astcenc_swz swizzle
266) {
267	// Not all enum values are handled; SWZ_Z is invalid for compression
268	switch (static_cast<int>(swizzle))
269	{
270	case ASTCENC_SWZ_R:
271	case ASTCENC_SWZ_G:
272	case ASTCENC_SWZ_B:
273	case ASTCENC_SWZ_A:
274	case ASTCENC_SWZ_0:
275	case ASTCENC_SWZ_1:
276		return ASTCENC_SUCCESS;
277	default:
278		return ASTCENC_ERR_BAD_SWIZZLE;
279	}
280}
281
282/**
283 * @brief Validate overall compression swizzle.
284 *
285 * @param swizzle   The swizzle to check.
286 *
287 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
288 */
289static astcenc_error validate_compression_swizzle(
290	const astcenc_swizzle& swizzle
291) {
292	if (validate_compression_swz(swizzle.r) ||
293	    validate_compression_swz(swizzle.g) ||
294	    validate_compression_swz(swizzle.b) ||
295	    validate_compression_swz(swizzle.a))
296	{
297		return ASTCENC_ERR_BAD_SWIZZLE;
298	}
299
300	return ASTCENC_SUCCESS;
301}
302#endif
303
304/**
305 * @brief Validate single channel decompression swizzle.
306 *
307 * @param swizzle   The swizzle to check.
308 *
309 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
310 */
311static astcenc_error validate_decompression_swz(
312	astcenc_swz swizzle
313) {
314	// Values in this enum are from an external user, so not guaranteed to be
315	// bounded to the enum values
316	switch (static_cast<int>(swizzle))
317	{
318	case ASTCENC_SWZ_R:
319	case ASTCENC_SWZ_G:
320	case ASTCENC_SWZ_B:
321	case ASTCENC_SWZ_A:
322	case ASTCENC_SWZ_0:
323	case ASTCENC_SWZ_1:
324	case ASTCENC_SWZ_Z:
325		return ASTCENC_SUCCESS;
326	default:
327		return ASTCENC_ERR_BAD_SWIZZLE;
328	}
329}
330
331/**
332 * @brief Validate overall decompression swizzle.
333 *
334 * @param swizzle   The swizzle to check.
335 *
336 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
337 */
338static astcenc_error validate_decompression_swizzle(
339	const astcenc_swizzle& swizzle
340) {
341	if (validate_decompression_swz(swizzle.r) ||
342	    validate_decompression_swz(swizzle.g) ||
343	    validate_decompression_swz(swizzle.b) ||
344	    validate_decompression_swz(swizzle.a))
345	{
346		return ASTCENC_ERR_BAD_SWIZZLE;
347	}
348
349	return ASTCENC_SUCCESS;
350}
351
352/**
353 * Validate that an incoming configuration is in-spec.
354 *
355 * This function can respond in two ways:
356 *
357 *   * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown
358 *     for out-of-range inputs in this case.
359 *   * Numerical inputs and logic inputs are are logically invalid and which make no sense
360 *     algorithmically will return an error.
361 *
362 * @param[in,out] config   The input compressor configuration.
363 *
364 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
365 */
366static astcenc_error validate_config(
367	astcenc_config &config
368) {
369	astcenc_error status;
370
371	status = validate_profile(config.profile);
372	if (status != ASTCENC_SUCCESS)
373	{
374		return status;
375	}
376
377	status = validate_flags(config.profile, config.flags);
378	if (status != ASTCENC_SUCCESS)
379	{
380		return status;
381	}
382
383	status = validate_block_size(config.block_x, config.block_y, config.block_z);
384	if (status != ASTCENC_SUCCESS)
385	{
386		return status;
387	}
388
389#if defined(ASTCENC_DECOMPRESS_ONLY)
390	// Decompress-only builds only support decompress-only contexts
391	if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
392	{
393		return ASTCENC_ERR_BAD_PARAM;
394	}
395#endif
396
397	config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
398
399	config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
400	config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
401	config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
402	config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
403	config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
404	config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
405	config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
406	config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
407	config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
408	config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
409	config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
410	config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f);
411	config.tune_2partition_early_out_limit_factor = astc::max(config.tune_2partition_early_out_limit_factor, 0.0f);
412	config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f);
413	config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f);
414
415	// Specifying a zero weight color component is not allowed; force to small value
416	float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
417	                             astc::max(config.cw_b_weight, config.cw_a_weight));
418	if (max_weight > 0.0f)
419	{
420		max_weight /= 1000.0f;
421		config.cw_r_weight = astc::max(config.cw_r_weight, max_weight);
422		config.cw_g_weight = astc::max(config.cw_g_weight, max_weight);
423		config.cw_b_weight = astc::max(config.cw_b_weight, max_weight);
424		config.cw_a_weight = astc::max(config.cw_a_weight, max_weight);
425	}
426	// If all color components error weights are zero then return an error
427	else
428	{
429		return ASTCENC_ERR_BAD_PARAM;
430	}
431
432	return ASTCENC_SUCCESS;
433}
434
435/* See header for documentation. */
436astcenc_error astcenc_config_init(
437	astcenc_profile profile,
438	unsigned int block_x,
439	unsigned int block_y,
440	unsigned int block_z,
441	float quality,
442	unsigned int flags,
443	astcenc_config* configp
444) {
445	astcenc_error status;
446
447	status = validate_cpu_float();
448	if (status != ASTCENC_SUCCESS)
449	{
450		return status;
451	}
452
453	// Zero init all config fields; although most of will be over written
454	astcenc_config& config = *configp;
455	std::memset(&config, 0, sizeof(config));
456
457	// Process the block size
458	block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1
459	status = validate_block_size(block_x, block_y, block_z);
460	if (status != ASTCENC_SUCCESS)
461	{
462		return status;
463	}
464
465	config.block_x = block_x;
466	config.block_y = block_y;
467	config.block_z = block_z;
468
469	float texels = static_cast<float>(block_x * block_y * block_z);
470	float ltexels = logf(texels) / logf(10.0f);
471
472	// Process the performance quality level or preset; note that this must be done before we
473	// process any additional settings, such as color profile and flags, which may replace some of
474	// these settings with more use case tuned values
475	if (quality < ASTCENC_PRE_FASTEST ||
476	    quality > ASTCENC_PRE_EXHAUSTIVE)
477	{
478		return ASTCENC_ERR_BAD_QUALITY;
479	}
480
481	static const std::array<astcenc_preset_config, 6>* preset_configs;
482	int texels_int = block_x * block_y * block_z;
483	if (texels_int < 25)
484	{
485		preset_configs = &preset_configs_high;
486	}
487	else if (texels_int < 64)
488	{
489		preset_configs = &preset_configs_mid;
490	}
491	else
492	{
493		preset_configs = &preset_configs_low;
494	}
495
496	// Determine which preset to use, or which pair to interpolate
497	size_t start;
498	size_t end;
499	for (end = 0; end < preset_configs->size(); end++)
500	{
501		if ((*preset_configs)[end].quality >= quality)
502		{
503			break;
504		}
505	}
506
507	start = end == 0 ? 0 : end - 1;
508
509	// Start and end node are the same - so just transfer the values.
510	if (start == end)
511	{
512		config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
513		config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
514		config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
515		config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
516		config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
517		config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
518		config.tune_candidate_limit = (*preset_configs)[start].tune_candidate_limit;
519		config.tune_2partitioning_candidate_limit = (*preset_configs)[start].tune_2partitioning_candidate_limit;
520		config.tune_3partitioning_candidate_limit = (*preset_configs)[start].tune_3partitioning_candidate_limit;
521		config.tune_4partitioning_candidate_limit = (*preset_configs)[start].tune_4partitioning_candidate_limit;
522		config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
523		                                 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
524
525		config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot;
526
527		config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor;
528		config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor;
529		config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation;
530		config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable;
531	}
532	// Start and end node are not the same - so interpolate between them
533	else
534	{
535		auto& node_a = (*preset_configs)[start];
536		auto& node_b = (*preset_configs)[end];
537
538		float wt_range = node_b.quality - node_a.quality;
539		assert(wt_range > 0);
540
541		// Compute interpolation factors
542		float wt_node_a = (node_b.quality - quality) / wt_range;
543		float wt_node_b = (quality - node_a.quality) / wt_range;
544
545		#define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b))
546		#define LERPI(param) astc::flt2int_rtn(\
547		                         (static_cast<float>(node_a.param) * wt_node_a) + \
548		                         (static_cast<float>(node_b.param) * wt_node_b))
549		#define LERPUI(param) static_cast<unsigned int>(LERPI(param))
550
551		config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
552		config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
553		config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
554		config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
555		config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
556		config.tune_refinement_limit = LERPI(tune_refinement_limit);
557		config.tune_candidate_limit = LERPUI(tune_candidate_limit);
558		config.tune_2partitioning_candidate_limit = LERPUI(tune_2partitioning_candidate_limit);
559		config.tune_3partitioning_candidate_limit = LERPUI(tune_3partitioning_candidate_limit);
560		config.tune_4partitioning_candidate_limit = LERPUI(tune_4partitioning_candidate_limit);
561		config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
562		                                 LERP(tune_db_limit_b_base) - 19 * ltexels);
563
564		config.tune_mse_overshoot = LERP(tune_mse_overshoot);
565
566		config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor);
567		config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor);
568		config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation);
569		config.tune_search_mode0_enable = LERP(tune_search_mode0_enable);
570		#undef LERP
571		#undef LERPI
572		#undef LERPUI
573	}
574
575	// Set heuristics to the defaults for each color profile
576	config.cw_r_weight = 1.0f;
577	config.cw_g_weight = 1.0f;
578	config.cw_b_weight = 1.0f;
579	config.cw_a_weight = 1.0f;
580
581	config.a_scale_radius = 0;
582
583	config.rgbm_m_scale = 0.0f;
584
585	config.profile = profile;
586
587	// Values in this enum are from an external user, so not guaranteed to be
588	// bounded to the enum values
589	switch (static_cast<int>(profile))
590	{
591	case ASTCENC_PRF_LDR:
592	case ASTCENC_PRF_LDR_SRGB:
593		break;
594	case ASTCENC_PRF_HDR_RGB_LDR_A:
595	case ASTCENC_PRF_HDR:
596		config.tune_db_limit = 999.0f;
597		config.tune_search_mode0_enable = 0.0f;
598		break;
599	default:
600		return ASTCENC_ERR_BAD_PROFILE;
601	}
602
603	// Flags field must not contain any unknown flag bits
604	status = validate_flags(profile, flags);
605	if (status != ASTCENC_SUCCESS)
606	{
607		return status;
608	}
609
610	if (flags & ASTCENC_FLG_MAP_NORMAL)
611	{
612		// Normal map encoding uses L+A blocks, so allow one more partitioning
613		// than normal. We need need fewer bits for endpoints, so more likely
614		// to be able to use more partitions than an RGB/RGBA block
615		config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u);
616
617		config.cw_g_weight = 0.0f;
618		config.cw_b_weight = 0.0f;
619		config.tune_2partition_early_out_limit_factor *= 1.5f;
620		config.tune_3partition_early_out_limit_factor *= 1.5f;
621		config.tune_2plane_early_out_limit_correlation = 0.99f;
622
623		// Normals are prone to blocking artifacts on smooth curves
624		// so force compressor to try harder here ...
625		config.tune_db_limit *= 1.03f;
626	}
627	else if (flags & ASTCENC_FLG_MAP_RGBM)
628	{
629		config.rgbm_m_scale = 5.0f;
630		config.cw_a_weight = 2.0f * config.rgbm_m_scale;
631	}
632	else // (This is color data)
633	{
634		// This is a very basic perceptual metric for RGB color data, which weights error
635		// significance by the perceptual luminance contribution of each color channel. For
636		// luminance the usual weights to compute luminance from a linear RGB value are as
637		// follows:
638		//
639		//     l = r * 0.3 + g * 0.59 + b * 0.11
640		//
641		// ... but we scale these up to keep a better balance between color and alpha. Note
642		// that if the content is using alpha we'd recommend using the -a option to weight
643		// the color contribution by the alpha transparency.
644		if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
645		{
646			config.cw_r_weight = 0.30f * 2.25f;
647			config.cw_g_weight = 0.59f * 2.25f;
648			config.cw_b_weight = 0.11f * 2.25f;
649		}
650	}
651	config.flags = flags;
652
653	return ASTCENC_SUCCESS;
654}
655
656/* See header for documentation. */
657astcenc_error astcenc_context_alloc(
658	const astcenc_config* configp,
659	unsigned int thread_count,
660	astcenc_context** context
661) {
662	astcenc_error status;
663	const astcenc_config& config = *configp;
664
665	status = validate_cpu_float();
666	if (status != ASTCENC_SUCCESS)
667	{
668		return status;
669	}
670
671	if (thread_count == 0)
672	{
673		return ASTCENC_ERR_BAD_PARAM;
674	}
675
676#if defined(ASTCENC_DIAGNOSTICS)
677	// Force single threaded compressor use in diagnostic mode.
678	if (thread_count != 1)
679	{
680		return ASTCENC_ERR_BAD_PARAM;
681	}
682#endif
683
684#ifndef ASTC_CUSTOMIZED_ENABLE
685	if (config.privateProfile == CUSTOMIZED_PROFILE)
686	{
687		return ASTCENC_ERR_BAD_PARAM;
688	}
689#endif
690
691	astcenc_context* ctxo = new astcenc_context;
692	astcenc_contexti* ctx = &ctxo->context;
693	ctx->thread_count = thread_count;
694	ctx->config = config;
695	ctx->working_buffers = nullptr;
696
697	// These are allocated per-compress, as they depend on image size
698	ctx->input_alpha_averages = nullptr;
699
700	// Copy the config first and validate the copy (we may modify it)
701	status = validate_config(ctx->config);
702	if (status != ASTCENC_SUCCESS)
703	{
704		delete ctxo;
705		return status;
706	}
707
708	ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
709	if (!ctx->bsd)
710	{
711		delete ctxo;
712		return ASTCENC_ERR_OUT_OF_MEM;
713	}
714
715	bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
716#ifdef ASTC_CUSTOMIZED_ENABLE
717	if (!init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
718	                           can_omit_modes,
719	                           config.tune_partition_count_limit,
720	                           static_cast<float>(config.tune_block_mode_limit) / 100.0f,
721	                           *ctx->bsd))
722	{
723		aligned_free<block_size_descriptor>(ctx->bsd);
724		delete ctxo;
725		*context = nullptr;
726		return ASTCENC_ERR_DLOPEN_FAILED;
727	}
728#else
729	init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
730	                           can_omit_modes,
731	                           config.tune_partition_count_limit,
732	                           static_cast<float>(config.tune_block_mode_limit) / 100.0f,
733	                           *ctx->bsd);
734#endif
735
736#if !defined(ASTCENC_DECOMPRESS_ONLY)
737	// Do setup only needed by compression
738	if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
739	{
740		// Turn a dB limit into a per-texel error for faster use later
741		if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
742		{
743			ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f;
744		}
745		else
746		{
747			ctx->config.tune_db_limit = 0.0f;
748		}
749
750		size_t worksize = sizeof(compression_working_buffers) * thread_count;
751		ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
752		static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
753		              "compression_working_buffers size must be multiple of vector alignment");
754		if (!ctx->working_buffers)
755		{
756			aligned_free<block_size_descriptor>(ctx->bsd);
757			delete ctxo;
758			*context = nullptr;
759			return ASTCENC_ERR_OUT_OF_MEM;
760		}
761	}
762#endif
763
764#if defined(ASTCENC_DIAGNOSTICS)
765	ctx->trace_log = new TraceLog(ctx->config.trace_file_path);
766	if (!ctx->trace_log->m_file)
767	{
768		return ASTCENC_ERR_DTRACE_FAILURE;
769	}
770
771	trace_add_data("block_x", config.block_x);
772	trace_add_data("block_y", config.block_y);
773	trace_add_data("block_z", config.block_z);
774#endif
775
776	*context = ctxo;
777
778#if !defined(ASTCENC_DECOMPRESS_ONLY)
779	prepare_angular_tables();
780#endif
781
782	return ASTCENC_SUCCESS;
783}
784
785/* See header dor documentation. */
786void astcenc_context_free(
787	astcenc_context* ctxo
788) {
789	if (ctxo)
790	{
791		astcenc_contexti* ctx = &ctxo->context;
792		if (ctx->working_buffers)
793		{
794			aligned_free<compression_working_buffers>(ctx->working_buffers);
795		}
796		else
797		{
798			printf("ctx->working_buffers is nullptr !!\n");
799		}
800		if (ctx->bsd)
801		{
802			aligned_free<block_size_descriptor>(ctx->bsd);
803		}
804		else
805		{
806			printf("ctx->bsd is nullptr !!\n");
807		}
808#if defined(ASTCENC_DIAGNOSTICS)
809		delete ctx->trace_log;
810#endif
811		delete ctxo;
812	}
813}
814
815#if !defined(ASTCENC_DECOMPRESS_ONLY)
816
817/**
818 * @brief Compress an image, after any preflight has completed.
819 *
820 * @param[out] ctxo           The compressor context.
821 * @param      thread_index   The thread index.
822 * @param      image          The intput image.
823 * @param      swizzle        The input swizzle.
824 * @param[out] buffer         The output array for the compressed data.
825 */
826static void compress_image(
827	astcenc_context& ctxo,
828	unsigned int thread_index,
829	const astcenc_image& image,
830	const astcenc_swizzle& swizzle,
831#if QUALITY_CONTROL
832	uint8_t* buffer,
833	bool calQualityEnable,
834	int32_t *mse[RGBA_COM]
835#else
836	uint8_t* buffer
837#endif
838) {
839	astcenc_contexti& ctx = ctxo.context;
840	const block_size_descriptor& bsd = *ctx.bsd;
841	astcenc_profile decode_mode = ctx.config.profile;
842
843	image_block blk;
844
845	int block_x = bsd.xdim;
846	int block_y = bsd.ydim;
847	int block_z = bsd.zdim;
848	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
849
850	int dim_x = image.dim_x;
851	int dim_y = image.dim_y;
852	int dim_z = image.dim_z;
853
854	int xblocks = (dim_x + block_x - 1) / block_x;
855	int yblocks = (dim_y + block_y - 1) / block_y;
856	int zblocks = (dim_z + block_z - 1) / block_z;
857	int block_count = zblocks * yblocks * xblocks;
858
859	int row_blocks = xblocks;
860	int plane_blocks = xblocks * yblocks;
861
862	blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
863
864	// Populate the block channel weights
865	blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
866	                             ctx.config.cw_g_weight,
867	                             ctx.config.cw_b_weight,
868	                             ctx.config.cw_a_weight);
869
870	// Use preallocated scratch buffer
871	auto& temp_buffers = ctx.working_buffers[thread_index];
872
873	// Only the first thread actually runs the initializer
874	ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
875
876	// Determine if we can use an optimized load function
877	bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
878	                 (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A);
879
880	bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) ||
881	                 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A);
882
883	bool use_fast_load = !needs_swz && !needs_hdr &&
884	                     block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
885
886	auto load_func = load_image_block;
887	if (use_fast_load)
888	{
889		load_func = load_image_block_fast_ldr;
890	}
891
892	// All threads run this processing loop until there is no work remaining
893	while (true)
894	{
895		unsigned int count;
896		unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
897		if (!count)
898		{
899			break;
900		}
901
902		for (unsigned int i = base; i < base + count; i++)
903		{
904			// Decode i into x, y, z block indices
905			int z = i / plane_blocks;
906			unsigned int rem = i - (z * plane_blocks);
907			int y = rem / row_blocks;
908			int x = rem - (y * row_blocks);
909
910			// Test if we can apply some basic alpha-scale RDO
911			bool use_full_block = true;
912			if (ctx.config.a_scale_radius != 0 && block_z == 1)
913			{
914				int start_x = x * block_x;
915				int end_x = astc::min(dim_x, start_x + block_x);
916
917				int start_y = y * block_y;
918				int end_y = astc::min(dim_y, start_y + block_y);
919
920				// SATs accumulate error, so don't test exactly zero. Test for
921				// less than 1 alpha in the expanded block footprint that
922				// includes the alpha radius.
923				int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1);
924
925				int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1);
926
927				float footprint = static_cast<float>(x_footprint * y_footprint);
928				float threshold = 0.9f / (255.0f * footprint);
929
930				// Do we have any alpha values?
931				use_full_block = false;
932				for (int ay = start_y; ay < end_y; ay++)
933				{
934					for (int ax = start_x; ax < end_x; ax++)
935					{
936						float a_avg = ctx.input_alpha_averages[ay * dim_x + ax];
937						if (a_avg > threshold)
938						{
939							use_full_block = true;
940							ax = end_x;
941							ay = end_y;
942						}
943					}
944				}
945			}
946
947			// Fetch the full block for compression
948			if (use_full_block)
949			{
950				load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
951
952				// Scale RGB error contribution by the maximum alpha in the block
953				// This encourages preserving alpha accuracy in regions with high
954				// transparency, and can buy up to 0.5 dB PSNR.
955				if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
956				{
957					float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
958					blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
959					                             ctx.config.cw_g_weight * alpha_scale,
960					                             ctx.config.cw_b_weight * alpha_scale,
961					                             ctx.config.cw_a_weight);
962				}
963			}
964			// Apply alpha scale RDO - substitute constant color block
965			else
966			{
967				blk.origin_texel = vfloat4::zero();
968				blk.data_min = vfloat4::zero();
969				blk.data_mean = vfloat4::zero();
970				blk.data_max = vfloat4::zero();
971				blk.grayscale = true;
972			}
973
974			int offset = ((z * yblocks + y) * xblocks + x) * 16;
975			uint8_t *bp = buffer + offset;
976#if QUALITY_CONTROL
977			int32_t *mseBlock[RGBA_COM] = {nullptr, nullptr, nullptr, nullptr};
978			if (calQualityEnable) {
979				offset = (z * yblocks + y) * xblocks + x;
980				mseBlock[R_COM] = mse[R_COM] + offset;
981				mseBlock[G_COM] = mse[G_COM] + offset;
982				mseBlock[B_COM] = mse[B_COM] + offset;
983				mseBlock[A_COM] = mse[A_COM] + offset;
984			}
985			compress_block(ctx, blk, bp, temp_buffers, calQualityEnable, mseBlock);
986#else
987			compress_block(ctx, blk, bp, temp_buffers);
988#endif
989		}
990
991		ctxo.manage_compress.complete_task_assignment(count);
992	}
993}
994
995/**
996 * @brief Compute regional averages in an image.
997 *
998 * This function can be called by multiple threads, but only after a single
999 * thread calls the setup function @c init_compute_averages().
1000 *
1001 * Results are written back into @c img->input_alpha_averages.
1002 *
1003 * @param[out] ctx   The context.
1004 * @param      ag    The average and variance arguments created during setup.
1005 */
1006static void compute_averages(
1007	astcenc_context& ctx,
1008	const avg_args &ag
1009) {
1010	pixel_region_args arg = ag.arg;
1011	arg.work_memory = new vfloat4[ag.work_memory_size];
1012
1013	int size_x = ag.img_size_x;
1014	int size_y = ag.img_size_y;
1015	int size_z = ag.img_size_z;
1016
1017	int step_xy = ag.blk_size_xy;
1018	int step_z = ag.blk_size_z;
1019
1020	int y_tasks = (size_y + step_xy - 1) / step_xy;
1021
1022	// All threads run this processing loop until there is no work remaining
1023	while (true)
1024	{
1025		unsigned int count;
1026		unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
1027		if (!count)
1028		{
1029			break;
1030		}
1031
1032		for (unsigned int i = base; i < base + count; i++)
1033		{
1034			int z = (i / (y_tasks)) * step_z;
1035			int y = (i - (z * y_tasks)) * step_xy;
1036
1037			arg.size_z = astc::min(step_z, size_z - z);
1038			arg.offset_z = z;
1039
1040			arg.size_y = astc::min(step_xy, size_y - y);
1041			arg.offset_y = y;
1042
1043			for (int x = 0; x < size_x; x += step_xy)
1044			{
1045				arg.size_x = astc::min(step_xy, size_x - x);
1046				arg.offset_x = x;
1047				compute_pixel_region_variance(ctx.context, arg);
1048			}
1049		}
1050
1051		ctx.manage_avg.complete_task_assignment(count);
1052	}
1053
1054	delete[] arg.work_memory;
1055}
1056
1057#endif
1058
1059/* See header for documentation. */
1060astcenc_error astcenc_compress_image(
1061	astcenc_context* ctxo,
1062	astcenc_image* imagep,
1063	const astcenc_swizzle* swizzle,
1064	uint8_t* data_out,
1065	size_t data_len,
1066#if QUALITY_CONTROL
1067	bool calQualityEnable,
1068	int32_t *mse[RGBA_COM],
1069#endif
1070	unsigned int thread_index
1071) {
1072#if defined(ASTCENC_DECOMPRESS_ONLY)
1073	(void)ctxo;
1074	(void)imagep;
1075	(void)swizzle;
1076	(void)data_out;
1077	(void)data_len;
1078	(void)thread_index;
1079	return ASTCENC_ERR_BAD_CONTEXT;
1080#else
1081	astcenc_contexti* ctx = &ctxo->context;
1082	astcenc_error status;
1083	astcenc_image& image = *imagep;
1084
1085	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1086	{
1087		return ASTCENC_ERR_BAD_CONTEXT;
1088	}
1089
1090	status = validate_compression_swizzle(*swizzle);
1091	if (status != ASTCENC_SUCCESS)
1092	{
1093		return status;
1094	}
1095
1096	if (thread_index >= ctx->thread_count)
1097	{
1098		return ASTCENC_ERR_BAD_PARAM;
1099	}
1100
1101	unsigned int block_x = ctx->config.block_x;
1102	unsigned int block_y = ctx->config.block_y;
1103	unsigned int block_z = ctx->config.block_z;
1104
1105	unsigned int xblocks = (image.dim_x + block_x - 1) / block_x;
1106	unsigned int yblocks = (image.dim_y + block_y - 1) / block_y;
1107	unsigned int zblocks = (image.dim_z + block_z - 1) / block_z;
1108
1109	// Check we have enough output space (16 bytes per block)
1110	size_t size_needed = xblocks * yblocks * zblocks * 16;
1111	if (data_len < size_needed)
1112	{
1113		return ASTCENC_ERR_OUT_OF_MEM;
1114	}
1115
1116	// If context thread count is one then implicitly reset
1117	if (ctx->thread_count == 1)
1118	{
1119		astcenc_compress_reset(ctxo);
1120	}
1121
1122	if (ctx->config.a_scale_radius != 0)
1123	{
1124		// First thread to enter will do setup, other threads will subsequently
1125		// enter the critical section but simply skip over the initialization
1126		auto init_avg = [ctx, &image, swizzle]() {
1127			// Perform memory allocations for the destination buffers
1128			size_t texel_count = image.dim_x * image.dim_y * image.dim_z;
1129			ctx->input_alpha_averages = new float[texel_count];
1130
1131			return init_compute_averages(
1132				image, ctx->config.a_scale_radius, *swizzle,
1133				ctx->avg_preprocess_args);
1134		};
1135
1136		// Only the first thread actually runs the initializer
1137		ctxo->manage_avg.init(init_avg);
1138
1139		// All threads will enter this function and dynamically grab work
1140		compute_averages(*ctxo, ctx->avg_preprocess_args);
1141	}
1142
1143	// Wait for compute_averages to complete before compressing
1144	ctxo->manage_avg.wait();
1145#if QUALITY_CONTROL
1146	compress_image(*ctxo, thread_index, image, *swizzle, data_out, calQualityEnable, mse);
1147#else
1148	compress_image(*ctxo, thread_index, image, *swizzle, data_out);
1149#endif
1150	// Wait for compress to complete before freeing memory
1151	ctxo->manage_compress.wait();
1152
1153	auto term_compress = [ctx]() {
1154		delete[] ctx->input_alpha_averages;
1155		ctx->input_alpha_averages = nullptr;
1156	};
1157
1158	// Only the first thread to arrive actually runs the term
1159	ctxo->manage_compress.term(term_compress);
1160
1161	return ASTCENC_SUCCESS;
1162#endif
1163}
1164
1165/* See header for documentation. */
1166astcenc_error astcenc_compress_reset(
1167	astcenc_context* ctxo
1168) {
1169#if defined(ASTCENC_DECOMPRESS_ONLY)
1170	(void)ctxo;
1171	return ASTCENC_ERR_BAD_CONTEXT;
1172#else
1173	astcenc_contexti* ctx = &ctxo->context;
1174	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1175	{
1176		return ASTCENC_ERR_BAD_CONTEXT;
1177	}
1178
1179	ctxo->manage_avg.reset();
1180	ctxo->manage_compress.reset();
1181	return ASTCENC_SUCCESS;
1182#endif
1183}
1184
1185/* See header for documentation. */
1186astcenc_error astcenc_decompress_image(
1187	astcenc_context* ctxo,
1188	const uint8_t* data,
1189	size_t data_len,
1190	astcenc_image* image_outp,
1191	const astcenc_swizzle* swizzle,
1192	unsigned int thread_index
1193) {
1194	astcenc_error status;
1195	astcenc_image& image_out = *image_outp;
1196	astcenc_contexti* ctx = &ctxo->context;
1197
1198	// Today this doesn't matter (working set on stack) but might in future ...
1199	if (thread_index >= ctx->thread_count)
1200	{
1201		return ASTCENC_ERR_BAD_PARAM;
1202	}
1203
1204	status = validate_decompression_swizzle(*swizzle);
1205	if (status != ASTCENC_SUCCESS)
1206	{
1207		return status;
1208	}
1209
1210	unsigned int block_x = ctx->config.block_x;
1211	unsigned int block_y = ctx->config.block_y;
1212	unsigned int block_z = ctx->config.block_z;
1213
1214	unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
1215	unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
1216	unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
1217	unsigned int block_count = zblocks * yblocks * xblocks;
1218
1219	int row_blocks = xblocks;
1220	int plane_blocks = xblocks * yblocks;
1221
1222	// Check we have enough output space (16 bytes per block)
1223	size_t size_needed = xblocks * yblocks * zblocks * 16;
1224	if (data_len < size_needed)
1225	{
1226		return ASTCENC_ERR_OUT_OF_MEM;
1227	}
1228
1229	image_block blk;
1230	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
1231
1232	// Decode mode inferred from the output data type
1233	blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
1234
1235	// If context thread count is one then implicitly reset
1236	if (ctx->thread_count == 1)
1237	{
1238		astcenc_decompress_reset(ctxo);
1239	}
1240
1241	// Only the first thread actually runs the initializer
1242	ctxo->manage_decompress.init(block_count, nullptr);
1243
1244	// All threads run this processing loop until there is no work remaining
1245	while (true)
1246	{
1247		unsigned int count;
1248		unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
1249		if (!count)
1250		{
1251			break;
1252		}
1253
1254		for (unsigned int i = base; i < base + count; i++)
1255		{
1256			// Decode i into x, y, z block indices
1257			int z = i / plane_blocks;
1258			unsigned int rem = i - (z * plane_blocks);
1259			int y = rem / row_blocks;
1260			int x = rem - (y * row_blocks);
1261
1262			unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
1263			const uint8_t* bp = data + offset;
1264
1265			symbolic_compressed_block scb;
1266
1267			physical_to_symbolic(*ctx->bsd, bp, scb);
1268
1269			decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
1270			                          x * block_x, y * block_y, z * block_z,
1271			                          scb, blk);
1272
1273			store_image_block(image_out, blk, *ctx->bsd,
1274			                  x * block_x, y * block_y, z * block_z, *swizzle);
1275		}
1276
1277		ctxo->manage_decompress.complete_task_assignment(count);
1278	}
1279
1280	return ASTCENC_SUCCESS;
1281}
1282
1283/* See header for documentation. */
1284astcenc_error astcenc_decompress_reset(
1285	astcenc_context* ctxo
1286) {
1287	ctxo->manage_decompress.reset();
1288	return ASTCENC_SUCCESS;
1289}
1290
1291/* See header for documentation. */
1292astcenc_error astcenc_get_block_info(
1293	astcenc_context* ctxo,
1294	const uint8_t data[16],
1295	astcenc_block_info* info
1296) {
1297#if defined(ASTCENC_DECOMPRESS_ONLY)
1298	(void)ctxo;
1299	(void)data;
1300	(void)info;
1301	return ASTCENC_ERR_BAD_CONTEXT;
1302#else
1303	astcenc_contexti* ctx = &ctxo->context;
1304
1305	// Decode the compressed data into a symbolic form
1306	symbolic_compressed_block scb;
1307	physical_to_symbolic(*ctx->bsd, data, scb);
1308
1309	// Fetch the appropriate partition and decimation tables
1310	block_size_descriptor& bsd = *ctx->bsd;
1311
1312	// Start from a clean slate
1313	memset(info, 0, sizeof(*info));
1314
1315	// Basic info we can always populate
1316	info->profile = ctx->config.profile;
1317
1318	info->block_x = ctx->config.block_x;
1319	info->block_y = ctx->config.block_y;
1320	info->block_z = ctx->config.block_z;
1321	info->texel_count = bsd.texel_count;
1322
1323	// Check for error blocks first
1324	info->is_error_block = scb.block_type == SYM_BTYPE_ERROR;
1325	if (info->is_error_block)
1326	{
1327		return ASTCENC_SUCCESS;
1328	}
1329
1330	// Check for constant color blocks second
1331	info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 ||
1332	                          scb.block_type == SYM_BTYPE_CONST_U16;
1333	if (info->is_constant_block)
1334	{
1335		return ASTCENC_SUCCESS;
1336	}
1337
1338	// Otherwise handle a full block ; known to be valid after conditions above have been checked
1339	int partition_count = scb.partition_count;
1340	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
1341
1342	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
1343	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
1344
1345	info->weight_x = di.weight_x;
1346	info->weight_y = di.weight_y;
1347	info->weight_z = di.weight_z;
1348
1349	info->is_dual_plane_block = bm.is_dual_plane != 0;
1350
1351	info->partition_count = scb.partition_count;
1352	info->partition_index = scb.partition_index;
1353	info->dual_plane_component = scb.plane2_component;
1354
1355	info->color_level_count = get_quant_level(scb.get_color_quant_mode());
1356	info->weight_level_count = get_quant_level(bm.get_weight_quant_mode());
1357
1358	// Unpack color endpoints for each active partition
1359	for (unsigned int i = 0; i < scb.partition_count; i++)
1360	{
1361		bool rgb_hdr;
1362		bool a_hdr;
1363		vint4 endpnt[2];
1364
1365		unpack_color_endpoints(ctx->config.profile,
1366		                       scb.color_formats[i],
1367		                       scb.color_values[i],
1368		                       rgb_hdr, a_hdr,
1369		                       endpnt[0], endpnt[1]);
1370
1371		// Store the color endpoint mode info
1372		info->color_endpoint_modes[i] = scb.color_formats[i];
1373		info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr;
1374
1375		// Store the unpacked and decoded color endpoint
1376		vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr);
1377		for (int j = 0; j < 2; j++)
1378		{
1379			vint4 color_lns = lns_to_sf16(endpnt[j]);
1380			vint4 color_unorm = unorm16_to_sf16(endpnt[j]);
1381			vint4 datai = select(color_unorm, color_lns, hdr_mask);
1382			store(float16_to_float(datai), info->color_endpoints[i][j]);
1383		}
1384	}
1385
1386	// Unpack weights for each texel
1387	int weight_plane1[BLOCK_MAX_TEXELS];
1388	int weight_plane2[BLOCK_MAX_TEXELS];
1389
1390	unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
1391	for (unsigned int i = 0; i < bsd.texel_count; i++)
1392	{
1393		info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1394		if (info->is_dual_plane_block)
1395		{
1396			info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1397		}
1398	}
1399
1400	// Unpack partition assignments for each texel
1401	for (unsigned int i = 0; i < bsd.texel_count; i++)
1402	{
1403		info->partition_assignment[i] = pi.partition_of_texel[i];
1404	}
1405
1406	return ASTCENC_SUCCESS;
1407#endif
1408}
1409
1410/* See header for documentation. */
1411const char* astcenc_get_error_string(
1412	astcenc_error status
1413) {
1414	// Values in this enum are from an external user, so not guaranteed to be
1415	// bounded to the enum values
1416	switch (static_cast<int>(status))
1417	{
1418	case ASTCENC_SUCCESS:
1419		return "ASTCENC_SUCCESS";
1420	case ASTCENC_ERR_OUT_OF_MEM:
1421		return "ASTCENC_ERR_OUT_OF_MEM";
1422	case ASTCENC_ERR_BAD_CPU_FLOAT:
1423		return "ASTCENC_ERR_BAD_CPU_FLOAT";
1424	case ASTCENC_ERR_BAD_PARAM:
1425		return "ASTCENC_ERR_BAD_PARAM";
1426	case ASTCENC_ERR_BAD_BLOCK_SIZE:
1427		return "ASTCENC_ERR_BAD_BLOCK_SIZE";
1428	case ASTCENC_ERR_BAD_PROFILE:
1429		return "ASTCENC_ERR_BAD_PROFILE";
1430	case ASTCENC_ERR_BAD_QUALITY:
1431		return "ASTCENC_ERR_BAD_QUALITY";
1432	case ASTCENC_ERR_BAD_FLAGS:
1433		return "ASTCENC_ERR_BAD_FLAGS";
1434	case ASTCENC_ERR_BAD_SWIZZLE:
1435		return "ASTCENC_ERR_BAD_SWIZZLE";
1436	case ASTCENC_ERR_BAD_CONTEXT:
1437		return "ASTCENC_ERR_BAD_CONTEXT";
1438	case ASTCENC_ERR_NOT_IMPLEMENTED:
1439		return "ASTCENC_ERR_NOT_IMPLEMENTED";
1440	case ASTCENC_ERR_BAD_DECODE_MODE:
1441		return "ASTCENC_ERR_BAD_DECODE_MODE";
1442#if defined(ASTCENC_DIAGNOSTICS)
1443	case ASTCENC_ERR_DTRACE_FAILURE:
1444		return "ASTCENC_ERR_DTRACE_FAILURE";
1445#endif
1446	default:
1447		return nullptr;
1448	}
1449}
1450