1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2024 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief Functions for the library entrypoint.
20  */
21 
22 #include <array>
23 #include <cstring>
24 #include <new>
25 
26 #include "astcenc.h"
27 #include "astcenc_internal_entry.h"
28 #include "astcenc_diagnostic_trace.h"
29 
30 /**
31  * @brief Record of the quality tuning parameter values.
32  *
33  * See the @c astcenc_config structure for detailed parameter documentation.
34  *
35  * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit.
36  * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios
37  * for the more through search presets because the underlying db_limit is so much higher.
38  */
39 struct astcenc_preset_config
40 {
41 	float quality;
42 	unsigned int tune_partition_count_limit;
43 	unsigned int tune_2partition_index_limit;
44 	unsigned int tune_3partition_index_limit;
45 	unsigned int tune_4partition_index_limit;
46 	unsigned int tune_block_mode_limit;
47 	unsigned int tune_refinement_limit;
48 	unsigned int tune_candidate_limit;
49 	unsigned int tune_2partitioning_candidate_limit;
50 	unsigned int tune_3partitioning_candidate_limit;
51 	unsigned int tune_4partitioning_candidate_limit;
52 	float tune_db_limit_a_base;
53 	float tune_db_limit_b_base;
54 	float tune_mse_overshoot;
55 	float tune_2partition_early_out_limit_factor;
56 	float tune_3partition_early_out_limit_factor;
57 	float tune_2plane_early_out_limit_correlation;
58 	float tune_search_mode0_enable;
59 };
60 
61 /**
62  * @brief The static presets for high bandwidth encodings (x < 25 texels per block).
63  */
64 static const std::array<astcenc_preset_config, 6> preset_configs_high {{
65 	{
66 		ASTCENC_PRE_FASTEST,
67 		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f
68 	}, {
69 		ASTCENC_PRE_FAST,
70 		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f
71 	}, {
72 		ASTCENC_PRE_MEDIUM,
73 		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f
74 	}, {
75 		ASTCENC_PRE_THOROUGH,
76 		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f
77 	}, {
78 		ASTCENC_PRE_VERYTHOROUGH,
79 		4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
80 	}, {
81 		ASTCENC_PRE_EXHAUSTIVE,
82 		4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
83 	}
84 }};
85 
86 /**
87  * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block).
88  */
89 static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
90 	{
91 		ASTCENC_PRE_FASTEST,
92 		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
93 	}, {
94 		ASTCENC_PRE_FAST,
95 		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
96 	}, {
97 		ASTCENC_PRE_MEDIUM,
98 		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f
99 	}, {
100 		ASTCENC_PRE_THOROUGH,
101 		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f
102 	}, {
103 		ASTCENC_PRE_VERYTHOROUGH,
104 		4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
105 	}, {
106 		ASTCENC_PRE_EXHAUSTIVE,
107 		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
108 	}
109 }};
110 
111 /**
112  * @brief The static presets for low bandwidth encodings (64 <= x texels per block).
113  */
114 static const std::array<astcenc_preset_config, 6> preset_configs_low {{
115 	{
116 		ASTCENC_PRE_FASTEST,
117 		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
118 	}, {
119 		ASTCENC_PRE_FAST,
120 		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
121 	}, {
122 		ASTCENC_PRE_MEDIUM,
123 		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f
124 	}, {
125 		ASTCENC_PRE_THOROUGH,
126 		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f
127 	}, {
128 		ASTCENC_PRE_VERYTHOROUGH,
129 		4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f
130 	}, {
131 		ASTCENC_PRE_EXHAUSTIVE,
132 		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f
133 	}
134 }};
135 
136 /**
137  * @brief Validate CPU floating point meets assumptions made in the codec.
138  *
139  * The codec is written with the assumption that a float threaded through the @c if32 union will be
140  * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the
141  * case in an IEEE-754 compliant system, however not every system or compilation mode is actually
142  * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled.
143  *
144  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
145  */
validate_cpu_float()146 static astcenc_error validate_cpu_float()
147 {
148 	if32 p;
149 	volatile float xprec_testval = 2.51f;
150 	p.f = xprec_testval + 12582912.0f;
151 	float q = p.f - 12582912.0f;
152 
153 	if (q != 3.0f)
154 	{
155 		return ASTCENC_ERR_BAD_CPU_FLOAT;
156 	}
157 
158 	return ASTCENC_SUCCESS;
159 }
160 
161 /**
162  * @brief Validate config profile.
163  *
164  * @param profile   The profile to check.
165  *
166  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
167  */
validate_profile( astcenc_profile profile )168 static astcenc_error validate_profile(
169 	astcenc_profile profile
170 ) {
171 	// Values in this enum are from an external user, so not guaranteed to be
172 	// bounded to the enum values
173 	switch (static_cast<int>(profile))
174 	{
175 	case ASTCENC_PRF_LDR_SRGB:
176 	case ASTCENC_PRF_LDR:
177 	case ASTCENC_PRF_HDR_RGB_LDR_A:
178 	case ASTCENC_PRF_HDR:
179 		return ASTCENC_SUCCESS;
180 	default:
181 		return ASTCENC_ERR_BAD_PROFILE;
182 	}
183 }
184 
185 /**
186  * @brief Validate block size.
187  *
188  * @param block_x   The block x dimensions.
189  * @param block_y   The block y dimensions.
190  * @param block_z   The block z dimensions.
191  *
192  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
193  */
validate_block_size( unsigned int block_x, unsigned int block_y, unsigned int block_z )194 static astcenc_error validate_block_size(
195 	unsigned int block_x,
196 	unsigned int block_y,
197 	unsigned int block_z
198 ) {
199 	// Test if this is a legal block size at all
200 	bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
201 	                 ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
202 	if (!is_legal)
203 	{
204 		return ASTCENC_ERR_BAD_BLOCK_SIZE;
205 	}
206 
207 	// Test if this build has sufficient capacity for this block size
208 	bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
209 	if (!have_capacity)
210 	{
211 		return ASTCENC_ERR_NOT_IMPLEMENTED;
212 	}
213 
214 	return ASTCENC_SUCCESS;
215 }
216 
217 /**
218  * @brief Validate flags.
219  *
220  * @param profile   The profile to check.
221  * @param flags     The flags to check.
222  *
223  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
224  */
validate_flags( astcenc_profile profile, unsigned int flags )225 static astcenc_error validate_flags(
226 	astcenc_profile profile,
227 	unsigned int flags
228 ) {
229 	// Flags field must not contain any unknown flag bits
230 	unsigned int exMask = ~ASTCENC_ALL_FLAGS;
231 	if (popcount(flags & exMask) != 0)
232 	{
233 		return ASTCENC_ERR_BAD_FLAGS;
234 	}
235 
236 	// Flags field must only contain at most a single map type
237 	exMask = ASTCENC_FLG_MAP_NORMAL
238 	       | ASTCENC_FLG_MAP_RGBM;
239 	if (popcount(flags & exMask) > 1)
240 	{
241 		return ASTCENC_ERR_BAD_FLAGS;
242 	}
243 
244 	// Decode_unorm8 must only be used with an LDR profile
245 	bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
246 	bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
247 	if (is_unorm8 && is_hdr)
248 	{
249 		return ASTCENC_ERR_BAD_DECODE_MODE;
250 	}
251 
252 	return ASTCENC_SUCCESS;
253 }
254 
255 #if !defined(ASTCENC_DECOMPRESS_ONLY)
256 
257 /**
258  * @brief Validate single channel compression swizzle.
259  *
260  * @param swizzle   The swizzle to check.
261  *
262  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
263  */
validate_compression_swz( astcenc_swz swizzle )264 static astcenc_error validate_compression_swz(
265 	astcenc_swz swizzle
266 ) {
267 	// Not all enum values are handled; SWZ_Z is invalid for compression
268 	switch (static_cast<int>(swizzle))
269 	{
270 	case ASTCENC_SWZ_R:
271 	case ASTCENC_SWZ_G:
272 	case ASTCENC_SWZ_B:
273 	case ASTCENC_SWZ_A:
274 	case ASTCENC_SWZ_0:
275 	case ASTCENC_SWZ_1:
276 		return ASTCENC_SUCCESS;
277 	default:
278 		return ASTCENC_ERR_BAD_SWIZZLE;
279 	}
280 }
281 
282 /**
283  * @brief Validate overall compression swizzle.
284  *
285  * @param swizzle   The swizzle to check.
286  *
287  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
288  */
validate_compression_swizzle( const astcenc_swizzle& swizzle )289 static astcenc_error validate_compression_swizzle(
290 	const astcenc_swizzle& swizzle
291 ) {
292 	if (validate_compression_swz(swizzle.r) ||
293 	    validate_compression_swz(swizzle.g) ||
294 	    validate_compression_swz(swizzle.b) ||
295 	    validate_compression_swz(swizzle.a))
296 	{
297 		return ASTCENC_ERR_BAD_SWIZZLE;
298 	}
299 
300 	return ASTCENC_SUCCESS;
301 }
302 #endif
303 
304 /**
305  * @brief Validate single channel decompression swizzle.
306  *
307  * @param swizzle   The swizzle to check.
308  *
309  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
310  */
validate_decompression_swz( astcenc_swz swizzle )311 static astcenc_error validate_decompression_swz(
312 	astcenc_swz swizzle
313 ) {
314 	// Values in this enum are from an external user, so not guaranteed to be
315 	// bounded to the enum values
316 	switch (static_cast<int>(swizzle))
317 	{
318 	case ASTCENC_SWZ_R:
319 	case ASTCENC_SWZ_G:
320 	case ASTCENC_SWZ_B:
321 	case ASTCENC_SWZ_A:
322 	case ASTCENC_SWZ_0:
323 	case ASTCENC_SWZ_1:
324 	case ASTCENC_SWZ_Z:
325 		return ASTCENC_SUCCESS;
326 	default:
327 		return ASTCENC_ERR_BAD_SWIZZLE;
328 	}
329 }
330 
331 /**
332  * @brief Validate overall decompression swizzle.
333  *
334  * @param swizzle   The swizzle to check.
335  *
336  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
337  */
validate_decompression_swizzle( const astcenc_swizzle& swizzle )338 static astcenc_error validate_decompression_swizzle(
339 	const astcenc_swizzle& swizzle
340 ) {
341 	if (validate_decompression_swz(swizzle.r) ||
342 	    validate_decompression_swz(swizzle.g) ||
343 	    validate_decompression_swz(swizzle.b) ||
344 	    validate_decompression_swz(swizzle.a))
345 	{
346 		return ASTCENC_ERR_BAD_SWIZZLE;
347 	}
348 
349 	return ASTCENC_SUCCESS;
350 }
351 
352 /**
353  * Validate that an incoming configuration is in-spec.
354  *
355  * This function can respond in two ways:
356  *
357  *   * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown
358  *     for out-of-range inputs in this case.
359  *   * Numerical inputs and logic inputs are are logically invalid and which make no sense
360  *     algorithmically will return an error.
361  *
362  * @param[in,out] config   The input compressor configuration.
363  *
364  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
365  */
validate_config( astcenc_config &config )366 static astcenc_error validate_config(
367 	astcenc_config &config
368 ) {
369 	astcenc_error status;
370 
371 	status = validate_profile(config.profile);
372 	if (status != ASTCENC_SUCCESS)
373 	{
374 		return status;
375 	}
376 
377 	status = validate_flags(config.profile, config.flags);
378 	if (status != ASTCENC_SUCCESS)
379 	{
380 		return status;
381 	}
382 
383 	status = validate_block_size(config.block_x, config.block_y, config.block_z);
384 	if (status != ASTCENC_SUCCESS)
385 	{
386 		return status;
387 	}
388 
389 #if defined(ASTCENC_DECOMPRESS_ONLY)
390 	// Decompress-only builds only support decompress-only contexts
391 	if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
392 	{
393 		return ASTCENC_ERR_BAD_PARAM;
394 	}
395 #endif
396 
397 	config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
398 
399 	config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
400 	config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
401 	config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
402 	config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
403 	config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
404 	config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
405 	config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
406 	config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
407 	config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
408 	config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
409 	config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
410 	config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f);
411 	config.tune_2partition_early_out_limit_factor = astc::max(config.tune_2partition_early_out_limit_factor, 0.0f);
412 	config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f);
413 	config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f);
414 
415 	// Specifying a zero weight color component is not allowed; force to small value
416 	float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
417 	                             astc::max(config.cw_b_weight, config.cw_a_weight));
418 	if (max_weight > 0.0f)
419 	{
420 		max_weight /= 1000.0f;
421 		config.cw_r_weight = astc::max(config.cw_r_weight, max_weight);
422 		config.cw_g_weight = astc::max(config.cw_g_weight, max_weight);
423 		config.cw_b_weight = astc::max(config.cw_b_weight, max_weight);
424 		config.cw_a_weight = astc::max(config.cw_a_weight, max_weight);
425 	}
426 	// If all color components error weights are zero then return an error
427 	else
428 	{
429 		return ASTCENC_ERR_BAD_PARAM;
430 	}
431 
432 	return ASTCENC_SUCCESS;
433 }
434 
435 /* See header for documentation. */
astcenc_config_init( astcenc_profile profile, unsigned int block_x, unsigned int block_y, unsigned int block_z, float quality, unsigned int flags, astcenc_config* configp )436 astcenc_error astcenc_config_init(
437 	astcenc_profile profile,
438 	unsigned int block_x,
439 	unsigned int block_y,
440 	unsigned int block_z,
441 	float quality,
442 	unsigned int flags,
443 	astcenc_config* configp
444 ) {
445 	astcenc_error status;
446 
447 	status = validate_cpu_float();
448 	if (status != ASTCENC_SUCCESS)
449 	{
450 		return status;
451 	}
452 
453 	// Zero init all config fields; although most of will be over written
454 	astcenc_config& config = *configp;
455 	std::memset(&config, 0, sizeof(config));
456 
457 	// Process the block size
458 	block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1
459 	status = validate_block_size(block_x, block_y, block_z);
460 	if (status != ASTCENC_SUCCESS)
461 	{
462 		return status;
463 	}
464 
465 	config.block_x = block_x;
466 	config.block_y = block_y;
467 	config.block_z = block_z;
468 
469 	float texels = static_cast<float>(block_x * block_y * block_z);
470 	float ltexels = logf(texels) / logf(10.0f);
471 
472 	// Process the performance quality level or preset; note that this must be done before we
473 	// process any additional settings, such as color profile and flags, which may replace some of
474 	// these settings with more use case tuned values
475 	if (quality < ASTCENC_PRE_FASTEST ||
476 	    quality > ASTCENC_PRE_EXHAUSTIVE)
477 	{
478 		return ASTCENC_ERR_BAD_QUALITY;
479 	}
480 
481 	static const std::array<astcenc_preset_config, 6>* preset_configs;
482 	int texels_int = block_x * block_y * block_z;
483 	if (texels_int < 25)
484 	{
485 		preset_configs = &preset_configs_high;
486 	}
487 	else if (texels_int < 64)
488 	{
489 		preset_configs = &preset_configs_mid;
490 	}
491 	else
492 	{
493 		preset_configs = &preset_configs_low;
494 	}
495 
496 	// Determine which preset to use, or which pair to interpolate
497 	size_t start;
498 	size_t end;
499 	for (end = 0; end < preset_configs->size(); end++)
500 	{
501 		if ((*preset_configs)[end].quality >= quality)
502 		{
503 			break;
504 		}
505 	}
506 
507 	start = end == 0 ? 0 : end - 1;
508 
509 	// Start and end node are the same - so just transfer the values.
510 	if (start == end)
511 	{
512 		config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
513 		config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
514 		config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
515 		config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
516 		config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
517 		config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
518 		config.tune_candidate_limit = (*preset_configs)[start].tune_candidate_limit;
519 		config.tune_2partitioning_candidate_limit = (*preset_configs)[start].tune_2partitioning_candidate_limit;
520 		config.tune_3partitioning_candidate_limit = (*preset_configs)[start].tune_3partitioning_candidate_limit;
521 		config.tune_4partitioning_candidate_limit = (*preset_configs)[start].tune_4partitioning_candidate_limit;
522 		config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
523 		                                 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
524 
525 		config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot;
526 
527 		config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor;
528 		config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor;
529 		config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation;
530 		config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable;
531 	}
532 	// Start and end node are not the same - so interpolate between them
533 	else
534 	{
535 		auto& node_a = (*preset_configs)[start];
536 		auto& node_b = (*preset_configs)[end];
537 
538 		float wt_range = node_b.quality - node_a.quality;
539 		assert(wt_range > 0);
540 
541 		// Compute interpolation factors
542 		float wt_node_a = (node_b.quality - quality) / wt_range;
543 		float wt_node_b = (quality - node_a.quality) / wt_range;
544 
545 		#define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b))
546 		#define LERPI(param) astc::flt2int_rtn(\
547 		                         (static_cast<float>(node_a.param) * wt_node_a) + \
548 		                         (static_cast<float>(node_b.param) * wt_node_b))
549 		#define LERPUI(param) static_cast<unsigned int>(LERPI(param))
550 
551 		config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
552 		config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
553 		config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
554 		config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
555 		config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
556 		config.tune_refinement_limit = LERPI(tune_refinement_limit);
557 		config.tune_candidate_limit = LERPUI(tune_candidate_limit);
558 		config.tune_2partitioning_candidate_limit = LERPUI(tune_2partitioning_candidate_limit);
559 		config.tune_3partitioning_candidate_limit = LERPUI(tune_3partitioning_candidate_limit);
560 		config.tune_4partitioning_candidate_limit = LERPUI(tune_4partitioning_candidate_limit);
561 		config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
562 		                                 LERP(tune_db_limit_b_base) - 19 * ltexels);
563 
564 		config.tune_mse_overshoot = LERP(tune_mse_overshoot);
565 
566 		config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor);
567 		config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor);
568 		config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation);
569 		config.tune_search_mode0_enable = LERP(tune_search_mode0_enable);
570 		#undef LERP
571 		#undef LERPI
572 		#undef LERPUI
573 	}
574 
575 	// Set heuristics to the defaults for each color profile
576 	config.cw_r_weight = 1.0f;
577 	config.cw_g_weight = 1.0f;
578 	config.cw_b_weight = 1.0f;
579 	config.cw_a_weight = 1.0f;
580 
581 	config.a_scale_radius = 0;
582 
583 	config.rgbm_m_scale = 0.0f;
584 
585 	config.profile = profile;
586 
587 	// Values in this enum are from an external user, so not guaranteed to be
588 	// bounded to the enum values
589 	switch (static_cast<int>(profile))
590 	{
591 	case ASTCENC_PRF_LDR:
592 	case ASTCENC_PRF_LDR_SRGB:
593 		break;
594 	case ASTCENC_PRF_HDR_RGB_LDR_A:
595 	case ASTCENC_PRF_HDR:
596 		config.tune_db_limit = 999.0f;
597 		config.tune_search_mode0_enable = 0.0f;
598 		break;
599 	default:
600 		return ASTCENC_ERR_BAD_PROFILE;
601 	}
602 
603 	// Flags field must not contain any unknown flag bits
604 	status = validate_flags(profile, flags);
605 	if (status != ASTCENC_SUCCESS)
606 	{
607 		return status;
608 	}
609 
610 	if (flags & ASTCENC_FLG_MAP_NORMAL)
611 	{
612 		// Normal map encoding uses L+A blocks, so allow one more partitioning
613 		// than normal. We need need fewer bits for endpoints, so more likely
614 		// to be able to use more partitions than an RGB/RGBA block
615 		config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u);
616 
617 		config.cw_g_weight = 0.0f;
618 		config.cw_b_weight = 0.0f;
619 		config.tune_2partition_early_out_limit_factor *= 1.5f;
620 		config.tune_3partition_early_out_limit_factor *= 1.5f;
621 		config.tune_2plane_early_out_limit_correlation = 0.99f;
622 
623 		// Normals are prone to blocking artifacts on smooth curves
624 		// so force compressor to try harder here ...
625 		config.tune_db_limit *= 1.03f;
626 	}
627 	else if (flags & ASTCENC_FLG_MAP_RGBM)
628 	{
629 		config.rgbm_m_scale = 5.0f;
630 		config.cw_a_weight = 2.0f * config.rgbm_m_scale;
631 	}
632 	else // (This is color data)
633 	{
634 		// This is a very basic perceptual metric for RGB color data, which weights error
635 		// significance by the perceptual luminance contribution of each color channel. For
636 		// luminance the usual weights to compute luminance from a linear RGB value are as
637 		// follows:
638 		//
639 		//     l = r * 0.3 + g * 0.59 + b * 0.11
640 		//
641 		// ... but we scale these up to keep a better balance between color and alpha. Note
642 		// that if the content is using alpha we'd recommend using the -a option to weight
643 		// the color contribution by the alpha transparency.
644 		if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
645 		{
646 			config.cw_r_weight = 0.30f * 2.25f;
647 			config.cw_g_weight = 0.59f * 2.25f;
648 			config.cw_b_weight = 0.11f * 2.25f;
649 		}
650 	}
651 	config.flags = flags;
652 
653 	return ASTCENC_SUCCESS;
654 }
655 
656 /* See header for documentation. */
astcenc_context_alloc( const astcenc_config* configp, unsigned int thread_count, astcenc_context** context )657 astcenc_error astcenc_context_alloc(
658 	const astcenc_config* configp,
659 	unsigned int thread_count,
660 	astcenc_context** context
661 ) {
662 	astcenc_error status;
663 	const astcenc_config& config = *configp;
664 
665 	status = validate_cpu_float();
666 	if (status != ASTCENC_SUCCESS)
667 	{
668 		return status;
669 	}
670 
671 	if (thread_count == 0)
672 	{
673 		return ASTCENC_ERR_BAD_PARAM;
674 	}
675 
676 #if defined(ASTCENC_DIAGNOSTICS)
677 	// Force single threaded compressor use in diagnostic mode.
678 	if (thread_count != 1)
679 	{
680 		return ASTCENC_ERR_BAD_PARAM;
681 	}
682 #endif
683 
684 #ifndef ASTC_CUSTOMIZED_ENABLE
685 	if (config.privateProfile == CUSTOMIZED_PROFILE)
686 	{
687 		return ASTCENC_ERR_BAD_PARAM;
688 	}
689 #endif
690 
691 	astcenc_context* ctxo = new astcenc_context;
692 	astcenc_contexti* ctx = &ctxo->context;
693 	ctx->thread_count = thread_count;
694 	ctx->config = config;
695 	ctx->working_buffers = nullptr;
696 
697 	// These are allocated per-compress, as they depend on image size
698 	ctx->input_alpha_averages = nullptr;
699 
700 	// Copy the config first and validate the copy (we may modify it)
701 	status = validate_config(ctx->config);
702 	if (status != ASTCENC_SUCCESS)
703 	{
704 		delete ctxo;
705 		return status;
706 	}
707 
708 	ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
709 	if (!ctx->bsd)
710 	{
711 		delete ctxo;
712 		return ASTCENC_ERR_OUT_OF_MEM;
713 	}
714 
715 	bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
716 #ifdef ASTC_CUSTOMIZED_ENABLE
717 	if (!init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
718 	                           can_omit_modes,
719 	                           config.tune_partition_count_limit,
720 	                           static_cast<float>(config.tune_block_mode_limit) / 100.0f,
721 	                           *ctx->bsd))
722 	{
723 		aligned_free<block_size_descriptor>(ctx->bsd);
724 		delete ctxo;
725 		*context = nullptr;
726 		return ASTCENC_ERR_DLOPEN_FAILED;
727 	}
728 #else
729 	init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
730 	                           can_omit_modes,
731 	                           config.tune_partition_count_limit,
732 	                           static_cast<float>(config.tune_block_mode_limit) / 100.0f,
733 	                           *ctx->bsd);
734 #endif
735 
736 #if !defined(ASTCENC_DECOMPRESS_ONLY)
737 	// Do setup only needed by compression
738 	if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
739 	{
740 		// Turn a dB limit into a per-texel error for faster use later
741 		if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
742 		{
743 			ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f;
744 		}
745 		else
746 		{
747 			ctx->config.tune_db_limit = 0.0f;
748 		}
749 
750 		size_t worksize = sizeof(compression_working_buffers) * thread_count;
751 		ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
752 		static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
753 		              "compression_working_buffers size must be multiple of vector alignment");
754 		if (!ctx->working_buffers)
755 		{
756 			aligned_free<block_size_descriptor>(ctx->bsd);
757 			delete ctxo;
758 			*context = nullptr;
759 			return ASTCENC_ERR_OUT_OF_MEM;
760 		}
761 	}
762 #endif
763 
764 #if defined(ASTCENC_DIAGNOSTICS)
765 	ctx->trace_log = new TraceLog(ctx->config.trace_file_path);
766 	if (!ctx->trace_log->m_file)
767 	{
768 		return ASTCENC_ERR_DTRACE_FAILURE;
769 	}
770 
771 	trace_add_data("block_x", config.block_x);
772 	trace_add_data("block_y", config.block_y);
773 	trace_add_data("block_z", config.block_z);
774 #endif
775 
776 	*context = ctxo;
777 
778 #if !defined(ASTCENC_DECOMPRESS_ONLY)
779 	prepare_angular_tables();
780 #endif
781 
782 	return ASTCENC_SUCCESS;
783 }
784 
785 /* See header dor documentation. */
astcenc_context_free( astcenc_context* ctxo )786 void astcenc_context_free(
787 	astcenc_context* ctxo
788 ) {
789 	if (ctxo)
790 	{
791 		astcenc_contexti* ctx = &ctxo->context;
792 		if (ctx->working_buffers)
793 		{
794 			aligned_free<compression_working_buffers>(ctx->working_buffers);
795 		}
796 		else
797 		{
798 			printf("ctx->working_buffers is nullptr !!\n");
799 		}
800 		if (ctx->bsd)
801 		{
802 			aligned_free<block_size_descriptor>(ctx->bsd);
803 		}
804 		else
805 		{
806 			printf("ctx->bsd is nullptr !!\n");
807 		}
808 #if defined(ASTCENC_DIAGNOSTICS)
809 		delete ctx->trace_log;
810 #endif
811 		delete ctxo;
812 	}
813 }
814 
815 #if !defined(ASTCENC_DECOMPRESS_ONLY)
816 
817 /**
818  * @brief Compress an image, after any preflight has completed.
819  *
820  * @param[out] ctxo           The compressor context.
821  * @param      thread_index   The thread index.
822  * @param      image          The intput image.
823  * @param      swizzle        The input swizzle.
824  * @param[out] buffer         The output array for the compressed data.
825  */
compress_image( astcenc_context& ctxo, unsigned int thread_index, const astcenc_image& image, const astcenc_swizzle& swizzle, uint8_t* buffer, bool calQualityEnable, int32_t *mse[RGBA_COM] )826 static void compress_image(
827 	astcenc_context& ctxo,
828 	unsigned int thread_index,
829 	const astcenc_image& image,
830 	const astcenc_swizzle& swizzle,
831 #if QUALITY_CONTROL
832 	uint8_t* buffer,
833 	bool calQualityEnable,
834 	int32_t *mse[RGBA_COM]
835 #else
836 	uint8_t* buffer
837 #endif
838 ) {
839 	astcenc_contexti& ctx = ctxo.context;
840 	const block_size_descriptor& bsd = *ctx.bsd;
841 	astcenc_profile decode_mode = ctx.config.profile;
842 
843 	image_block blk;
844 
845 	int block_x = bsd.xdim;
846 	int block_y = bsd.ydim;
847 	int block_z = bsd.zdim;
848 	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
849 
850 	int dim_x = image.dim_x;
851 	int dim_y = image.dim_y;
852 	int dim_z = image.dim_z;
853 
854 	int xblocks = (dim_x + block_x - 1) / block_x;
855 	int yblocks = (dim_y + block_y - 1) / block_y;
856 	int zblocks = (dim_z + block_z - 1) / block_z;
857 	int block_count = zblocks * yblocks * xblocks;
858 
859 	int row_blocks = xblocks;
860 	int plane_blocks = xblocks * yblocks;
861 
862 	blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
863 
864 	// Populate the block channel weights
865 	blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
866 	                             ctx.config.cw_g_weight,
867 	                             ctx.config.cw_b_weight,
868 	                             ctx.config.cw_a_weight);
869 
870 	// Use preallocated scratch buffer
871 	auto& temp_buffers = ctx.working_buffers[thread_index];
872 
873 	// Only the first thread actually runs the initializer
874 	ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
875 
876 	// Determine if we can use an optimized load function
877 	bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
878 	                 (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A);
879 
880 	bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) ||
881 	                 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A);
882 
883 	bool use_fast_load = !needs_swz && !needs_hdr &&
884 	                     block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
885 
886 	auto load_func = load_image_block;
887 	if (use_fast_load)
888 	{
889 		load_func = load_image_block_fast_ldr;
890 	}
891 
892 	// All threads run this processing loop until there is no work remaining
893 	while (true)
894 	{
895 		unsigned int count;
896 		unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
897 		if (!count)
898 		{
899 			break;
900 		}
901 
902 		for (unsigned int i = base; i < base + count; i++)
903 		{
904 			// Decode i into x, y, z block indices
905 			int z = i / plane_blocks;
906 			unsigned int rem = i - (z * plane_blocks);
907 			int y = rem / row_blocks;
908 			int x = rem - (y * row_blocks);
909 
910 			// Test if we can apply some basic alpha-scale RDO
911 			bool use_full_block = true;
912 			if (ctx.config.a_scale_radius != 0 && block_z == 1)
913 			{
914 				int start_x = x * block_x;
915 				int end_x = astc::min(dim_x, start_x + block_x);
916 
917 				int start_y = y * block_y;
918 				int end_y = astc::min(dim_y, start_y + block_y);
919 
920 				// SATs accumulate error, so don't test exactly zero. Test for
921 				// less than 1 alpha in the expanded block footprint that
922 				// includes the alpha radius.
923 				int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1);
924 
925 				int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1);
926 
927 				float footprint = static_cast<float>(x_footprint * y_footprint);
928 				float threshold = 0.9f / (255.0f * footprint);
929 
930 				// Do we have any alpha values?
931 				use_full_block = false;
932 				for (int ay = start_y; ay < end_y; ay++)
933 				{
934 					for (int ax = start_x; ax < end_x; ax++)
935 					{
936 						float a_avg = ctx.input_alpha_averages[ay * dim_x + ax];
937 						if (a_avg > threshold)
938 						{
939 							use_full_block = true;
940 							ax = end_x;
941 							ay = end_y;
942 						}
943 					}
944 				}
945 			}
946 
947 			// Fetch the full block for compression
948 			if (use_full_block)
949 			{
950 				load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
951 
952 				// Scale RGB error contribution by the maximum alpha in the block
953 				// This encourages preserving alpha accuracy in regions with high
954 				// transparency, and can buy up to 0.5 dB PSNR.
955 				if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
956 				{
957 					float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
958 					blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
959 					                             ctx.config.cw_g_weight * alpha_scale,
960 					                             ctx.config.cw_b_weight * alpha_scale,
961 					                             ctx.config.cw_a_weight);
962 				}
963 			}
964 			// Apply alpha scale RDO - substitute constant color block
965 			else
966 			{
967 				blk.origin_texel = vfloat4::zero();
968 				blk.data_min = vfloat4::zero();
969 				blk.data_mean = vfloat4::zero();
970 				blk.data_max = vfloat4::zero();
971 				blk.grayscale = true;
972 			}
973 
974 			int offset = ((z * yblocks + y) * xblocks + x) * 16;
975 			uint8_t *bp = buffer + offset;
976 #if QUALITY_CONTROL
977 			int32_t *mseBlock[RGBA_COM] = {nullptr, nullptr, nullptr, nullptr};
978 			if (calQualityEnable) {
979 				offset = (z * yblocks + y) * xblocks + x;
980 				mseBlock[R_COM] = mse[R_COM] + offset;
981 				mseBlock[G_COM] = mse[G_COM] + offset;
982 				mseBlock[B_COM] = mse[B_COM] + offset;
983 				mseBlock[A_COM] = mse[A_COM] + offset;
984 			}
985 			compress_block(ctx, blk, bp, temp_buffers, calQualityEnable, mseBlock);
986 #else
987 			compress_block(ctx, blk, bp, temp_buffers);
988 #endif
989 		}
990 
991 		ctxo.manage_compress.complete_task_assignment(count);
992 	}
993 }
994 
995 /**
996  * @brief Compute regional averages in an image.
997  *
998  * This function can be called by multiple threads, but only after a single
999  * thread calls the setup function @c init_compute_averages().
1000  *
1001  * Results are written back into @c img->input_alpha_averages.
1002  *
1003  * @param[out] ctx   The context.
1004  * @param      ag    The average and variance arguments created during setup.
1005  */
compute_averages( astcenc_context& ctx, const avg_args &ag )1006 static void compute_averages(
1007 	astcenc_context& ctx,
1008 	const avg_args &ag
1009 ) {
1010 	pixel_region_args arg = ag.arg;
1011 	arg.work_memory = new vfloat4[ag.work_memory_size];
1012 
1013 	int size_x = ag.img_size_x;
1014 	int size_y = ag.img_size_y;
1015 	int size_z = ag.img_size_z;
1016 
1017 	int step_xy = ag.blk_size_xy;
1018 	int step_z = ag.blk_size_z;
1019 
1020 	int y_tasks = (size_y + step_xy - 1) / step_xy;
1021 
1022 	// All threads run this processing loop until there is no work remaining
1023 	while (true)
1024 	{
1025 		unsigned int count;
1026 		unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
1027 		if (!count)
1028 		{
1029 			break;
1030 		}
1031 
1032 		for (unsigned int i = base; i < base + count; i++)
1033 		{
1034 			int z = (i / (y_tasks)) * step_z;
1035 			int y = (i - (z * y_tasks)) * step_xy;
1036 
1037 			arg.size_z = astc::min(step_z, size_z - z);
1038 			arg.offset_z = z;
1039 
1040 			arg.size_y = astc::min(step_xy, size_y - y);
1041 			arg.offset_y = y;
1042 
1043 			for (int x = 0; x < size_x; x += step_xy)
1044 			{
1045 				arg.size_x = astc::min(step_xy, size_x - x);
1046 				arg.offset_x = x;
1047 				compute_pixel_region_variance(ctx.context, arg);
1048 			}
1049 		}
1050 
1051 		ctx.manage_avg.complete_task_assignment(count);
1052 	}
1053 
1054 	delete[] arg.work_memory;
1055 }
1056 
1057 #endif
1058 
1059 /* See header for documentation. */
astcenc_compress_image( astcenc_context* ctxo, astcenc_image* imagep, const astcenc_swizzle* swizzle, uint8_t* data_out, size_t data_len, bool calQualityEnable, int32_t *mse[RGBA_COM], unsigned int thread_index )1060 astcenc_error astcenc_compress_image(
1061 	astcenc_context* ctxo,
1062 	astcenc_image* imagep,
1063 	const astcenc_swizzle* swizzle,
1064 	uint8_t* data_out,
1065 	size_t data_len,
1066 #if QUALITY_CONTROL
1067 	bool calQualityEnable,
1068 	int32_t *mse[RGBA_COM],
1069 #endif
1070 	unsigned int thread_index
1071 ) {
1072 #if defined(ASTCENC_DECOMPRESS_ONLY)
1073 	(void)ctxo;
1074 	(void)imagep;
1075 	(void)swizzle;
1076 	(void)data_out;
1077 	(void)data_len;
1078 	(void)thread_index;
1079 	return ASTCENC_ERR_BAD_CONTEXT;
1080 #else
1081 	astcenc_contexti* ctx = &ctxo->context;
1082 	astcenc_error status;
1083 	astcenc_image& image = *imagep;
1084 
1085 	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1086 	{
1087 		return ASTCENC_ERR_BAD_CONTEXT;
1088 	}
1089 
1090 	status = validate_compression_swizzle(*swizzle);
1091 	if (status != ASTCENC_SUCCESS)
1092 	{
1093 		return status;
1094 	}
1095 
1096 	if (thread_index >= ctx->thread_count)
1097 	{
1098 		return ASTCENC_ERR_BAD_PARAM;
1099 	}
1100 
1101 	unsigned int block_x = ctx->config.block_x;
1102 	unsigned int block_y = ctx->config.block_y;
1103 	unsigned int block_z = ctx->config.block_z;
1104 
1105 	unsigned int xblocks = (image.dim_x + block_x - 1) / block_x;
1106 	unsigned int yblocks = (image.dim_y + block_y - 1) / block_y;
1107 	unsigned int zblocks = (image.dim_z + block_z - 1) / block_z;
1108 
1109 	// Check we have enough output space (16 bytes per block)
1110 	size_t size_needed = xblocks * yblocks * zblocks * 16;
1111 	if (data_len < size_needed)
1112 	{
1113 		return ASTCENC_ERR_OUT_OF_MEM;
1114 	}
1115 
1116 	// If context thread count is one then implicitly reset
1117 	if (ctx->thread_count == 1)
1118 	{
1119 		astcenc_compress_reset(ctxo);
1120 	}
1121 
1122 	if (ctx->config.a_scale_radius != 0)
1123 	{
1124 		// First thread to enter will do setup, other threads will subsequently
1125 		// enter the critical section but simply skip over the initialization
1126 		auto init_avg = [ctx, &image, swizzle]() {
1127 			// Perform memory allocations for the destination buffers
1128 			size_t texel_count = image.dim_x * image.dim_y * image.dim_z;
1129 			ctx->input_alpha_averages = new float[texel_count];
1130 
1131 			return init_compute_averages(
1132 				image, ctx->config.a_scale_radius, *swizzle,
1133 				ctx->avg_preprocess_args);
1134 		};
1135 
1136 		// Only the first thread actually runs the initializer
1137 		ctxo->manage_avg.init(init_avg);
1138 
1139 		// All threads will enter this function and dynamically grab work
1140 		compute_averages(*ctxo, ctx->avg_preprocess_args);
1141 	}
1142 
1143 	// Wait for compute_averages to complete before compressing
1144 	ctxo->manage_avg.wait();
1145 #if QUALITY_CONTROL
1146 	compress_image(*ctxo, thread_index, image, *swizzle, data_out, calQualityEnable, mse);
1147 #else
1148 	compress_image(*ctxo, thread_index, image, *swizzle, data_out);
1149 #endif
1150 	// Wait for compress to complete before freeing memory
1151 	ctxo->manage_compress.wait();
1152 
1153 	auto term_compress = [ctx]() {
1154 		delete[] ctx->input_alpha_averages;
1155 		ctx->input_alpha_averages = nullptr;
1156 	};
1157 
1158 	// Only the first thread to arrive actually runs the term
1159 	ctxo->manage_compress.term(term_compress);
1160 
1161 	return ASTCENC_SUCCESS;
1162 #endif
1163 }
1164 
1165 /* See header for documentation. */
astcenc_compress_reset( astcenc_context* ctxo )1166 astcenc_error astcenc_compress_reset(
1167 	astcenc_context* ctxo
1168 ) {
1169 #if defined(ASTCENC_DECOMPRESS_ONLY)
1170 	(void)ctxo;
1171 	return ASTCENC_ERR_BAD_CONTEXT;
1172 #else
1173 	astcenc_contexti* ctx = &ctxo->context;
1174 	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1175 	{
1176 		return ASTCENC_ERR_BAD_CONTEXT;
1177 	}
1178 
1179 	ctxo->manage_avg.reset();
1180 	ctxo->manage_compress.reset();
1181 	return ASTCENC_SUCCESS;
1182 #endif
1183 }
1184 
1185 /* See header for documentation. */
astcenc_decompress_image( astcenc_context* ctxo, const uint8_t* data, size_t data_len, astcenc_image* image_outp, const astcenc_swizzle* swizzle, unsigned int thread_index )1186 astcenc_error astcenc_decompress_image(
1187 	astcenc_context* ctxo,
1188 	const uint8_t* data,
1189 	size_t data_len,
1190 	astcenc_image* image_outp,
1191 	const astcenc_swizzle* swizzle,
1192 	unsigned int thread_index
1193 ) {
1194 	astcenc_error status;
1195 	astcenc_image& image_out = *image_outp;
1196 	astcenc_contexti* ctx = &ctxo->context;
1197 
1198 	// Today this doesn't matter (working set on stack) but might in future ...
1199 	if (thread_index >= ctx->thread_count)
1200 	{
1201 		return ASTCENC_ERR_BAD_PARAM;
1202 	}
1203 
1204 	status = validate_decompression_swizzle(*swizzle);
1205 	if (status != ASTCENC_SUCCESS)
1206 	{
1207 		return status;
1208 	}
1209 
1210 	unsigned int block_x = ctx->config.block_x;
1211 	unsigned int block_y = ctx->config.block_y;
1212 	unsigned int block_z = ctx->config.block_z;
1213 
1214 	unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
1215 	unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
1216 	unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
1217 	unsigned int block_count = zblocks * yblocks * xblocks;
1218 
1219 	int row_blocks = xblocks;
1220 	int plane_blocks = xblocks * yblocks;
1221 
1222 	// Check we have enough output space (16 bytes per block)
1223 	size_t size_needed = xblocks * yblocks * zblocks * 16;
1224 	if (data_len < size_needed)
1225 	{
1226 		return ASTCENC_ERR_OUT_OF_MEM;
1227 	}
1228 
1229 	image_block blk;
1230 	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
1231 
1232 	// Decode mode inferred from the output data type
1233 	blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
1234 
1235 	// If context thread count is one then implicitly reset
1236 	if (ctx->thread_count == 1)
1237 	{
1238 		astcenc_decompress_reset(ctxo);
1239 	}
1240 
1241 	// Only the first thread actually runs the initializer
1242 	ctxo->manage_decompress.init(block_count, nullptr);
1243 
1244 	// All threads run this processing loop until there is no work remaining
1245 	while (true)
1246 	{
1247 		unsigned int count;
1248 		unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
1249 		if (!count)
1250 		{
1251 			break;
1252 		}
1253 
1254 		for (unsigned int i = base; i < base + count; i++)
1255 		{
1256 			// Decode i into x, y, z block indices
1257 			int z = i / plane_blocks;
1258 			unsigned int rem = i - (z * plane_blocks);
1259 			int y = rem / row_blocks;
1260 			int x = rem - (y * row_blocks);
1261 
1262 			unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
1263 			const uint8_t* bp = data + offset;
1264 
1265 			symbolic_compressed_block scb;
1266 
1267 			physical_to_symbolic(*ctx->bsd, bp, scb);
1268 
1269 			decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
1270 			                          x * block_x, y * block_y, z * block_z,
1271 			                          scb, blk);
1272 
1273 			store_image_block(image_out, blk, *ctx->bsd,
1274 			                  x * block_x, y * block_y, z * block_z, *swizzle);
1275 		}
1276 
1277 		ctxo->manage_decompress.complete_task_assignment(count);
1278 	}
1279 
1280 	return ASTCENC_SUCCESS;
1281 }
1282 
1283 /* See header for documentation. */
astcenc_decompress_reset( astcenc_context* ctxo )1284 astcenc_error astcenc_decompress_reset(
1285 	astcenc_context* ctxo
1286 ) {
1287 	ctxo->manage_decompress.reset();
1288 	return ASTCENC_SUCCESS;
1289 }
1290 
1291 /* See header for documentation. */
astcenc_get_block_info( astcenc_context* ctxo, const uint8_t data[16], astcenc_block_info* info )1292 astcenc_error astcenc_get_block_info(
1293 	astcenc_context* ctxo,
1294 	const uint8_t data[16],
1295 	astcenc_block_info* info
1296 ) {
1297 #if defined(ASTCENC_DECOMPRESS_ONLY)
1298 	(void)ctxo;
1299 	(void)data;
1300 	(void)info;
1301 	return ASTCENC_ERR_BAD_CONTEXT;
1302 #else
1303 	astcenc_contexti* ctx = &ctxo->context;
1304 
1305 	// Decode the compressed data into a symbolic form
1306 	symbolic_compressed_block scb;
1307 	physical_to_symbolic(*ctx->bsd, data, scb);
1308 
1309 	// Fetch the appropriate partition and decimation tables
1310 	block_size_descriptor& bsd = *ctx->bsd;
1311 
1312 	// Start from a clean slate
1313 	memset(info, 0, sizeof(*info));
1314 
1315 	// Basic info we can always populate
1316 	info->profile = ctx->config.profile;
1317 
1318 	info->block_x = ctx->config.block_x;
1319 	info->block_y = ctx->config.block_y;
1320 	info->block_z = ctx->config.block_z;
1321 	info->texel_count = bsd.texel_count;
1322 
1323 	// Check for error blocks first
1324 	info->is_error_block = scb.block_type == SYM_BTYPE_ERROR;
1325 	if (info->is_error_block)
1326 	{
1327 		return ASTCENC_SUCCESS;
1328 	}
1329 
1330 	// Check for constant color blocks second
1331 	info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 ||
1332 	                          scb.block_type == SYM_BTYPE_CONST_U16;
1333 	if (info->is_constant_block)
1334 	{
1335 		return ASTCENC_SUCCESS;
1336 	}
1337 
1338 	// Otherwise handle a full block ; known to be valid after conditions above have been checked
1339 	int partition_count = scb.partition_count;
1340 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
1341 
1342 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
1343 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
1344 
1345 	info->weight_x = di.weight_x;
1346 	info->weight_y = di.weight_y;
1347 	info->weight_z = di.weight_z;
1348 
1349 	info->is_dual_plane_block = bm.is_dual_plane != 0;
1350 
1351 	info->partition_count = scb.partition_count;
1352 	info->partition_index = scb.partition_index;
1353 	info->dual_plane_component = scb.plane2_component;
1354 
1355 	info->color_level_count = get_quant_level(scb.get_color_quant_mode());
1356 	info->weight_level_count = get_quant_level(bm.get_weight_quant_mode());
1357 
1358 	// Unpack color endpoints for each active partition
1359 	for (unsigned int i = 0; i < scb.partition_count; i++)
1360 	{
1361 		bool rgb_hdr;
1362 		bool a_hdr;
1363 		vint4 endpnt[2];
1364 
1365 		unpack_color_endpoints(ctx->config.profile,
1366 		                       scb.color_formats[i],
1367 		                       scb.color_values[i],
1368 		                       rgb_hdr, a_hdr,
1369 		                       endpnt[0], endpnt[1]);
1370 
1371 		// Store the color endpoint mode info
1372 		info->color_endpoint_modes[i] = scb.color_formats[i];
1373 		info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr;
1374 
1375 		// Store the unpacked and decoded color endpoint
1376 		vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr);
1377 		for (int j = 0; j < 2; j++)
1378 		{
1379 			vint4 color_lns = lns_to_sf16(endpnt[j]);
1380 			vint4 color_unorm = unorm16_to_sf16(endpnt[j]);
1381 			vint4 datai = select(color_unorm, color_lns, hdr_mask);
1382 			store(float16_to_float(datai), info->color_endpoints[i][j]);
1383 		}
1384 	}
1385 
1386 	// Unpack weights for each texel
1387 	int weight_plane1[BLOCK_MAX_TEXELS];
1388 	int weight_plane2[BLOCK_MAX_TEXELS];
1389 
1390 	unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
1391 	for (unsigned int i = 0; i < bsd.texel_count; i++)
1392 	{
1393 		info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1394 		if (info->is_dual_plane_block)
1395 		{
1396 			info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1397 		}
1398 	}
1399 
1400 	// Unpack partition assignments for each texel
1401 	for (unsigned int i = 0; i < bsd.texel_count; i++)
1402 	{
1403 		info->partition_assignment[i] = pi.partition_of_texel[i];
1404 	}
1405 
1406 	return ASTCENC_SUCCESS;
1407 #endif
1408 }
1409 
1410 /* See header for documentation. */
astcenc_get_error_string( astcenc_error status )1411 const char* astcenc_get_error_string(
1412 	astcenc_error status
1413 ) {
1414 	// Values in this enum are from an external user, so not guaranteed to be
1415 	// bounded to the enum values
1416 	switch (static_cast<int>(status))
1417 	{
1418 	case ASTCENC_SUCCESS:
1419 		return "ASTCENC_SUCCESS";
1420 	case ASTCENC_ERR_OUT_OF_MEM:
1421 		return "ASTCENC_ERR_OUT_OF_MEM";
1422 	case ASTCENC_ERR_BAD_CPU_FLOAT:
1423 		return "ASTCENC_ERR_BAD_CPU_FLOAT";
1424 	case ASTCENC_ERR_BAD_PARAM:
1425 		return "ASTCENC_ERR_BAD_PARAM";
1426 	case ASTCENC_ERR_BAD_BLOCK_SIZE:
1427 		return "ASTCENC_ERR_BAD_BLOCK_SIZE";
1428 	case ASTCENC_ERR_BAD_PROFILE:
1429 		return "ASTCENC_ERR_BAD_PROFILE";
1430 	case ASTCENC_ERR_BAD_QUALITY:
1431 		return "ASTCENC_ERR_BAD_QUALITY";
1432 	case ASTCENC_ERR_BAD_FLAGS:
1433 		return "ASTCENC_ERR_BAD_FLAGS";
1434 	case ASTCENC_ERR_BAD_SWIZZLE:
1435 		return "ASTCENC_ERR_BAD_SWIZZLE";
1436 	case ASTCENC_ERR_BAD_CONTEXT:
1437 		return "ASTCENC_ERR_BAD_CONTEXT";
1438 	case ASTCENC_ERR_NOT_IMPLEMENTED:
1439 		return "ASTCENC_ERR_NOT_IMPLEMENTED";
1440 	case ASTCENC_ERR_BAD_DECODE_MODE:
1441 		return "ASTCENC_ERR_BAD_DECODE_MODE";
1442 #if defined(ASTCENC_DIAGNOSTICS)
1443 	case ASTCENC_ERR_DTRACE_FAILURE:
1444 		return "ASTCENC_ERR_DTRACE_FAILURE";
1445 #endif
1446 	default:
1447 		return nullptr;
1448 	}
1449 }
1450