1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2024 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 /**
19 * @brief Functions for the library entrypoint.
20 */
21
22 #include <array>
23 #include <cstring>
24 #include <new>
25
26 #include "astcenc.h"
27 #include "astcenc_internal_entry.h"
28 #include "astcenc_diagnostic_trace.h"
29
30 /**
31 * @brief Record of the quality tuning parameter values.
32 *
33 * See the @c astcenc_config structure for detailed parameter documentation.
34 *
35 * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit.
36 * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios
37 * for the more through search presets because the underlying db_limit is so much higher.
38 */
39 struct astcenc_preset_config
40 {
41 float quality;
42 unsigned int tune_partition_count_limit;
43 unsigned int tune_2partition_index_limit;
44 unsigned int tune_3partition_index_limit;
45 unsigned int tune_4partition_index_limit;
46 unsigned int tune_block_mode_limit;
47 unsigned int tune_refinement_limit;
48 unsigned int tune_candidate_limit;
49 unsigned int tune_2partitioning_candidate_limit;
50 unsigned int tune_3partitioning_candidate_limit;
51 unsigned int tune_4partitioning_candidate_limit;
52 float tune_db_limit_a_base;
53 float tune_db_limit_b_base;
54 float tune_mse_overshoot;
55 float tune_2partition_early_out_limit_factor;
56 float tune_3partition_early_out_limit_factor;
57 float tune_2plane_early_out_limit_correlation;
58 float tune_search_mode0_enable;
59 };
60
61 /**
62 * @brief The static presets for high bandwidth encodings (x < 25 texels per block).
63 */
64 static const std::array<astcenc_preset_config, 6> preset_configs_high {{
65 {
66 ASTCENC_PRE_FASTEST,
67 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f
68 }, {
69 ASTCENC_PRE_FAST,
70 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f
71 }, {
72 ASTCENC_PRE_MEDIUM,
73 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f
74 }, {
75 ASTCENC_PRE_THOROUGH,
76 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f
77 }, {
78 ASTCENC_PRE_VERYTHOROUGH,
79 4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
80 }, {
81 ASTCENC_PRE_EXHAUSTIVE,
82 4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
83 }
84 }};
85
86 /**
87 * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block).
88 */
89 static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
90 {
91 ASTCENC_PRE_FASTEST,
92 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
93 }, {
94 ASTCENC_PRE_FAST,
95 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
96 }, {
97 ASTCENC_PRE_MEDIUM,
98 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f
99 }, {
100 ASTCENC_PRE_THOROUGH,
101 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f
102 }, {
103 ASTCENC_PRE_VERYTHOROUGH,
104 4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
105 }, {
106 ASTCENC_PRE_EXHAUSTIVE,
107 4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
108 }
109 }};
110
111 /**
112 * @brief The static presets for low bandwidth encodings (64 <= x texels per block).
113 */
114 static const std::array<astcenc_preset_config, 6> preset_configs_low {{
115 {
116 ASTCENC_PRE_FASTEST,
117 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
118 }, {
119 ASTCENC_PRE_FAST,
120 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
121 }, {
122 ASTCENC_PRE_MEDIUM,
123 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f
124 }, {
125 ASTCENC_PRE_THOROUGH,
126 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f
127 }, {
128 ASTCENC_PRE_VERYTHOROUGH,
129 4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f
130 }, {
131 ASTCENC_PRE_EXHAUSTIVE,
132 4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f
133 }
134 }};
135
136 /**
137 * @brief Validate CPU floating point meets assumptions made in the codec.
138 *
139 * The codec is written with the assumption that a float threaded through the @c if32 union will be
140 * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the
141 * case in an IEEE-754 compliant system, however not every system or compilation mode is actually
142 * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled.
143 *
144 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
145 */
validate_cpu_float()146 static astcenc_error validate_cpu_float()
147 {
148 if32 p;
149 volatile float xprec_testval = 2.51f;
150 p.f = xprec_testval + 12582912.0f;
151 float q = p.f - 12582912.0f;
152
153 if (q != 3.0f)
154 {
155 return ASTCENC_ERR_BAD_CPU_FLOAT;
156 }
157
158 return ASTCENC_SUCCESS;
159 }
160
161 /**
162 * @brief Validate config profile.
163 *
164 * @param profile The profile to check.
165 *
166 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
167 */
validate_profile( astcenc_profile profile )168 static astcenc_error validate_profile(
169 astcenc_profile profile
170 ) {
171 // Values in this enum are from an external user, so not guaranteed to be
172 // bounded to the enum values
173 switch (static_cast<int>(profile))
174 {
175 case ASTCENC_PRF_LDR_SRGB:
176 case ASTCENC_PRF_LDR:
177 case ASTCENC_PRF_HDR_RGB_LDR_A:
178 case ASTCENC_PRF_HDR:
179 return ASTCENC_SUCCESS;
180 default:
181 return ASTCENC_ERR_BAD_PROFILE;
182 }
183 }
184
185 /**
186 * @brief Validate block size.
187 *
188 * @param block_x The block x dimensions.
189 * @param block_y The block y dimensions.
190 * @param block_z The block z dimensions.
191 *
192 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
193 */
validate_block_size( unsigned int block_x, unsigned int block_y, unsigned int block_z )194 static astcenc_error validate_block_size(
195 unsigned int block_x,
196 unsigned int block_y,
197 unsigned int block_z
198 ) {
199 // Test if this is a legal block size at all
200 bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
201 ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
202 if (!is_legal)
203 {
204 return ASTCENC_ERR_BAD_BLOCK_SIZE;
205 }
206
207 // Test if this build has sufficient capacity for this block size
208 bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
209 if (!have_capacity)
210 {
211 return ASTCENC_ERR_NOT_IMPLEMENTED;
212 }
213
214 return ASTCENC_SUCCESS;
215 }
216
217 /**
218 * @brief Validate flags.
219 *
220 * @param profile The profile to check.
221 * @param flags The flags to check.
222 *
223 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
224 */
validate_flags( astcenc_profile profile, unsigned int flags )225 static astcenc_error validate_flags(
226 astcenc_profile profile,
227 unsigned int flags
228 ) {
229 // Flags field must not contain any unknown flag bits
230 unsigned int exMask = ~ASTCENC_ALL_FLAGS;
231 if (popcount(flags & exMask) != 0)
232 {
233 return ASTCENC_ERR_BAD_FLAGS;
234 }
235
236 // Flags field must only contain at most a single map type
237 exMask = ASTCENC_FLG_MAP_NORMAL
238 | ASTCENC_FLG_MAP_RGBM;
239 if (popcount(flags & exMask) > 1)
240 {
241 return ASTCENC_ERR_BAD_FLAGS;
242 }
243
244 // Decode_unorm8 must only be used with an LDR profile
245 bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
246 bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
247 if (is_unorm8 && is_hdr)
248 {
249 return ASTCENC_ERR_BAD_DECODE_MODE;
250 }
251
252 return ASTCENC_SUCCESS;
253 }
254
255 #if !defined(ASTCENC_DECOMPRESS_ONLY)
256
257 /**
258 * @brief Validate single channel compression swizzle.
259 *
260 * @param swizzle The swizzle to check.
261 *
262 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
263 */
validate_compression_swz( astcenc_swz swizzle )264 static astcenc_error validate_compression_swz(
265 astcenc_swz swizzle
266 ) {
267 // Not all enum values are handled; SWZ_Z is invalid for compression
268 switch (static_cast<int>(swizzle))
269 {
270 case ASTCENC_SWZ_R:
271 case ASTCENC_SWZ_G:
272 case ASTCENC_SWZ_B:
273 case ASTCENC_SWZ_A:
274 case ASTCENC_SWZ_0:
275 case ASTCENC_SWZ_1:
276 return ASTCENC_SUCCESS;
277 default:
278 return ASTCENC_ERR_BAD_SWIZZLE;
279 }
280 }
281
282 /**
283 * @brief Validate overall compression swizzle.
284 *
285 * @param swizzle The swizzle to check.
286 *
287 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
288 */
validate_compression_swizzle( const astcenc_swizzle& swizzle )289 static astcenc_error validate_compression_swizzle(
290 const astcenc_swizzle& swizzle
291 ) {
292 if (validate_compression_swz(swizzle.r) ||
293 validate_compression_swz(swizzle.g) ||
294 validate_compression_swz(swizzle.b) ||
295 validate_compression_swz(swizzle.a))
296 {
297 return ASTCENC_ERR_BAD_SWIZZLE;
298 }
299
300 return ASTCENC_SUCCESS;
301 }
302 #endif
303
304 /**
305 * @brief Validate single channel decompression swizzle.
306 *
307 * @param swizzle The swizzle to check.
308 *
309 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
310 */
validate_decompression_swz( astcenc_swz swizzle )311 static astcenc_error validate_decompression_swz(
312 astcenc_swz swizzle
313 ) {
314 // Values in this enum are from an external user, so not guaranteed to be
315 // bounded to the enum values
316 switch (static_cast<int>(swizzle))
317 {
318 case ASTCENC_SWZ_R:
319 case ASTCENC_SWZ_G:
320 case ASTCENC_SWZ_B:
321 case ASTCENC_SWZ_A:
322 case ASTCENC_SWZ_0:
323 case ASTCENC_SWZ_1:
324 case ASTCENC_SWZ_Z:
325 return ASTCENC_SUCCESS;
326 default:
327 return ASTCENC_ERR_BAD_SWIZZLE;
328 }
329 }
330
331 /**
332 * @brief Validate overall decompression swizzle.
333 *
334 * @param swizzle The swizzle to check.
335 *
336 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
337 */
validate_decompression_swizzle( const astcenc_swizzle& swizzle )338 static astcenc_error validate_decompression_swizzle(
339 const astcenc_swizzle& swizzle
340 ) {
341 if (validate_decompression_swz(swizzle.r) ||
342 validate_decompression_swz(swizzle.g) ||
343 validate_decompression_swz(swizzle.b) ||
344 validate_decompression_swz(swizzle.a))
345 {
346 return ASTCENC_ERR_BAD_SWIZZLE;
347 }
348
349 return ASTCENC_SUCCESS;
350 }
351
352 /**
353 * Validate that an incoming configuration is in-spec.
354 *
355 * This function can respond in two ways:
356 *
357 * * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown
358 * for out-of-range inputs in this case.
359 * * Numerical inputs and logic inputs are are logically invalid and which make no sense
360 * algorithmically will return an error.
361 *
362 * @param[in,out] config The input compressor configuration.
363 *
364 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
365 */
validate_config( astcenc_config &config )366 static astcenc_error validate_config(
367 astcenc_config &config
368 ) {
369 astcenc_error status;
370
371 status = validate_profile(config.profile);
372 if (status != ASTCENC_SUCCESS)
373 {
374 return status;
375 }
376
377 status = validate_flags(config.profile, config.flags);
378 if (status != ASTCENC_SUCCESS)
379 {
380 return status;
381 }
382
383 status = validate_block_size(config.block_x, config.block_y, config.block_z);
384 if (status != ASTCENC_SUCCESS)
385 {
386 return status;
387 }
388
389 #if defined(ASTCENC_DECOMPRESS_ONLY)
390 // Decompress-only builds only support decompress-only contexts
391 if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
392 {
393 return ASTCENC_ERR_BAD_PARAM;
394 }
395 #endif
396
397 config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
398
399 config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
400 config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
401 config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
402 config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
403 config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
404 config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
405 config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
406 config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
407 config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
408 config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
409 config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
410 config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f);
411 config.tune_2partition_early_out_limit_factor = astc::max(config.tune_2partition_early_out_limit_factor, 0.0f);
412 config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f);
413 config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f);
414
415 // Specifying a zero weight color component is not allowed; force to small value
416 float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
417 astc::max(config.cw_b_weight, config.cw_a_weight));
418 if (max_weight > 0.0f)
419 {
420 max_weight /= 1000.0f;
421 config.cw_r_weight = astc::max(config.cw_r_weight, max_weight);
422 config.cw_g_weight = astc::max(config.cw_g_weight, max_weight);
423 config.cw_b_weight = astc::max(config.cw_b_weight, max_weight);
424 config.cw_a_weight = astc::max(config.cw_a_weight, max_weight);
425 }
426 // If all color components error weights are zero then return an error
427 else
428 {
429 return ASTCENC_ERR_BAD_PARAM;
430 }
431
432 return ASTCENC_SUCCESS;
433 }
434
435 /* See header for documentation. */
astcenc_config_init( astcenc_profile profile, unsigned int block_x, unsigned int block_y, unsigned int block_z, float quality, unsigned int flags, astcenc_config* configp )436 astcenc_error astcenc_config_init(
437 astcenc_profile profile,
438 unsigned int block_x,
439 unsigned int block_y,
440 unsigned int block_z,
441 float quality,
442 unsigned int flags,
443 astcenc_config* configp
444 ) {
445 astcenc_error status;
446
447 status = validate_cpu_float();
448 if (status != ASTCENC_SUCCESS)
449 {
450 return status;
451 }
452
453 // Zero init all config fields; although most of will be over written
454 astcenc_config& config = *configp;
455 std::memset(&config, 0, sizeof(config));
456
457 // Process the block size
458 block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1
459 status = validate_block_size(block_x, block_y, block_z);
460 if (status != ASTCENC_SUCCESS)
461 {
462 return status;
463 }
464
465 config.block_x = block_x;
466 config.block_y = block_y;
467 config.block_z = block_z;
468
469 float texels = static_cast<float>(block_x * block_y * block_z);
470 float ltexels = logf(texels) / logf(10.0f);
471
472 // Process the performance quality level or preset; note that this must be done before we
473 // process any additional settings, such as color profile and flags, which may replace some of
474 // these settings with more use case tuned values
475 if (quality < ASTCENC_PRE_FASTEST ||
476 quality > ASTCENC_PRE_EXHAUSTIVE)
477 {
478 return ASTCENC_ERR_BAD_QUALITY;
479 }
480
481 static const std::array<astcenc_preset_config, 6>* preset_configs;
482 int texels_int = block_x * block_y * block_z;
483 if (texels_int < 25)
484 {
485 preset_configs = &preset_configs_high;
486 }
487 else if (texels_int < 64)
488 {
489 preset_configs = &preset_configs_mid;
490 }
491 else
492 {
493 preset_configs = &preset_configs_low;
494 }
495
496 // Determine which preset to use, or which pair to interpolate
497 size_t start;
498 size_t end;
499 for (end = 0; end < preset_configs->size(); end++)
500 {
501 if ((*preset_configs)[end].quality >= quality)
502 {
503 break;
504 }
505 }
506
507 start = end == 0 ? 0 : end - 1;
508
509 // Start and end node are the same - so just transfer the values.
510 if (start == end)
511 {
512 config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
513 config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
514 config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
515 config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
516 config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
517 config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
518 config.tune_candidate_limit = (*preset_configs)[start].tune_candidate_limit;
519 config.tune_2partitioning_candidate_limit = (*preset_configs)[start].tune_2partitioning_candidate_limit;
520 config.tune_3partitioning_candidate_limit = (*preset_configs)[start].tune_3partitioning_candidate_limit;
521 config.tune_4partitioning_candidate_limit = (*preset_configs)[start].tune_4partitioning_candidate_limit;
522 config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
523 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
524
525 config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot;
526
527 config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor;
528 config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor;
529 config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation;
530 config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable;
531 }
532 // Start and end node are not the same - so interpolate between them
533 else
534 {
535 auto& node_a = (*preset_configs)[start];
536 auto& node_b = (*preset_configs)[end];
537
538 float wt_range = node_b.quality - node_a.quality;
539 assert(wt_range > 0);
540
541 // Compute interpolation factors
542 float wt_node_a = (node_b.quality - quality) / wt_range;
543 float wt_node_b = (quality - node_a.quality) / wt_range;
544
545 #define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b))
546 #define LERPI(param) astc::flt2int_rtn(\
547 (static_cast<float>(node_a.param) * wt_node_a) + \
548 (static_cast<float>(node_b.param) * wt_node_b))
549 #define LERPUI(param) static_cast<unsigned int>(LERPI(param))
550
551 config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
552 config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
553 config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
554 config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
555 config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
556 config.tune_refinement_limit = LERPI(tune_refinement_limit);
557 config.tune_candidate_limit = LERPUI(tune_candidate_limit);
558 config.tune_2partitioning_candidate_limit = LERPUI(tune_2partitioning_candidate_limit);
559 config.tune_3partitioning_candidate_limit = LERPUI(tune_3partitioning_candidate_limit);
560 config.tune_4partitioning_candidate_limit = LERPUI(tune_4partitioning_candidate_limit);
561 config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
562 LERP(tune_db_limit_b_base) - 19 * ltexels);
563
564 config.tune_mse_overshoot = LERP(tune_mse_overshoot);
565
566 config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor);
567 config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor);
568 config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation);
569 config.tune_search_mode0_enable = LERP(tune_search_mode0_enable);
570 #undef LERP
571 #undef LERPI
572 #undef LERPUI
573 }
574
575 // Set heuristics to the defaults for each color profile
576 config.cw_r_weight = 1.0f;
577 config.cw_g_weight = 1.0f;
578 config.cw_b_weight = 1.0f;
579 config.cw_a_weight = 1.0f;
580
581 config.a_scale_radius = 0;
582
583 config.rgbm_m_scale = 0.0f;
584
585 config.profile = profile;
586
587 // Values in this enum are from an external user, so not guaranteed to be
588 // bounded to the enum values
589 switch (static_cast<int>(profile))
590 {
591 case ASTCENC_PRF_LDR:
592 case ASTCENC_PRF_LDR_SRGB:
593 break;
594 case ASTCENC_PRF_HDR_RGB_LDR_A:
595 case ASTCENC_PRF_HDR:
596 config.tune_db_limit = 999.0f;
597 config.tune_search_mode0_enable = 0.0f;
598 break;
599 default:
600 return ASTCENC_ERR_BAD_PROFILE;
601 }
602
603 // Flags field must not contain any unknown flag bits
604 status = validate_flags(profile, flags);
605 if (status != ASTCENC_SUCCESS)
606 {
607 return status;
608 }
609
610 if (flags & ASTCENC_FLG_MAP_NORMAL)
611 {
612 // Normal map encoding uses L+A blocks, so allow one more partitioning
613 // than normal. We need need fewer bits for endpoints, so more likely
614 // to be able to use more partitions than an RGB/RGBA block
615 config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u);
616
617 config.cw_g_weight = 0.0f;
618 config.cw_b_weight = 0.0f;
619 config.tune_2partition_early_out_limit_factor *= 1.5f;
620 config.tune_3partition_early_out_limit_factor *= 1.5f;
621 config.tune_2plane_early_out_limit_correlation = 0.99f;
622
623 // Normals are prone to blocking artifacts on smooth curves
624 // so force compressor to try harder here ...
625 config.tune_db_limit *= 1.03f;
626 }
627 else if (flags & ASTCENC_FLG_MAP_RGBM)
628 {
629 config.rgbm_m_scale = 5.0f;
630 config.cw_a_weight = 2.0f * config.rgbm_m_scale;
631 }
632 else // (This is color data)
633 {
634 // This is a very basic perceptual metric for RGB color data, which weights error
635 // significance by the perceptual luminance contribution of each color channel. For
636 // luminance the usual weights to compute luminance from a linear RGB value are as
637 // follows:
638 //
639 // l = r * 0.3 + g * 0.59 + b * 0.11
640 //
641 // ... but we scale these up to keep a better balance between color and alpha. Note
642 // that if the content is using alpha we'd recommend using the -a option to weight
643 // the color contribution by the alpha transparency.
644 if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
645 {
646 config.cw_r_weight = 0.30f * 2.25f;
647 config.cw_g_weight = 0.59f * 2.25f;
648 config.cw_b_weight = 0.11f * 2.25f;
649 }
650 }
651 config.flags = flags;
652
653 return ASTCENC_SUCCESS;
654 }
655
656 /* See header for documentation. */
astcenc_context_alloc( const astcenc_config* configp, unsigned int thread_count, astcenc_context** context )657 astcenc_error astcenc_context_alloc(
658 const astcenc_config* configp,
659 unsigned int thread_count,
660 astcenc_context** context
661 ) {
662 astcenc_error status;
663 const astcenc_config& config = *configp;
664
665 status = validate_cpu_float();
666 if (status != ASTCENC_SUCCESS)
667 {
668 return status;
669 }
670
671 if (thread_count == 0)
672 {
673 return ASTCENC_ERR_BAD_PARAM;
674 }
675
676 #if defined(ASTCENC_DIAGNOSTICS)
677 // Force single threaded compressor use in diagnostic mode.
678 if (thread_count != 1)
679 {
680 return ASTCENC_ERR_BAD_PARAM;
681 }
682 #endif
683
684 #ifndef ASTC_CUSTOMIZED_ENABLE
685 if (config.privateProfile == CUSTOMIZED_PROFILE)
686 {
687 return ASTCENC_ERR_BAD_PARAM;
688 }
689 #endif
690
691 astcenc_context* ctxo = new astcenc_context;
692 astcenc_contexti* ctx = &ctxo->context;
693 ctx->thread_count = thread_count;
694 ctx->config = config;
695 ctx->working_buffers = nullptr;
696
697 // These are allocated per-compress, as they depend on image size
698 ctx->input_alpha_averages = nullptr;
699
700 // Copy the config first and validate the copy (we may modify it)
701 status = validate_config(ctx->config);
702 if (status != ASTCENC_SUCCESS)
703 {
704 delete ctxo;
705 return status;
706 }
707
708 ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
709 if (!ctx->bsd)
710 {
711 delete ctxo;
712 return ASTCENC_ERR_OUT_OF_MEM;
713 }
714
715 bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
716 #ifdef ASTC_CUSTOMIZED_ENABLE
717 if (!init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
718 can_omit_modes,
719 config.tune_partition_count_limit,
720 static_cast<float>(config.tune_block_mode_limit) / 100.0f,
721 *ctx->bsd))
722 {
723 aligned_free<block_size_descriptor>(ctx->bsd);
724 delete ctxo;
725 *context = nullptr;
726 return ASTCENC_ERR_DLOPEN_FAILED;
727 }
728 #else
729 init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
730 can_omit_modes,
731 config.tune_partition_count_limit,
732 static_cast<float>(config.tune_block_mode_limit) / 100.0f,
733 *ctx->bsd);
734 #endif
735
736 #if !defined(ASTCENC_DECOMPRESS_ONLY)
737 // Do setup only needed by compression
738 if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
739 {
740 // Turn a dB limit into a per-texel error for faster use later
741 if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
742 {
743 ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f;
744 }
745 else
746 {
747 ctx->config.tune_db_limit = 0.0f;
748 }
749
750 size_t worksize = sizeof(compression_working_buffers) * thread_count;
751 ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
752 static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
753 "compression_working_buffers size must be multiple of vector alignment");
754 if (!ctx->working_buffers)
755 {
756 aligned_free<block_size_descriptor>(ctx->bsd);
757 delete ctxo;
758 *context = nullptr;
759 return ASTCENC_ERR_OUT_OF_MEM;
760 }
761 }
762 #endif
763
764 #if defined(ASTCENC_DIAGNOSTICS)
765 ctx->trace_log = new TraceLog(ctx->config.trace_file_path);
766 if (!ctx->trace_log->m_file)
767 {
768 return ASTCENC_ERR_DTRACE_FAILURE;
769 }
770
771 trace_add_data("block_x", config.block_x);
772 trace_add_data("block_y", config.block_y);
773 trace_add_data("block_z", config.block_z);
774 #endif
775
776 *context = ctxo;
777
778 #if !defined(ASTCENC_DECOMPRESS_ONLY)
779 prepare_angular_tables();
780 #endif
781
782 return ASTCENC_SUCCESS;
783 }
784
785 /* See header dor documentation. */
astcenc_context_free( astcenc_context* ctxo )786 void astcenc_context_free(
787 astcenc_context* ctxo
788 ) {
789 if (ctxo)
790 {
791 astcenc_contexti* ctx = &ctxo->context;
792 if (ctx->working_buffers)
793 {
794 aligned_free<compression_working_buffers>(ctx->working_buffers);
795 }
796 else
797 {
798 printf("ctx->working_buffers is nullptr !!\n");
799 }
800 if (ctx->bsd)
801 {
802 aligned_free<block_size_descriptor>(ctx->bsd);
803 }
804 else
805 {
806 printf("ctx->bsd is nullptr !!\n");
807 }
808 #if defined(ASTCENC_DIAGNOSTICS)
809 delete ctx->trace_log;
810 #endif
811 delete ctxo;
812 }
813 }
814
815 #if !defined(ASTCENC_DECOMPRESS_ONLY)
816
817 /**
818 * @brief Compress an image, after any preflight has completed.
819 *
820 * @param[out] ctxo The compressor context.
821 * @param thread_index The thread index.
822 * @param image The intput image.
823 * @param swizzle The input swizzle.
824 * @param[out] buffer The output array for the compressed data.
825 */
compress_image( astcenc_context& ctxo, unsigned int thread_index, const astcenc_image& image, const astcenc_swizzle& swizzle, uint8_t* buffer, bool calQualityEnable, int32_t *mse[RGBA_COM] )826 static void compress_image(
827 astcenc_context& ctxo,
828 unsigned int thread_index,
829 const astcenc_image& image,
830 const astcenc_swizzle& swizzle,
831 #if QUALITY_CONTROL
832 uint8_t* buffer,
833 bool calQualityEnable,
834 int32_t *mse[RGBA_COM]
835 #else
836 uint8_t* buffer
837 #endif
838 ) {
839 astcenc_contexti& ctx = ctxo.context;
840 const block_size_descriptor& bsd = *ctx.bsd;
841 astcenc_profile decode_mode = ctx.config.profile;
842
843 image_block blk;
844
845 int block_x = bsd.xdim;
846 int block_y = bsd.ydim;
847 int block_z = bsd.zdim;
848 blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
849
850 int dim_x = image.dim_x;
851 int dim_y = image.dim_y;
852 int dim_z = image.dim_z;
853
854 int xblocks = (dim_x + block_x - 1) / block_x;
855 int yblocks = (dim_y + block_y - 1) / block_y;
856 int zblocks = (dim_z + block_z - 1) / block_z;
857 int block_count = zblocks * yblocks * xblocks;
858
859 int row_blocks = xblocks;
860 int plane_blocks = xblocks * yblocks;
861
862 blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
863
864 // Populate the block channel weights
865 blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
866 ctx.config.cw_g_weight,
867 ctx.config.cw_b_weight,
868 ctx.config.cw_a_weight);
869
870 // Use preallocated scratch buffer
871 auto& temp_buffers = ctx.working_buffers[thread_index];
872
873 // Only the first thread actually runs the initializer
874 ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
875
876 // Determine if we can use an optimized load function
877 bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
878 (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A);
879
880 bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) ||
881 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A);
882
883 bool use_fast_load = !needs_swz && !needs_hdr &&
884 block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
885
886 auto load_func = load_image_block;
887 if (use_fast_load)
888 {
889 load_func = load_image_block_fast_ldr;
890 }
891
892 // All threads run this processing loop until there is no work remaining
893 while (true)
894 {
895 unsigned int count;
896 unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
897 if (!count)
898 {
899 break;
900 }
901
902 for (unsigned int i = base; i < base + count; i++)
903 {
904 // Decode i into x, y, z block indices
905 int z = i / plane_blocks;
906 unsigned int rem = i - (z * plane_blocks);
907 int y = rem / row_blocks;
908 int x = rem - (y * row_blocks);
909
910 // Test if we can apply some basic alpha-scale RDO
911 bool use_full_block = true;
912 if (ctx.config.a_scale_radius != 0 && block_z == 1)
913 {
914 int start_x = x * block_x;
915 int end_x = astc::min(dim_x, start_x + block_x);
916
917 int start_y = y * block_y;
918 int end_y = astc::min(dim_y, start_y + block_y);
919
920 // SATs accumulate error, so don't test exactly zero. Test for
921 // less than 1 alpha in the expanded block footprint that
922 // includes the alpha radius.
923 int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1);
924
925 int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1);
926
927 float footprint = static_cast<float>(x_footprint * y_footprint);
928 float threshold = 0.9f / (255.0f * footprint);
929
930 // Do we have any alpha values?
931 use_full_block = false;
932 for (int ay = start_y; ay < end_y; ay++)
933 {
934 for (int ax = start_x; ax < end_x; ax++)
935 {
936 float a_avg = ctx.input_alpha_averages[ay * dim_x + ax];
937 if (a_avg > threshold)
938 {
939 use_full_block = true;
940 ax = end_x;
941 ay = end_y;
942 }
943 }
944 }
945 }
946
947 // Fetch the full block for compression
948 if (use_full_block)
949 {
950 load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
951
952 // Scale RGB error contribution by the maximum alpha in the block
953 // This encourages preserving alpha accuracy in regions with high
954 // transparency, and can buy up to 0.5 dB PSNR.
955 if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
956 {
957 float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
958 blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
959 ctx.config.cw_g_weight * alpha_scale,
960 ctx.config.cw_b_weight * alpha_scale,
961 ctx.config.cw_a_weight);
962 }
963 }
964 // Apply alpha scale RDO - substitute constant color block
965 else
966 {
967 blk.origin_texel = vfloat4::zero();
968 blk.data_min = vfloat4::zero();
969 blk.data_mean = vfloat4::zero();
970 blk.data_max = vfloat4::zero();
971 blk.grayscale = true;
972 }
973
974 int offset = ((z * yblocks + y) * xblocks + x) * 16;
975 uint8_t *bp = buffer + offset;
976 #if QUALITY_CONTROL
977 int32_t *mseBlock[RGBA_COM] = {nullptr, nullptr, nullptr, nullptr};
978 if (calQualityEnable) {
979 offset = (z * yblocks + y) * xblocks + x;
980 mseBlock[R_COM] = mse[R_COM] + offset;
981 mseBlock[G_COM] = mse[G_COM] + offset;
982 mseBlock[B_COM] = mse[B_COM] + offset;
983 mseBlock[A_COM] = mse[A_COM] + offset;
984 }
985 compress_block(ctx, blk, bp, temp_buffers, calQualityEnable, mseBlock);
986 #else
987 compress_block(ctx, blk, bp, temp_buffers);
988 #endif
989 }
990
991 ctxo.manage_compress.complete_task_assignment(count);
992 }
993 }
994
995 /**
996 * @brief Compute regional averages in an image.
997 *
998 * This function can be called by multiple threads, but only after a single
999 * thread calls the setup function @c init_compute_averages().
1000 *
1001 * Results are written back into @c img->input_alpha_averages.
1002 *
1003 * @param[out] ctx The context.
1004 * @param ag The average and variance arguments created during setup.
1005 */
compute_averages( astcenc_context& ctx, const avg_args &ag )1006 static void compute_averages(
1007 astcenc_context& ctx,
1008 const avg_args &ag
1009 ) {
1010 pixel_region_args arg = ag.arg;
1011 arg.work_memory = new vfloat4[ag.work_memory_size];
1012
1013 int size_x = ag.img_size_x;
1014 int size_y = ag.img_size_y;
1015 int size_z = ag.img_size_z;
1016
1017 int step_xy = ag.blk_size_xy;
1018 int step_z = ag.blk_size_z;
1019
1020 int y_tasks = (size_y + step_xy - 1) / step_xy;
1021
1022 // All threads run this processing loop until there is no work remaining
1023 while (true)
1024 {
1025 unsigned int count;
1026 unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
1027 if (!count)
1028 {
1029 break;
1030 }
1031
1032 for (unsigned int i = base; i < base + count; i++)
1033 {
1034 int z = (i / (y_tasks)) * step_z;
1035 int y = (i - (z * y_tasks)) * step_xy;
1036
1037 arg.size_z = astc::min(step_z, size_z - z);
1038 arg.offset_z = z;
1039
1040 arg.size_y = astc::min(step_xy, size_y - y);
1041 arg.offset_y = y;
1042
1043 for (int x = 0; x < size_x; x += step_xy)
1044 {
1045 arg.size_x = astc::min(step_xy, size_x - x);
1046 arg.offset_x = x;
1047 compute_pixel_region_variance(ctx.context, arg);
1048 }
1049 }
1050
1051 ctx.manage_avg.complete_task_assignment(count);
1052 }
1053
1054 delete[] arg.work_memory;
1055 }
1056
1057 #endif
1058
1059 /* See header for documentation. */
astcenc_compress_image( astcenc_context* ctxo, astcenc_image* imagep, const astcenc_swizzle* swizzle, uint8_t* data_out, size_t data_len, bool calQualityEnable, int32_t *mse[RGBA_COM], unsigned int thread_index )1060 astcenc_error astcenc_compress_image(
1061 astcenc_context* ctxo,
1062 astcenc_image* imagep,
1063 const astcenc_swizzle* swizzle,
1064 uint8_t* data_out,
1065 size_t data_len,
1066 #if QUALITY_CONTROL
1067 bool calQualityEnable,
1068 int32_t *mse[RGBA_COM],
1069 #endif
1070 unsigned int thread_index
1071 ) {
1072 #if defined(ASTCENC_DECOMPRESS_ONLY)
1073 (void)ctxo;
1074 (void)imagep;
1075 (void)swizzle;
1076 (void)data_out;
1077 (void)data_len;
1078 (void)thread_index;
1079 return ASTCENC_ERR_BAD_CONTEXT;
1080 #else
1081 astcenc_contexti* ctx = &ctxo->context;
1082 astcenc_error status;
1083 astcenc_image& image = *imagep;
1084
1085 if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1086 {
1087 return ASTCENC_ERR_BAD_CONTEXT;
1088 }
1089
1090 status = validate_compression_swizzle(*swizzle);
1091 if (status != ASTCENC_SUCCESS)
1092 {
1093 return status;
1094 }
1095
1096 if (thread_index >= ctx->thread_count)
1097 {
1098 return ASTCENC_ERR_BAD_PARAM;
1099 }
1100
1101 unsigned int block_x = ctx->config.block_x;
1102 unsigned int block_y = ctx->config.block_y;
1103 unsigned int block_z = ctx->config.block_z;
1104
1105 unsigned int xblocks = (image.dim_x + block_x - 1) / block_x;
1106 unsigned int yblocks = (image.dim_y + block_y - 1) / block_y;
1107 unsigned int zblocks = (image.dim_z + block_z - 1) / block_z;
1108
1109 // Check we have enough output space (16 bytes per block)
1110 size_t size_needed = xblocks * yblocks * zblocks * 16;
1111 if (data_len < size_needed)
1112 {
1113 return ASTCENC_ERR_OUT_OF_MEM;
1114 }
1115
1116 // If context thread count is one then implicitly reset
1117 if (ctx->thread_count == 1)
1118 {
1119 astcenc_compress_reset(ctxo);
1120 }
1121
1122 if (ctx->config.a_scale_radius != 0)
1123 {
1124 // First thread to enter will do setup, other threads will subsequently
1125 // enter the critical section but simply skip over the initialization
1126 auto init_avg = [ctx, &image, swizzle]() {
1127 // Perform memory allocations for the destination buffers
1128 size_t texel_count = image.dim_x * image.dim_y * image.dim_z;
1129 ctx->input_alpha_averages = new float[texel_count];
1130
1131 return init_compute_averages(
1132 image, ctx->config.a_scale_radius, *swizzle,
1133 ctx->avg_preprocess_args);
1134 };
1135
1136 // Only the first thread actually runs the initializer
1137 ctxo->manage_avg.init(init_avg);
1138
1139 // All threads will enter this function and dynamically grab work
1140 compute_averages(*ctxo, ctx->avg_preprocess_args);
1141 }
1142
1143 // Wait for compute_averages to complete before compressing
1144 ctxo->manage_avg.wait();
1145 #if QUALITY_CONTROL
1146 compress_image(*ctxo, thread_index, image, *swizzle, data_out, calQualityEnable, mse);
1147 #else
1148 compress_image(*ctxo, thread_index, image, *swizzle, data_out);
1149 #endif
1150 // Wait for compress to complete before freeing memory
1151 ctxo->manage_compress.wait();
1152
1153 auto term_compress = [ctx]() {
1154 delete[] ctx->input_alpha_averages;
1155 ctx->input_alpha_averages = nullptr;
1156 };
1157
1158 // Only the first thread to arrive actually runs the term
1159 ctxo->manage_compress.term(term_compress);
1160
1161 return ASTCENC_SUCCESS;
1162 #endif
1163 }
1164
1165 /* See header for documentation. */
astcenc_compress_reset( astcenc_context* ctxo )1166 astcenc_error astcenc_compress_reset(
1167 astcenc_context* ctxo
1168 ) {
1169 #if defined(ASTCENC_DECOMPRESS_ONLY)
1170 (void)ctxo;
1171 return ASTCENC_ERR_BAD_CONTEXT;
1172 #else
1173 astcenc_contexti* ctx = &ctxo->context;
1174 if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1175 {
1176 return ASTCENC_ERR_BAD_CONTEXT;
1177 }
1178
1179 ctxo->manage_avg.reset();
1180 ctxo->manage_compress.reset();
1181 return ASTCENC_SUCCESS;
1182 #endif
1183 }
1184
1185 /* See header for documentation. */
astcenc_decompress_image( astcenc_context* ctxo, const uint8_t* data, size_t data_len, astcenc_image* image_outp, const astcenc_swizzle* swizzle, unsigned int thread_index )1186 astcenc_error astcenc_decompress_image(
1187 astcenc_context* ctxo,
1188 const uint8_t* data,
1189 size_t data_len,
1190 astcenc_image* image_outp,
1191 const astcenc_swizzle* swizzle,
1192 unsigned int thread_index
1193 ) {
1194 astcenc_error status;
1195 astcenc_image& image_out = *image_outp;
1196 astcenc_contexti* ctx = &ctxo->context;
1197
1198 // Today this doesn't matter (working set on stack) but might in future ...
1199 if (thread_index >= ctx->thread_count)
1200 {
1201 return ASTCENC_ERR_BAD_PARAM;
1202 }
1203
1204 status = validate_decompression_swizzle(*swizzle);
1205 if (status != ASTCENC_SUCCESS)
1206 {
1207 return status;
1208 }
1209
1210 unsigned int block_x = ctx->config.block_x;
1211 unsigned int block_y = ctx->config.block_y;
1212 unsigned int block_z = ctx->config.block_z;
1213
1214 unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
1215 unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
1216 unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
1217 unsigned int block_count = zblocks * yblocks * xblocks;
1218
1219 int row_blocks = xblocks;
1220 int plane_blocks = xblocks * yblocks;
1221
1222 // Check we have enough output space (16 bytes per block)
1223 size_t size_needed = xblocks * yblocks * zblocks * 16;
1224 if (data_len < size_needed)
1225 {
1226 return ASTCENC_ERR_OUT_OF_MEM;
1227 }
1228
1229 image_block blk;
1230 blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
1231
1232 // Decode mode inferred from the output data type
1233 blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
1234
1235 // If context thread count is one then implicitly reset
1236 if (ctx->thread_count == 1)
1237 {
1238 astcenc_decompress_reset(ctxo);
1239 }
1240
1241 // Only the first thread actually runs the initializer
1242 ctxo->manage_decompress.init(block_count, nullptr);
1243
1244 // All threads run this processing loop until there is no work remaining
1245 while (true)
1246 {
1247 unsigned int count;
1248 unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
1249 if (!count)
1250 {
1251 break;
1252 }
1253
1254 for (unsigned int i = base; i < base + count; i++)
1255 {
1256 // Decode i into x, y, z block indices
1257 int z = i / plane_blocks;
1258 unsigned int rem = i - (z * plane_blocks);
1259 int y = rem / row_blocks;
1260 int x = rem - (y * row_blocks);
1261
1262 unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
1263 const uint8_t* bp = data + offset;
1264
1265 symbolic_compressed_block scb;
1266
1267 physical_to_symbolic(*ctx->bsd, bp, scb);
1268
1269 decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
1270 x * block_x, y * block_y, z * block_z,
1271 scb, blk);
1272
1273 store_image_block(image_out, blk, *ctx->bsd,
1274 x * block_x, y * block_y, z * block_z, *swizzle);
1275 }
1276
1277 ctxo->manage_decompress.complete_task_assignment(count);
1278 }
1279
1280 return ASTCENC_SUCCESS;
1281 }
1282
1283 /* See header for documentation. */
astcenc_decompress_reset( astcenc_context* ctxo )1284 astcenc_error astcenc_decompress_reset(
1285 astcenc_context* ctxo
1286 ) {
1287 ctxo->manage_decompress.reset();
1288 return ASTCENC_SUCCESS;
1289 }
1290
1291 /* See header for documentation. */
astcenc_get_block_info( astcenc_context* ctxo, const uint8_t data[16], astcenc_block_info* info )1292 astcenc_error astcenc_get_block_info(
1293 astcenc_context* ctxo,
1294 const uint8_t data[16],
1295 astcenc_block_info* info
1296 ) {
1297 #if defined(ASTCENC_DECOMPRESS_ONLY)
1298 (void)ctxo;
1299 (void)data;
1300 (void)info;
1301 return ASTCENC_ERR_BAD_CONTEXT;
1302 #else
1303 astcenc_contexti* ctx = &ctxo->context;
1304
1305 // Decode the compressed data into a symbolic form
1306 symbolic_compressed_block scb;
1307 physical_to_symbolic(*ctx->bsd, data, scb);
1308
1309 // Fetch the appropriate partition and decimation tables
1310 block_size_descriptor& bsd = *ctx->bsd;
1311
1312 // Start from a clean slate
1313 memset(info, 0, sizeof(*info));
1314
1315 // Basic info we can always populate
1316 info->profile = ctx->config.profile;
1317
1318 info->block_x = ctx->config.block_x;
1319 info->block_y = ctx->config.block_y;
1320 info->block_z = ctx->config.block_z;
1321 info->texel_count = bsd.texel_count;
1322
1323 // Check for error blocks first
1324 info->is_error_block = scb.block_type == SYM_BTYPE_ERROR;
1325 if (info->is_error_block)
1326 {
1327 return ASTCENC_SUCCESS;
1328 }
1329
1330 // Check for constant color blocks second
1331 info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 ||
1332 scb.block_type == SYM_BTYPE_CONST_U16;
1333 if (info->is_constant_block)
1334 {
1335 return ASTCENC_SUCCESS;
1336 }
1337
1338 // Otherwise handle a full block ; known to be valid after conditions above have been checked
1339 int partition_count = scb.partition_count;
1340 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
1341
1342 const block_mode& bm = bsd.get_block_mode(scb.block_mode);
1343 const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
1344
1345 info->weight_x = di.weight_x;
1346 info->weight_y = di.weight_y;
1347 info->weight_z = di.weight_z;
1348
1349 info->is_dual_plane_block = bm.is_dual_plane != 0;
1350
1351 info->partition_count = scb.partition_count;
1352 info->partition_index = scb.partition_index;
1353 info->dual_plane_component = scb.plane2_component;
1354
1355 info->color_level_count = get_quant_level(scb.get_color_quant_mode());
1356 info->weight_level_count = get_quant_level(bm.get_weight_quant_mode());
1357
1358 // Unpack color endpoints for each active partition
1359 for (unsigned int i = 0; i < scb.partition_count; i++)
1360 {
1361 bool rgb_hdr;
1362 bool a_hdr;
1363 vint4 endpnt[2];
1364
1365 unpack_color_endpoints(ctx->config.profile,
1366 scb.color_formats[i],
1367 scb.color_values[i],
1368 rgb_hdr, a_hdr,
1369 endpnt[0], endpnt[1]);
1370
1371 // Store the color endpoint mode info
1372 info->color_endpoint_modes[i] = scb.color_formats[i];
1373 info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr;
1374
1375 // Store the unpacked and decoded color endpoint
1376 vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr);
1377 for (int j = 0; j < 2; j++)
1378 {
1379 vint4 color_lns = lns_to_sf16(endpnt[j]);
1380 vint4 color_unorm = unorm16_to_sf16(endpnt[j]);
1381 vint4 datai = select(color_unorm, color_lns, hdr_mask);
1382 store(float16_to_float(datai), info->color_endpoints[i][j]);
1383 }
1384 }
1385
1386 // Unpack weights for each texel
1387 int weight_plane1[BLOCK_MAX_TEXELS];
1388 int weight_plane2[BLOCK_MAX_TEXELS];
1389
1390 unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
1391 for (unsigned int i = 0; i < bsd.texel_count; i++)
1392 {
1393 info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1394 if (info->is_dual_plane_block)
1395 {
1396 info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1397 }
1398 }
1399
1400 // Unpack partition assignments for each texel
1401 for (unsigned int i = 0; i < bsd.texel_count; i++)
1402 {
1403 info->partition_assignment[i] = pi.partition_of_texel[i];
1404 }
1405
1406 return ASTCENC_SUCCESS;
1407 #endif
1408 }
1409
1410 /* See header for documentation. */
astcenc_get_error_string( astcenc_error status )1411 const char* astcenc_get_error_string(
1412 astcenc_error status
1413 ) {
1414 // Values in this enum are from an external user, so not guaranteed to be
1415 // bounded to the enum values
1416 switch (static_cast<int>(status))
1417 {
1418 case ASTCENC_SUCCESS:
1419 return "ASTCENC_SUCCESS";
1420 case ASTCENC_ERR_OUT_OF_MEM:
1421 return "ASTCENC_ERR_OUT_OF_MEM";
1422 case ASTCENC_ERR_BAD_CPU_FLOAT:
1423 return "ASTCENC_ERR_BAD_CPU_FLOAT";
1424 case ASTCENC_ERR_BAD_PARAM:
1425 return "ASTCENC_ERR_BAD_PARAM";
1426 case ASTCENC_ERR_BAD_BLOCK_SIZE:
1427 return "ASTCENC_ERR_BAD_BLOCK_SIZE";
1428 case ASTCENC_ERR_BAD_PROFILE:
1429 return "ASTCENC_ERR_BAD_PROFILE";
1430 case ASTCENC_ERR_BAD_QUALITY:
1431 return "ASTCENC_ERR_BAD_QUALITY";
1432 case ASTCENC_ERR_BAD_FLAGS:
1433 return "ASTCENC_ERR_BAD_FLAGS";
1434 case ASTCENC_ERR_BAD_SWIZZLE:
1435 return "ASTCENC_ERR_BAD_SWIZZLE";
1436 case ASTCENC_ERR_BAD_CONTEXT:
1437 return "ASTCENC_ERR_BAD_CONTEXT";
1438 case ASTCENC_ERR_NOT_IMPLEMENTED:
1439 return "ASTCENC_ERR_NOT_IMPLEMENTED";
1440 case ASTCENC_ERR_BAD_DECODE_MODE:
1441 return "ASTCENC_ERR_BAD_DECODE_MODE";
1442 #if defined(ASTCENC_DIAGNOSTICS)
1443 case ASTCENC_ERR_DTRACE_FAILURE:
1444 return "ASTCENC_ERR_DTRACE_FAILURE";
1445 #endif
1446 default:
1447 return nullptr;
1448 }
1449 }
1450