1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2024 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20 /**
21 * @brief Functions to compress a symbolic block.
22 */
23
24 #include "astcenc_internal.h"
25 #include "astcenc_diagnostic_trace.h"
26
27 #include <cassert>
28 #ifdef ASTC_CUSTOMIZED_ENABLE
29 AstcCustomizedSoManager g_astcCustomizedSoManager;
30 #endif
31
32 /**
33 * @brief Merge two planes of endpoints into a single vector.
34 *
35 * @param ep_plane1 The endpoints for plane 1.
36 * @param ep_plane2 The endpoints for plane 2.
37 * @param component_plane2 The color component for plane 2.
38 * @param[out] result The merged output.
39 */
merge_endpoints( const endpoints& ep_plane1, const endpoints& ep_plane2, unsigned int component_plane2, endpoints& result )40 static void merge_endpoints(
41 const endpoints& ep_plane1,
42 const endpoints& ep_plane2,
43 unsigned int component_plane2,
44 endpoints& result
45 ) {
46 unsigned int partition_count = ep_plane1.partition_count;
47 assert(partition_count == 1);
48
49 vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
50
51 result.partition_count = partition_count;
52 result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
53 result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
54 }
55
56 /**
57 * @brief Attempt to improve weights given a chosen configuration.
58 *
59 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
60 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
61 * down by one quantization step.
62 *
63 * This is a specialized function which only supports operating on undecimated weight grids,
64 * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
65 * is needed less often.
66 *
67 * @param decode_mode The decode mode (LDR, HDR).
68 * @param bsd The block size information.
69 * @param blk The image block color data to compress.
70 * @param[out] scb The symbolic compressed block output.
71 */
72 #if ASTCENC_NEON != 0
realign_weights_undecimated( astcenc_profile decode_mode, const block_size_descriptor& bsd, const image_block& blk, symbolic_compressed_block& scb )73 static bool realign_weights_undecimated(
74 astcenc_profile decode_mode,
75 const block_size_descriptor& bsd,
76 const image_block& blk,
77 symbolic_compressed_block& scb
78 ) {
79 // Get the partition descriptor
80 unsigned int partition_count = scb.partition_count;
81 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
82
83 // Get the quantization table
84 const block_mode& bm = bsd.get_block_mode(scb.block_mode);
85 unsigned int weight_quant_level = bm.quant_mode;
86 const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
87
88 unsigned int max_plane = bm.is_dual_plane;
89 int plane2_component = scb.plane2_component;
90 vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
91
92 // Decode the color endpoints
93 bool rgb_hdr;
94 bool alpha_hdr;
95 vint4 endpnt0[BLOCK_MAX_PARTITIONS];
96 vint4 endpnt1[BLOCK_MAX_PARTITIONS];
97 vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
98 vfloat4 offset[BLOCK_MAX_PARTITIONS];
99
100 promise(partition_count > 0);
101
102 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
103 {
104 unpack_color_endpoints(decode_mode,
105 scb.color_formats[pa_idx],
106 scb.color_values[pa_idx],
107 rgb_hdr, alpha_hdr,
108 endpnt0[pa_idx],
109 endpnt1[pa_idx]);
110 }
111
112 uint8_t* dec_weights_uquant = scb.weights;
113 bool adjustments = false;
114
115 // For each plane and partition ...
116 for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
117 {
118 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
119 {
120 // Compute the endpoint delta for all components in current plane
121 vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
122 epd = select(epd, vint4::zero(), plane_mask);
123
124 endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
125 offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
126 }
127
128 // For each weight compute previous, current, and next errors
129 promise(bsd.texel_count > 0);
130
131 unsigned int texel = 0;
132 for (; texel + ASTCENC_SIMD_WIDTH <= bsd.texel_count; texel += ASTCENC_SIMD_WIDTH)
133 {
134 int uqw0 = dec_weights_uquant[texel];
135 int uqw1 = dec_weights_uquant[texel + 1];
136 int uqw2 = dec_weights_uquant[texel + 2];
137 int uqw3 = dec_weights_uquant[texel + 3];
138
139 vint4 uqw_vec = vint4(uqw0, uqw1, uqw2, uqw3);
140 vint4 prev_and_next_vec = vint4(qat.prev_next_values[uqw0], qat.prev_next_values[uqw1],
141 qat.prev_next_values[uqw2], qat.prev_next_values[uqw3]);
142
143 vint4 mask = vint4(0xFF, 0xFF, 0xFF, 0xFF);
144 vint4 uqw_down_vec = prev_and_next_vec & mask;
145 vint4 uqw_up_vec = vint4(vshrq_n_s32(prev_and_next_vec.m, 8)) & mask;
146
147 vfloat4 weight_base_vec = int_to_float(uqw_vec);
148 vfloat4 weight_down_vec = int_to_float(uqw_down_vec) - weight_base_vec;
149 vfloat4 weight_up_vec = int_to_float(uqw_up_vec) - weight_base_vec;
150
151 unsigned int partition0 = pi.partition_of_texel[texel];
152 unsigned int partition1 = pi.partition_of_texel[texel + 1];
153 unsigned int partition2 = pi.partition_of_texel[texel + 2];
154 unsigned int partition3 = pi.partition_of_texel[texel + 3];
155
156 vfloat4 color_offset0 = offset[partition0];
157 vfloat4 color_offset1 = offset[partition1];
158 vfloat4 color_offset2 = offset[partition2];
159 vfloat4 color_offset3 = offset[partition3];
160
161 vfloat4 color_base0 = endpnt0f[partition0];
162 vfloat4 color_base1 = endpnt0f[partition1];
163 vfloat4 color_base2 = endpnt0f[partition2];
164 vfloat4 color_base3 = endpnt0f[partition3];
165
166 vfloat4 color0 = color_base0 + color_offset0 * weight_base_vec.lane<0>();
167 vfloat4 color1 = color_base1 + color_offset1 * weight_base_vec.lane<1>();
168 vfloat4 color2 = color_base2 + color_offset2 * weight_base_vec.lane<2>();
169 vfloat4 color3 = color_base3 + color_offset3 * weight_base_vec.lane<3>();
170
171 vfloat4 orig_color0 = blk.texel(texel);
172 vfloat4 orig_color1 = blk.texel(texel + 1);
173 vfloat4 orig_color2 = blk.texel(texel + 2);
174 vfloat4 orig_color3 = blk.texel(texel + 3);
175
176 vfloat4 error_weight = blk.channel_weight;
177
178 vfloat4 color_diff0 = color0 - orig_color0;
179 vfloat4 color_diff1 = color1 - orig_color1;
180 vfloat4 color_diff2 = color2 - orig_color2;
181 vfloat4 color_diff3 = color3 - orig_color3;
182
183 vfloat4 color_diff_down0 = color_diff0 + color_offset0 * weight_down_vec.lane<0>();
184 vfloat4 color_diff_down1 = color_diff1 + color_offset1 * weight_down_vec.lane<1>();
185 vfloat4 color_diff_down2 = color_diff2 + color_offset2 * weight_down_vec.lane<2>();
186 vfloat4 color_diff_down3 = color_diff3 + color_offset3 * weight_down_vec.lane<3>();
187
188 vfloat4 color_diff_up0 = color_diff0 + color_offset0 * weight_up_vec.lane<0>();
189 vfloat4 color_diff_up1 = color_diff1 + color_offset1 * weight_up_vec.lane<1>();
190 vfloat4 color_diff_up2 = color_diff2 + color_offset2 * weight_up_vec.lane<2>();
191 vfloat4 color_diff_up3 = color_diff3 + color_offset3 * weight_up_vec.lane<3>();
192
193 float error_base0 = dot_s(color_diff0 * color_diff0, error_weight);
194 float error_base1 = dot_s(color_diff1 * color_diff1, error_weight);
195 float error_base2 = dot_s(color_diff2 * color_diff2, error_weight);
196 float error_base3 = dot_s(color_diff3 * color_diff3, error_weight);
197
198 float error_down0 = dot_s(color_diff_down0 * color_diff_down0, error_weight);
199 float error_down1 = dot_s(color_diff_down1 * color_diff_down1, error_weight);
200 float error_down2 = dot_s(color_diff_down2 * color_diff_down2, error_weight);
201 float error_down3 = dot_s(color_diff_down3 * color_diff_down3, error_weight);
202
203 float error_up0 = dot_s(color_diff_up0 * color_diff_up0, error_weight);
204 float error_up1 = dot_s(color_diff_up1 * color_diff_up1, error_weight);
205 float error_up2 = dot_s(color_diff_up2 * color_diff_up2, error_weight);
206 float error_up3 = dot_s(color_diff_up3 * color_diff_up3, error_weight);
207
208 vfloat4 error_base_vec = vfloat4(error_base0, error_base1, error_base2, error_base3);
209 vfloat4 error_down_vec = vfloat4(error_down0, error_down1, error_down2, error_down3);
210 vfloat4 error_up_vec = vfloat4(error_up0, error_up1, error_up2, error_up3);
211
212 vmask4 check_result_up = (error_up_vec < error_base_vec) &
213 (error_up_vec < error_down_vec) & (uqw_vec < vint4(64));
214
215 vmask4 check_result_down = (error_down_vec < error_base_vec) & (uqw_vec > vint4::zero());
216 check_result_down = check_result_down & (~check_result_up);
217
218 if (popcount(check_result_up | check_result_down) != 0)
219 {
220 uqw_vec = select(uqw_vec, uqw_up_vec, check_result_up);
221 uqw_vec = select(uqw_vec, uqw_down_vec, check_result_down);
222
223 dec_weights_uquant[texel] = uqw_vec.lane<0>();
224 dec_weights_uquant[texel + 1] = uqw_vec.lane<1>();
225 dec_weights_uquant[texel + 2] = uqw_vec.lane<2>(); // channel 2
226 dec_weights_uquant[texel + 3] = uqw_vec.lane<3>(); // channel 3
227 adjustments = true;
228 }
229 };
230
231 for (; texel < bsd.texel_count; texel++)
232 {
233 int uqw = dec_weights_uquant[texel];
234
235 uint32_t prev_and_next = qat.prev_next_values[uqw];
236 int uqw_down = prev_and_next & 0xFF;
237 int uqw_up = (prev_and_next >> 8) & 0xFF;
238
239 // Interpolate the colors to create the diffs
240 float weight_base = static_cast<float>(uqw);
241 float weight_down = static_cast<float>(uqw_down - uqw);
242 float weight_up = static_cast<float>(uqw_up - uqw);
243
244 unsigned int partition = pi.partition_of_texel[texel];
245 vfloat4 color_offset = offset[partition];
246 vfloat4 color_base = endpnt0f[partition];
247
248 vfloat4 color = color_base + color_offset * weight_base;
249 vfloat4 orig_color = blk.texel(texel);
250 vfloat4 error_weight = blk.channel_weight;
251
252 vfloat4 color_diff = color - orig_color;
253 vfloat4 color_diff_down = color_diff + color_offset * weight_down;
254 vfloat4 color_diff_up = color_diff + color_offset * weight_up;
255
256 float error_base = dot_s(color_diff * color_diff, error_weight);
257 float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
258 float error_up = dot_s(color_diff_up * color_diff_up, error_weight);
259
260 // Check if the prev or next error is better, and if so use it
261 if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
262 {
263 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
264 adjustments = true;
265 }
266 else if ((error_down < error_base) && (uqw > 0))
267 {
268 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
269 adjustments = true;
270 }
271 }
272
273 // Prepare iteration for plane 2
274 dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
275 plane_mask = ~plane_mask;
276 }
277 return adjustments;
278 }
279 #else
realign_weights_undecimated( astcenc_profile decode_mode, const block_size_descriptor& bsd, const image_block& blk, symbolic_compressed_block& scb )280 static bool realign_weights_undecimated(
281 astcenc_profile decode_mode,
282 const block_size_descriptor& bsd,
283 const image_block& blk,
284 symbolic_compressed_block& scb
285 ) {
286 // Get the partition descriptor
287 unsigned int partition_count = scb.partition_count;
288 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
289
290 // Get the quantization table
291 const block_mode& bm = bsd.get_block_mode(scb.block_mode);
292 unsigned int weight_quant_level = bm.quant_mode;
293 const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
294
295 unsigned int max_plane = bm.is_dual_plane;
296 int plane2_component = scb.plane2_component;
297 vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
298
299 // Decode the color endpoints
300 bool rgb_hdr;
301 bool alpha_hdr;
302 vint4 endpnt0[BLOCK_MAX_PARTITIONS];
303 vint4 endpnt1[BLOCK_MAX_PARTITIONS];
304 vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
305 vfloat4 offset[BLOCK_MAX_PARTITIONS];
306
307 promise(partition_count > 0);
308
309 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
310 {
311 unpack_color_endpoints(decode_mode,
312 scb.color_formats[pa_idx],
313 scb.color_values[pa_idx],
314 rgb_hdr, alpha_hdr,
315 endpnt0[pa_idx],
316 endpnt1[pa_idx]);
317 }
318
319 uint8_t* dec_weights_uquant = scb.weights;
320 bool adjustments = false;
321
322 // For each plane and partition ...
323 for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
324 {
325 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
326 {
327 // Compute the endpoint delta for all components in current plane
328 vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
329 epd = select(epd, vint4::zero(), plane_mask);
330
331 endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
332 offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
333 }
334
335 // For each weight compute previous, current, and next errors
336 promise(bsd.texel_count > 0);
337 for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
338 {
339 int uqw = dec_weights_uquant[texel];
340
341 uint32_t prev_and_next = qat.prev_next_values[uqw];
342 int uqw_down = prev_and_next & 0xFF;
343 int uqw_up = (prev_and_next >> 8) & 0xFF;
344
345 // Interpolate the colors to create the diffs
346 float weight_base = static_cast<float>(uqw);
347 float weight_down = static_cast<float>(uqw_down - uqw);
348 float weight_up = static_cast<float>(uqw_up - uqw);
349
350 unsigned int partition = pi.partition_of_texel[texel];
351 vfloat4 color_offset = offset[partition];
352 vfloat4 color_base = endpnt0f[partition];
353
354 vfloat4 color = color_base + color_offset * weight_base;
355 vfloat4 orig_color = blk.texel(texel);
356 vfloat4 error_weight = blk.channel_weight;
357
358 vfloat4 color_diff = color - orig_color;
359 vfloat4 color_diff_down = color_diff + color_offset * weight_down;
360 vfloat4 color_diff_up = color_diff + color_offset * weight_up;
361
362 float error_base = dot_s(color_diff * color_diff, error_weight);
363 float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
364 float error_up = dot_s(color_diff_up * color_diff_up, error_weight);
365
366 // Check if the prev or next error is better, and if so use it
367 if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
368 {
369 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
370 adjustments = true;
371 }
372 else if ((error_down < error_base) && (uqw > 0))
373 {
374 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
375 adjustments = true;
376 }
377 }
378
379 // Prepare iteration for plane 2
380 dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
381 plane_mask = ~plane_mask;
382 }
383
384 return adjustments;
385 }
386 #endif
387
388 /**
389 * @brief Attempt to improve weights given a chosen configuration.
390 *
391 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
392 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
393 * down by one quantization step.
394 *
395 * @param decode_mode The decode mode (LDR, HDR).
396 * @param bsd The block size information.
397 * @param blk The image block color data to compress.
398 * @param[out] scb The symbolic compressed block output.
399 */
realign_weights_decimated( astcenc_profile decode_mode, const block_size_descriptor& bsd, const image_block& blk, symbolic_compressed_block& scb )400 static bool realign_weights_decimated(
401 astcenc_profile decode_mode,
402 const block_size_descriptor& bsd,
403 const image_block& blk,
404 symbolic_compressed_block& scb
405 ) {
406 // Get the partition descriptor
407 unsigned int partition_count = scb.partition_count;
408 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
409
410 // Get the quantization table
411 const block_mode& bm = bsd.get_block_mode(scb.block_mode);
412 unsigned int weight_quant_level = bm.quant_mode;
413 const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
414
415 // Get the decimation table
416 const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
417 unsigned int weight_count = di.weight_count;
418 assert(weight_count != bsd.texel_count);
419
420 unsigned int max_plane = bm.is_dual_plane;
421 int plane2_component = scb.plane2_component;
422 vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
423
424 // Decode the color endpoints
425 bool rgb_hdr;
426 bool alpha_hdr;
427 vint4 endpnt0[BLOCK_MAX_PARTITIONS];
428 vint4 endpnt1[BLOCK_MAX_PARTITIONS];
429 vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
430 vfloat4 offset[BLOCK_MAX_PARTITIONS];
431
432 promise(partition_count > 0);
433 promise(weight_count > 0);
434
435 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
436 {
437 unpack_color_endpoints(decode_mode,
438 scb.color_formats[pa_idx],
439 scb.color_values[pa_idx],
440 rgb_hdr, alpha_hdr,
441 endpnt0[pa_idx],
442 endpnt1[pa_idx]);
443 }
444
445 uint8_t* dec_weights_uquant = scb.weights;
446 bool adjustments = false;
447
448 // For each plane and partition ...
449 for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
450 {
451 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
452 {
453 // Compute the endpoint delta for all components in current plane
454 vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
455 epd = select(epd, vint4::zero(), plane_mask);
456
457 endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
458 offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
459 }
460
461 // Create an unquantized weight grid for this decimation level
462 ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
463 for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
464 {
465 vint unquant_value(dec_weights_uquant + we_idx);
466 vfloat unquant_valuef = int_to_float(unquant_value);
467 storea(unquant_valuef, uq_weightsf + we_idx);
468 }
469
470 // For each weight compute previous, current, and next errors
471 for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
472 {
473 int uqw = dec_weights_uquant[we_idx];
474 uint32_t prev_and_next = qat.prev_next_values[uqw];
475
476 float uqw_base = uq_weightsf[we_idx];
477 float uqw_down = static_cast<float>(prev_and_next & 0xFF);
478 float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
479
480 float uqw_diff_down = uqw_down - uqw_base;
481 float uqw_diff_up = uqw_up - uqw_base;
482
483 vfloat4 error_basev = vfloat4::zero();
484 vfloat4 error_downv = vfloat4::zero();
485 vfloat4 error_upv = vfloat4::zero();
486
487 // Interpolate the colors to create the diffs
488 unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
489 promise(texels_to_evaluate > 0);
490 for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
491 {
492 unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
493
494 float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
495
496 float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
497 + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
498 + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
499 + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
500
501 // Ideally this is integer rounded, but IQ gain it isn't worth the overhead
502 // float weight = astc::flt_rd(weight_base + 0.5f);
503 // float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
504 // float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
505 float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
506 float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
507
508 unsigned int partition = pi.partition_of_texel[texel];
509 vfloat4 color_offset = offset[partition];
510 vfloat4 color_base = endpnt0f[partition];
511
512 vfloat4 color = color_base + color_offset * weight_base;
513 vfloat4 orig_color = blk.texel(texel);
514
515 vfloat4 color_diff = color - orig_color;
516 vfloat4 color_down_diff = color_diff + color_offset * weight_down;
517 vfloat4 color_up_diff = color_diff + color_offset * weight_up;
518
519 error_basev += color_diff * color_diff;
520 error_downv += color_down_diff * color_down_diff;
521 error_upv += color_up_diff * color_up_diff;
522 }
523
524 vfloat4 error_weight = blk.channel_weight;
525 float error_base = hadd_s(error_basev * error_weight);
526 float error_down = hadd_s(error_downv * error_weight);
527 float error_up = hadd_s(error_upv * error_weight);
528
529 // Check if the prev or next error is better, and if so use it
530 if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
531 {
532 uq_weightsf[we_idx] = uqw_up;
533 dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
534 adjustments = true;
535 }
536 else if ((error_down < error_base) && (uqw > 0))
537 {
538 uq_weightsf[we_idx] = uqw_down;
539 dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
540 adjustments = true;
541 }
542 }
543
544 // Prepare iteration for plane 2
545 dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
546 plane_mask = ~plane_mask;
547 }
548
549 return adjustments;
550 }
551
552 /**
553 * @brief Compress a block using a chosen partitioning and 1 plane of weights.
554 *
555 * @param config The compressor configuration.
556 * @param bsd The block size information.
557 * @param blk The image block color data to compress.
558 * @param only_always True if we only use "always" percentile block modes.
559 * @param tune_errorval_threshold The error value threshold.
560 * @param partition_count The partition count.
561 * @param partition_index The partition index if @c partition_count is 2-4.
562 * @param[out] scb The symbolic compressed block output.
563 * @param[out] tmpbuf The quantized weights for plane 1.
564 */
compress_symbolic_block_for_partition_1plane( QualityProfile privateProfile, const astcenc_config& config, const block_size_descriptor& bsd, const image_block& blk, bool only_always, float tune_errorval_threshold, unsigned int partition_count, unsigned int partition_index, symbolic_compressed_block& scb, compression_working_buffers& tmpbuf, int quant_limit )565 static float compress_symbolic_block_for_partition_1plane(
566 QualityProfile privateProfile,
567 const astcenc_config& config,
568 const block_size_descriptor& bsd,
569 const image_block& blk,
570 bool only_always,
571 float tune_errorval_threshold,
572 unsigned int partition_count,
573 unsigned int partition_index,
574 symbolic_compressed_block& scb,
575 compression_working_buffers& tmpbuf,
576 int quant_limit
577 ) {
578 promise(partition_count > 0);
579 promise(config.tune_candidate_limit > 0);
580 promise(config.tune_refinement_limit > 0);
581
582 int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
583
584 auto compute_difference = &compute_symbolic_block_difference_1plane;
585 if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
586 {
587 compute_difference = &compute_symbolic_block_difference_1plane_1partition;
588 }
589
590 const auto& pi = bsd.get_partition_info(partition_count, partition_index);
591
592 // Compute ideal weights and endpoint colors, with no quantization or decimation
593 endpoints_and_weights& ei = tmpbuf.ei1;
594 compute_ideal_colors_and_weights_1plane(blk, pi, ei);
595
596 // Compute ideal weights and endpoint colors for every decimation
597 float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
598 uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
599
600 // For each decimation mode, compute an ideal set of weights with no quantization
601 unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
602 : bsd.decimation_mode_count_selected;
603 promise(max_decimation_modes > 0);
604 for (unsigned int i = 0; i < max_decimation_modes; i++)
605 {
606 const auto& dm = bsd.get_decimation_mode(i);
607 if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
608 {
609 continue;
610 }
611
612 const auto& di = bsd.get_decimation_info(i);
613
614 compute_ideal_weights_for_decimation(
615 ei,
616 di,
617 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
618 }
619
620 // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
621 // weight pair, compute the smallest weight that will result in a color value greater than 1
622 vfloat4 min_ep(10.0f);
623 for (unsigned int i = 0; i < partition_count; i++)
624 {
625 vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
626
627 vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
628 min_ep = select(min_ep, ep, use_ep);
629 }
630
631 float min_wt_cutoff = hmin_s(min_ep);
632
633 // For each mode, use the angular method to compute a shift
634 compute_angular_endpoints_1plane(
635 privateProfile, only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
636
637 float* weight_low_value = tmpbuf.weight_low_value1;
638 float* weight_high_value = tmpbuf.weight_high_value1;
639 int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
640 float* qwt_errors = tmpbuf.qwt_errors;
641
642 // For each mode (which specifies a decimation and a quantization):
643 // * Compute number of bits needed for the quantized weights
644 // * Generate an optimized set of quantized weights
645 // * Compute quantization errors for the mode
646
647
648 static const int8_t free_bits_for_partition_count[4] {
649 115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
650 };
651
652 unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
653 : bsd.block_mode_count_1plane_selected;
654 promise(max_block_modes > 0);
655 for (unsigned int i = 0; i < max_block_modes; i++)
656 {
657 const block_mode& bm = bsd.block_modes[i];
658
659 if (bm.quant_mode > max_weight_quant)
660 {
661 qwt_errors[i] = 1e38f;
662 continue;
663 }
664
665 assert(!bm.is_dual_plane);
666 int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
667 if (bitcount <= 0)
668 {
669 qwt_errors[i] = 1e38f;
670 continue;
671 }
672
673 if (weight_high_value[i] > 1.02f * min_wt_cutoff)
674 {
675 weight_high_value[i] = 1.0f;
676 }
677
678 int decimation_mode = bm.decimation_mode;
679 const auto& di = bsd.get_decimation_info(decimation_mode);
680
681 qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
682
683 ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
684
685 // Generate the optimized set of weights for the weight mode
686 compute_quantized_weights_for_decimation(
687 di,
688 weight_low_value[i], weight_high_value[i],
689 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
690 dec_weights_uquantf,
691 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
692 bm.get_weight_quant_mode());
693
694 // Compute weight quantization errors for the block mode
695 qwt_errors[i] = compute_error_of_weight_set_1plane(
696 ei,
697 di,
698 dec_weights_uquantf);
699 }
700
701 // Decide the optimal combination of color endpoint encodings and weight encodings
702 uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
703 int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
704
705 quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
706 quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
707
708 unsigned int candidate_count = compute_ideal_endpoint_formats(
709 privateProfile,
710 pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
711 config.tune_candidate_limit, 0, max_block_modes,
712 partition_format_specifiers, block_mode_index,
713 color_quant_level, color_quant_level_mod, tmpbuf);
714
715 // Iterate over the N believed-to-be-best modes to find out which one is actually best
716 float best_errorval_in_mode = ERROR_CALC_DEFAULT;
717 float best_errorval_in_scb = scb.errorval;
718
719 for (unsigned int i = 0; i < candidate_count; i++)
720 {
721 TRACE_NODE(node0, "candidate");
722
723 const int bm_packed_index = block_mode_index[i];
724 assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
725 const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
726
727 int decimation_mode = qw_bm.decimation_mode;
728 const auto& di = bsd.get_decimation_info(decimation_mode);
729 promise(di.weight_count > 0);
730
731 trace_add_data("weight_x", di.weight_x);
732 trace_add_data("weight_y", di.weight_y);
733 trace_add_data("weight_z", di.weight_z);
734 trace_add_data("weight_quant", qw_bm.quant_mode);
735
736 // Recompute the ideal color endpoints before storing them
737 vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
738 vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
739
740 symbolic_compressed_block workscb;
741 endpoints workep = ei.ep;
742
743 uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
744
745 for (unsigned int j = 0; j < di.weight_count; j++)
746 {
747 workscb.weights[j] = u8_weight_src[j];
748 }
749
750 for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
751 {
752 recompute_ideal_colors_1plane(
753 blk, pi, di, workscb.weights,
754 workep, rgbs_colors, rgbo_colors);
755
756 // Quantize the chosen color, tracking if worth trying the mod value
757 bool all_same = color_quant_level[i] != color_quant_level_mod[i];
758 for (unsigned int j = 0; j < partition_count; j++)
759 {
760 workscb.color_formats[j] = pack_color_endpoints(
761 privateProfile,
762 workep.endpt0[j],
763 workep.endpt1[j],
764 rgbs_colors[j],
765 rgbo_colors[j],
766 partition_format_specifiers[i][j],
767 workscb.color_values[j],
768 color_quant_level[i]);
769
770 all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
771 }
772
773 // If all the color endpoint modes are the same, we get a few more bits to store colors;
774 // let's see if we can take advantage of this: requantize all the colors and see if the
775 // endpoint modes remain the same.
776 workscb.color_formats_matched = 0;
777 if (partition_count >= 2 && all_same)
778 {
779 uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
780 uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
781 bool all_same_mod = true;
782 for (unsigned int j = 0; j < partition_count; j++)
783 {
784 color_formats_mod[j] = pack_color_endpoints(
785 privateProfile,
786 workep.endpt0[j],
787 workep.endpt1[j],
788 rgbs_colors[j],
789 rgbo_colors[j],
790 partition_format_specifiers[i][j],
791 colorvals[j],
792 color_quant_level_mod[i]);
793
794 // Early out as soon as it's no longer possible to use mod
795 if (color_formats_mod[j] != color_formats_mod[0])
796 {
797 all_same_mod = false;
798 break;
799 }
800 }
801
802 if (all_same_mod)
803 {
804 workscb.color_formats_matched = 1;
805 for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
806 {
807 for (unsigned int k = 0; k < 8; k++)
808 {
809 workscb.color_values[j][k] = colorvals[j][k];
810 }
811
812 workscb.color_formats[j] = color_formats_mod[j];
813 }
814 }
815 }
816
817 // Store header fields
818 workscb.partition_count = static_cast<uint8_t>(partition_count);
819 workscb.partition_index = static_cast<uint16_t>(partition_index);
820 workscb.plane2_component = -1;
821 workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
822 workscb.block_mode = qw_bm.mode_index;
823 workscb.block_type = SYM_BTYPE_NONCONST;
824 if (privateProfile == HIGH_SPEED_PROFILE)
825 {
826 workscb.errorval = 0;
827 scb = workscb;
828 break;
829 }
830 // Pre-realign test
831 if (l == 0)
832 {
833 float errorval = compute_difference(config, bsd, workscb, blk);
834 if (errorval == -ERROR_CALC_DEFAULT)
835 {
836 errorval = -errorval;
837 workscb.block_type = SYM_BTYPE_ERROR;
838 }
839
840 trace_add_data("error_prerealign", errorval);
841 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
842
843 // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
844 // iteration can help more so we give it a extra 8% leeway. Use this knowledge to
845 // drive a heuristic to skip blocks that are unlikely to catch up with the best
846 // block we have already.
847 unsigned int iters_remaining = config.tune_refinement_limit - l;
848 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
849 if (errorval > (threshold * best_errorval_in_scb))
850 {
851 break;
852 }
853
854 if (errorval < best_errorval_in_scb)
855 {
856 best_errorval_in_scb = errorval;
857 workscb.errorval = errorval;
858 scb = workscb;
859
860 if (errorval < tune_errorval_threshold)
861 {
862 // Skip remaining candidates - this is "good enough"
863 i = candidate_count;
864 break;
865 }
866 }
867 }
868
869 bool adjustments;
870 if (di.weight_count != bsd.texel_count)
871 {
872 adjustments = realign_weights_decimated(
873 config.profile, bsd, blk, workscb);
874 }
875 else
876 {
877 adjustments = realign_weights_undecimated(
878 config.profile, bsd, blk, workscb);
879 }
880
881 // Post-realign test
882 float errorval = compute_difference(config, bsd, workscb, blk);
883 if (errorval == -ERROR_CALC_DEFAULT)
884 {
885 errorval = -errorval;
886 workscb.block_type = SYM_BTYPE_ERROR;
887 }
888
889 trace_add_data("error_postrealign", errorval);
890 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
891
892 // Average refinement improvement is 3.5% per iteration, so skip blocks that are
893 // unlikely to catch up with the best block we have already. Assume a 4.5% per step to
894 // give benefit of the doubt ...
895 unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
896 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
897 if (errorval > (threshold * best_errorval_in_scb))
898 {
899 break;
900 }
901
902 if (errorval < best_errorval_in_scb)
903 {
904 best_errorval_in_scb = errorval;
905 workscb.errorval = errorval;
906 scb = workscb;
907
908 if (errorval < tune_errorval_threshold)
909 {
910 // Skip remaining candidates - this is "good enough"
911 i = candidate_count;
912 break;
913 }
914 }
915
916 if (!adjustments)
917 {
918 break;
919 }
920 }
921 }
922
923 return best_errorval_in_mode;
924 }
925
926 /**
927 * @brief Compress a block using a chosen partitioning and 2 planes of weights.
928 *
929 * @param config The compressor configuration.
930 * @param bsd The block size information.
931 * @param blk The image block color data to compress.
932 * @param tune_errorval_threshold The error value threshold.
933 * @param plane2_component The component index for the second plane of weights.
934 * @param[out] scb The symbolic compressed block output.
935 * @param[out] tmpbuf The quantized weights for plane 1.
936 */
compress_symbolic_block_for_partition_2planes( QualityProfile privateProfile, const astcenc_config& config, const block_size_descriptor& bsd, const image_block& blk, float tune_errorval_threshold, unsigned int plane2_component, symbolic_compressed_block& scb, compression_working_buffers& tmpbuf, int quant_limit )937 static float compress_symbolic_block_for_partition_2planes(
938 QualityProfile privateProfile,
939 const astcenc_config& config,
940 const block_size_descriptor& bsd,
941 const image_block& blk,
942 float tune_errorval_threshold,
943 unsigned int plane2_component,
944 symbolic_compressed_block& scb,
945 compression_working_buffers& tmpbuf,
946 int quant_limit
947 ) {
948 promise(config.tune_candidate_limit > 0);
949 promise(config.tune_refinement_limit > 0);
950 promise(bsd.decimation_mode_count_selected > 0);
951
952 int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
953
954 // Compute ideal weights and endpoint colors, with no quantization or decimation
955 endpoints_and_weights& ei1 = tmpbuf.ei1;
956 endpoints_and_weights& ei2 = tmpbuf.ei2;
957
958 compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
959
960 // Compute ideal weights and endpoint colors for every decimation
961 float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
962 uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
963
964 // For each decimation mode, compute an ideal set of weights with no quantization
965 for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
966 {
967 const auto& dm = bsd.get_decimation_mode(i);
968 if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
969 {
970 continue;
971 }
972
973 const auto& di = bsd.get_decimation_info(i);
974
975 compute_ideal_weights_for_decimation(
976 ei1,
977 di,
978 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
979
980 compute_ideal_weights_for_decimation(
981 ei2,
982 di,
983 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
984 }
985
986 // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
987 // weight pair, compute the smallest weight that will result in a color value greater than 1
988 vfloat4 min_ep1(10.0f);
989 vfloat4 min_ep2(10.0f);
990
991 vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
992 vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
993 min_ep1 = select(min_ep1, ep1, use_ep1);
994
995 vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
996 vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
997 min_ep2 = select(min_ep2, ep2, use_ep2);
998
999 vfloat4 err_max(ERROR_CALC_DEFAULT);
1000 vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
1001
1002 // Set the plane2 component to max error in ep1
1003 min_ep1 = select(min_ep1, err_max, err_mask);
1004
1005 float min_wt_cutoff1 = hmin_s(min_ep1);
1006
1007 // Set the minwt2 to the plane2 component min in ep2
1008 float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
1009
1010 compute_angular_endpoints_2planes(
1011 privateProfile, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
1012
1013 // For each mode (which specifies a decimation and a quantization):
1014 // * Compute number of bits needed for the quantized weights
1015 // * Generate an optimized set of quantized weights
1016 // * Compute quantization errors for the mode
1017
1018 float* weight_low_value1 = tmpbuf.weight_low_value1;
1019 float* weight_high_value1 = tmpbuf.weight_high_value1;
1020 float* weight_low_value2 = tmpbuf.weight_low_value2;
1021 float* weight_high_value2 = tmpbuf.weight_high_value2;
1022
1023 int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
1024 float* qwt_errors = tmpbuf.qwt_errors;
1025
1026 unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
1027 unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
1028
1029 for (unsigned int i = start_2plane; i < end_2plane; i++)
1030 {
1031 const block_mode& bm = bsd.block_modes[i];
1032 assert(bm.is_dual_plane);
1033
1034 if (bm.quant_mode > max_weight_quant)
1035 {
1036 qwt_errors[i] = 1e38f;
1037 continue;
1038 }
1039
1040 qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
1041
1042 if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
1043 {
1044 weight_high_value1[i] = 1.0f;
1045 }
1046
1047 if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
1048 {
1049 weight_high_value2[i] = 1.0f;
1050 }
1051
1052 unsigned int decimation_mode = bm.decimation_mode;
1053 const auto& di = bsd.get_decimation_info(decimation_mode);
1054
1055 ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
1056
1057 // Generate the optimized set of weights for the mode
1058 compute_quantized_weights_for_decimation(
1059 di,
1060 weight_low_value1[i],
1061 weight_high_value1[i],
1062 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
1063 dec_weights_uquantf,
1064 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
1065 bm.get_weight_quant_mode());
1066
1067 compute_quantized_weights_for_decimation(
1068 di,
1069 weight_low_value2[i],
1070 weight_high_value2[i],
1071 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
1072 dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
1073 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
1074 bm.get_weight_quant_mode());
1075
1076 // Compute weight quantization errors for the block mode
1077 qwt_errors[i] = compute_error_of_weight_set_2planes(
1078 ei1,
1079 ei2,
1080 di,
1081 dec_weights_uquantf,
1082 dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
1083 }
1084
1085 // Decide the optimal combination of color endpoint encodings and weight encodings
1086 uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
1087 int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
1088
1089 quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
1090 quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
1091
1092 endpoints epm;
1093 merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
1094
1095 const auto& pi = bsd.get_partition_info(1, 0);
1096 unsigned int candidate_count = compute_ideal_endpoint_formats(
1097 config.privateProfile,
1098 pi, blk, epm, qwt_bitcounts, qwt_errors,
1099 config.tune_candidate_limit,
1100 bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
1101 partition_format_specifiers, block_mode_index,
1102 color_quant_level, color_quant_level_mod, tmpbuf);
1103
1104 // Iterate over the N believed-to-be-best modes to find out which one is actually best
1105 float best_errorval_in_mode = ERROR_CALC_DEFAULT;
1106 float best_errorval_in_scb = scb.errorval;
1107
1108 for (unsigned int i = 0; i < candidate_count; i++)
1109 {
1110 TRACE_NODE(node0, "candidate");
1111
1112 const int bm_packed_index = block_mode_index[i];
1113 assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
1114 bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
1115 const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
1116
1117 int decimation_mode = qw_bm.decimation_mode;
1118 const auto& di = bsd.get_decimation_info(decimation_mode);
1119 promise(di.weight_count > 0);
1120
1121 trace_add_data("weight_x", di.weight_x);
1122 trace_add_data("weight_y", di.weight_y);
1123 trace_add_data("weight_z", di.weight_z);
1124 trace_add_data("weight_quant", qw_bm.quant_mode);
1125
1126 vfloat4 rgbs_color;
1127 vfloat4 rgbo_color;
1128
1129 symbolic_compressed_block workscb;
1130 endpoints workep = epm;
1131
1132 uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
1133 uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
1134
1135 for (int j = 0; j < di.weight_count; j++)
1136 {
1137 workscb.weights[j] = u8_weight1_src[j];
1138 workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
1139 }
1140
1141 for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
1142 {
1143 recompute_ideal_colors_2planes(
1144 blk, bsd, di,
1145 workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
1146 workep, rgbs_color, rgbo_color, plane2_component);
1147
1148 // Quantize the chosen color
1149 workscb.color_formats[0] = pack_color_endpoints(
1150 privateProfile,
1151 workep.endpt0[0],
1152 workep.endpt1[0],
1153 rgbs_color, rgbo_color,
1154 partition_format_specifiers[i][0],
1155 workscb.color_values[0],
1156 color_quant_level[i]);
1157
1158 // Store header fields
1159 workscb.partition_count = 1;
1160 workscb.partition_index = 0;
1161 workscb.quant_mode = color_quant_level[i];
1162 workscb.color_formats_matched = 0;
1163 workscb.block_mode = qw_bm.mode_index;
1164 workscb.plane2_component = static_cast<int8_t>(plane2_component);
1165 workscb.block_type = SYM_BTYPE_NONCONST;
1166
1167 // Pre-realign test
1168 if (l == 0)
1169 {
1170 float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
1171 if (errorval == -ERROR_CALC_DEFAULT)
1172 {
1173 errorval = -errorval;
1174 workscb.block_type = SYM_BTYPE_ERROR;
1175 }
1176
1177 trace_add_data("error_prerealign", errorval);
1178 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1179
1180 // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
1181 // iteration can help more so we give it a extra 8% leeway. Use this knowledge to
1182 // drive a heuristic to skip blocks that are unlikely to catch up with the best
1183 // block we have already.
1184 unsigned int iters_remaining = config.tune_refinement_limit - l;
1185 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
1186 if (errorval > (threshold * best_errorval_in_scb))
1187 {
1188 break;
1189 }
1190
1191 if (errorval < best_errorval_in_scb)
1192 {
1193 best_errorval_in_scb = errorval;
1194 workscb.errorval = errorval;
1195 scb = workscb;
1196
1197 if (errorval < tune_errorval_threshold)
1198 {
1199 // Skip remaining candidates - this is "good enough"
1200 i = candidate_count;
1201 break;
1202 }
1203 }
1204 }
1205
1206 // Perform a final pass over the weights to try to improve them.
1207 bool adjustments;
1208 if (di.weight_count != bsd.texel_count)
1209 {
1210 adjustments = realign_weights_decimated(
1211 config.profile, bsd, blk, workscb);
1212 }
1213 else
1214 {
1215 adjustments = realign_weights_undecimated(
1216 config.profile, bsd, blk, workscb);
1217 }
1218
1219 // Post-realign test
1220 float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
1221 if (errorval == -ERROR_CALC_DEFAULT)
1222 {
1223 errorval = -errorval;
1224 workscb.block_type = SYM_BTYPE_ERROR;
1225 }
1226
1227 trace_add_data("error_postrealign", errorval);
1228 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1229
1230 // Average refinement improvement is 3.5% per iteration, so skip blocks that are
1231 // unlikely to catch up with the best block we have already. Assume a 4.5% per step to
1232 // give benefit of the doubt ...
1233 unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
1234 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
1235 if (errorval > (threshold * best_errorval_in_scb))
1236 {
1237 break;
1238 }
1239
1240 if (errorval < best_errorval_in_scb)
1241 {
1242 best_errorval_in_scb = errorval;
1243 workscb.errorval = errorval;
1244 scb = workscb;
1245
1246 if (errorval < tune_errorval_threshold)
1247 {
1248 // Skip remaining candidates - this is "good enough"
1249 i = candidate_count;
1250 break;
1251 }
1252 }
1253
1254 if (!adjustments)
1255 {
1256 break;
1257 }
1258 }
1259 }
1260
1261 return best_errorval_in_mode;
1262 }
1263
1264 /**
1265 * @brief Determine the lowest cross-channel correlation factor.
1266 *
1267 * @param texels_per_block The number of texels in a block.
1268 * @param blk The image block color data to compress.
1269 *
1270 * @return Return the lowest correlation factor.
1271 */
prepare_block_statistics( int texels_per_block, const image_block& blk )1272 static float prepare_block_statistics(
1273 int texels_per_block,
1274 const image_block& blk
1275 ) {
1276 // Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
1277 // of the matrix. The matrix is symmetric, so this is all we need for this use case.
1278 float rs = 0.0f;
1279 float gs = 0.0f;
1280 float bs = 0.0f;
1281 float as = 0.0f;
1282 float rr_var = 0.0f;
1283 float gg_var = 0.0f;
1284 float bb_var = 0.0f;
1285 float aa_var = 0.0f;
1286 float rg_cov = 0.0f;
1287 float rb_cov = 0.0f;
1288 float ra_cov = 0.0f;
1289 float gb_cov = 0.0f;
1290 float ga_cov = 0.0f;
1291 float ba_cov = 0.0f;
1292
1293 float weight_sum = 0.0f;
1294
1295 promise(texels_per_block > 0);
1296 for (int i = 0; i < texels_per_block; i++)
1297 {
1298 float weight = hadd_s(blk.channel_weight) / 4.0f;
1299 assert(weight >= 0.0f);
1300 weight_sum += weight;
1301
1302 float r = blk.data_r[i];
1303 float g = blk.data_g[i];
1304 float b = blk.data_b[i];
1305 float a = blk.data_a[i];
1306
1307 float rw = r * weight;
1308 rs += rw;
1309 rr_var += r * rw;
1310 rg_cov += g * rw;
1311 rb_cov += b * rw;
1312 ra_cov += a * rw;
1313
1314 float gw = g * weight;
1315 gs += gw;
1316 gg_var += g * gw;
1317 gb_cov += b * gw;
1318 ga_cov += a * gw;
1319
1320 float bw = b * weight;
1321 bs += bw;
1322 bb_var += b * bw;
1323 ba_cov += a * bw;
1324
1325 float aw = a * weight;
1326 as += aw;
1327 aa_var += a * aw;
1328 }
1329
1330 float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
1331
1332 rr_var -= rs * (rs * rpt);
1333 rg_cov -= gs * (rs * rpt);
1334 rb_cov -= bs * (rs * rpt);
1335 ra_cov -= as * (rs * rpt);
1336
1337 gg_var -= gs * (gs * rpt);
1338 gb_cov -= bs * (gs * rpt);
1339 ga_cov -= as * (gs * rpt);
1340
1341 bb_var -= bs * (bs * rpt);
1342 ba_cov -= as * (bs * rpt);
1343
1344 aa_var -= as * (as * rpt);
1345
1346 // These will give a NaN if a channel is constant - these are fixed up in the next step
1347 rg_cov *= astc::rsqrt(rr_var * gg_var);
1348 rb_cov *= astc::rsqrt(rr_var * bb_var);
1349 ra_cov *= astc::rsqrt(rr_var * aa_var);
1350 gb_cov *= astc::rsqrt(gg_var * bb_var);
1351 ga_cov *= astc::rsqrt(gg_var * aa_var);
1352 ba_cov *= astc::rsqrt(bb_var * aa_var);
1353
1354 if (astc::isnan(rg_cov)) rg_cov = 1.0f;
1355 if (astc::isnan(rb_cov)) rb_cov = 1.0f;
1356 if (astc::isnan(ra_cov)) ra_cov = 1.0f;
1357 if (astc::isnan(gb_cov)) gb_cov = 1.0f;
1358 if (astc::isnan(ga_cov)) ga_cov = 1.0f;
1359 if (astc::isnan(ba_cov)) ba_cov = 1.0f;
1360
1361 float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
1362 lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));
1363 lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));
1364 lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));
1365 lowest_correlation = astc::min(lowest_correlation, fabsf(ba_cov));
1366
1367 // Diagnostic trace points
1368 trace_add_data("min_r", blk.data_min.lane<0>());
1369 trace_add_data("max_r", blk.data_max.lane<0>());
1370 trace_add_data("min_g", blk.data_min.lane<1>());
1371 trace_add_data("max_g", blk.data_max.lane<1>());
1372 trace_add_data("min_b", blk.data_min.lane<2>());
1373 trace_add_data("max_b", blk.data_max.lane<2>());
1374 trace_add_data("min_a", blk.data_min.lane<3>());
1375 trace_add_data("max_a", blk.data_max.lane<3>());
1376 trace_add_data("cov_rg", fabsf(rg_cov));
1377 trace_add_data("cov_rb", fabsf(rb_cov));
1378 trace_add_data("cov_ra", fabsf(ra_cov));
1379 trace_add_data("cov_gb", fabsf(gb_cov));
1380 trace_add_data("cov_ga", fabsf(ga_cov));
1381 trace_add_data("cov_ba", fabsf(ba_cov));
1382
1383 return lowest_correlation;
1384 }
1385
1386 /* See header for documentation. */
compress_block( const astcenc_contexti& ctx, const image_block& blk, uint8_t pcb[16], compression_working_buffers& tmpbuf, bool calQualityEnable, int32_t *mseBlock[RGBA_COM] )1387 void compress_block(
1388 const astcenc_contexti& ctx,
1389 const image_block& blk,
1390 uint8_t pcb[16],
1391 #if QUALITY_CONTROL
1392 compression_working_buffers& tmpbuf,
1393 bool calQualityEnable,
1394 int32_t *mseBlock[RGBA_COM]
1395 #else
1396 compression_working_buffers& tmpbuf
1397 #endif
1398 )
1399 {
1400 astcenc_profile decode_mode = ctx.config.profile;
1401 symbolic_compressed_block scb;
1402 const block_size_descriptor& bsd = *ctx.bsd;
1403 float lowest_correl;
1404
1405 TRACE_NODE(node0, "block");
1406 trace_add_data("pos_x", blk.xpos);
1407 trace_add_data("pos_y", blk.ypos);
1408 trace_add_data("pos_z", blk.zpos);
1409
1410 // Set stricter block targets for luminance data as we have more bits to play with
1411 bool block_is_l = blk.is_luminance();
1412 float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
1413
1414 // Set slightly stricter block targets for lumalpha data as we have more bits to play with
1415 bool block_is_la = blk.is_luminancealpha();
1416 float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
1417
1418 bool block_skip_two_plane = false;
1419 int max_partitions;
1420 if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
1421 {
1422 max_partitions = 1;
1423 }
1424 #ifdef ASTC_CUSTOMIZED_ENABLE
1425 else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
1426 {
1427 if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
1428 g_astcCustomizedSoManager.customizedMaxPartitionsFunc_ == nullptr)
1429 {
1430 printf("astcenc customized so dlopen failed or customizedMaxPartitionsFunc_ is nullptr!\n");
1431 return;
1432 }
1433 max_partitions = g_astcCustomizedSoManager.customizedMaxPartitionsFunc_();
1434 }
1435 #endif
1436 else
1437 {
1438 max_partitions = ctx.config.tune_partition_count_limit;
1439 }
1440
1441 unsigned int requested_partition_indices[3] {
1442 ctx.config.tune_2partition_index_limit,
1443 ctx.config.tune_3partition_index_limit,
1444 ctx.config.tune_4partition_index_limit
1445 };
1446
1447 unsigned int requested_partition_trials[3] {
1448 ctx.config.tune_2partitioning_candidate_limit,
1449 ctx.config.tune_3partitioning_candidate_limit,
1450 ctx.config.tune_4partitioning_candidate_limit
1451 };
1452
1453 #if defined(ASTCENC_DIAGNOSTICS)
1454 // Do this early in diagnostic builds so we can dump uniform metrics
1455 // for every block. Do it later in release builds to avoid redundant work!
1456 float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1457 float error_threshold = ctx.config.tune_db_limit
1458 * error_weight_sum
1459 * block_is_l_scale
1460 * block_is_la_scale;
1461
1462 lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1463 trace_add_data("lowest_correl", lowest_correl);
1464 trace_add_data("tune_error_threshold", error_threshold);
1465 #endif
1466
1467 // Detected a constant-color block
1468 if (all(blk.data_min == blk.data_max))
1469 {
1470 TRACE_NODE(node1, "pass");
1471 trace_add_data("partition_count", 0);
1472 trace_add_data("plane_count", 1);
1473
1474 scb.partition_count = 0;
1475
1476 // Encode as FP16 if using HDR
1477 if ((decode_mode == ASTCENC_PRF_HDR) ||
1478 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
1479 {
1480 scb.block_type = SYM_BTYPE_CONST_F16;
1481 vint4 color_f16 = float_to_float16(blk.origin_texel);
1482 store(color_f16, scb.constant_color);
1483 }
1484 // Encode as UNORM16 if NOT using HDR
1485 else
1486 {
1487 scb.block_type = SYM_BTYPE_CONST_U16;
1488 vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1489 vint4 color_u16 = float_to_int_rtn(color_f32);
1490 store(color_u16, scb.constant_color);
1491 }
1492
1493 trace_add_data("exit", "quality hit");
1494 if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE)
1495 {
1496 scb.block_type = SYM_BTYPE_NONCONST;
1497 scb.partition_count = 1;
1498 scb.color_formats_matched = 0;
1499 scb.plane2_component = -1;
1500 if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
1501 {
1502 scb.block_mode = HIGH_SPEED_PROFILE_BLOCK_MODE;
1503 }
1504 #ifdef ASTC_CUSTOMIZED_ENABLE
1505 else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
1506 {
1507 if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
1508 g_astcCustomizedSoManager.customizedBlockModeFunc_ == nullptr)
1509 {
1510 printf("astcenc customized so dlopen failed or customizedBlockModeFunc_ is nullptr!\n");
1511 return;
1512 }
1513 scb.block_mode = g_astcCustomizedSoManager.customizedBlockModeFunc_();
1514 }
1515 #endif
1516 scb.partition_index = 0;
1517 scb.quant_mode = QUANT_256;
1518 scb.color_formats[0] = 12; // color format is 12 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
1519 for (int w = 0; w < 16; w++) { // weights num is 16 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
1520 scb.weights[w] = 0;
1521 }
1522 for (unsigned int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit
1523 scb.color_values[0][pixel << 1] = scb.constant_color[pixel] & BYTE_MASK; // low byte
1524 scb.color_values[0][(pixel << 1) + 1] = (scb.constant_color[pixel] >> 8) & BYTE_MASK; // high byte
1525 }
1526 }
1527 scb.privateProfile = ctx.config.privateProfile;
1528 symbolic_to_physical(bsd, scb, pcb);
1529 #if QUALITY_CONTROL
1530 if (calQualityEnable) {
1531 *mseBlock[R_COM] = *mseBlock[G_COM] = *mseBlock[B_COM] = *mseBlock[A_COM] = 0;
1532 }
1533 #endif
1534 return;
1535 }
1536
1537 #if !defined(ASTCENC_DIAGNOSTICS)
1538 float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1539 float error_threshold = ctx.config.tune_db_limit
1540 * error_weight_sum
1541 * block_is_l_scale
1542 * block_is_la_scale;
1543 #endif
1544
1545 // Set SCB and mode errors to a very high error value
1546 scb.errorval = ERROR_CALC_DEFAULT;
1547 scb.block_type = SYM_BTYPE_ERROR;
1548
1549 float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
1550 ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
1551 };
1552
1553 float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
1554 0.0f,
1555 ctx.config.tune_2partition_early_out_limit_factor,
1556 ctx.config.tune_3partition_early_out_limit_factor,
1557 0.0f
1558 };
1559
1560 // Trial using 1 plane of weights and 1 partition.
1561
1562 // Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
1563 // mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
1564 // optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
1565 // compression and slightly reduces image quality.
1566
1567 float errorval_mult[2] {
1568 1.0f / ctx.config.tune_mse_overshoot,
1569 1.0f
1570 };
1571
1572 static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
1573
1574 // Only enable MODE0 fast path if enabled
1575 // Never enable for 3D blocks as no "always" block modes are available
1576 int start_trial = 1;
1577 if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
1578 {
1579 start_trial = 0;
1580 }
1581
1582 int quant_limit = QUANT_32;
1583 for (int i = start_trial; i < 2; i++)
1584 {
1585 TRACE_NODE(node1, "pass");
1586 trace_add_data("partition_count", 1);
1587 trace_add_data("plane_count", 1);
1588 trace_add_data("search_mode", i);
1589
1590 float errorval = compress_symbolic_block_for_partition_1plane(
1591 ctx.config.privateProfile,
1592 ctx.config, bsd, blk, i == 0,
1593 error_threshold * errorval_mult[i] * errorval_overshoot,
1594 1, 0, scb, tmpbuf, QUANT_32);
1595
1596 // Record the quant level so we can use the filter later searches
1597 const auto& bm = bsd.get_block_mode(scb.block_mode);
1598 quant_limit = bm.get_weight_quant_mode();
1599
1600 best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
1601 if ((ctx.config.privateProfile == HIGH_SPEED_PROFILE) || (errorval < (error_threshold * errorval_mult[i])))
1602 {
1603 trace_add_data("exit", "quality hit");
1604 goto END_OF_TESTS;
1605 }
1606 }
1607
1608 #if !defined(ASTCENC_DIAGNOSTICS)
1609 lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1610 #endif
1611
1612 block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
1613
1614 // Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
1615 // alpha is the most likely to be non-correlated if it is present in the data.
1616 for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
1617 {
1618 if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE)
1619 {
1620 break;
1621 }
1622 TRACE_NODE(node1, "pass");
1623 trace_add_data("partition_count", 1);
1624 trace_add_data("plane_count", 2);
1625 trace_add_data("plane_component", i);
1626
1627 if (block_skip_two_plane)
1628 {
1629 trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
1630 continue;
1631 }
1632
1633 if (blk.grayscale && i != 3)
1634 {
1635 trace_add_data("skip", "grayscale block");
1636 continue;
1637 }
1638
1639 if (blk.is_constant_channel(i))
1640 {
1641 trace_add_data("skip", "constant component");
1642 continue;
1643 }
1644
1645 float errorval = compress_symbolic_block_for_partition_2planes(
1646 ctx.config.privateProfile,
1647 ctx.config, bsd, blk, error_threshold * errorval_overshoot,
1648 i, scb, tmpbuf, quant_limit);
1649
1650 // If attempting two planes is much worse than the best one plane result
1651 // then further two plane searches are unlikely to help so move on ...
1652 if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
1653 {
1654 break;
1655 }
1656
1657 if (errorval < error_threshold)
1658 {
1659 trace_add_data("exit", "quality hit");
1660 goto END_OF_TESTS;
1661 }
1662 }
1663
1664 // Find best blocks for 2, 3 and 4 partitions
1665 for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
1666 {
1667 unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
1668
1669 unsigned int requested_indices = requested_partition_indices[partition_count - 2];
1670
1671 unsigned int requested_trials = requested_partition_trials[partition_count - 2];
1672 requested_trials = astc::min(requested_trials, requested_indices);
1673
1674 unsigned int actual_trials = find_best_partition_candidates(
1675 bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
1676
1677 float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
1678
1679 for (unsigned int i = 0; i < actual_trials; i++)
1680 {
1681 TRACE_NODE(node1, "pass");
1682 trace_add_data("partition_count", partition_count);
1683 trace_add_data("partition_index", partition_indices[i]);
1684 trace_add_data("plane_count", 1);
1685 trace_add_data("search_mode", i);
1686
1687 float errorval = compress_symbolic_block_for_partition_1plane(
1688 ctx.config.privateProfile,
1689 ctx.config, bsd, blk, false,
1690 error_threshold * errorval_overshoot,
1691 partition_count, partition_indices[i],
1692 scb, tmpbuf, quant_limit);
1693
1694 best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
1695
1696 // If using N partitions doesn't improve much over using N-1 partitions then skip trying
1697 // N+1. Error can dramatically improve if the data is correlated or non-correlated and
1698 // aligns with a partitioning that suits that encoding, so for this inner loop check add
1699 // a large error scale because the "other" trial could be a lot better.
1700 float best_error = best_errorvals_for_pcount[partition_count - 1];
1701 float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
1702 if (best_error > (best_error_in_prev * best_error_scale))
1703 {
1704 trace_add_data("skip", "tune_partition_early_out_limit_factor");
1705 goto END_OF_TESTS;
1706 }
1707
1708 if (errorval < error_threshold)
1709 {
1710 trace_add_data("exit", "quality hit");
1711 goto END_OF_TESTS;
1712 }
1713 }
1714
1715 // If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
1716 float best_error = best_errorvals_for_pcount[partition_count - 1];
1717 float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
1718 if (best_error > (best_error_in_prev * best_error_scale))
1719 {
1720 trace_add_data("skip", "tune_partition_early_out_limit_factor");
1721 goto END_OF_TESTS;
1722 }
1723 }
1724
1725 trace_add_data("exit", "quality not hit");
1726
1727 END_OF_TESTS:
1728 // If we still have an error block then convert to something we can encode
1729 // TODO: Do something more sensible here, such as average color block
1730 if (scb.block_type == SYM_BTYPE_ERROR)
1731 {
1732 #if defined(ASTCENC_DIAGNOSTICS)
1733 static bool printed_once = false;
1734 if (!printed_once)
1735 {
1736 printed_once = true;
1737 printf("WARN: At least one block failed to find a valid encoding.\n"
1738 " Try increasing compression quality settings.\n\n");
1739 }
1740 #endif
1741
1742 scb.block_type = SYM_BTYPE_CONST_U16;
1743 vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1744 vint4 color_u16 = float_to_int_rtn(color_f32);
1745 store(color_u16, scb.constant_color);
1746 }
1747
1748 // Compress to a physical block
1749 scb.privateProfile = ctx.config.privateProfile;
1750 symbolic_to_physical(bsd, scb, pcb);
1751 #if QUALITY_CONTROL
1752 if (calQualityEnable) {
1753 image_block decBlk = blk;
1754 decompress_symbolic_block(ctx.config.profile, bsd, blk.xpos, blk.ypos, blk.zpos, scb, decBlk);
1755 vint4 colorSumDiff = vint4::zero();
1756 for (size_t ii = 0; ii < bsd.texel_count; ii++) {
1757 vint4 colorRef = float_to_int_rtn(blk.texel(ii) * 255.0f / 65535.0f);
1758 vint4 colorTest = float_to_int_rtn(min(decBlk.texel(ii), 1.0f) * 255.0f);
1759 vint4 colorDiff = colorRef - colorTest;
1760 colorSumDiff += colorDiff * colorDiff;
1761 }
1762 *mseBlock[R_COM] = colorSumDiff.lane<0>();
1763 *mseBlock[G_COM] = colorSumDiff.lane<1>();
1764 *mseBlock[B_COM] = colorSumDiff.lane<2>();
1765 *mseBlock[A_COM] = colorSumDiff.lane<3>();
1766 }
1767 #endif
1768 }
1769
1770 #endif
1771