1// SPDX-License-Identifier: Apache-2.0 2// ---------------------------------------------------------------------------- 3// Copyright 2011-2024 Arm Limited 4// 5// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6// use this file except in compliance with the License. You may obtain a copy 7// of the License at: 8// 9// http://www.apache.org/licenses/LICENSE-2.0 10// 11// Unless required by applicable law or agreed to in writing, software 12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14// License for the specific language governing permissions and limitations 15// under the License. 16// ---------------------------------------------------------------------------- 17 18#if !defined(ASTCENC_DECOMPRESS_ONLY) 19 20/** 21 * @brief Functions to compress a symbolic block. 22 */ 23 24#include "astcenc_internal.h" 25#include "astcenc_diagnostic_trace.h" 26 27#include <cassert> 28#ifdef ASTC_CUSTOMIZED_ENABLE 29AstcCustomizedSoManager g_astcCustomizedSoManager; 30#endif 31 32/** 33 * @brief Merge two planes of endpoints into a single vector. 34 * 35 * @param ep_plane1 The endpoints for plane 1. 36 * @param ep_plane2 The endpoints for plane 2. 37 * @param component_plane2 The color component for plane 2. 38 * @param[out] result The merged output. 39 */ 40static void merge_endpoints( 41 const endpoints& ep_plane1, 42 const endpoints& ep_plane2, 43 unsigned int component_plane2, 44 endpoints& result 45) { 46 unsigned int partition_count = ep_plane1.partition_count; 47 assert(partition_count == 1); 48 49 vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2); 50 51 result.partition_count = partition_count; 52 result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask); 53 result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask); 54} 55 56/** 57 * @brief Attempt to improve weights given a chosen configuration. 58 * 59 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per 60 * partition and per plane) and attempt to improve image quality by moving each weight up by one or 61 * down by one quantization step. 62 * 63 * This is a specialized function which only supports operating on undecimated weight grids, 64 * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation 65 * is needed less often. 66 * 67 * @param decode_mode The decode mode (LDR, HDR). 68 * @param bsd The block size information. 69 * @param blk The image block color data to compress. 70 * @param[out] scb The symbolic compressed block output. 71 */ 72#if ASTCENC_NEON != 0 73static bool realign_weights_undecimated( 74 astcenc_profile decode_mode, 75 const block_size_descriptor& bsd, 76 const image_block& blk, 77 symbolic_compressed_block& scb 78) { 79 // Get the partition descriptor 80 unsigned int partition_count = scb.partition_count; 81 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); 82 83 // Get the quantization table 84 const block_mode& bm = bsd.get_block_mode(scb.block_mode); 85 unsigned int weight_quant_level = bm.quant_mode; 86 const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level]; 87 88 unsigned int max_plane = bm.is_dual_plane; 89 int plane2_component = scb.plane2_component; 90 vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component); 91 92 // Decode the color endpoints 93 bool rgb_hdr; 94 bool alpha_hdr; 95 vint4 endpnt0[BLOCK_MAX_PARTITIONS]; 96 vint4 endpnt1[BLOCK_MAX_PARTITIONS]; 97 vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS]; 98 vfloat4 offset[BLOCK_MAX_PARTITIONS]; 99 100 promise(partition_count > 0); 101 102 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) 103 { 104 unpack_color_endpoints(decode_mode, 105 scb.color_formats[pa_idx], 106 scb.color_values[pa_idx], 107 rgb_hdr, alpha_hdr, 108 endpnt0[pa_idx], 109 endpnt1[pa_idx]); 110 } 111 112 uint8_t* dec_weights_uquant = scb.weights; 113 bool adjustments = false; 114 115 // For each plane and partition ... 116 for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++) 117 { 118 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) 119 { 120 // Compute the endpoint delta for all components in current plane 121 vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx]; 122 epd = select(epd, vint4::zero(), plane_mask); 123 124 endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]); 125 offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f); 126 } 127 128 // For each weight compute previous, current, and next errors 129 promise(bsd.texel_count > 0); 130 131 unsigned int texel = 0; 132 for (; texel + ASTCENC_SIMD_WIDTH <= bsd.texel_count; texel += ASTCENC_SIMD_WIDTH) 133 { 134 int uqw0 = dec_weights_uquant[texel]; 135 int uqw1 = dec_weights_uquant[texel + 1]; 136 int uqw2 = dec_weights_uquant[texel + 2]; 137 int uqw3 = dec_weights_uquant[texel + 3]; 138 139 vint4 uqw_vec = vint4(uqw0, uqw1, uqw2, uqw3); 140 vint4 prev_and_next_vec = vint4(qat.prev_next_values[uqw0], qat.prev_next_values[uqw1], 141 qat.prev_next_values[uqw2], qat.prev_next_values[uqw3]); 142 143 vint4 mask = vint4(0xFF, 0xFF, 0xFF, 0xFF); 144 vint4 uqw_down_vec = prev_and_next_vec & mask; 145 vint4 uqw_up_vec = vint4(vshrq_n_s32(prev_and_next_vec.m, 8)) & mask; 146 147 vfloat4 weight_base_vec = int_to_float(uqw_vec); 148 vfloat4 weight_down_vec = int_to_float(uqw_down_vec) - weight_base_vec; 149 vfloat4 weight_up_vec = int_to_float(uqw_up_vec) - weight_base_vec; 150 151 unsigned int partition0 = pi.partition_of_texel[texel]; 152 unsigned int partition1 = pi.partition_of_texel[texel + 1]; 153 unsigned int partition2 = pi.partition_of_texel[texel + 2]; 154 unsigned int partition3 = pi.partition_of_texel[texel + 3]; 155 156 vfloat4 color_offset0 = offset[partition0]; 157 vfloat4 color_offset1 = offset[partition1]; 158 vfloat4 color_offset2 = offset[partition2]; 159 vfloat4 color_offset3 = offset[partition3]; 160 161 vfloat4 color_base0 = endpnt0f[partition0]; 162 vfloat4 color_base1 = endpnt0f[partition1]; 163 vfloat4 color_base2 = endpnt0f[partition2]; 164 vfloat4 color_base3 = endpnt0f[partition3]; 165 166 vfloat4 color0 = color_base0 + color_offset0 * weight_base_vec.lane<0>(); 167 vfloat4 color1 = color_base1 + color_offset1 * weight_base_vec.lane<1>(); 168 vfloat4 color2 = color_base2 + color_offset2 * weight_base_vec.lane<2>(); 169 vfloat4 color3 = color_base3 + color_offset3 * weight_base_vec.lane<3>(); 170 171 vfloat4 orig_color0 = blk.texel(texel); 172 vfloat4 orig_color1 = blk.texel(texel + 1); 173 vfloat4 orig_color2 = blk.texel(texel + 2); 174 vfloat4 orig_color3 = blk.texel(texel + 3); 175 176 vfloat4 error_weight = blk.channel_weight; 177 178 vfloat4 color_diff0 = color0 - orig_color0; 179 vfloat4 color_diff1 = color1 - orig_color1; 180 vfloat4 color_diff2 = color2 - orig_color2; 181 vfloat4 color_diff3 = color3 - orig_color3; 182 183 vfloat4 color_diff_down0 = color_diff0 + color_offset0 * weight_down_vec.lane<0>(); 184 vfloat4 color_diff_down1 = color_diff1 + color_offset1 * weight_down_vec.lane<1>(); 185 vfloat4 color_diff_down2 = color_diff2 + color_offset2 * weight_down_vec.lane<2>(); 186 vfloat4 color_diff_down3 = color_diff3 + color_offset3 * weight_down_vec.lane<3>(); 187 188 vfloat4 color_diff_up0 = color_diff0 + color_offset0 * weight_up_vec.lane<0>(); 189 vfloat4 color_diff_up1 = color_diff1 + color_offset1 * weight_up_vec.lane<1>(); 190 vfloat4 color_diff_up2 = color_diff2 + color_offset2 * weight_up_vec.lane<2>(); 191 vfloat4 color_diff_up3 = color_diff3 + color_offset3 * weight_up_vec.lane<3>(); 192 193 float error_base0 = dot_s(color_diff0 * color_diff0, error_weight); 194 float error_base1 = dot_s(color_diff1 * color_diff1, error_weight); 195 float error_base2 = dot_s(color_diff2 * color_diff2, error_weight); 196 float error_base3 = dot_s(color_diff3 * color_diff3, error_weight); 197 198 float error_down0 = dot_s(color_diff_down0 * color_diff_down0, error_weight); 199 float error_down1 = dot_s(color_diff_down1 * color_diff_down1, error_weight); 200 float error_down2 = dot_s(color_diff_down2 * color_diff_down2, error_weight); 201 float error_down3 = dot_s(color_diff_down3 * color_diff_down3, error_weight); 202 203 float error_up0 = dot_s(color_diff_up0 * color_diff_up0, error_weight); 204 float error_up1 = dot_s(color_diff_up1 * color_diff_up1, error_weight); 205 float error_up2 = dot_s(color_diff_up2 * color_diff_up2, error_weight); 206 float error_up3 = dot_s(color_diff_up3 * color_diff_up3, error_weight); 207 208 vfloat4 error_base_vec = vfloat4(error_base0, error_base1, error_base2, error_base3); 209 vfloat4 error_down_vec = vfloat4(error_down0, error_down1, error_down2, error_down3); 210 vfloat4 error_up_vec = vfloat4(error_up0, error_up1, error_up2, error_up3); 211 212 vmask4 check_result_up = (error_up_vec < error_base_vec) & 213 (error_up_vec < error_down_vec) & (uqw_vec < vint4(64)); 214 215 vmask4 check_result_down = (error_down_vec < error_base_vec) & (uqw_vec > vint4::zero()); 216 check_result_down = check_result_down & (~check_result_up); 217 218 if (popcount(check_result_up | check_result_down) != 0) 219 { 220 uqw_vec = select(uqw_vec, uqw_up_vec, check_result_up); 221 uqw_vec = select(uqw_vec, uqw_down_vec, check_result_down); 222 223 dec_weights_uquant[texel] = uqw_vec.lane<0>(); 224 dec_weights_uquant[texel + 1] = uqw_vec.lane<1>(); 225 dec_weights_uquant[texel + 2] = uqw_vec.lane<2>(); // channel 2 226 dec_weights_uquant[texel + 3] = uqw_vec.lane<3>(); // channel 3 227 adjustments = true; 228 } 229 }; 230 231 for (; texel < bsd.texel_count; texel++) 232 { 233 int uqw = dec_weights_uquant[texel]; 234 235 uint32_t prev_and_next = qat.prev_next_values[uqw]; 236 int uqw_down = prev_and_next & 0xFF; 237 int uqw_up = (prev_and_next >> 8) & 0xFF; 238 239 // Interpolate the colors to create the diffs 240 float weight_base = static_cast<float>(uqw); 241 float weight_down = static_cast<float>(uqw_down - uqw); 242 float weight_up = static_cast<float>(uqw_up - uqw); 243 244 unsigned int partition = pi.partition_of_texel[texel]; 245 vfloat4 color_offset = offset[partition]; 246 vfloat4 color_base = endpnt0f[partition]; 247 248 vfloat4 color = color_base + color_offset * weight_base; 249 vfloat4 orig_color = blk.texel(texel); 250 vfloat4 error_weight = blk.channel_weight; 251 252 vfloat4 color_diff = color - orig_color; 253 vfloat4 color_diff_down = color_diff + color_offset * weight_down; 254 vfloat4 color_diff_up = color_diff + color_offset * weight_up; 255 256 float error_base = dot_s(color_diff * color_diff, error_weight); 257 float error_down = dot_s(color_diff_down * color_diff_down, error_weight); 258 float error_up = dot_s(color_diff_up * color_diff_up, error_weight); 259 260 // Check if the prev or next error is better, and if so use it 261 if ((error_up < error_base) && (error_up < error_down) && (uqw < 64)) 262 { 263 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up); 264 adjustments = true; 265 } 266 else if ((error_down < error_base) && (uqw > 0)) 267 { 268 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down); 269 adjustments = true; 270 } 271 } 272 273 // Prepare iteration for plane 2 274 dec_weights_uquant += WEIGHTS_PLANE2_OFFSET; 275 plane_mask = ~plane_mask; 276 } 277 return adjustments; 278} 279#else 280static bool realign_weights_undecimated( 281 astcenc_profile decode_mode, 282 const block_size_descriptor& bsd, 283 const image_block& blk, 284 symbolic_compressed_block& scb 285) { 286 // Get the partition descriptor 287 unsigned int partition_count = scb.partition_count; 288 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); 289 290 // Get the quantization table 291 const block_mode& bm = bsd.get_block_mode(scb.block_mode); 292 unsigned int weight_quant_level = bm.quant_mode; 293 const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level]; 294 295 unsigned int max_plane = bm.is_dual_plane; 296 int plane2_component = scb.plane2_component; 297 vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component); 298 299 // Decode the color endpoints 300 bool rgb_hdr; 301 bool alpha_hdr; 302 vint4 endpnt0[BLOCK_MAX_PARTITIONS]; 303 vint4 endpnt1[BLOCK_MAX_PARTITIONS]; 304 vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS]; 305 vfloat4 offset[BLOCK_MAX_PARTITIONS]; 306 307 promise(partition_count > 0); 308 309 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) 310 { 311 unpack_color_endpoints(decode_mode, 312 scb.color_formats[pa_idx], 313 scb.color_values[pa_idx], 314 rgb_hdr, alpha_hdr, 315 endpnt0[pa_idx], 316 endpnt1[pa_idx]); 317 } 318 319 uint8_t* dec_weights_uquant = scb.weights; 320 bool adjustments = false; 321 322 // For each plane and partition ... 323 for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++) 324 { 325 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) 326 { 327 // Compute the endpoint delta for all components in current plane 328 vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx]; 329 epd = select(epd, vint4::zero(), plane_mask); 330 331 endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]); 332 offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f); 333 } 334 335 // For each weight compute previous, current, and next errors 336 promise(bsd.texel_count > 0); 337 for (unsigned int texel = 0; texel < bsd.texel_count; texel++) 338 { 339 int uqw = dec_weights_uquant[texel]; 340 341 uint32_t prev_and_next = qat.prev_next_values[uqw]; 342 int uqw_down = prev_and_next & 0xFF; 343 int uqw_up = (prev_and_next >> 8) & 0xFF; 344 345 // Interpolate the colors to create the diffs 346 float weight_base = static_cast<float>(uqw); 347 float weight_down = static_cast<float>(uqw_down - uqw); 348 float weight_up = static_cast<float>(uqw_up - uqw); 349 350 unsigned int partition = pi.partition_of_texel[texel]; 351 vfloat4 color_offset = offset[partition]; 352 vfloat4 color_base = endpnt0f[partition]; 353 354 vfloat4 color = color_base + color_offset * weight_base; 355 vfloat4 orig_color = blk.texel(texel); 356 vfloat4 error_weight = blk.channel_weight; 357 358 vfloat4 color_diff = color - orig_color; 359 vfloat4 color_diff_down = color_diff + color_offset * weight_down; 360 vfloat4 color_diff_up = color_diff + color_offset * weight_up; 361 362 float error_base = dot_s(color_diff * color_diff, error_weight); 363 float error_down = dot_s(color_diff_down * color_diff_down, error_weight); 364 float error_up = dot_s(color_diff_up * color_diff_up, error_weight); 365 366 // Check if the prev or next error is better, and if so use it 367 if ((error_up < error_base) && (error_up < error_down) && (uqw < 64)) 368 { 369 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up); 370 adjustments = true; 371 } 372 else if ((error_down < error_base) && (uqw > 0)) 373 { 374 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down); 375 adjustments = true; 376 } 377 } 378 379 // Prepare iteration for plane 2 380 dec_weights_uquant += WEIGHTS_PLANE2_OFFSET; 381 plane_mask = ~plane_mask; 382 } 383 384 return adjustments; 385} 386#endif 387 388/** 389 * @brief Attempt to improve weights given a chosen configuration. 390 * 391 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per 392 * partition and per plane) and attempt to improve image quality by moving each weight up by one or 393 * down by one quantization step. 394 * 395 * @param decode_mode The decode mode (LDR, HDR). 396 * @param bsd The block size information. 397 * @param blk The image block color data to compress. 398 * @param[out] scb The symbolic compressed block output. 399 */ 400static bool realign_weights_decimated( 401 astcenc_profile decode_mode, 402 const block_size_descriptor& bsd, 403 const image_block& blk, 404 symbolic_compressed_block& scb 405) { 406 // Get the partition descriptor 407 unsigned int partition_count = scb.partition_count; 408 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); 409 410 // Get the quantization table 411 const block_mode& bm = bsd.get_block_mode(scb.block_mode); 412 unsigned int weight_quant_level = bm.quant_mode; 413 const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level]; 414 415 // Get the decimation table 416 const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); 417 unsigned int weight_count = di.weight_count; 418 assert(weight_count != bsd.texel_count); 419 420 unsigned int max_plane = bm.is_dual_plane; 421 int plane2_component = scb.plane2_component; 422 vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component); 423 424 // Decode the color endpoints 425 bool rgb_hdr; 426 bool alpha_hdr; 427 vint4 endpnt0[BLOCK_MAX_PARTITIONS]; 428 vint4 endpnt1[BLOCK_MAX_PARTITIONS]; 429 vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS]; 430 vfloat4 offset[BLOCK_MAX_PARTITIONS]; 431 432 promise(partition_count > 0); 433 promise(weight_count > 0); 434 435 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) 436 { 437 unpack_color_endpoints(decode_mode, 438 scb.color_formats[pa_idx], 439 scb.color_values[pa_idx], 440 rgb_hdr, alpha_hdr, 441 endpnt0[pa_idx], 442 endpnt1[pa_idx]); 443 } 444 445 uint8_t* dec_weights_uquant = scb.weights; 446 bool adjustments = false; 447 448 // For each plane and partition ... 449 for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++) 450 { 451 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) 452 { 453 // Compute the endpoint delta for all components in current plane 454 vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx]; 455 epd = select(epd, vint4::zero(), plane_mask); 456 457 endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]); 458 offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f); 459 } 460 461 // Create an unquantized weight grid for this decimation level 462 ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS]; 463 for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH) 464 { 465 vint unquant_value(dec_weights_uquant + we_idx); 466 vfloat unquant_valuef = int_to_float(unquant_value); 467 storea(unquant_valuef, uq_weightsf + we_idx); 468 } 469 470 // For each weight compute previous, current, and next errors 471 for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++) 472 { 473 int uqw = dec_weights_uquant[we_idx]; 474 uint32_t prev_and_next = qat.prev_next_values[uqw]; 475 476 float uqw_base = uq_weightsf[we_idx]; 477 float uqw_down = static_cast<float>(prev_and_next & 0xFF); 478 float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF); 479 480 float uqw_diff_down = uqw_down - uqw_base; 481 float uqw_diff_up = uqw_up - uqw_base; 482 483 vfloat4 error_basev = vfloat4::zero(); 484 vfloat4 error_downv = vfloat4::zero(); 485 vfloat4 error_upv = vfloat4::zero(); 486 487 // Interpolate the colors to create the diffs 488 unsigned int texels_to_evaluate = di.weight_texel_count[we_idx]; 489 promise(texels_to_evaluate > 0); 490 for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++) 491 { 492 unsigned int texel = di.weight_texels_tr[te_idx][we_idx]; 493 494 float tw_base = di.texel_contrib_for_weight[te_idx][we_idx]; 495 496 float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel] 497 + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel]) 498 + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel] 499 + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]); 500 501 // Ideally this is integer rounded, but IQ gain it isn't worth the overhead 502 // float weight = astc::flt_rd(weight_base + 0.5f); 503 // float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight; 504 // float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight; 505 float weight_down = weight_base + uqw_diff_down * tw_base - weight_base; 506 float weight_up = weight_base + uqw_diff_up * tw_base - weight_base; 507 508 unsigned int partition = pi.partition_of_texel[texel]; 509 vfloat4 color_offset = offset[partition]; 510 vfloat4 color_base = endpnt0f[partition]; 511 512 vfloat4 color = color_base + color_offset * weight_base; 513 vfloat4 orig_color = blk.texel(texel); 514 515 vfloat4 color_diff = color - orig_color; 516 vfloat4 color_down_diff = color_diff + color_offset * weight_down; 517 vfloat4 color_up_diff = color_diff + color_offset * weight_up; 518 519 error_basev += color_diff * color_diff; 520 error_downv += color_down_diff * color_down_diff; 521 error_upv += color_up_diff * color_up_diff; 522 } 523 524 vfloat4 error_weight = blk.channel_weight; 525 float error_base = hadd_s(error_basev * error_weight); 526 float error_down = hadd_s(error_downv * error_weight); 527 float error_up = hadd_s(error_upv * error_weight); 528 529 // Check if the prev or next error is better, and if so use it 530 if ((error_up < error_base) && (error_up < error_down) && (uqw < 64)) 531 { 532 uq_weightsf[we_idx] = uqw_up; 533 dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up); 534 adjustments = true; 535 } 536 else if ((error_down < error_base) && (uqw > 0)) 537 { 538 uq_weightsf[we_idx] = uqw_down; 539 dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down); 540 adjustments = true; 541 } 542 } 543 544 // Prepare iteration for plane 2 545 dec_weights_uquant += WEIGHTS_PLANE2_OFFSET; 546 plane_mask = ~plane_mask; 547 } 548 549 return adjustments; 550} 551 552/** 553 * @brief Compress a block using a chosen partitioning and 1 plane of weights. 554 * 555 * @param config The compressor configuration. 556 * @param bsd The block size information. 557 * @param blk The image block color data to compress. 558 * @param only_always True if we only use "always" percentile block modes. 559 * @param tune_errorval_threshold The error value threshold. 560 * @param partition_count The partition count. 561 * @param partition_index The partition index if @c partition_count is 2-4. 562 * @param[out] scb The symbolic compressed block output. 563 * @param[out] tmpbuf The quantized weights for plane 1. 564 */ 565static float compress_symbolic_block_for_partition_1plane( 566 QualityProfile privateProfile, 567 const astcenc_config& config, 568 const block_size_descriptor& bsd, 569 const image_block& blk, 570 bool only_always, 571 float tune_errorval_threshold, 572 unsigned int partition_count, 573 unsigned int partition_index, 574 symbolic_compressed_block& scb, 575 compression_working_buffers& tmpbuf, 576 int quant_limit 577) { 578 promise(partition_count > 0); 579 promise(config.tune_candidate_limit > 0); 580 promise(config.tune_refinement_limit > 0); 581 582 int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit); 583 584 auto compute_difference = &compute_symbolic_block_difference_1plane; 585 if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM)) 586 { 587 compute_difference = &compute_symbolic_block_difference_1plane_1partition; 588 } 589 590 const auto& pi = bsd.get_partition_info(partition_count, partition_index); 591 592 // Compute ideal weights and endpoint colors, with no quantization or decimation 593 endpoints_and_weights& ei = tmpbuf.ei1; 594 compute_ideal_colors_and_weights_1plane(blk, pi, ei); 595 596 // Compute ideal weights and endpoint colors for every decimation 597 float* dec_weights_ideal = tmpbuf.dec_weights_ideal; 598 uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant; 599 600 // For each decimation mode, compute an ideal set of weights with no quantization 601 unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always 602 : bsd.decimation_mode_count_selected; 603 promise(max_decimation_modes > 0); 604 for (unsigned int i = 0; i < max_decimation_modes; i++) 605 { 606 const auto& dm = bsd.get_decimation_mode(i); 607 if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant))) 608 { 609 continue; 610 } 611 612 const auto& di = bsd.get_decimation_info(i); 613 614 compute_ideal_weights_for_decimation( 615 ei, 616 di, 617 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS); 618 } 619 620 // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal 621 // weight pair, compute the smallest weight that will result in a color value greater than 1 622 vfloat4 min_ep(10.0f); 623 for (unsigned int i = 0; i < partition_count; i++) 624 { 625 vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]); 626 627 vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep); 628 min_ep = select(min_ep, ep, use_ep); 629 } 630 631 float min_wt_cutoff = hmin_s(min_ep); 632 633 // For each mode, use the angular method to compute a shift 634 compute_angular_endpoints_1plane( 635 privateProfile, only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf); 636 637 float* weight_low_value = tmpbuf.weight_low_value1; 638 float* weight_high_value = tmpbuf.weight_high_value1; 639 int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts; 640 float* qwt_errors = tmpbuf.qwt_errors; 641 642 // For each mode (which specifies a decimation and a quantization): 643 // * Compute number of bits needed for the quantized weights 644 // * Generate an optimized set of quantized weights 645 // * Compute quantization errors for the mode 646 647 648 static const int8_t free_bits_for_partition_count[4] { 649 115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS 650 }; 651 652 unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always 653 : bsd.block_mode_count_1plane_selected; 654 promise(max_block_modes > 0); 655 for (unsigned int i = 0; i < max_block_modes; i++) 656 { 657 const block_mode& bm = bsd.block_modes[i]; 658 659 if (bm.quant_mode > max_weight_quant) 660 { 661 qwt_errors[i] = 1e38f; 662 continue; 663 } 664 665 assert(!bm.is_dual_plane); 666 int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits; 667 if (bitcount <= 0) 668 { 669 qwt_errors[i] = 1e38f; 670 continue; 671 } 672 673 if (weight_high_value[i] > 1.02f * min_wt_cutoff) 674 { 675 weight_high_value[i] = 1.0f; 676 } 677 678 int decimation_mode = bm.decimation_mode; 679 const auto& di = bsd.get_decimation_info(decimation_mode); 680 681 qwt_bitcounts[i] = static_cast<int8_t>(bitcount); 682 683 ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; 684 685 // Generate the optimized set of weights for the weight mode 686 compute_quantized_weights_for_decimation( 687 di, 688 weight_low_value[i], weight_high_value[i], 689 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode, 690 dec_weights_uquantf, 691 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i, 692 bm.get_weight_quant_mode()); 693 694 // Compute weight quantization errors for the block mode 695 qwt_errors[i] = compute_error_of_weight_set_1plane( 696 ei, 697 di, 698 dec_weights_uquantf); 699 } 700 701 // Decide the optimal combination of color endpoint encodings and weight encodings 702 uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS]; 703 int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES]; 704 705 quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES]; 706 quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES]; 707 708 unsigned int candidate_count = compute_ideal_endpoint_formats( 709 privateProfile, 710 pi, blk, ei.ep, qwt_bitcounts, qwt_errors, 711 config.tune_candidate_limit, 0, max_block_modes, 712 partition_format_specifiers, block_mode_index, 713 color_quant_level, color_quant_level_mod, tmpbuf); 714 715 // Iterate over the N believed-to-be-best modes to find out which one is actually best 716 float best_errorval_in_mode = ERROR_CALC_DEFAULT; 717 float best_errorval_in_scb = scb.errorval; 718 719 for (unsigned int i = 0; i < candidate_count; i++) 720 { 721 TRACE_NODE(node0, "candidate"); 722 723 const int bm_packed_index = block_mode_index[i]; 724 assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected)); 725 const block_mode& qw_bm = bsd.block_modes[bm_packed_index]; 726 727 int decimation_mode = qw_bm.decimation_mode; 728 const auto& di = bsd.get_decimation_info(decimation_mode); 729 promise(di.weight_count > 0); 730 731 trace_add_data("weight_x", di.weight_x); 732 trace_add_data("weight_y", di.weight_y); 733 trace_add_data("weight_z", di.weight_z); 734 trace_add_data("weight_quant", qw_bm.quant_mode); 735 736 // Recompute the ideal color endpoints before storing them 737 vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS]; 738 vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS]; 739 740 symbolic_compressed_block workscb; 741 endpoints workep = ei.ep; 742 743 uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index; 744 745 for (unsigned int j = 0; j < di.weight_count; j++) 746 { 747 workscb.weights[j] = u8_weight_src[j]; 748 } 749 750 for (unsigned int l = 0; l < config.tune_refinement_limit; l++) 751 { 752 recompute_ideal_colors_1plane( 753 blk, pi, di, workscb.weights, 754 workep, rgbs_colors, rgbo_colors); 755 756 // Quantize the chosen color, tracking if worth trying the mod value 757 bool all_same = color_quant_level[i] != color_quant_level_mod[i]; 758 for (unsigned int j = 0; j < partition_count; j++) 759 { 760 workscb.color_formats[j] = pack_color_endpoints( 761 privateProfile, 762 workep.endpt0[j], 763 workep.endpt1[j], 764 rgbs_colors[j], 765 rgbo_colors[j], 766 partition_format_specifiers[i][j], 767 workscb.color_values[j], 768 color_quant_level[i]); 769 770 all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0]; 771 } 772 773 // If all the color endpoint modes are the same, we get a few more bits to store colors; 774 // let's see if we can take advantage of this: requantize all the colors and see if the 775 // endpoint modes remain the same. 776 workscb.color_formats_matched = 0; 777 if (partition_count >= 2 && all_same) 778 { 779 uint8_t colorvals[BLOCK_MAX_PARTITIONS][8]; 780 uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 }; 781 bool all_same_mod = true; 782 for (unsigned int j = 0; j < partition_count; j++) 783 { 784 color_formats_mod[j] = pack_color_endpoints( 785 privateProfile, 786 workep.endpt0[j], 787 workep.endpt1[j], 788 rgbs_colors[j], 789 rgbo_colors[j], 790 partition_format_specifiers[i][j], 791 colorvals[j], 792 color_quant_level_mod[i]); 793 794 // Early out as soon as it's no longer possible to use mod 795 if (color_formats_mod[j] != color_formats_mod[0]) 796 { 797 all_same_mod = false; 798 break; 799 } 800 } 801 802 if (all_same_mod) 803 { 804 workscb.color_formats_matched = 1; 805 for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++) 806 { 807 for (unsigned int k = 0; k < 8; k++) 808 { 809 workscb.color_values[j][k] = colorvals[j][k]; 810 } 811 812 workscb.color_formats[j] = color_formats_mod[j]; 813 } 814 } 815 } 816 817 // Store header fields 818 workscb.partition_count = static_cast<uint8_t>(partition_count); 819 workscb.partition_index = static_cast<uint16_t>(partition_index); 820 workscb.plane2_component = -1; 821 workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i]; 822 workscb.block_mode = qw_bm.mode_index; 823 workscb.block_type = SYM_BTYPE_NONCONST; 824 if (privateProfile == HIGH_SPEED_PROFILE) 825 { 826 workscb.errorval = 0; 827 scb = workscb; 828 break; 829 } 830 // Pre-realign test 831 if (l == 0) 832 { 833 float errorval = compute_difference(config, bsd, workscb, blk); 834 if (errorval == -ERROR_CALC_DEFAULT) 835 { 836 errorval = -errorval; 837 workscb.block_type = SYM_BTYPE_ERROR; 838 } 839 840 trace_add_data("error_prerealign", errorval); 841 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode); 842 843 // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first 844 // iteration can help more so we give it a extra 8% leeway. Use this knowledge to 845 // drive a heuristic to skip blocks that are unlikely to catch up with the best 846 // block we have already. 847 unsigned int iters_remaining = config.tune_refinement_limit - l; 848 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f; 849 if (errorval > (threshold * best_errorval_in_scb)) 850 { 851 break; 852 } 853 854 if (errorval < best_errorval_in_scb) 855 { 856 best_errorval_in_scb = errorval; 857 workscb.errorval = errorval; 858 scb = workscb; 859 860 if (errorval < tune_errorval_threshold) 861 { 862 // Skip remaining candidates - this is "good enough" 863 i = candidate_count; 864 break; 865 } 866 } 867 } 868 869 bool adjustments; 870 if (di.weight_count != bsd.texel_count) 871 { 872 adjustments = realign_weights_decimated( 873 config.profile, bsd, blk, workscb); 874 } 875 else 876 { 877 adjustments = realign_weights_undecimated( 878 config.profile, bsd, blk, workscb); 879 } 880 881 // Post-realign test 882 float errorval = compute_difference(config, bsd, workscb, blk); 883 if (errorval == -ERROR_CALC_DEFAULT) 884 { 885 errorval = -errorval; 886 workscb.block_type = SYM_BTYPE_ERROR; 887 } 888 889 trace_add_data("error_postrealign", errorval); 890 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode); 891 892 // Average refinement improvement is 3.5% per iteration, so skip blocks that are 893 // unlikely to catch up with the best block we have already. Assume a 4.5% per step to 894 // give benefit of the doubt ... 895 unsigned int iters_remaining = config.tune_refinement_limit - 1 - l; 896 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f; 897 if (errorval > (threshold * best_errorval_in_scb)) 898 { 899 break; 900 } 901 902 if (errorval < best_errorval_in_scb) 903 { 904 best_errorval_in_scb = errorval; 905 workscb.errorval = errorval; 906 scb = workscb; 907 908 if (errorval < tune_errorval_threshold) 909 { 910 // Skip remaining candidates - this is "good enough" 911 i = candidate_count; 912 break; 913 } 914 } 915 916 if (!adjustments) 917 { 918 break; 919 } 920 } 921 } 922 923 return best_errorval_in_mode; 924} 925 926/** 927 * @brief Compress a block using a chosen partitioning and 2 planes of weights. 928 * 929 * @param config The compressor configuration. 930 * @param bsd The block size information. 931 * @param blk The image block color data to compress. 932 * @param tune_errorval_threshold The error value threshold. 933 * @param plane2_component The component index for the second plane of weights. 934 * @param[out] scb The symbolic compressed block output. 935 * @param[out] tmpbuf The quantized weights for plane 1. 936 */ 937static float compress_symbolic_block_for_partition_2planes( 938 QualityProfile privateProfile, 939 const astcenc_config& config, 940 const block_size_descriptor& bsd, 941 const image_block& blk, 942 float tune_errorval_threshold, 943 unsigned int plane2_component, 944 symbolic_compressed_block& scb, 945 compression_working_buffers& tmpbuf, 946 int quant_limit 947) { 948 promise(config.tune_candidate_limit > 0); 949 promise(config.tune_refinement_limit > 0); 950 promise(bsd.decimation_mode_count_selected > 0); 951 952 int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit); 953 954 // Compute ideal weights and endpoint colors, with no quantization or decimation 955 endpoints_and_weights& ei1 = tmpbuf.ei1; 956 endpoints_and_weights& ei2 = tmpbuf.ei2; 957 958 compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2); 959 960 // Compute ideal weights and endpoint colors for every decimation 961 float* dec_weights_ideal = tmpbuf.dec_weights_ideal; 962 uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant; 963 964 // For each decimation mode, compute an ideal set of weights with no quantization 965 for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++) 966 { 967 const auto& dm = bsd.get_decimation_mode(i); 968 if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant))) 969 { 970 continue; 971 } 972 973 const auto& di = bsd.get_decimation_info(i); 974 975 compute_ideal_weights_for_decimation( 976 ei1, 977 di, 978 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS); 979 980 compute_ideal_weights_for_decimation( 981 ei2, 982 di, 983 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET); 984 } 985 986 // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal 987 // weight pair, compute the smallest weight that will result in a color value greater than 1 988 vfloat4 min_ep1(10.0f); 989 vfloat4 min_ep2(10.0f); 990 991 vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]); 992 vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1); 993 min_ep1 = select(min_ep1, ep1, use_ep1); 994 995 vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]); 996 vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2); 997 min_ep2 = select(min_ep2, ep2, use_ep2); 998 999 vfloat4 err_max(ERROR_CALC_DEFAULT); 1000 vmask4 err_mask = vint4::lane_id() == vint4(plane2_component); 1001 1002 // Set the plane2 component to max error in ep1 1003 min_ep1 = select(min_ep1, err_max, err_mask); 1004 1005 float min_wt_cutoff1 = hmin_s(min_ep1); 1006 1007 // Set the minwt2 to the plane2 component min in ep2 1008 float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask)); 1009 1010 compute_angular_endpoints_2planes( 1011 privateProfile, bsd, dec_weights_ideal, max_weight_quant, tmpbuf); 1012 1013 // For each mode (which specifies a decimation and a quantization): 1014 // * Compute number of bits needed for the quantized weights 1015 // * Generate an optimized set of quantized weights 1016 // * Compute quantization errors for the mode 1017 1018 float* weight_low_value1 = tmpbuf.weight_low_value1; 1019 float* weight_high_value1 = tmpbuf.weight_high_value1; 1020 float* weight_low_value2 = tmpbuf.weight_low_value2; 1021 float* weight_high_value2 = tmpbuf.weight_high_value2; 1022 1023 int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts; 1024 float* qwt_errors = tmpbuf.qwt_errors; 1025 1026 unsigned int start_2plane = bsd.block_mode_count_1plane_selected; 1027 unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected; 1028 1029 for (unsigned int i = start_2plane; i < end_2plane; i++) 1030 { 1031 const block_mode& bm = bsd.block_modes[i]; 1032 assert(bm.is_dual_plane); 1033 1034 if (bm.quant_mode > max_weight_quant) 1035 { 1036 qwt_errors[i] = 1e38f; 1037 continue; 1038 } 1039 1040 qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits); 1041 1042 if (weight_high_value1[i] > 1.02f * min_wt_cutoff1) 1043 { 1044 weight_high_value1[i] = 1.0f; 1045 } 1046 1047 if (weight_high_value2[i] > 1.02f * min_wt_cutoff2) 1048 { 1049 weight_high_value2[i] = 1.0f; 1050 } 1051 1052 unsigned int decimation_mode = bm.decimation_mode; 1053 const auto& di = bsd.get_decimation_info(decimation_mode); 1054 1055 ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; 1056 1057 // Generate the optimized set of weights for the mode 1058 compute_quantized_weights_for_decimation( 1059 di, 1060 weight_low_value1[i], 1061 weight_high_value1[i], 1062 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode, 1063 dec_weights_uquantf, 1064 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i, 1065 bm.get_weight_quant_mode()); 1066 1067 compute_quantized_weights_for_decimation( 1068 di, 1069 weight_low_value2[i], 1070 weight_high_value2[i], 1071 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET, 1072 dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET, 1073 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET, 1074 bm.get_weight_quant_mode()); 1075 1076 // Compute weight quantization errors for the block mode 1077 qwt_errors[i] = compute_error_of_weight_set_2planes( 1078 ei1, 1079 ei2, 1080 di, 1081 dec_weights_uquantf, 1082 dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET); 1083 } 1084 1085 // Decide the optimal combination of color endpoint encodings and weight encodings 1086 uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS]; 1087 int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES]; 1088 1089 quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES]; 1090 quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES]; 1091 1092 endpoints epm; 1093 merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm); 1094 1095 const auto& pi = bsd.get_partition_info(1, 0); 1096 unsigned int candidate_count = compute_ideal_endpoint_formats( 1097 config.privateProfile, 1098 pi, blk, epm, qwt_bitcounts, qwt_errors, 1099 config.tune_candidate_limit, 1100 bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected, 1101 partition_format_specifiers, block_mode_index, 1102 color_quant_level, color_quant_level_mod, tmpbuf); 1103 1104 // Iterate over the N believed-to-be-best modes to find out which one is actually best 1105 float best_errorval_in_mode = ERROR_CALC_DEFAULT; 1106 float best_errorval_in_scb = scb.errorval; 1107 1108 for (unsigned int i = 0; i < candidate_count; i++) 1109 { 1110 TRACE_NODE(node0, "candidate"); 1111 1112 const int bm_packed_index = block_mode_index[i]; 1113 assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) && 1114 bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected)); 1115 const block_mode& qw_bm = bsd.block_modes[bm_packed_index]; 1116 1117 int decimation_mode = qw_bm.decimation_mode; 1118 const auto& di = bsd.get_decimation_info(decimation_mode); 1119 promise(di.weight_count > 0); 1120 1121 trace_add_data("weight_x", di.weight_x); 1122 trace_add_data("weight_y", di.weight_y); 1123 trace_add_data("weight_z", di.weight_z); 1124 trace_add_data("weight_quant", qw_bm.quant_mode); 1125 1126 vfloat4 rgbs_color; 1127 vfloat4 rgbo_color; 1128 1129 symbolic_compressed_block workscb; 1130 endpoints workep = epm; 1131 1132 uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index; 1133 uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET; 1134 1135 for (int j = 0; j < di.weight_count; j++) 1136 { 1137 workscb.weights[j] = u8_weight1_src[j]; 1138 workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j]; 1139 } 1140 1141 for (unsigned int l = 0; l < config.tune_refinement_limit; l++) 1142 { 1143 recompute_ideal_colors_2planes( 1144 blk, bsd, di, 1145 workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET, 1146 workep, rgbs_color, rgbo_color, plane2_component); 1147 1148 // Quantize the chosen color 1149 workscb.color_formats[0] = pack_color_endpoints( 1150 privateProfile, 1151 workep.endpt0[0], 1152 workep.endpt1[0], 1153 rgbs_color, rgbo_color, 1154 partition_format_specifiers[i][0], 1155 workscb.color_values[0], 1156 color_quant_level[i]); 1157 1158 // Store header fields 1159 workscb.partition_count = 1; 1160 workscb.partition_index = 0; 1161 workscb.quant_mode = color_quant_level[i]; 1162 workscb.color_formats_matched = 0; 1163 workscb.block_mode = qw_bm.mode_index; 1164 workscb.plane2_component = static_cast<int8_t>(plane2_component); 1165 workscb.block_type = SYM_BTYPE_NONCONST; 1166 1167 // Pre-realign test 1168 if (l == 0) 1169 { 1170 float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk); 1171 if (errorval == -ERROR_CALC_DEFAULT) 1172 { 1173 errorval = -errorval; 1174 workscb.block_type = SYM_BTYPE_ERROR; 1175 } 1176 1177 trace_add_data("error_prerealign", errorval); 1178 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode); 1179 1180 // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first 1181 // iteration can help more so we give it a extra 8% leeway. Use this knowledge to 1182 // drive a heuristic to skip blocks that are unlikely to catch up with the best 1183 // block we have already. 1184 unsigned int iters_remaining = config.tune_refinement_limit - l; 1185 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f; 1186 if (errorval > (threshold * best_errorval_in_scb)) 1187 { 1188 break; 1189 } 1190 1191 if (errorval < best_errorval_in_scb) 1192 { 1193 best_errorval_in_scb = errorval; 1194 workscb.errorval = errorval; 1195 scb = workscb; 1196 1197 if (errorval < tune_errorval_threshold) 1198 { 1199 // Skip remaining candidates - this is "good enough" 1200 i = candidate_count; 1201 break; 1202 } 1203 } 1204 } 1205 1206 // Perform a final pass over the weights to try to improve them. 1207 bool adjustments; 1208 if (di.weight_count != bsd.texel_count) 1209 { 1210 adjustments = realign_weights_decimated( 1211 config.profile, bsd, blk, workscb); 1212 } 1213 else 1214 { 1215 adjustments = realign_weights_undecimated( 1216 config.profile, bsd, blk, workscb); 1217 } 1218 1219 // Post-realign test 1220 float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk); 1221 if (errorval == -ERROR_CALC_DEFAULT) 1222 { 1223 errorval = -errorval; 1224 workscb.block_type = SYM_BTYPE_ERROR; 1225 } 1226 1227 trace_add_data("error_postrealign", errorval); 1228 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode); 1229 1230 // Average refinement improvement is 3.5% per iteration, so skip blocks that are 1231 // unlikely to catch up with the best block we have already. Assume a 4.5% per step to 1232 // give benefit of the doubt ... 1233 unsigned int iters_remaining = config.tune_refinement_limit - 1 - l; 1234 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f; 1235 if (errorval > (threshold * best_errorval_in_scb)) 1236 { 1237 break; 1238 } 1239 1240 if (errorval < best_errorval_in_scb) 1241 { 1242 best_errorval_in_scb = errorval; 1243 workscb.errorval = errorval; 1244 scb = workscb; 1245 1246 if (errorval < tune_errorval_threshold) 1247 { 1248 // Skip remaining candidates - this is "good enough" 1249 i = candidate_count; 1250 break; 1251 } 1252 } 1253 1254 if (!adjustments) 1255 { 1256 break; 1257 } 1258 } 1259 } 1260 1261 return best_errorval_in_mode; 1262} 1263 1264/** 1265 * @brief Determine the lowest cross-channel correlation factor. 1266 * 1267 * @param texels_per_block The number of texels in a block. 1268 * @param blk The image block color data to compress. 1269 * 1270 * @return Return the lowest correlation factor. 1271 */ 1272static float prepare_block_statistics( 1273 int texels_per_block, 1274 const image_block& blk 1275) { 1276 // Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row 1277 // of the matrix. The matrix is symmetric, so this is all we need for this use case. 1278 float rs = 0.0f; 1279 float gs = 0.0f; 1280 float bs = 0.0f; 1281 float as = 0.0f; 1282 float rr_var = 0.0f; 1283 float gg_var = 0.0f; 1284 float bb_var = 0.0f; 1285 float aa_var = 0.0f; 1286 float rg_cov = 0.0f; 1287 float rb_cov = 0.0f; 1288 float ra_cov = 0.0f; 1289 float gb_cov = 0.0f; 1290 float ga_cov = 0.0f; 1291 float ba_cov = 0.0f; 1292 1293 float weight_sum = 0.0f; 1294 1295 promise(texels_per_block > 0); 1296 for (int i = 0; i < texels_per_block; i++) 1297 { 1298 float weight = hadd_s(blk.channel_weight) / 4.0f; 1299 assert(weight >= 0.0f); 1300 weight_sum += weight; 1301 1302 float r = blk.data_r[i]; 1303 float g = blk.data_g[i]; 1304 float b = blk.data_b[i]; 1305 float a = blk.data_a[i]; 1306 1307 float rw = r * weight; 1308 rs += rw; 1309 rr_var += r * rw; 1310 rg_cov += g * rw; 1311 rb_cov += b * rw; 1312 ra_cov += a * rw; 1313 1314 float gw = g * weight; 1315 gs += gw; 1316 gg_var += g * gw; 1317 gb_cov += b * gw; 1318 ga_cov += a * gw; 1319 1320 float bw = b * weight; 1321 bs += bw; 1322 bb_var += b * bw; 1323 ba_cov += a * bw; 1324 1325 float aw = a * weight; 1326 as += aw; 1327 aa_var += a * aw; 1328 } 1329 1330 float rpt = 1.0f / astc::max(weight_sum, 1e-7f); 1331 1332 rr_var -= rs * (rs * rpt); 1333 rg_cov -= gs * (rs * rpt); 1334 rb_cov -= bs * (rs * rpt); 1335 ra_cov -= as * (rs * rpt); 1336 1337 gg_var -= gs * (gs * rpt); 1338 gb_cov -= bs * (gs * rpt); 1339 ga_cov -= as * (gs * rpt); 1340 1341 bb_var -= bs * (bs * rpt); 1342 ba_cov -= as * (bs * rpt); 1343 1344 aa_var -= as * (as * rpt); 1345 1346 // These will give a NaN if a channel is constant - these are fixed up in the next step 1347 rg_cov *= astc::rsqrt(rr_var * gg_var); 1348 rb_cov *= astc::rsqrt(rr_var * bb_var); 1349 ra_cov *= astc::rsqrt(rr_var * aa_var); 1350 gb_cov *= astc::rsqrt(gg_var * bb_var); 1351 ga_cov *= astc::rsqrt(gg_var * aa_var); 1352 ba_cov *= astc::rsqrt(bb_var * aa_var); 1353 1354 if (astc::isnan(rg_cov)) rg_cov = 1.0f; 1355 if (astc::isnan(rb_cov)) rb_cov = 1.0f; 1356 if (astc::isnan(ra_cov)) ra_cov = 1.0f; 1357 if (astc::isnan(gb_cov)) gb_cov = 1.0f; 1358 if (astc::isnan(ga_cov)) ga_cov = 1.0f; 1359 if (astc::isnan(ba_cov)) ba_cov = 1.0f; 1360 1361 float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov)); 1362 lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov)); 1363 lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov)); 1364 lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov)); 1365 lowest_correlation = astc::min(lowest_correlation, fabsf(ba_cov)); 1366 1367 // Diagnostic trace points 1368 trace_add_data("min_r", blk.data_min.lane<0>()); 1369 trace_add_data("max_r", blk.data_max.lane<0>()); 1370 trace_add_data("min_g", blk.data_min.lane<1>()); 1371 trace_add_data("max_g", blk.data_max.lane<1>()); 1372 trace_add_data("min_b", blk.data_min.lane<2>()); 1373 trace_add_data("max_b", blk.data_max.lane<2>()); 1374 trace_add_data("min_a", blk.data_min.lane<3>()); 1375 trace_add_data("max_a", blk.data_max.lane<3>()); 1376 trace_add_data("cov_rg", fabsf(rg_cov)); 1377 trace_add_data("cov_rb", fabsf(rb_cov)); 1378 trace_add_data("cov_ra", fabsf(ra_cov)); 1379 trace_add_data("cov_gb", fabsf(gb_cov)); 1380 trace_add_data("cov_ga", fabsf(ga_cov)); 1381 trace_add_data("cov_ba", fabsf(ba_cov)); 1382 1383 return lowest_correlation; 1384} 1385 1386/* See header for documentation. */ 1387void compress_block( 1388 const astcenc_contexti& ctx, 1389 const image_block& blk, 1390 uint8_t pcb[16], 1391#if QUALITY_CONTROL 1392 compression_working_buffers& tmpbuf, 1393 bool calQualityEnable, 1394 int32_t *mseBlock[RGBA_COM] 1395#else 1396 compression_working_buffers& tmpbuf 1397#endif 1398 ) 1399{ 1400 astcenc_profile decode_mode = ctx.config.profile; 1401 symbolic_compressed_block scb; 1402 const block_size_descriptor& bsd = *ctx.bsd; 1403 float lowest_correl; 1404 1405 TRACE_NODE(node0, "block"); 1406 trace_add_data("pos_x", blk.xpos); 1407 trace_add_data("pos_y", blk.ypos); 1408 trace_add_data("pos_z", blk.zpos); 1409 1410 // Set stricter block targets for luminance data as we have more bits to play with 1411 bool block_is_l = blk.is_luminance(); 1412 float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f; 1413 1414 // Set slightly stricter block targets for lumalpha data as we have more bits to play with 1415 bool block_is_la = blk.is_luminancealpha(); 1416 float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f; 1417 1418 bool block_skip_two_plane = false; 1419 int max_partitions; 1420 if (ctx.config.privateProfile == HIGH_SPEED_PROFILE) 1421 { 1422 max_partitions = 1; 1423 } 1424#ifdef ASTC_CUSTOMIZED_ENABLE 1425 else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE) 1426 { 1427 if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() || 1428 g_astcCustomizedSoManager.customizedMaxPartitionsFunc_ == nullptr) 1429 { 1430 printf("astcenc customized so dlopen failed or customizedMaxPartitionsFunc_ is nullptr!\n"); 1431 return; 1432 } 1433 max_partitions = g_astcCustomizedSoManager.customizedMaxPartitionsFunc_(); 1434 } 1435#endif 1436 else 1437 { 1438 max_partitions = ctx.config.tune_partition_count_limit; 1439 } 1440 1441 unsigned int requested_partition_indices[3] { 1442 ctx.config.tune_2partition_index_limit, 1443 ctx.config.tune_3partition_index_limit, 1444 ctx.config.tune_4partition_index_limit 1445 }; 1446 1447 unsigned int requested_partition_trials[3] { 1448 ctx.config.tune_2partitioning_candidate_limit, 1449 ctx.config.tune_3partitioning_candidate_limit, 1450 ctx.config.tune_4partitioning_candidate_limit 1451 }; 1452 1453#if defined(ASTCENC_DIAGNOSTICS) 1454 // Do this early in diagnostic builds so we can dump uniform metrics 1455 // for every block. Do it later in release builds to avoid redundant work! 1456 float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count; 1457 float error_threshold = ctx.config.tune_db_limit 1458 * error_weight_sum 1459 * block_is_l_scale 1460 * block_is_la_scale; 1461 1462 lowest_correl = prepare_block_statistics(bsd.texel_count, blk); 1463 trace_add_data("lowest_correl", lowest_correl); 1464 trace_add_data("tune_error_threshold", error_threshold); 1465#endif 1466 1467 // Detected a constant-color block 1468 if (all(blk.data_min == blk.data_max)) 1469 { 1470 TRACE_NODE(node1, "pass"); 1471 trace_add_data("partition_count", 0); 1472 trace_add_data("plane_count", 1); 1473 1474 scb.partition_count = 0; 1475 1476 // Encode as FP16 if using HDR 1477 if ((decode_mode == ASTCENC_PRF_HDR) || 1478 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A)) 1479 { 1480 scb.block_type = SYM_BTYPE_CONST_F16; 1481 vint4 color_f16 = float_to_float16(blk.origin_texel); 1482 store(color_f16, scb.constant_color); 1483 } 1484 // Encode as UNORM16 if NOT using HDR 1485 else 1486 { 1487 scb.block_type = SYM_BTYPE_CONST_U16; 1488 vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f; 1489 vint4 color_u16 = float_to_int_rtn(color_f32); 1490 store(color_u16, scb.constant_color); 1491 } 1492 1493 trace_add_data("exit", "quality hit"); 1494 if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE) 1495 { 1496 scb.block_type = SYM_BTYPE_NONCONST; 1497 scb.partition_count = 1; 1498 scb.color_formats_matched = 0; 1499 scb.plane2_component = -1; 1500 if (ctx.config.privateProfile == HIGH_SPEED_PROFILE) 1501 { 1502 scb.block_mode = HIGH_SPEED_PROFILE_BLOCK_MODE; 1503 } 1504#ifdef ASTC_CUSTOMIZED_ENABLE 1505 else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE) 1506 { 1507 if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() || 1508 g_astcCustomizedSoManager.customizedBlockModeFunc_ == nullptr) 1509 { 1510 printf("astcenc customized so dlopen failed or customizedBlockModeFunc_ is nullptr!\n"); 1511 return; 1512 } 1513 scb.block_mode = g_astcCustomizedSoManager.customizedBlockModeFunc_(); 1514 } 1515#endif 1516 scb.partition_index = 0; 1517 scb.quant_mode = QUANT_256; 1518 scb.color_formats[0] = 12; // color format is 12 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE 1519 for (int w = 0; w < 16; w++) { // weights num is 16 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE 1520 scb.weights[w] = 0; 1521 } 1522 for (unsigned int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit 1523 scb.color_values[0][pixel << 1] = scb.constant_color[pixel] & BYTE_MASK; // low byte 1524 scb.color_values[0][(pixel << 1) + 1] = (scb.constant_color[pixel] >> 8) & BYTE_MASK; // high byte 1525 } 1526 } 1527 scb.privateProfile = ctx.config.privateProfile; 1528 symbolic_to_physical(bsd, scb, pcb); 1529#if QUALITY_CONTROL 1530 if (calQualityEnable) { 1531 *mseBlock[R_COM] = *mseBlock[G_COM] = *mseBlock[B_COM] = *mseBlock[A_COM] = 0; 1532 } 1533#endif 1534 return; 1535 } 1536 1537#if !defined(ASTCENC_DIAGNOSTICS) 1538 float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count; 1539 float error_threshold = ctx.config.tune_db_limit 1540 * error_weight_sum 1541 * block_is_l_scale 1542 * block_is_la_scale; 1543#endif 1544 1545 // Set SCB and mode errors to a very high error value 1546 scb.errorval = ERROR_CALC_DEFAULT; 1547 scb.block_type = SYM_BTYPE_ERROR; 1548 1549 float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] { 1550 ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT 1551 }; 1552 1553 float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] { 1554 0.0f, 1555 ctx.config.tune_2partition_early_out_limit_factor, 1556 ctx.config.tune_3partition_early_out_limit_factor, 1557 0.0f 1558 }; 1559 1560 // Trial using 1 plane of weights and 1 partition. 1561 1562 // Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified 1563 // mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this 1564 // optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the 1565 // compression and slightly reduces image quality. 1566 1567 float errorval_mult[2] { 1568 1.0f / ctx.config.tune_mse_overshoot, 1569 1.0f 1570 }; 1571 1572 static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot; 1573 1574 // Only enable MODE0 fast path if enabled 1575 // Never enable for 3D blocks as no "always" block modes are available 1576 int start_trial = 1; 1577 if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1)) 1578 { 1579 start_trial = 0; 1580 } 1581 1582 int quant_limit = QUANT_32; 1583 for (int i = start_trial; i < 2; i++) 1584 { 1585 TRACE_NODE(node1, "pass"); 1586 trace_add_data("partition_count", 1); 1587 trace_add_data("plane_count", 1); 1588 trace_add_data("search_mode", i); 1589 1590 float errorval = compress_symbolic_block_for_partition_1plane( 1591 ctx.config.privateProfile, 1592 ctx.config, bsd, blk, i == 0, 1593 error_threshold * errorval_mult[i] * errorval_overshoot, 1594 1, 0, scb, tmpbuf, QUANT_32); 1595 1596 // Record the quant level so we can use the filter later searches 1597 const auto& bm = bsd.get_block_mode(scb.block_mode); 1598 quant_limit = bm.get_weight_quant_mode(); 1599 1600 best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval); 1601 if ((ctx.config.privateProfile == HIGH_SPEED_PROFILE) || (errorval < (error_threshold * errorval_mult[i]))) 1602 { 1603 trace_add_data("exit", "quality hit"); 1604 goto END_OF_TESTS; 1605 } 1606 } 1607 1608#if !defined(ASTCENC_DIAGNOSTICS) 1609 lowest_correl = prepare_block_statistics(bsd.texel_count, blk); 1610#endif 1611 1612 block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation; 1613 1614 // Test the four possible 1-partition, 2-planes modes. Do this in reverse, as 1615 // alpha is the most likely to be non-correlated if it is present in the data. 1616 for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--) 1617 { 1618 if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE) 1619 { 1620 break; 1621 } 1622 TRACE_NODE(node1, "pass"); 1623 trace_add_data("partition_count", 1); 1624 trace_add_data("plane_count", 2); 1625 trace_add_data("plane_component", i); 1626 1627 if (block_skip_two_plane) 1628 { 1629 trace_add_data("skip", "tune_2plane_early_out_limit_correlation"); 1630 continue; 1631 } 1632 1633 if (blk.grayscale && i != 3) 1634 { 1635 trace_add_data("skip", "grayscale block"); 1636 continue; 1637 } 1638 1639 if (blk.is_constant_channel(i)) 1640 { 1641 trace_add_data("skip", "constant component"); 1642 continue; 1643 } 1644 1645 float errorval = compress_symbolic_block_for_partition_2planes( 1646 ctx.config.privateProfile, 1647 ctx.config, bsd, blk, error_threshold * errorval_overshoot, 1648 i, scb, tmpbuf, quant_limit); 1649 1650 // If attempting two planes is much worse than the best one plane result 1651 // then further two plane searches are unlikely to help so move on ... 1652 if (errorval > (best_errorvals_for_pcount[0] * 1.85f)) 1653 { 1654 break; 1655 } 1656 1657 if (errorval < error_threshold) 1658 { 1659 trace_add_data("exit", "quality hit"); 1660 goto END_OF_TESTS; 1661 } 1662 } 1663 1664 // Find best blocks for 2, 3 and 4 partitions 1665 for (int partition_count = 2; partition_count <= max_partitions; partition_count++) 1666 { 1667 unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES]; 1668 1669 unsigned int requested_indices = requested_partition_indices[partition_count - 2]; 1670 1671 unsigned int requested_trials = requested_partition_trials[partition_count - 2]; 1672 requested_trials = astc::min(requested_trials, requested_indices); 1673 1674 unsigned int actual_trials = find_best_partition_candidates( 1675 bsd, blk, partition_count, requested_indices, partition_indices, requested_trials); 1676 1677 float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2]; 1678 1679 for (unsigned int i = 0; i < actual_trials; i++) 1680 { 1681 TRACE_NODE(node1, "pass"); 1682 trace_add_data("partition_count", partition_count); 1683 trace_add_data("partition_index", partition_indices[i]); 1684 trace_add_data("plane_count", 1); 1685 trace_add_data("search_mode", i); 1686 1687 float errorval = compress_symbolic_block_for_partition_1plane( 1688 ctx.config.privateProfile, 1689 ctx.config, bsd, blk, false, 1690 error_threshold * errorval_overshoot, 1691 partition_count, partition_indices[i], 1692 scb, tmpbuf, quant_limit); 1693 1694 best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval); 1695 1696 // If using N partitions doesn't improve much over using N-1 partitions then skip trying 1697 // N+1. Error can dramatically improve if the data is correlated or non-correlated and 1698 // aligns with a partitioning that suits that encoding, so for this inner loop check add 1699 // a large error scale because the "other" trial could be a lot better. 1700 float best_error = best_errorvals_for_pcount[partition_count - 1]; 1701 float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f; 1702 if (best_error > (best_error_in_prev * best_error_scale)) 1703 { 1704 trace_add_data("skip", "tune_partition_early_out_limit_factor"); 1705 goto END_OF_TESTS; 1706 } 1707 1708 if (errorval < error_threshold) 1709 { 1710 trace_add_data("exit", "quality hit"); 1711 goto END_OF_TESTS; 1712 } 1713 } 1714 1715 // If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1 1716 float best_error = best_errorvals_for_pcount[partition_count - 1]; 1717 float best_error_scale = exit_thresholds_for_pcount[partition_count - 1]; 1718 if (best_error > (best_error_in_prev * best_error_scale)) 1719 { 1720 trace_add_data("skip", "tune_partition_early_out_limit_factor"); 1721 goto END_OF_TESTS; 1722 } 1723 } 1724 1725 trace_add_data("exit", "quality not hit"); 1726 1727END_OF_TESTS: 1728 // If we still have an error block then convert to something we can encode 1729 // TODO: Do something more sensible here, such as average color block 1730 if (scb.block_type == SYM_BTYPE_ERROR) 1731 { 1732#if defined(ASTCENC_DIAGNOSTICS) 1733 static bool printed_once = false; 1734 if (!printed_once) 1735 { 1736 printed_once = true; 1737 printf("WARN: At least one block failed to find a valid encoding.\n" 1738 " Try increasing compression quality settings.\n\n"); 1739 } 1740#endif 1741 1742 scb.block_type = SYM_BTYPE_CONST_U16; 1743 vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f; 1744 vint4 color_u16 = float_to_int_rtn(color_f32); 1745 store(color_u16, scb.constant_color); 1746 } 1747 1748 // Compress to a physical block 1749 scb.privateProfile = ctx.config.privateProfile; 1750 symbolic_to_physical(bsd, scb, pcb); 1751#if QUALITY_CONTROL 1752 if (calQualityEnable) { 1753 image_block decBlk = blk; 1754 decompress_symbolic_block(ctx.config.profile, bsd, blk.xpos, blk.ypos, blk.zpos, scb, decBlk); 1755 vint4 colorSumDiff = vint4::zero(); 1756 for (size_t ii = 0; ii < bsd.texel_count; ii++) { 1757 vint4 colorRef = float_to_int_rtn(blk.texel(ii) * 255.0f / 65535.0f); 1758 vint4 colorTest = float_to_int_rtn(min(decBlk.texel(ii), 1.0f) * 255.0f); 1759 vint4 colorDiff = colorRef - colorTest; 1760 colorSumDiff += colorDiff * colorDiff; 1761 } 1762 *mseBlock[R_COM] = colorSumDiff.lane<0>(); 1763 *mseBlock[G_COM] = colorSumDiff.lane<1>(); 1764 *mseBlock[B_COM] = colorSumDiff.lane<2>(); 1765 *mseBlock[A_COM] = colorSumDiff.lane<3>(); 1766 } 1767#endif 1768} 1769 1770#endif 1771