1// SPDX-License-Identifier: Apache-2.0 2// ---------------------------------------------------------------------------- 3// Copyright 2011-2022 Arm Limited 4// 5// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6// use this file except in compliance with the License. You may obtain a copy 7// of the License at: 8// 9// http://www.apache.org/licenses/LICENSE-2.0 10// 11// Unless required by applicable law or agreed to in writing, software 12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14// License for the specific language governing permissions and limitations 15// under the License. 16// ---------------------------------------------------------------------------- 17 18#if !defined(ASTCENC_DECOMPRESS_ONLY) 19 20/** 21 * @brief Functions for finding best endpoint format. 22 * 23 * We assume there are two independent sources of error in any given partition: 24 * 25 * - Encoding choice errors 26 * - Quantization errors 27 * 28 * Encoding choice errors are caused by encoder decisions. For example: 29 * 30 * - Using luminance instead of separate RGB components. 31 * - Using a constant 1.0 alpha instead of storing an alpha component. 32 * - Using RGB+scale instead of storing two full RGB endpoints. 33 * 34 * Quantization errors occur due to the limited precision we use for storage. These errors generally 35 * scale with quantization level, but are not actually independent of color encoding. In particular: 36 * 37 * - If we can use offset encoding then quantization error is halved. 38 * - If we can use blue-contraction then quantization error for RG is halved. 39 * - If we use HDR endpoints the quantization error is higher. 40 * 41 * Apart from these effects, we assume the error is proportional to the quantization step size. 42 */ 43 44 45#include "astcenc_internal.h" 46#include "astcenc_vecmathlib.h" 47 48#include <assert.h> 49 50/** 51 * @brief Compute the errors of the endpoint line options for one partition. 52 * 53 * Uncorrelated data assumes storing completely independent RGBA channels for each endpoint. Same 54 * chroma data assumes storing RGBA endpoints which pass though the origin (LDR only). RGBL data 55 * assumes storing RGB + lumashift (HDR only). Luminance error assumes storing RGB channels as a 56 * single value. 57 * 58 * 59 * @param pi The partition info data. 60 * @param partition_index The partition index to compule the error for. 61 * @param blk The image block. 62 * @param uncor_pline The endpoint line assuming uncorrelated endpoints. 63 * @param[out] uncor_err The computed error for the uncorrelated endpoint line. 64 * @param samec_pline The endpoint line assuming the same chroma for both endpoints. 65 * @param[out] samec_err The computed error for the uncorrelated endpoint line. 66 * @param rgbl_pline The endpoint line assuming RGB + lumashift data. 67 * @param[out] rgbl_err The computed error for the RGB + lumashift endpoint line. 68 * @param l_pline The endpoint line assuming luminance data. 69 * @param[out] l_err The computed error for the luminance endpoint line. 70 * @param[out] a_drop_err The computed error for dropping the alpha component. 71 */ 72static void compute_error_squared_rgb_single_partition( 73 const partition_info& pi, 74 int partition_index, 75 const image_block& blk, 76 const processed_line3& uncor_pline, 77 float& uncor_err, 78 const processed_line3& samec_pline, 79 float& samec_err, 80 const processed_line3& rgbl_pline, 81 float& rgbl_err, 82 const processed_line3& l_pline, 83 float& l_err, 84 float& a_drop_err 85) { 86 vfloat4 ews = blk.channel_weight; 87 88 unsigned int texel_count = pi.partition_texel_count[partition_index]; 89 const uint8_t* texel_indexes = pi.texels_of_partition[partition_index]; 90 promise(texel_count > 0); 91 92 vfloatacc a_drop_errv = vfloatacc::zero(); 93 vfloat default_a(blk.get_default_alpha()); 94 95 vfloatacc uncor_errv = vfloatacc::zero(); 96 vfloat uncor_bs0(uncor_pline.bs.lane<0>()); 97 vfloat uncor_bs1(uncor_pline.bs.lane<1>()); 98 vfloat uncor_bs2(uncor_pline.bs.lane<2>()); 99 100 vfloat uncor_amod0(uncor_pline.amod.lane<0>()); 101 vfloat uncor_amod1(uncor_pline.amod.lane<1>()); 102 vfloat uncor_amod2(uncor_pline.amod.lane<2>()); 103 104 vfloatacc samec_errv = vfloatacc::zero(); 105 vfloat samec_bs0(samec_pline.bs.lane<0>()); 106 vfloat samec_bs1(samec_pline.bs.lane<1>()); 107 vfloat samec_bs2(samec_pline.bs.lane<2>()); 108 109 vfloatacc rgbl_errv = vfloatacc::zero(); 110 vfloat rgbl_bs0(rgbl_pline.bs.lane<0>()); 111 vfloat rgbl_bs1(rgbl_pline.bs.lane<1>()); 112 vfloat rgbl_bs2(rgbl_pline.bs.lane<2>()); 113 114 vfloat rgbl_amod0(rgbl_pline.amod.lane<0>()); 115 vfloat rgbl_amod1(rgbl_pline.amod.lane<1>()); 116 vfloat rgbl_amod2(rgbl_pline.amod.lane<2>()); 117 118 vfloatacc l_errv = vfloatacc::zero(); 119 vfloat l_bs0(l_pline.bs.lane<0>()); 120 vfloat l_bs1(l_pline.bs.lane<1>()); 121 vfloat l_bs2(l_pline.bs.lane<2>()); 122 123 vfloat one_third(1/3.0f, 1/3.0f, 1/3.0f, 1/3.0f); 124 vfloat uncor_errv0 = vfloat::zero(); 125 vfloat uncor_errv1 = vfloat::zero(); 126 vfloat uncor_errv2 = vfloat::zero(); 127 vfloat samec_errv0 = vfloat::zero(); 128 vfloat samec_errv1 = vfloat::zero(); 129 vfloat samec_errv2 = vfloat::zero(); 130 vfloat rgbl_errv0 = vfloat::zero(); 131 vfloat rgbl_errv1 = vfloat::zero(); 132 vfloat rgbl_errv2 = vfloat::zero(); 133 vfloat l_errv0 = vfloat::zero(); 134 vfloat l_errv1 = vfloat::zero(); 135 vfloat l_errv2 = vfloat::zero(); 136 137 unsigned int i = 0; 138 for (; i + ASTCENC_SIMD_WIDTH <= texel_count; i += ASTCENC_SIMD_WIDTH) 139 { 140#ifdef ASTCENC_USE_COMMON_GATHERF 141 const uint8_t* tix = texel_indexes + i; 142#else 143 vint tix(texel_indexes + i); 144#endif 145 146 // Compute the error that arises from just ditching alpha 147 vfloat data_a = gatherf(blk.data_a, tix); 148 vfloat alpha_diff = data_a - default_a; 149 alpha_diff = alpha_diff * alpha_diff; 150 151 haccumulate(a_drop_errv, alpha_diff); 152 153 vfloat data_r = gatherf(blk.data_r, tix); 154 vfloat data_g = gatherf(blk.data_g, tix); 155 vfloat data_b = gatherf(blk.data_b, tix); 156 157 vfloat data_rgb_avg = (data_r + data_g + data_b) * one_third; 158 vfloat data_rgb_0 = data_rgb_avg - data_r; 159 vfloat data_rgb_1 = data_rgb_avg - data_g; 160 vfloat data_rgb_2 = data_rgb_avg - data_b; 161 162 // Compute uncorrelated error 163 vfloat param = data_r * uncor_bs0 164 + data_g * uncor_bs1 165 + data_b * uncor_bs2; 166 167 vfloat dist0 = (uncor_amod0 + param * uncor_bs0) - data_r; 168 vfloat dist1 = (uncor_amod1 + param * uncor_bs1) - data_g; 169 vfloat dist2 = (uncor_amod2 + param * uncor_bs2) - data_b; 170 171 haccumulate(uncor_errv0, dist0 * dist0); 172 haccumulate(uncor_errv1, dist1 * dist1); 173 haccumulate(uncor_errv2, dist2 * dist2); 174 175 // Compute same chroma error - no "amod", its always zero 176 param = data_r * samec_bs0 177 + data_g * samec_bs1 178 + data_b * samec_bs2; 179 180 dist0 = (param * samec_bs0) - data_r; 181 dist1 = (param * samec_bs1) - data_g; 182 dist2 = (param * samec_bs2) - data_b; 183 184 haccumulate(uncor_errv0, dist0 * dist0); 185 haccumulate(uncor_errv1, dist1 * dist1); 186 haccumulate(uncor_errv2, dist2 * dist2); 187 188 // Compute rgbl error 189 dist0 = rgbl_amod0 + data_rgb_0; 190 dist1 = rgbl_amod1 + data_rgb_1; 191 dist2 = rgbl_amod2 + data_rgb_2; 192 193 haccumulate(rgbl_errv0, dist0 * dist0); 194 haccumulate(rgbl_errv1, dist1 * dist1); 195 haccumulate(rgbl_errv2, dist2 * dist2); 196 197 // Compute luma error - no "amod", its always zero 198 dist0 = data_rgb_0; 199 dist1 = data_rgb_1; 200 dist2 = data_rgb_2; 201 202 haccumulate(l_errv0, dist0 * dist0); 203 haccumulate(l_errv1, dist1 * dist1); 204 haccumulate(l_errv2, dist2 * dist2); 205 } 206 207 uncor_errv = uncor_errv0 * ews.lane<0>() + uncor_errv1 * ews.lane<1>() + uncor_errv2 * ews.lane<2>(); // channel 0,1,2 208 samec_errv = samec_errv0 * ews.lane<0>() + samec_errv1 * ews.lane<1>() + samec_errv2 * ews.lane<2>(); // channel 0,1,2 209 rgbl_errv = rgbl_errv0 * ews.lane<0>() + rgbl_errv1 * ews.lane<1>() + rgbl_errv2 * ews.lane<2>(); // channel 0,1,2 210 l_errv = l_errv0 * ews.lane<0>() + l_errv1 * ews.lane<1>() + l_errv2 * ews.lane<2>(); // channel 0,1,2 211 212 if (i < texel_count) 213 { 214 vint lane_ids = vint::lane_id() + i; 215 vint tix(texel_indexes + i); 216 217 vmask mask = lane_ids < vint(texel_count); 218 lane_ids += vint(ASTCENC_SIMD_WIDTH); 219 220 // Compute the error that arises from just ditching alpha 221 vfloat data_a = gatherf(blk.data_a, tix); 222 vfloat alpha_diff = data_a - default_a; 223 alpha_diff = alpha_diff * alpha_diff; 224 225 haccumulate(a_drop_errv, alpha_diff, mask); 226 227 vfloat data_r = gatherf(blk.data_r, tix); 228 vfloat data_g = gatherf(blk.data_g, tix); 229 vfloat data_b = gatherf(blk.data_b, tix); 230 231 vfloat data_rgb_avg = (data_r + data_g + data_b) * one_third; 232 vfloat data_rgb_0 = data_rgb_avg - data_r; 233 vfloat data_rgb_1 = data_rgb_avg - data_g; 234 vfloat data_rgb_2 = data_rgb_avg - data_b; 235 236 // Compute uncorrelated error 237 vfloat param = data_r * uncor_bs0 238 + data_g * uncor_bs1 239 + data_b * uncor_bs2; 240 241 vfloat dist0 = (uncor_amod0 + param * uncor_bs0) - data_r; 242 vfloat dist1 = (uncor_amod1 + param * uncor_bs1) - data_g; 243 vfloat dist2 = (uncor_amod2 + param * uncor_bs2) - data_b; 244 245 vfloat error = dist0 * dist0 * ews.lane<0>() 246 + dist1 * dist1 * ews.lane<1>() 247 + dist2 * dist2 * ews.lane<2>(); 248 249 haccumulate(uncor_errv, error, mask); 250 251 // Compute same chroma error - no "amod", its always zero 252 param = data_r * samec_bs0 253 + data_g * samec_bs1 254 + data_b * samec_bs2; 255 256 dist0 = (param * samec_bs0) - data_r; 257 dist1 = (param * samec_bs1) - data_g; 258 dist2 = (param * samec_bs2) - data_b; 259 260 error = dist0 * dist0 * ews.lane<0>() 261 + dist1 * dist1 * ews.lane<1>() 262 + dist2 * dist2 * ews.lane<2>(); 263 264 haccumulate(samec_errv, error, mask); 265 266 // Compute rgbl error 267 dist0 = rgbl_amod0 + data_rgb_0; 268 dist1 = rgbl_amod1 + data_rgb_1; 269 dist2 = rgbl_amod2 + data_rgb_2; 270 271 error = dist0 * dist0 * ews.lane<0>() 272 + dist1 * dist1 * ews.lane<1>() 273 + dist2 * dist2 * ews.lane<2>(); 274 275 haccumulate(rgbl_errv, error, mask); 276 277 // Compute luma error - no "amod", its always zero 278 dist0 = data_rgb_0; 279 dist1 = data_rgb_1; 280 dist2 = data_rgb_2; 281 282 error = dist0 * dist0 * ews.lane<0>() 283 + dist1 * dist1 * ews.lane<1>() 284 + dist2 * dist2 * ews.lane<2>(); 285 286 haccumulate(l_errv, error, mask); 287 } 288 289 a_drop_err = hadd_s(a_drop_errv) * ews.lane<3>(); 290 uncor_err = hadd_s(uncor_errv); 291 samec_err = hadd_s(samec_errv); 292 rgbl_err = hadd_s(rgbl_errv); 293 l_err = hadd_s(l_errv); 294} 295 296/** 297 * @brief For a given set of input colors and partitioning determine endpoint encode errors. 298 * 299 * This function determines the color error that results from RGB-scale encoding (LDR only), 300 * RGB-lumashift encoding (HDR only), luminance-encoding, and alpha drop. Also determines whether 301 * the endpoints are eligible for offset encoding or blue-contraction 302 * 303 * @param blk The image block. 304 * @param pi The partition info data. 305 * @param ep The idealized endpoints. 306 * @param[out] eci The resulting encoding choice error metrics. 307 */ 308static void compute_encoding_choice_errors( 309 QualityProfile privateProfile, 310 const image_block& blk, 311 const partition_info& pi, 312 const endpoints& ep, 313 encoding_choice_errors eci[BLOCK_MAX_PARTITIONS]) 314{ 315 int partition_count = pi.partition_count; 316 promise(partition_count > 0); 317 318 partition_metrics *pms = reinterpret_cast<partition_metrics *>(&blk.pms[0]); 319 320 if (!blk.is_constant_channel(3) || (partition_count != 1 && privateProfile == HIGH_QUALITY_PROFILE)) 321 { 322 compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); 323 } 324 325 for (int i = 0; i < partition_count; i++) 326 { 327 partition_metrics& pm = pms[i]; 328 329 line3 uncor_rgb_lines; 330 line3 samec_rgb_lines; // for LDR-RGB-scale 331 line3 rgb_luma_lines; // for HDR-RGB-scale 332 333 processed_line3 uncor_rgb_plines; 334 processed_line3 samec_rgb_plines; 335 processed_line3 rgb_luma_plines; 336 processed_line3 luminance_plines; 337 338 float uncorr_rgb_error; 339 float samechroma_rgb_error; 340 float rgb_luma_error; 341 float luminance_rgb_error; 342 float alpha_drop_error; 343 344 uncor_rgb_lines.a = pm.avg; 345 uncor_rgb_lines.b = normalize_safe(pm.dir, unit3()); 346 347 samec_rgb_lines.a = vfloat4::zero(); 348 samec_rgb_lines.b = normalize_safe(pm.avg, unit3()); 349 350 rgb_luma_lines.a = pm.avg; 351 rgb_luma_lines.b = unit3(); 352 353 uncor_rgb_plines.amod = uncor_rgb_lines.a - uncor_rgb_lines.b * dot3(uncor_rgb_lines.a, uncor_rgb_lines.b); 354 uncor_rgb_plines.bs = uncor_rgb_lines.b; 355 356 // Same chroma always goes though zero, so this is simpler than the others 357 samec_rgb_plines.amod = vfloat4::zero(); 358 samec_rgb_plines.bs = samec_rgb_lines.b; 359 360 rgb_luma_plines.amod = rgb_luma_lines.a - rgb_luma_lines.b * dot3(rgb_luma_lines.a, rgb_luma_lines.b); 361 rgb_luma_plines.bs = rgb_luma_lines.b; 362 363 // Luminance always goes though zero, so this is simpler than the others 364 luminance_plines.amod = vfloat4::zero(); 365 luminance_plines.bs = unit3(); 366 367 compute_error_squared_rgb_single_partition( 368 pi, i, blk, 369 uncor_rgb_plines, uncorr_rgb_error, 370 samec_rgb_plines, samechroma_rgb_error, 371 rgb_luma_plines, rgb_luma_error, 372 luminance_plines, luminance_rgb_error, 373 alpha_drop_error); 374 375 // Determine if we can offset encode RGB lanes 376 vfloat4 endpt0 = ep.endpt0[i]; 377 vfloat4 endpt1 = ep.endpt1[i]; 378 vfloat4 endpt_diff = abs(endpt1 - endpt0); 379 vmask4 endpt_can_offset = endpt_diff < vfloat4(0.12f * 65535.0f); 380 bool can_offset_encode = (mask(endpt_can_offset) & 0x7) == 0x7; 381 382 // Store out the settings 383 eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * 0.7f; // empirical 384 eci[i].rgb_luma_error = (rgb_luma_error - uncorr_rgb_error) * 1.5f; // wild guess 385 eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * 3.0f; // empirical 386 eci[i].alpha_drop_error = alpha_drop_error * 3.0f; 387 eci[i].can_offset_encode = can_offset_encode; 388 eci[i].can_blue_contract = !blk.is_luminance(); 389 } 390} 391 392/** 393 * @brief For a given partition compute the error for every endpoint integer count and quant level. 394 * 395 * @param encode_hdr_rgb @c true if using HDR for RGB, @c false for LDR. 396 * @param encode_hdr_alpha @c true if using HDR for alpha, @c false for LDR. 397 * @param partition_index The partition index. 398 * @param pi The partition info. 399 * @param eci The encoding choice error metrics. 400 * @param ep The idealized endpoints. 401 * @param error_weight The resulting encoding choice error metrics. 402 * @param[out] best_error The best error for each integer count and quant level. 403 * @param[out] format_of_choice The preferred endpoint format for each integer count and quant level. 404 */ 405static void compute_color_error_for_every_integer_count_and_quant_level( 406 bool encode_hdr_rgb, 407 bool encode_hdr_alpha, 408 int partition_index, 409 const partition_info& pi, 410 const encoding_choice_errors& eci, 411 const endpoints& ep, 412 vfloat4 error_weight, 413 float best_error[21][4], 414 uint8_t format_of_choice[21][4] 415) { 416 int partition_size = pi.partition_texel_count[partition_index]; 417 418 static const float baseline_quant_error[21 - QUANT_6] { 419 (65536.0f * 65536.0f / 18.0f) / (5 * 5), 420 (65536.0f * 65536.0f / 18.0f) / (7 * 7), 421 (65536.0f * 65536.0f / 18.0f) / (9 * 9), 422 (65536.0f * 65536.0f / 18.0f) / (11 * 11), 423 (65536.0f * 65536.0f / 18.0f) / (15 * 15), 424 (65536.0f * 65536.0f / 18.0f) / (19 * 19), 425 (65536.0f * 65536.0f / 18.0f) / (23 * 23), 426 (65536.0f * 65536.0f / 18.0f) / (31 * 31), 427 (65536.0f * 65536.0f / 18.0f) / (39 * 39), 428 (65536.0f * 65536.0f / 18.0f) / (47 * 47), 429 (65536.0f * 65536.0f / 18.0f) / (63 * 63), 430 (65536.0f * 65536.0f / 18.0f) / (79 * 79), 431 (65536.0f * 65536.0f / 18.0f) / (95 * 95), 432 (65536.0f * 65536.0f / 18.0f) / (127 * 127), 433 (65536.0f * 65536.0f / 18.0f) / (159 * 159), 434 (65536.0f * 65536.0f / 18.0f) / (191 * 191), 435 (65536.0f * 65536.0f / 18.0f) / (255 * 255) 436 }; 437 438 vfloat4 ep0 = ep.endpt0[partition_index]; 439 vfloat4 ep1 = ep.endpt1[partition_index]; 440 441 float ep1_min = hmin_rgb_s(ep1); 442 ep1_min = astc::max(ep1_min, 0.0f); 443 444 float error_weight_rgbsum = hadd_rgb_s(error_weight); 445 446 float range_upper_limit_rgb = encode_hdr_rgb ? 61440.0f : 65535.0f; 447 float range_upper_limit_alpha = encode_hdr_alpha ? 61440.0f : 65535.0f; 448 449 // It is possible to get endpoint colors significantly outside [0,upper-limit] even if the 450 // input data are safely contained in [0,upper-limit]; we need to add an error term for this 451 vfloat4 offset(range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_alpha); 452 vfloat4 ep0_range_error_high = max(ep0 - offset, 0.0f); 453 vfloat4 ep1_range_error_high = max(ep1 - offset, 0.0f); 454 455 vfloat4 ep0_range_error_low = min(ep0, 0.0f); 456 vfloat4 ep1_range_error_low = min(ep1, 0.0f); 457 458 vfloat4 sum_range_error = 459 (ep0_range_error_low * ep0_range_error_low) + 460 (ep1_range_error_low * ep1_range_error_low) + 461 (ep0_range_error_high * ep0_range_error_high) + 462 (ep1_range_error_high * ep1_range_error_high); 463 464 float rgb_range_error = dot3_s(sum_range_error, error_weight) 465 * 0.5f * static_cast<float>(partition_size); 466 float alpha_range_error = sum_range_error.lane<3>() * error_weight.lane<3>() 467 * 0.5f * static_cast<float>(partition_size); 468 469 if (encode_hdr_rgb) 470 { 471 472 // Collect some statistics 473 float af, cf; 474 if (ep1.lane<0>() > ep1.lane<1>() && ep1.lane<0>() > ep1.lane<2>()) 475 { 476 af = ep1.lane<0>(); 477 cf = ep1.lane<0>() - ep0.lane<0>(); 478 } 479 else if (ep1.lane<1>() > ep1.lane<2>()) 480 { 481 af = ep1.lane<1>(); 482 cf = ep1.lane<1>() - ep0.lane<1>(); 483 } 484 else 485 { 486 af = ep1.lane<2>(); 487 cf = ep1.lane<2>() - ep0.lane<2>(); 488 } 489 490 // Estimate of color-component spread in high endpoint color 491 float bf = af - ep1_min; 492 vfloat4 prd = (ep1 - vfloat4(cf)).swz<0, 1, 2>(); 493 vfloat4 pdif = prd - ep0.swz<0, 1, 2>(); 494 // Estimate of color-component spread in low endpoint color 495 float df = hmax_s(abs(pdif)); 496 497 int b = static_cast<int>(bf); 498 int c = static_cast<int>(cf); 499 int d = static_cast<int>(df); 500 501 // Determine which one of the 6 submodes is likely to be used in case of an RGBO-mode 502 int rgbo_mode = 5; // 7 bits per component 503 // mode 4: 8 7 6 504 if (b < 32768 && c < 16384) 505 { 506 rgbo_mode = 4; 507 } 508 509 // mode 3: 9 6 7 510 if (b < 8192 && c < 16384) 511 { 512 rgbo_mode = 3; 513 } 514 515 // mode 2: 10 5 8 516 if (b < 2048 && c < 16384) 517 { 518 rgbo_mode = 2; 519 } 520 521 // mode 1: 11 6 5 522 if (b < 2048 && c < 1024) 523 { 524 rgbo_mode = 1; 525 } 526 527 // mode 0: 11 5 7 528 if (b < 1024 && c < 4096) 529 { 530 rgbo_mode = 0; 531 } 532 533 // Determine which one of the 9 submodes is likely to be used in case of an RGB-mode. 534 int rgb_mode = 8; // 8 bits per component, except 7 bits for blue 535 536 // mode 0: 9 7 6 7 537 if (b < 16384 && c < 8192 && d < 8192) 538 { 539 rgb_mode = 0; 540 } 541 542 // mode 1: 9 8 6 6 543 if (b < 32768 && c < 8192 && d < 4096) 544 { 545 rgb_mode = 1; 546 } 547 548 // mode 2: 10 6 7 7 549 if (b < 4096 && c < 8192 && d < 4096) 550 { 551 rgb_mode = 2; 552 } 553 554 // mode 3: 10 7 7 6 555 if (b < 8192 && c < 8192 && d < 2048) 556 { 557 rgb_mode = 3; 558 } 559 560 // mode 4: 11 8 6 5 561 if (b < 8192 && c < 2048 && d < 512) 562 { 563 rgb_mode = 4; 564 } 565 566 // mode 5: 11 6 8 6 567 if (b < 2048 && c < 8192 && d < 1024) 568 { 569 rgb_mode = 5; 570 } 571 572 // mode 6: 12 7 7 5 573 if (b < 2048 && c < 2048 && d < 256) 574 { 575 rgb_mode = 6; 576 } 577 578 // mode 7: 12 6 7 6 579 if (b < 1024 && c < 2048 && d < 512) 580 { 581 rgb_mode = 7; 582 } 583 584 static const float rgbo_error_scales[6] { 4.0f, 4.0f, 16.0f, 64.0f, 256.0f, 1024.0f }; 585 static const float rgb_error_scales[9] { 64.0f, 64.0f, 16.0f, 16.0f, 4.0f, 4.0f, 1.0f, 1.0f, 384.0f }; 586 587 float mode7mult = rgbo_error_scales[rgbo_mode] * 0.0015f; // Empirically determined .... 588 float mode11mult = rgb_error_scales[rgb_mode] * 0.010f; // Empirically determined .... 589 590 591 float lum_high = hadd_rgb_s(ep1) * (1.0f / 3.0f); 592 float lum_low = hadd_rgb_s(ep0) * (1.0f / 3.0f); 593 float lumdif = lum_high - lum_low; 594 float mode23mult = lumdif < 960 ? 4.0f : lumdif < 3968 ? 16.0f : 128.0f; 595 596 mode23mult *= 0.0005f; // Empirically determined .... 597 598 // Pick among the available HDR endpoint modes 599 for (int i = QUANT_2; i < QUANT_16; i++) 600 { 601 best_error[i][3] = ERROR_CALC_DEFAULT; 602 best_error[i][2] = ERROR_CALC_DEFAULT; 603 best_error[i][1] = ERROR_CALC_DEFAULT; 604 best_error[i][0] = ERROR_CALC_DEFAULT; 605 606 format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA); 607 format_of_choice[i][2] = FMT_HDR_RGB; 608 format_of_choice[i][1] = FMT_HDR_RGB_SCALE; 609 format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE; 610 } 611 612 for (int i = QUANT_16; i <= QUANT_256; i++) 613 { 614 // The base_quant_error should depend on the scale-factor that would be used during 615 // actual encode of the color value 616 617 float base_quant_error = baseline_quant_error[i - QUANT_6] * static_cast<float>(partition_size); 618 float rgb_quantization_error = error_weight_rgbsum * base_quant_error * 2.0f; 619 float alpha_quantization_error = error_weight.lane<3>() * base_quant_error * 2.0f; 620 float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error; 621 622 // For 8 integers, we have two encodings: one with HDR A and another one with LDR A 623 624 float full_hdr_rgba_error = rgba_quantization_error + rgb_range_error + alpha_range_error; 625 best_error[i][3] = full_hdr_rgba_error; 626 format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA); 627 628 // For 6 integers, we have one HDR-RGB encoding 629 float full_hdr_rgb_error = (rgb_quantization_error * mode11mult) + rgb_range_error + eci.alpha_drop_error; 630 best_error[i][2] = full_hdr_rgb_error; 631 format_of_choice[i][2] = FMT_HDR_RGB; 632 633 // For 4 integers, we have one HDR-RGB-Scale encoding 634 float hdr_rgb_scale_error = (rgb_quantization_error * mode7mult) + rgb_range_error + eci.alpha_drop_error + eci.rgb_luma_error; 635 636 best_error[i][1] = hdr_rgb_scale_error; 637 format_of_choice[i][1] = FMT_HDR_RGB_SCALE; 638 639 // For 2 integers, we assume luminance-with-large-range 640 float hdr_luminance_error = (rgb_quantization_error * mode23mult) + rgb_range_error + eci.alpha_drop_error + eci.luminance_error; 641 best_error[i][0] = hdr_luminance_error; 642 format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE; 643 } 644 } 645 else 646 { 647 for (int i = QUANT_2; i < QUANT_6; i++) 648 { 649 best_error[i][3] = ERROR_CALC_DEFAULT; 650 best_error[i][2] = ERROR_CALC_DEFAULT; 651 best_error[i][1] = ERROR_CALC_DEFAULT; 652 best_error[i][0] = ERROR_CALC_DEFAULT; 653 654 format_of_choice[i][3] = FMT_RGBA; 655 format_of_choice[i][2] = FMT_RGB; 656 format_of_choice[i][1] = FMT_RGB_SCALE; 657 format_of_choice[i][0] = FMT_LUMINANCE; 658 } 659 660 float base_quant_error_rgb = error_weight_rgbsum * static_cast<float>(partition_size); 661 float base_quant_error_a = error_weight.lane<3>() * static_cast<float>(partition_size); 662 float base_quant_error_rgba = base_quant_error_rgb + base_quant_error_a; 663 664 float error_scale_bc_rgba = eci.can_blue_contract ? 0.625f : 1.0f; 665 float error_scale_oe_rgba = eci.can_offset_encode ? 0.5f : 1.0f; 666 667 float error_scale_bc_rgb = eci.can_blue_contract ? 0.5f : 1.0f; 668 float error_scale_oe_rgb = eci.can_offset_encode ? 0.25f : 1.0f; 669 670 // Pick among the available LDR endpoint modes 671 for (int i = QUANT_6; i <= QUANT_256; i++) 672 { 673 // Offset encoding not possible at higher quant levels 674 if (i >= QUANT_192) 675 { 676 error_scale_oe_rgba = 1.0f; 677 error_scale_oe_rgb = 1.0f; 678 } 679 680 float base_quant_error = baseline_quant_error[i - QUANT_6]; 681 float quant_error_rgb = base_quant_error_rgb * base_quant_error; 682 float quant_error_rgba = base_quant_error_rgba * base_quant_error; 683 684 // 8 integers can encode as RGBA+RGBA 685 float full_ldr_rgba_error = quant_error_rgba 686 * error_scale_bc_rgba 687 * error_scale_oe_rgba 688 + rgb_range_error 689 + alpha_range_error; 690 691 best_error[i][3] = full_ldr_rgba_error; 692 format_of_choice[i][3] = FMT_RGBA; 693 694 // 6 integers can encode as RGB+RGB or RGBS+AA 695 float full_ldr_rgb_error = quant_error_rgb 696 * error_scale_bc_rgb 697 * error_scale_oe_rgb 698 + rgb_range_error 699 + eci.alpha_drop_error; 700 701 float rgbs_alpha_error = quant_error_rgba 702 + eci.rgb_scale_error 703 + rgb_range_error 704 + alpha_range_error; 705 706 if (rgbs_alpha_error < full_ldr_rgb_error) 707 { 708 best_error[i][2] = rgbs_alpha_error; 709 format_of_choice[i][2] = FMT_RGB_SCALE_ALPHA; 710 } 711 else 712 { 713 best_error[i][2] = full_ldr_rgb_error; 714 format_of_choice[i][2] = FMT_RGB; 715 } 716 717 // 4 integers can encode as RGBS or LA+LA 718 float ldr_rgbs_error = quant_error_rgb 719 + rgb_range_error 720 + eci.alpha_drop_error 721 + eci.rgb_scale_error; 722 723 float lum_alpha_error = quant_error_rgba 724 + rgb_range_error 725 + alpha_range_error 726 + eci.luminance_error; 727 728 if (ldr_rgbs_error < lum_alpha_error) 729 { 730 best_error[i][1] = ldr_rgbs_error; 731 format_of_choice[i][1] = FMT_RGB_SCALE; 732 } 733 else 734 { 735 best_error[i][1] = lum_alpha_error; 736 format_of_choice[i][1] = FMT_LUMINANCE_ALPHA; 737 } 738 739 // 2 integers can encode as L+L 740 float luminance_error = quant_error_rgb 741 + rgb_range_error 742 + eci.alpha_drop_error 743 + eci.luminance_error; 744 745 best_error[i][0] = luminance_error; 746 format_of_choice[i][0] = FMT_LUMINANCE; 747 } 748 } 749} 750 751/** 752 * @brief For one partition compute the best format and quantization for a given bit count. 753 * 754 * @param best_combined_error The best error for each quant level and integer count. 755 * @param best_combined_format The best format for each quant level and integer count. 756 * @param bits_available The number of bits available for encoding. 757 * @param[out] best_quant_level The output best color quant level. 758 * @param[out] best_format The output best color format. 759 * 760 * @return The output error for the best pairing. 761 */ 762static float one_partition_find_best_combination_for_bitcount( 763 QualityProfile privateProfile, 764 const float best_combined_error[21][4], 765 const uint8_t best_combined_format[21][4], 766 int bits_available, 767 uint8_t& best_quant_level, 768 uint8_t& best_format 769) { 770 int best_integer_count = 0; 771 float best_integer_count_error = ERROR_CALC_DEFAULT; 772 773 for (int integer_count = 1; integer_count <= 4; integer_count++) 774 { 775 if (privateProfile != HIGH_QUALITY_PROFILE) 776 { 777 integer_count = 4; // constant 4 bit count for HIGH_SPEED_PROFILE mode 778 } 779 // Compute the quantization level for a given number of integers and a given number of bits 780 int quant_level = quant_mode_table[integer_count][bits_available]; 781 782 // Don't have enough bits to represent a given endpoint format at all! 783 if (quant_level < QUANT_6) 784 { 785 continue; 786 } 787 788 float integer_count_error = best_combined_error[quant_level][integer_count - 1]; 789 if (integer_count_error < best_integer_count_error) 790 { 791 best_integer_count_error = integer_count_error; 792 best_integer_count = integer_count - 1; 793 } 794 } 795 796 int ql = quant_mode_table[best_integer_count + 1][bits_available]; 797 798 best_quant_level = static_cast<uint8_t>(ql); 799 if (privateProfile != HIGH_QUALITY_PROFILE) // keep openSource code style 800 { 801 best_format = FMT_RGBA; 802 } 803 else 804 { 805 best_format = FMT_LUMINANCE; 806 807 if (ql >= QUANT_6) 808 { 809 best_format = best_combined_format[ql][best_integer_count]; 810 } 811 } 812 813 return best_integer_count_error; 814} 815 816/** 817 * @brief For 2 partitions compute the best format combinations for every pair of quant mode and integer count. 818 * 819 * @param best_error The best error for a single endpoint quant level and integer count. 820 * @param best_format The best format for a single endpoint quant level and integer count. 821 * @param[out] best_combined_error The best combined error pairings for the 2 partitions. 822 * @param[out] best_combined_format The best combined format pairings for the 2 partitions. 823 */ 824static void two_partitions_find_best_combination_for_every_quantization_and_integer_count( 825 const float best_error[2][21][4], // indexed by (partition, quant-level, integer-pair-count-minus-1) 826 const uint8_t best_format[2][21][4], 827 float best_combined_error[21][7], // indexed by (quant-level, integer-pair-count-minus-2) 828 uint8_t best_combined_format[21][7][2] 829) { 830 for (int i = QUANT_2; i <= QUANT_256; i++) 831 { 832 for (int j = 0; j < 7; j++) 833 { 834 best_combined_error[i][j] = ERROR_CALC_DEFAULT; 835 } 836 } 837 838 for (int quant = QUANT_6; quant <= QUANT_256; quant++) 839 { 840 for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair 841 { 842 for (int j = 0; j < 4; j++) // integer-count for second endpoint-pair 843 { 844 int low2 = astc::min(i, j); 845 int high2 = astc::max(i, j); 846 if ((high2 - low2) > 1) 847 { 848 continue; 849 } 850 851 int intcnt = i + j; 852 float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j], 1e10f); 853 if (errorterm <= best_combined_error[quant][intcnt]) 854 { 855 best_combined_error[quant][intcnt] = errorterm; 856 best_combined_format[quant][intcnt][0] = best_format[0][quant][i]; 857 best_combined_format[quant][intcnt][1] = best_format[1][quant][j]; 858 } 859 } 860 } 861 } 862} 863 864/** 865 * @brief For 2 partitions compute the best format and quantization for a given bit count. 866 * 867 * @param best_combined_error The best error for each quant level and integer count. 868 * @param best_combined_format The best format for each quant level and integer count. 869 * @param bits_available The number of bits available for encoding. 870 * @param[out] best_quant_level The output best color quant level. 871 * @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available. 872 * @param[out] best_formats The output best color formats. 873 * 874 * @return The output error for the best pairing. 875 */ 876static float two_partitions_find_best_combination_for_bitcount( 877 unsigned int privateProfile, 878 float best_combined_error[21][7], 879 uint8_t best_combined_format[21][7][2], 880 int bits_available, 881 uint8_t& best_quant_level, 882 uint8_t& best_quant_level_mod, 883 uint8_t* best_formats 884) { 885 int best_integer_count = 0; 886 float best_integer_count_error = ERROR_CALC_DEFAULT; 887 int integer_count = 2; 888 if (privateProfile != HIGH_QUALITY_PROFILE) 889 { 890 integer_count = 8; // constant 8 bit count 891 } 892 893 for (; integer_count <= 8; integer_count++) 894 { 895 // Compute the quantization level for a given number of integers and a given number of bits 896 int quant_level = quant_mode_table[integer_count][bits_available]; 897 898 // Don't have enough bits to represent a given endpoint format at all! 899 if (quant_level < QUANT_6) 900 { 901 break; 902 } 903 904 float integer_count_error = best_combined_error[quant_level][integer_count - 2]; 905 if (integer_count_error < best_integer_count_error) 906 { 907 best_integer_count_error = integer_count_error; 908 best_integer_count = integer_count; 909 } 910 } 911 912 int ql = quant_mode_table[best_integer_count][bits_available]; 913 int ql_mod = quant_mode_table[best_integer_count][bits_available + 2]; 914 915 best_quant_level = static_cast<uint8_t>(ql); 916 best_quant_level_mod = static_cast<uint8_t>(ql_mod); 917 918 if (ql >= QUANT_6) 919 { 920 for (int i = 0; i < 2; i++) 921 { 922 best_formats[i] = best_combined_format[ql][best_integer_count - 2][i]; 923 } 924 } 925 else 926 { 927 for (int i = 0; i < 2; i++) 928 { 929 best_formats[i] = FMT_LUMINANCE; 930 } 931 } 932 933 return best_integer_count_error; 934} 935 936/** 937 * @brief For 3 partitions compute the best format combinations for every pair of quant mode and integer count. 938 * 939 * @param best_error The best error for a single endpoint quant level and integer count. 940 * @param best_format The best format for a single endpoint quant level and integer count. 941 * @param[out] best_combined_error The best combined error pairings for the 3 partitions. 942 * @param[out] best_combined_format The best combined format pairings for the 3 partitions. 943 */ 944static void three_partitions_find_best_combination_for_every_quantization_and_integer_count( 945 const float best_error[3][21][4], // indexed by (partition, quant-level, integer-count) 946 const uint8_t best_format[3][21][4], 947 float best_combined_error[21][10], 948 uint8_t best_combined_format[21][10][3] 949) { 950 for (int i = QUANT_2; i <= QUANT_256; i++) 951 { 952 for (int j = 0; j < 10; j++) 953 { 954 best_combined_error[i][j] = ERROR_CALC_DEFAULT; 955 } 956 } 957 958 for (int quant = QUANT_6; quant <= QUANT_256; quant++) 959 { 960 for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair 961 { 962 for (int j = 0; j < 4; j++) // integer-count for second endpoint-pair 963 { 964 int low2 = astc::min(i, j); 965 int high2 = astc::max(i, j); 966 if ((high2 - low2) > 1) 967 { 968 continue; 969 } 970 971 for (int k = 0; k < 4; k++) // integer-count for third endpoint-pair 972 { 973 int low3 = astc::min(k, low2); 974 int high3 = astc::max(k, high2); 975 if ((high3 - low3) > 1) 976 { 977 continue; 978 } 979 980 int intcnt = i + j + k; 981 float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k], 1e10f); 982 if (errorterm <= best_combined_error[quant][intcnt]) 983 { 984 best_combined_error[quant][intcnt] = errorterm; 985 best_combined_format[quant][intcnt][0] = best_format[0][quant][i]; 986 best_combined_format[quant][intcnt][1] = best_format[1][quant][j]; 987 best_combined_format[quant][intcnt][2] = best_format[2][quant][k]; 988 } 989 } 990 } 991 } 992 } 993} 994 995/** 996 * @brief For 3 partitions compute the best format and quantization for a given bit count. 997 * 998 * @param best_combined_error The best error for each quant level and integer count. 999 * @param best_combined_format The best format for each quant level and integer count. 1000 * @param bits_available The number of bits available for encoding. 1001 * @param[out] best_quant_level The output best color quant level. 1002 * @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available. 1003 * @param[out] best_formats The output best color formats. 1004 * 1005 * @return The output error for the best pairing. 1006 */ 1007static float three_partitions_find_best_combination_for_bitcount( 1008 const float best_combined_error[21][10], 1009 const uint8_t best_combined_format[21][10][3], 1010 int bits_available, 1011 uint8_t& best_quant_level, 1012 uint8_t& best_quant_level_mod, 1013 uint8_t* best_formats 1014) { 1015 int best_integer_count = 0; 1016 float best_integer_count_error = ERROR_CALC_DEFAULT; 1017 1018 for (int integer_count = 3; integer_count <= 9; integer_count++) 1019 { 1020 // Compute the quantization level for a given number of integers and a given number of bits 1021 int quant_level = quant_mode_table[integer_count][bits_available]; 1022 1023 // Don't have enough bits to represent a given endpoint format at all! 1024 if (quant_level < QUANT_6) 1025 { 1026 break; 1027 } 1028 1029 float integer_count_error = best_combined_error[quant_level][integer_count - 3]; 1030 if (integer_count_error < best_integer_count_error) 1031 { 1032 best_integer_count_error = integer_count_error; 1033 best_integer_count = integer_count; 1034 } 1035 } 1036 1037 int ql = quant_mode_table[best_integer_count][bits_available]; 1038 int ql_mod = quant_mode_table[best_integer_count][bits_available + 5]; 1039 1040 best_quant_level = static_cast<uint8_t>(ql); 1041 best_quant_level_mod = static_cast<uint8_t>(ql_mod); 1042 1043 if (ql >= QUANT_6) 1044 { 1045 for (int i = 0; i < 3; i++) 1046 { 1047 best_formats[i] = best_combined_format[ql][best_integer_count - 3][i]; 1048 } 1049 } 1050 else 1051 { 1052 for (int i = 0; i < 3; i++) 1053 { 1054 best_formats[i] = FMT_LUMINANCE; 1055 } 1056 } 1057 1058 return best_integer_count_error; 1059} 1060 1061/** 1062 * @brief For 4 partitions compute the best format combinations for every pair of quant mode and integer count. 1063 * 1064 * @param best_error The best error for a single endpoint quant level and integer count. 1065 * @param best_format The best format for a single endpoint quant level and integer count. 1066 * @param[out] best_combined_error The best combined error pairings for the 4 partitions. 1067 * @param[out] best_combined_format The best combined format pairings for the 4 partitions. 1068 */ 1069static void four_partitions_find_best_combination_for_every_quantization_and_integer_count( 1070 const float best_error[4][21][4], // indexed by (partition, quant-level, integer-count) 1071 const uint8_t best_format[4][21][4], 1072 float best_combined_error[21][13], 1073 uint8_t best_combined_format[21][13][4] 1074) { 1075 for (int i = QUANT_2; i <= QUANT_256; i++) 1076 { 1077 for (int j = 0; j < 13; j++) 1078 { 1079 best_combined_error[i][j] = ERROR_CALC_DEFAULT; 1080 } 1081 } 1082 1083 for (int quant = QUANT_6; quant <= QUANT_256; quant++) 1084 { 1085 for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair 1086 { 1087 for (int j = 0; j < 4; j++) // integer-count for second endpoint-pair 1088 { 1089 int low2 = astc::min(i, j); 1090 int high2 = astc::max(i, j); 1091 if ((high2 - low2) > 1) 1092 { 1093 continue; 1094 } 1095 1096 for (int k = 0; k < 4; k++) // integer-count for third endpoint-pair 1097 { 1098 int low3 = astc::min(k, low2); 1099 int high3 = astc::max(k, high2); 1100 if ((high3 - low3) > 1) 1101 { 1102 continue; 1103 } 1104 1105 for (int l = 0; l < 4; l++) // integer-count for fourth endpoint-pair 1106 { 1107 int low4 = astc::min(l, low3); 1108 int high4 = astc::max(l, high3); 1109 if ((high4 - low4) > 1) 1110 { 1111 continue; 1112 } 1113 1114 int intcnt = i + j + k + l; 1115 float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k] + best_error[3][quant][l], 1e10f); 1116 if (errorterm <= best_combined_error[quant][intcnt]) 1117 { 1118 best_combined_error[quant][intcnt] = errorterm; 1119 best_combined_format[quant][intcnt][0] = best_format[0][quant][i]; 1120 best_combined_format[quant][intcnt][1] = best_format[1][quant][j]; 1121 best_combined_format[quant][intcnt][2] = best_format[2][quant][k]; 1122 best_combined_format[quant][intcnt][3] = best_format[3][quant][l]; 1123 } 1124 } 1125 } 1126 } 1127 } 1128 } 1129} 1130 1131/** 1132 * @brief For 4 partitions compute the best format and quantization for a given bit count. 1133 * 1134 * @param best_combined_error The best error for each quant level and integer count. 1135 * @param best_combined_format The best format for each quant level and integer count. 1136 * @param bits_available The number of bits available for encoding. 1137 * @param[out] best_quant_level The output best color quant level. 1138 * @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available. 1139 * @param[out] best_formats The output best color formats. 1140 * 1141 * @return best_error The output error for the best pairing. 1142 */ 1143static float four_partitions_find_best_combination_for_bitcount( 1144 const float best_combined_error[21][13], 1145 const uint8_t best_combined_format[21][13][4], 1146 int bits_available, 1147 uint8_t& best_quant_level, 1148 uint8_t& best_quant_level_mod, 1149 uint8_t* best_formats 1150) { 1151 int best_integer_count = 0; 1152 float best_integer_count_error = ERROR_CALC_DEFAULT; 1153 1154 for (int integer_count = 4; integer_count <= 9; integer_count++) 1155 { 1156 // Compute the quantization level for a given number of integers and a given number of bits 1157 int quant_level = quant_mode_table[integer_count][bits_available]; 1158 1159 // Don't have enough bits to represent a given endpoint format at all! 1160 if (quant_level < QUANT_6) 1161 { 1162 break; 1163 } 1164 1165 float integer_count_error = best_combined_error[quant_level][integer_count - 4]; 1166 if (integer_count_error < best_integer_count_error) 1167 { 1168 best_integer_count_error = integer_count_error; 1169 best_integer_count = integer_count; 1170 } 1171 } 1172 1173 int ql = quant_mode_table[best_integer_count][bits_available]; 1174 int ql_mod = quant_mode_table[best_integer_count][bits_available + 8]; 1175 1176 best_quant_level = static_cast<uint8_t>(ql); 1177 best_quant_level_mod = static_cast<uint8_t>(ql_mod); 1178 1179 if (ql >= QUANT_6) 1180 { 1181 for (int i = 0; i < 4; i++) 1182 { 1183 best_formats[i] = best_combined_format[ql][best_integer_count - 4][i]; 1184 } 1185 } 1186 else 1187 { 1188 for (int i = 0; i < 4; i++) 1189 { 1190 best_formats[i] = FMT_LUMINANCE; 1191 } 1192 } 1193 1194 return best_integer_count_error; 1195} 1196 1197/* See header for documentation. */ 1198unsigned int compute_ideal_endpoint_formats( 1199 QualityProfile privateProfile, 1200 const partition_info& pi, 1201 const image_block& blk, 1202 const endpoints& ep, 1203 // bitcounts and errors computed for the various quantization methods 1204 const int8_t* qwt_bitcounts, 1205 const float* qwt_errors, 1206 unsigned int tune_candidate_limit, 1207 unsigned int start_block_mode, 1208 unsigned int end_block_mode, 1209 // output data 1210 uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS], 1211 int block_mode[TUNE_MAX_TRIAL_CANDIDATES], 1212 quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES], 1213 quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES], 1214 compression_working_buffers& tmpbuf 1215) { 1216 int partition_count = pi.partition_count; 1217 1218 promise(partition_count > 0); 1219 1220 bool encode_hdr_rgb = static_cast<bool>(blk.rgb_lns[0]); 1221 bool encode_hdr_alpha = static_cast<bool>(blk.alpha_lns[0]); 1222 1223 // Compute the errors that result from various encoding choices (such as using luminance instead 1224 // of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on) 1225 encoding_choice_errors eci[BLOCK_MAX_PARTITIONS]; 1226 compute_encoding_choice_errors(privateProfile, blk, pi, ep, eci); 1227 1228 float best_error[BLOCK_MAX_PARTITIONS][21][4]; 1229 uint8_t format_of_choice[BLOCK_MAX_PARTITIONS][21][4]; 1230 for (int i = 0; i < partition_count; i++) 1231 { 1232 compute_color_error_for_every_integer_count_and_quant_level( 1233 encode_hdr_rgb, encode_hdr_alpha, i, 1234 pi, eci[i], ep, blk.channel_weight, best_error[i], 1235 format_of_choice[i]); 1236 } 1237 1238 float* errors_of_best_combination = tmpbuf.errors_of_best_combination; 1239 uint8_t* best_quant_levels = tmpbuf.best_quant_levels; 1240 uint8_t* best_quant_levels_mod = tmpbuf.best_quant_levels_mod; 1241 uint8_t (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats; 1242 1243 // Ensure that the first iteration understep contains data that will never be picked 1244 vfloat clear_error(ERROR_CALC_DEFAULT); 1245 vint clear_quant(0); 1246 1247 unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode); 1248 storea(clear_error, errors_of_best_combination + packed_start_block_mode); 1249 store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode); 1250 store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode); 1251 1252 // Ensure that last iteration overstep contains data that will never be picked 1253 unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1); 1254 storea(clear_error, errors_of_best_combination + packed_end_block_mode); 1255 store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode); 1256 store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode); 1257 1258 // Track a scalar best to avoid expensive search at least once ... 1259 float error_of_best_combination = ERROR_CALC_DEFAULT; 1260 int index_of_best_combination = -1; 1261 1262 // The block contains 1 partition 1263 if (partition_count == 1) 1264 { 1265 for (unsigned int i = start_block_mode; i < end_block_mode; i++) 1266 { 1267 if (qwt_errors[i] >= ERROR_CALC_DEFAULT) 1268 { 1269 errors_of_best_combination[i] = ERROR_CALC_DEFAULT; 1270 continue; 1271 } 1272 1273 float error_of_best = one_partition_find_best_combination_for_bitcount( 1274 privateProfile, 1275 best_error[0], format_of_choice[0], qwt_bitcounts[i], 1276 best_quant_levels[i], best_ep_formats[i][0]); 1277 1278 float total_error = error_of_best + qwt_errors[i]; 1279 errors_of_best_combination[i] = total_error; 1280 best_quant_levels_mod[i] = best_quant_levels[i]; 1281 1282 if (total_error < error_of_best_combination) 1283 { 1284 error_of_best_combination = total_error; 1285 index_of_best_combination = i; 1286 } 1287 } 1288 } 1289 // The block contains 2 partitions 1290 else if (partition_count == 2) 1291 { 1292 float combined_best_error[21][7]; 1293 uint8_t formats_of_choice[21][7][2]; 1294 1295 two_partitions_find_best_combination_for_every_quantization_and_integer_count( 1296 best_error, format_of_choice, combined_best_error, formats_of_choice); 1297 1298 assert(start_block_mode == 0); 1299 for (unsigned int i = 0; i < end_block_mode; i++) 1300 { 1301 if (qwt_errors[i] >= ERROR_CALC_DEFAULT) 1302 { 1303 errors_of_best_combination[i] = ERROR_CALC_DEFAULT; 1304 continue; 1305 } 1306 1307 float error_of_best = two_partitions_find_best_combination_for_bitcount( 1308 privateProfile, 1309 combined_best_error, formats_of_choice, qwt_bitcounts[i], 1310 best_quant_levels[i], best_quant_levels_mod[i], 1311 best_ep_formats[i]); 1312 1313 float total_error = error_of_best + qwt_errors[i]; 1314 errors_of_best_combination[i] = total_error; 1315 1316 if (total_error < error_of_best_combination) 1317 { 1318 error_of_best_combination = total_error; 1319 index_of_best_combination = i; 1320 } 1321 } 1322 } 1323 // The block contains 3 partitions 1324 else if (partition_count == 3) 1325 { 1326 float combined_best_error[21][10]; 1327 uint8_t formats_of_choice[21][10][3]; 1328 1329 three_partitions_find_best_combination_for_every_quantization_and_integer_count( 1330 best_error, format_of_choice, combined_best_error, formats_of_choice); 1331 1332 assert(start_block_mode == 0); 1333 for (unsigned int i = 0; i < end_block_mode; i++) 1334 { 1335 if (qwt_errors[i] >= ERROR_CALC_DEFAULT) 1336 { 1337 errors_of_best_combination[i] = ERROR_CALC_DEFAULT; 1338 continue; 1339 } 1340 1341 float error_of_best = three_partitions_find_best_combination_for_bitcount( 1342 combined_best_error, formats_of_choice, qwt_bitcounts[i], 1343 best_quant_levels[i], best_quant_levels_mod[i], 1344 best_ep_formats[i]); 1345 1346 float total_error = error_of_best + qwt_errors[i]; 1347 errors_of_best_combination[i] = total_error; 1348 1349 if (total_error < error_of_best_combination) 1350 { 1351 error_of_best_combination = total_error; 1352 index_of_best_combination = i; 1353 } 1354 } 1355 } 1356 // The block contains 4 partitions 1357 else // if (partition_count == 4) 1358 { 1359 assert(partition_count == 4); 1360 float combined_best_error[21][13]; 1361 uint8_t formats_of_choice[21][13][4]; 1362 1363 four_partitions_find_best_combination_for_every_quantization_and_integer_count( 1364 best_error, format_of_choice, combined_best_error, formats_of_choice); 1365 1366 assert(start_block_mode == 0); 1367 for (unsigned int i = 0; i < end_block_mode; i++) 1368 { 1369 if (qwt_errors[i] >= ERROR_CALC_DEFAULT) 1370 { 1371 errors_of_best_combination[i] = ERROR_CALC_DEFAULT; 1372 continue; 1373 } 1374 1375 float error_of_best = four_partitions_find_best_combination_for_bitcount( 1376 combined_best_error, formats_of_choice, qwt_bitcounts[i], 1377 best_quant_levels[i], best_quant_levels_mod[i], 1378 best_ep_formats[i]); 1379 1380 float total_error = error_of_best + qwt_errors[i]; 1381 errors_of_best_combination[i] = total_error; 1382 1383 if (total_error < error_of_best_combination) 1384 { 1385 error_of_best_combination = total_error; 1386 index_of_best_combination = i; 1387 } 1388 } 1389 } 1390 1391 int best_error_weights[TUNE_MAX_TRIAL_CANDIDATES]; 1392 1393 // Fast path the first result and avoid the list search for trial 0 1394 best_error_weights[0] = index_of_best_combination; 1395 if (index_of_best_combination >= 0) 1396 { 1397 errors_of_best_combination[index_of_best_combination] = ERROR_CALC_DEFAULT; 1398 } 1399 1400 // Search the remaining results and pick the best candidate modes for trial 1+ 1401 for (unsigned int i = 1; i < tune_candidate_limit; i++) 1402 { 1403 vint vbest_error_index(-1); 1404 vfloat vbest_ep_error(ERROR_CALC_DEFAULT); 1405 1406 start_block_mode = round_down_to_simd_multiple_vla(start_block_mode); 1407 vint lane_ids = vint::lane_id() + vint(start_block_mode); 1408 for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH) 1409 { 1410 vfloat err = vfloat(errors_of_best_combination + j); 1411 vmask mask = err < vbest_ep_error; 1412 vbest_ep_error = select(vbest_ep_error, err, mask); 1413 vbest_error_index = select(vbest_error_index, lane_ids, mask); 1414 lane_ids += vint(ASTCENC_SIMD_WIDTH); 1415 } 1416 1417 // Pick best mode from the SIMD result, using lowest matching index to ensure invariance 1418 vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error); 1419 vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error); 1420 vbest_error_index = hmin(vbest_error_index); 1421 int best_error_index = vbest_error_index.lane<0>(); 1422 1423 best_error_weights[i] = best_error_index; 1424 1425 // Max the error for this candidate so we don't pick it again 1426 if (best_error_index >= 0) 1427 { 1428 errors_of_best_combination[best_error_index] = ERROR_CALC_DEFAULT; 1429 } 1430 // Early-out if no more candidates are valid 1431 else 1432 { 1433 break; 1434 } 1435 } 1436 1437 for (unsigned int i = 0; i < tune_candidate_limit; i++) 1438 { 1439 if (best_error_weights[i] < 0) 1440 { 1441 return i; 1442 } 1443 1444 block_mode[i] = best_error_weights[i]; 1445 1446 quant_level[i] = static_cast<quant_method>(best_quant_levels[best_error_weights[i]]); 1447 quant_level_mod[i] = static_cast<quant_method>(best_quant_levels_mod[best_error_weights[i]]); 1448 1449 assert(quant_level[i] >= QUANT_6 && quant_level[i] <= QUANT_256); 1450 assert(quant_level_mod[i] >= QUANT_6 && quant_level_mod[i] <= QUANT_256); 1451 1452 for (int j = 0; j < partition_count; j++) 1453 { 1454 partition_format_specifiers[i][j] = best_ep_formats[best_error_weights[i]][j]; 1455 } 1456 } 1457 1458 return tune_candidate_limit; 1459} 1460 1461#endif 1462