1// SPDX-License-Identifier: Apache-2.0 2// ---------------------------------------------------------------------------- 3// Copyright 2011-2024 Arm Limited 4// 5// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6// use this file except in compliance with the License. You may obtain a copy 7// of the License at: 8// 9// http://www.apache.org/licenses/LICENSE-2.0 10// 11// Unless required by applicable law or agreed to in writing, software 12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14// License for the specific language governing permissions and limitations 15// under the License. 16// ---------------------------------------------------------------------------- 17 18#if !defined(ASTCENC_DECOMPRESS_ONLY) 19 20/** 21 * @brief Functions for computing color endpoints and texel weights. 22 */ 23 24#include <cassert> 25 26#include "astcenc_internal.h" 27#include "astcenc_vecmathlib.h" 28 29/** 30 * @brief Compute the infilled weight for N texel indices in a decimated grid. 31 * 32 * @param di The weight grid decimation to use. 33 * @param weights The decimated weight values to use. 34 * @param index The first texel index to interpolate. 35 * 36 * @return The interpolated weight for the given set of SIMD_WIDTH texels. 37 */ 38static vfloat bilinear_infill_vla( 39 const decimation_info& di, 40 const float* weights, 41 unsigned int index 42) { 43 // Load the bilinear filter texel weight indexes in the decimated grid 44 vint weight_idx0 = vint(di.texel_weights_tr[0] + index); 45 vint weight_idx1 = vint(di.texel_weights_tr[1] + index); 46 vint weight_idx2 = vint(di.texel_weights_tr[2] + index); 47 vint weight_idx3 = vint(di.texel_weights_tr[3] + index); 48 49 // Load the bilinear filter weights from the decimated grid 50 vfloat weight_val0 = gatherf(weights, weight_idx0); 51 vfloat weight_val1 = gatherf(weights, weight_idx1); 52 vfloat weight_val2 = gatherf(weights, weight_idx2); 53 vfloat weight_val3 = gatherf(weights, weight_idx3); 54 55 // Load the weight contribution factors for each decimated weight 56 vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index); 57 vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index); 58 vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index); 59 vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index); 60 61 // Compute the bilinear interpolation to generate the per-texel weight 62 return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) + 63 (weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3); 64} 65 66/** 67 * @brief Compute the infilled weight for N texel indices in a decimated grid. 68 * 69 * This is specialized version which computes only two weights per texel for 70 * encodings that are only decimated in a single axis. 71 * 72 * @param di The weight grid decimation to use. 73 * @param weights The decimated weight values to use. 74 * @param index The first texel index to interpolate. 75 * 76 * @return The interpolated weight for the given set of SIMD_WIDTH texels. 77 */ 78static vfloat bilinear_infill_vla_2( 79 const decimation_info& di, 80 const float* weights, 81 unsigned int index 82) { 83 // Load the bilinear filter texel weight indexes in the decimated grid 84 vint weight_idx0 = vint(di.texel_weights_tr[0] + index); 85 vint weight_idx1 = vint(di.texel_weights_tr[1] + index); 86 87 // Load the bilinear filter weights from the decimated grid 88 vfloat weight_val0 = gatherf(weights, weight_idx0); 89 vfloat weight_val1 = gatherf(weights, weight_idx1); 90 91 // Load the weight contribution factors for each decimated weight 92 vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index); 93 vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index); 94 95 // Compute the bilinear interpolation to generate the per-texel weight 96 return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1); 97} 98 99/** 100 * @brief Compute the ideal endpoints and weights for 1 color component. 101 * 102 * @param blk The image block color data to compress. 103 * @param pi The partition info for the current trial. 104 * @param[out] ei The computed ideal endpoints and weights. 105 * @param component The color component to compute. 106 */ 107static void compute_ideal_colors_and_weights_1_comp( 108 const image_block& blk, 109 const partition_info& pi, 110 endpoints_and_weights& ei, 111 unsigned int component 112) { 113 unsigned int partition_count = pi.partition_count; 114 ei.ep.partition_count = partition_count; 115 promise(partition_count > 0); 116 117 unsigned int texel_count = blk.texel_count; 118 promise(texel_count > 0); 119 120 float error_weight; 121 const float* data_vr = nullptr; 122 123 assert(component < BLOCK_MAX_COMPONENTS); 124 switch (component) 125 { 126 case 0: 127 error_weight = blk.channel_weight.lane<0>(); 128 data_vr = blk.data_r; 129 break; 130 case 1: 131 error_weight = blk.channel_weight.lane<1>(); 132 data_vr = blk.data_g; 133 break; 134 case 2: 135 error_weight = blk.channel_weight.lane<2>(); 136 data_vr = blk.data_b; 137 break; 138 default: 139 assert(component == 3); 140 error_weight = blk.channel_weight.lane<3>(); 141 data_vr = blk.data_a; 142 break; 143 } 144 145 vmask4 sep_mask = vint4::lane_id() == vint4(component); 146 bool is_constant_wes { true }; 147 float partition0_len_sq { 0.0f }; 148 149 for (unsigned int i = 0; i < partition_count; i++) 150 { 151 float lowvalue { 1e10f }; 152 float highvalue { -1e10f }; 153 154 unsigned int partition_texel_count = pi.partition_texel_count[i]; 155 for (unsigned int j = 0; j < partition_texel_count; j++) 156 { 157 unsigned int tix = pi.texels_of_partition[i][j]; 158 float value = data_vr[tix]; 159 lowvalue = astc::min(value, lowvalue); 160 highvalue = astc::max(value, highvalue); 161 } 162 163 if (highvalue <= lowvalue) 164 { 165 lowvalue = 0.0f; 166 highvalue = 1e-7f; 167 } 168 169 float length = highvalue - lowvalue; 170 float length_squared = length * length; 171 float scale = 1.0f / length; 172 173 if (i == 0) 174 { 175 partition0_len_sq = length_squared; 176 } 177 else 178 { 179 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; 180 } 181 182 for (unsigned int j = 0; j < partition_texel_count; j++) 183 { 184 unsigned int tix = pi.texels_of_partition[i][j]; 185 float value = (data_vr[tix] - lowvalue) * scale; 186 value = astc::clamp1f(value); 187 188 ei.weights[tix] = value; 189 ei.weight_error_scale[tix] = length_squared * error_weight; 190 assert(!astc::isnan(ei.weight_error_scale[tix])); 191 } 192 193 ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask); 194 ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask); 195 } 196 197 // Zero initialize any SIMD over-fetch 198 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); 199 for (unsigned int i = texel_count; i < texel_count_simd; i++) 200 { 201 ei.weights[i] = 0.0f; 202 ei.weight_error_scale[i] = 0.0f; 203 } 204 205 ei.is_constant_weight_error_scale = is_constant_wes; 206} 207 208/** 209 * @brief Compute the ideal endpoints and weights for 2 color components. 210 * 211 * @param blk The image block color data to compress. 212 * @param pi The partition info for the current trial. 213 * @param[out] ei The computed ideal endpoints and weights. 214 * @param component1 The first color component to compute. 215 * @param component2 The second color component to compute. 216 */ 217static void compute_ideal_colors_and_weights_2_comp( 218 const image_block& blk, 219 const partition_info& pi, 220 endpoints_and_weights& ei, 221 int component1, 222 int component2 223) { 224 unsigned int partition_count = pi.partition_count; 225 ei.ep.partition_count = partition_count; 226 promise(partition_count > 0); 227 228 unsigned int texel_count = blk.texel_count; 229 promise(texel_count > 0); 230 231 partition_metrics pms[BLOCK_MAX_PARTITIONS]; 232 233 float error_weight; 234 const float* data_vr = nullptr; 235 const float* data_vg = nullptr; 236 237 if (component1 == 0 && component2 == 1) 238 { 239 error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f; 240 241 data_vr = blk.data_r; 242 data_vg = blk.data_g; 243 } 244 else if (component1 == 0 && component2 == 2) 245 { 246 error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f; 247 248 data_vr = blk.data_r; 249 data_vg = blk.data_b; 250 } 251 else // (component1 == 1 && component2 == 2) 252 { 253 assert(component1 == 1 && component2 == 2); 254 255 error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f; 256 257 data_vr = blk.data_g; 258 data_vg = blk.data_b; 259 } 260 261 compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms); 262 263 bool is_constant_wes { true }; 264 float partition0_len_sq { 0.0f }; 265 266 vmask4 comp1_mask = vint4::lane_id() == vint4(component1); 267 vmask4 comp2_mask = vint4::lane_id() == vint4(component2); 268 269 for (unsigned int i = 0; i < partition_count; i++) 270 { 271 vfloat4 dir = pms[i].dir; 272 if (hadd_s(dir) < 0.0f) 273 { 274 dir = vfloat4::zero() - dir; 275 } 276 277 line2 line { pms[i].avg, normalize_safe(dir, unit2()) }; 278 float lowparam { 1e10f }; 279 float highparam { -1e10f }; 280 281 unsigned int partition_texel_count = pi.partition_texel_count[i]; 282 for (unsigned int j = 0; j < partition_texel_count; j++) 283 { 284 unsigned int tix = pi.texels_of_partition[i][j]; 285 vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]); 286 float param = dot_s(point - line.a, line.b); 287 ei.weights[tix] = param; 288 289 lowparam = astc::min(param, lowparam); 290 highparam = astc::max(param, highparam); 291 } 292 293 // It is possible for a uniform-color partition to produce length=0; 294 // this causes NaN issues so set to small value to avoid this problem 295 if (highparam <= lowparam) 296 { 297 lowparam = 0.0f; 298 highparam = 1e-7f; 299 } 300 301 float length = highparam - lowparam; 302 float length_squared = length * length; 303 float scale = 1.0f / length; 304 305 if (i == 0) 306 { 307 partition0_len_sq = length_squared; 308 } 309 else 310 { 311 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; 312 } 313 314 for (unsigned int j = 0; j < partition_texel_count; j++) 315 { 316 unsigned int tix = pi.texels_of_partition[i][j]; 317 float idx = (ei.weights[tix] - lowparam) * scale; 318 idx = astc::clamp1f(idx); 319 320 ei.weights[tix] = idx; 321 ei.weight_error_scale[tix] = length_squared * error_weight; 322 assert(!astc::isnan(ei.weight_error_scale[tix])); 323 } 324 325 vfloat4 lowvalue = line.a + line.b * lowparam; 326 vfloat4 highvalue = line.a + line.b * highparam; 327 328 vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask); 329 vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask); 330 331 ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask); 332 ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask); 333 } 334 335 // Zero initialize any SIMD over-fetch 336 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); 337 for (unsigned int i = texel_count; i < texel_count_simd; i++) 338 { 339 ei.weights[i] = 0.0f; 340 ei.weight_error_scale[i] = 0.0f; 341 } 342 343 ei.is_constant_weight_error_scale = is_constant_wes; 344} 345 346/** 347 * @brief Compute the ideal endpoints and weights for 3 color components. 348 * 349 * @param blk The image block color data to compress. 350 * @param pi The partition info for the current trial. 351 * @param[out] ei The computed ideal endpoints and weights. 352 * @param omitted_component The color component excluded from the calculation. 353 */ 354static void compute_ideal_colors_and_weights_3_comp( 355 const image_block& blk, 356 const partition_info& pi, 357 endpoints_and_weights& ei, 358 unsigned int omitted_component 359) { 360 unsigned int partition_count = pi.partition_count; 361 ei.ep.partition_count = partition_count; 362 promise(partition_count > 0); 363 364 unsigned int texel_count = blk.texel_count; 365 promise(texel_count > 0); 366 367 partition_metrics *pms = reinterpret_cast<partition_metrics *>(&blk.pms[0]); 368 369 float error_weight; 370 const float* data_vr = nullptr; 371 const float* data_vg = nullptr; 372 const float* data_vb = nullptr; 373 if (omitted_component == 0) 374 { 375 error_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>()); 376 data_vr = blk.data_g; 377 data_vg = blk.data_b; 378 data_vb = blk.data_a; 379 } 380 else if (omitted_component == 1) 381 { 382 error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>()); 383 data_vr = blk.data_r; 384 data_vg = blk.data_b; 385 data_vb = blk.data_a; 386 } 387 else if (omitted_component == 2) 388 { 389 error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>()); 390 data_vr = blk.data_r; 391 data_vg = blk.data_g; 392 data_vb = blk.data_a; 393 } 394 else 395 { 396 assert(omitted_component == 3); 397 398 error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()); 399 data_vr = blk.data_r; 400 data_vg = blk.data_g; 401 data_vb = blk.data_b; 402 } 403 404 error_weight = error_weight * (1.0f / 3.0f); 405 406 if (omitted_component == 3) 407 { 408 compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); 409 } 410 else 411 { 412 compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms); 413 } 414 415 bool is_constant_wes { true }; 416 float partition0_len_sq { 0.0f }; 417 418 for (unsigned int i = 0; i < partition_count; i++) 419 { 420 vfloat4 dir = pms[i].dir; 421 if (hadd_rgb_s(dir) < 0.0f) 422 { 423 dir = vfloat4::zero() - dir; 424 } 425 426 line3 line { pms[i].avg, normalize_safe(dir, unit3()) }; 427 float lowparam { 1e10f }; 428 float highparam { -1e10f }; 429 430 unsigned int partition_texel_count = pi.partition_texel_count[i]; 431 432 vfloat4 lowparam_vec = vfloat4(1e10f, 1e10f, 1e10f, 1e10f); 433 vfloat4 highparam_vec = vfloat4(-1e10f, -1e10f, -1e10f, -1e10f); 434 435 unsigned int j = 0; 436 for (; j + ASTCENC_SIMD_WIDTH <= partition_texel_count; j += ASTCENC_SIMD_WIDTH) 437 { 438 unsigned int tix0 = pi.texels_of_partition[i][j]; 439 unsigned int tix1 = pi.texels_of_partition[i][j + 1]; 440 unsigned int tix2 = pi.texels_of_partition[i][j + 2]; 441 unsigned int tix3 = pi.texels_of_partition[i][j + 3]; 442 443 vfloat4 points0 = vfloat4(data_vr[tix0], data_vg[tix0], data_vb[tix0], 0.0f); 444 vfloat4 points1 = vfloat4(data_vr[tix1], data_vg[tix1], data_vb[tix1], 0.0f); 445 vfloat4 points2 = vfloat4(data_vr[tix2], data_vg[tix2], data_vb[tix2], 0.0f); 446 vfloat4 points3 = vfloat4(data_vr[tix3], data_vg[tix3], data_vb[tix3], 0.0f); 447 448 vfloat4 sub_v0 = points0 - line.a; 449 vfloat4 sub_v1 = points1 - line.a; 450 vfloat4 sub_v2 = points2 - line.a; 451 vfloat4 sub_v3 = points3 - line.a; 452 453 vfloat4 params0 = sub_v0 * line.b; 454 vfloat4 params1 = sub_v1 * line.b; 455 vfloat4 params2 = sub_v2 * line.b; 456 vfloat4 params3 = sub_v3 * line.b; 457 458 float param0 = hadd_rgba_s(params0); 459 float param1 = hadd_rgba_s(params1); 460 float param2 = hadd_rgba_s(params2); 461 float param3 = hadd_rgba_s(params3); 462 463 ei.weights[tix0] = param0; 464 ei.weights[tix1] = param1; 465 ei.weights[tix2] = param2; 466 ei.weights[tix3] = param3; 467 468 vfloat4 params_vec = vfloat4(param0, param1, param2, param3); 469 lowparam_vec = min(params_vec, lowparam_vec); 470 highparam_vec = max(params_vec, highparam_vec); 471 } 472 473 lowparam = hmin_s(vfloat4(lowparam_vec)); 474 highparam = hmax_s(vfloat4(highparam_vec)); 475 476 for (; j < partition_texel_count; j++) 477 { 478 unsigned int tix = pi.texels_of_partition[i][j]; 479 vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]); 480 float param = dot3_s(point - line.a, line.b); 481 ei.weights[tix] = param; 482 483 lowparam = astc::min(param, lowparam); 484 highparam = astc::max(param, highparam); 485 } 486 487 // It is possible for a uniform-color partition to produce length=0; 488 // this causes NaN issues so set to small value to avoid this problem 489 if (highparam <= lowparam) 490 { 491 lowparam = 0.0f; 492 highparam = 1e-7f; 493 } 494 495 float length = highparam - lowparam; 496 float length_squared = length * length; 497 float scale = 1.0f / length; 498 499 if (i == 0) 500 { 501 partition0_len_sq = length_squared; 502 } 503 else 504 { 505 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; 506 } 507 508 for (j = 0; j < partition_texel_count; j++) 509 { 510 unsigned int tix = pi.texels_of_partition[i][j]; 511 float idx = (ei.weights[tix] - lowparam) * scale; 512 idx = astc::clamp1f(idx); 513 514 ei.weights[tix] = idx; 515 ei.weight_error_scale[tix] = length_squared * error_weight; 516 assert(!astc::isnan(ei.weight_error_scale[tix])); 517 } 518 519 vfloat4 ep0 = line.a + line.b * lowparam; 520 vfloat4 ep1 = line.a + line.b * highparam; 521 522 vfloat4 bmin = blk.data_min; 523 vfloat4 bmax = blk.data_max; 524 525 assert(omitted_component < BLOCK_MAX_COMPONENTS); 526 switch (omitted_component) 527 { 528 case 0: 529 ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>()); 530 ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>()); 531 break; 532 case 1: 533 ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>()); 534 ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>()); 535 break; 536 case 2: 537 ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>()); 538 ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>()); 539 break; 540 default: 541 ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>()); 542 ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>()); 543 break; 544 } 545 } 546 547 // Zero initialize any SIMD over-fetch 548 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); 549 for (unsigned int i = texel_count; i < texel_count_simd; i++) 550 { 551 ei.weights[i] = 0.0f; 552 ei.weight_error_scale[i] = 0.0f; 553 } 554 555 ei.is_constant_weight_error_scale = is_constant_wes; 556} 557 558/** 559 * @brief Compute the ideal endpoints and weights for 4 color components. 560 * 561 * @param blk The image block color data to compress. 562 * @param pi The partition info for the current trial. 563 * @param[out] ei The computed ideal endpoints and weights. 564 */ 565static void compute_ideal_colors_and_weights_4_comp( 566 const image_block& blk, 567 const partition_info& pi, 568 endpoints_and_weights& ei 569) { 570 const float error_weight = hadd_s(blk.channel_weight) / 4.0f; 571 572 unsigned int partition_count = pi.partition_count; 573 574 unsigned int texel_count = blk.texel_count; 575 promise(texel_count > 0); 576 promise(partition_count > 0); 577 578 partition_metrics pms[BLOCK_MAX_PARTITIONS]; 579 580 compute_avgs_and_dirs_4_comp(pi, blk, pms); 581 582 bool is_constant_wes { true }; 583 float partition0_len_sq { 0.0f }; 584 585 for (unsigned int i = 0; i < partition_count; i++) 586 { 587 vfloat4 dir = pms[i].dir; 588 if (hadd_rgb_s(dir) < 0.0f) 589 { 590 dir = vfloat4::zero() - dir; 591 } 592 593 line4 line { pms[i].avg, normalize_safe(dir, unit4()) }; 594 float lowparam { 1e10f }; 595 float highparam { -1e10f }; 596 597 unsigned int partition_texel_count = pi.partition_texel_count[i]; 598 for (unsigned int j = 0; j < partition_texel_count; j++) 599 { 600 unsigned int tix = pi.texels_of_partition[i][j]; 601 vfloat4 point = blk.texel(tix); 602 float param = dot_s(point - line.a, line.b); 603 ei.weights[tix] = param; 604 605 lowparam = astc::min(param, lowparam); 606 highparam = astc::max(param, highparam); 607 } 608 609 // It is possible for a uniform-color partition to produce length=0; 610 // this causes NaN issues so set to small value to avoid this problem 611 if (highparam <= lowparam) 612 { 613 lowparam = 0.0f; 614 highparam = 1e-7f; 615 } 616 617 float length = highparam - lowparam; 618 float length_squared = length * length; 619 float scale = 1.0f / length; 620 621 if (i == 0) 622 { 623 partition0_len_sq = length_squared; 624 } 625 else 626 { 627 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; 628 } 629 630 ei.ep.endpt0[i] = line.a + line.b * lowparam; 631 ei.ep.endpt1[i] = line.a + line.b * highparam; 632 633 for (unsigned int j = 0; j < partition_texel_count; j++) 634 { 635 unsigned int tix = pi.texels_of_partition[i][j]; 636 float idx = (ei.weights[tix] - lowparam) * scale; 637 idx = astc::clamp1f(idx); 638 639 ei.weights[tix] = idx; 640 ei.weight_error_scale[tix] = length_squared * error_weight; 641 assert(!astc::isnan(ei.weight_error_scale[tix])); 642 } 643 } 644 645 // Zero initialize any SIMD over-fetch 646 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); 647 for (unsigned int i = texel_count; i < texel_count_simd; i++) 648 { 649 ei.weights[i] = 0.0f; 650 ei.weight_error_scale[i] = 0.0f; 651 } 652 653 ei.is_constant_weight_error_scale = is_constant_wes; 654} 655 656/* See header for documentation. */ 657void compute_ideal_colors_and_weights_1plane( 658 const image_block& blk, 659 const partition_info& pi, 660 endpoints_and_weights& ei 661) { 662 bool uses_alpha = !blk.is_constant_channel(3); 663 664 if (uses_alpha) 665 { 666 compute_ideal_colors_and_weights_4_comp(blk, pi, ei); 667 } 668 else 669 { 670 compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3); 671 } 672} 673 674/* See header for documentation. */ 675void compute_ideal_colors_and_weights_2planes( 676 const block_size_descriptor& bsd, 677 const image_block& blk, 678 unsigned int plane2_component, 679 endpoints_and_weights& ei1, 680 endpoints_and_weights& ei2 681) { 682 const auto& pi = bsd.get_partition_info(1, 0); 683 bool uses_alpha = !blk.is_constant_channel(3); 684 685 assert(plane2_component < BLOCK_MAX_COMPONENTS); 686 switch (plane2_component) 687 { 688 case 0: // Separate weights for red 689 if (uses_alpha) 690 { 691 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0); 692 } 693 else 694 { 695 compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2); 696 } 697 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0); 698 break; 699 700 case 1: // Separate weights for green 701 if (uses_alpha) 702 { 703 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1); 704 } 705 else 706 { 707 compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2); 708 } 709 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1); 710 break; 711 712 case 2: // Separate weights for blue 713 if (uses_alpha) 714 { 715 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2); 716 } 717 else 718 { 719 compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1); 720 } 721 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2); 722 break; 723 724 default: // Separate weights for alpha 725 assert(uses_alpha); 726 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3); 727 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3); 728 break; 729 } 730} 731 732/* See header for documentation. */ 733float compute_error_of_weight_set_1plane( 734 const endpoints_and_weights& eai, 735 const decimation_info& di, 736 const float* dec_weight_quant_uvalue 737) { 738 vfloatacc error_summav = vfloatacc::zero(); 739 unsigned int texel_count = di.texel_count; 740 promise(texel_count > 0); 741 742 // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized 743 if (di.max_texel_weight_count > 2) 744 { 745 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 746 { 747 // Compute the bilinear interpolation of the decimated weight grid 748 vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i); 749 750 // Compute the error between the computed value and the ideal weight 751 vfloat actual_values = loada(eai.weights + i); 752 vfloat diff = current_values - actual_values; 753 vfloat significance = loada(eai.weight_error_scale + i); 754 vfloat error = diff * diff * significance; 755 756 haccumulate(error_summav, error); 757 } 758 } 759 else if (di.max_texel_weight_count > 1) 760 { 761 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 762 { 763 // Compute the bilinear interpolation of the decimated weight grid 764 vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i); 765 766 // Compute the error between the computed value and the ideal weight 767 vfloat actual_values = loada(eai.weights + i); 768 vfloat diff = current_values - actual_values; 769 vfloat significance = loada(eai.weight_error_scale + i); 770 vfloat error = diff * diff * significance; 771 772 haccumulate(error_summav, error); 773 } 774 } 775 else 776 { 777 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 778 { 779 // Load the weight set directly, without interpolation 780 vfloat current_values = loada(dec_weight_quant_uvalue + i); 781 782 // Compute the error between the computed value and the ideal weight 783 vfloat actual_values = loada(eai.weights + i); 784 vfloat diff = current_values - actual_values; 785 vfloat significance = loada(eai.weight_error_scale + i); 786 vfloat error = diff * diff * significance; 787 788 haccumulate(error_summav, error); 789 } 790 } 791 792 // Resolve the final scalar accumulator sum 793 return hadd_s(error_summav); 794} 795 796/* See header for documentation. */ 797float compute_error_of_weight_set_2planes( 798 const endpoints_and_weights& eai1, 799 const endpoints_and_weights& eai2, 800 const decimation_info& di, 801 const float* dec_weight_quant_uvalue_plane1, 802 const float* dec_weight_quant_uvalue_plane2 803) { 804 vfloatacc error_summav = vfloatacc::zero(); 805 unsigned int texel_count = di.texel_count; 806 promise(texel_count > 0); 807 808 // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized 809 if (di.max_texel_weight_count > 2) 810 { 811 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 812 { 813 // Plane 1 814 // Compute the bilinear interpolation of the decimated weight grid 815 vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i); 816 817 // Compute the error between the computed value and the ideal weight 818 vfloat actual_values1 = loada(eai1.weights + i); 819 vfloat diff = current_values1 - actual_values1; 820 vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i); 821 822 // Plane 2 823 // Compute the bilinear interpolation of the decimated weight grid 824 vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i); 825 826 // Compute the error between the computed value and the ideal weight 827 vfloat actual_values2 = loada(eai2.weights + i); 828 diff = current_values2 - actual_values2; 829 vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i); 830 831 haccumulate(error_summav, error1 + error2); 832 } 833 } 834 else if (di.max_texel_weight_count > 1) 835 { 836 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 837 { 838 // Plane 1 839 // Compute the bilinear interpolation of the decimated weight grid 840 vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i); 841 842 // Compute the error between the computed value and the ideal weight 843 vfloat actual_values1 = loada(eai1.weights + i); 844 vfloat diff = current_values1 - actual_values1; 845 vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i); 846 847 // Plane 2 848 // Compute the bilinear interpolation of the decimated weight grid 849 vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i); 850 851 // Compute the error between the computed value and the ideal weight 852 vfloat actual_values2 = loada(eai2.weights + i); 853 diff = current_values2 - actual_values2; 854 vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i); 855 856 haccumulate(error_summav, error1 + error2); 857 } 858 } 859 else 860 { 861 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 862 { 863 // Plane 1 864 // Load the weight set directly, without interpolation 865 vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i); 866 867 // Compute the error between the computed value and the ideal weight 868 vfloat actual_values1 = loada(eai1.weights + i); 869 vfloat diff = current_values1 - actual_values1; 870 vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i); 871 872 // Plane 2 873 // Load the weight set directly, without interpolation 874 vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i); 875 876 // Compute the error between the computed value and the ideal weight 877 vfloat actual_values2 = loada(eai2.weights + i); 878 diff = current_values2 - actual_values2; 879 vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i); 880 881 haccumulate(error_summav, error1 + error2); 882 } 883 } 884 885 // Resolve the final scalar accumulator sum 886 return hadd_s(error_summav); 887} 888 889/* See header for documentation. */ 890void compute_ideal_weights_for_decimation( 891 const endpoints_and_weights& ei, 892 const decimation_info& di, 893 float* dec_weight_ideal_value 894) { 895 unsigned int texel_count = di.texel_count; 896 unsigned int weight_count = di.weight_count; 897 bool is_direct = texel_count == weight_count; 898 promise(texel_count > 0); 899 promise(weight_count > 0); 900 901 // Ensure that the end of the output arrays that are used for SIMD paths later are filled so we 902 // can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight 903 // arrays always contain space for 64 elements 904 unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1); 905 storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd); 906 907 // If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the 908 // zero-initialized SIMD over-fetch region 909 if (is_direct) 910 { 911 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 912 { 913 vfloat weight(ei.weights + i); 914 storea(weight, dec_weight_ideal_value + i); 915 } 916 917 return; 918 } 919 920 // Otherwise compute an estimate and perform single refinement iteration 921 ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS]; 922 923 // Compute an initial average for each decimated weight 924 bool constant_wes = ei.is_constant_weight_error_scale; 925 vfloat weight_error_scale(ei.weight_error_scale[0]); 926 927 // This overshoots - this is OK as we initialize the array tails in the 928 // decimation table structures to safe values ... 929 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) 930 { 931 // Start with a small value to avoid div-by-zero later 932 vfloat weight_weight(1e-10f); 933 vfloat initial_weight = vfloat::zero(); 934 935 // Accumulate error weighting of all the texels using this weight 936 vint weight_texel_count(di.weight_texel_count + i); 937 unsigned int max_texel_count = hmax(weight_texel_count).lane<0>(); 938 promise(max_texel_count > 0); 939 940 for (unsigned int j = 0; j < max_texel_count; j++) 941 { 942#ifdef ASTCENC_USE_COMMON_GATHERF 943 const uint8_t* texel = di.weight_texels_tr[j] + i; 944#else 945 vint texel(di.weight_texels_tr[j] + i); 946#endif 947 vfloat weight = loada(di.weights_texel_contribs_tr[j] + i); 948 949 if (!constant_wes) 950 { 951 weight_error_scale = gatherf(ei.weight_error_scale, texel); 952 } 953 954 vfloat contrib_weight = weight * weight_error_scale; 955 956 weight_weight += contrib_weight; 957 initial_weight += gatherf(ei.weights, texel) * contrib_weight; 958 } 959 960 storea(initial_weight / weight_weight, dec_weight_ideal_value + i); 961 } 962 963 // Populate the interpolated weight grid based on the initial average 964 // Process SIMD-width texel coordinates at at time while we can. Safe to 965 // over-process full SIMD vectors - the tail is zeroed. 966 if (di.max_texel_weight_count <= 2) 967 { 968 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 969 { 970 vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i); 971 storea(weight, infilled_weights + i); 972 } 973 } 974 else 975 { 976 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 977 { 978 vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i); 979 storea(weight, infilled_weights + i); 980 } 981 } 982 983 // Perform a single iteration of refinement 984 // Empirically determined step size; larger values don't help but smaller drops image quality 985 constexpr float stepsize = 0.25f; 986 constexpr float chd_scale = -WEIGHTS_TEXEL_SUM; 987 988 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) 989 { 990 vfloat weight_val = loada(dec_weight_ideal_value + i); 991 992 // Accumulate error weighting of all the texels using this weight 993 // Start with a small value to avoid div-by-zero later 994 vfloat error_change0(1e-10f); 995 vfloat error_change1(0.0f); 996 997 // Accumulate error weighting of all the texels using this weight 998 vint weight_texel_count(di.weight_texel_count + i); 999 unsigned int max_texel_count = hmax(weight_texel_count).lane<0>(); 1000 promise(max_texel_count > 0); 1001 1002 for (unsigned int j = 0; j < max_texel_count; j++) 1003 { 1004#ifdef ASTCENC_USE_COMMON_GATHERF 1005 const uint8_t* texel = di.weight_texels_tr[j] + i; 1006#else 1007 vint texel(di.weight_texels_tr[j] + i); 1008#endif 1009 vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i); 1010 1011 if (!constant_wes) 1012 { 1013 weight_error_scale = gatherf(ei.weight_error_scale, texel); 1014 } 1015 1016 vfloat scale = weight_error_scale * contrib_weight; 1017 vfloat old_weight = gatherf(infilled_weights, texel); 1018 vfloat ideal_weight = gatherf(ei.weights, texel); 1019 1020 error_change0 += contrib_weight * scale; 1021 error_change1 += (old_weight - ideal_weight) * scale; 1022 } 1023 1024 vfloat step = (error_change1 * chd_scale) / error_change0; 1025 step = clamp(-stepsize, stepsize, step); 1026 1027 // Update the weight; note this can store negative values 1028 storea(weight_val + step, dec_weight_ideal_value + i); 1029 } 1030} 1031 1032/* See header for documentation. */ 1033void compute_quantized_weights_for_decimation( 1034 const decimation_info& di, 1035 float low_bound, 1036 float high_bound, 1037 const float* dec_weight_ideal_value, 1038 float* weight_set_out, 1039 uint8_t* quantized_weight_set, 1040 quant_method quant_level 1041) { 1042 int weight_count = di.weight_count; 1043 promise(weight_count > 0); 1044 const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level]; 1045 1046 // The available quant levels, stored with a minus 1 bias 1047 static const float quant_levels_m1[12] { 1048 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f 1049 }; 1050 1051 vint steps_m1(get_quant_level(quant_level) - 1); 1052 float quant_level_m1 = quant_levels_m1[quant_level]; 1053 1054 // Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds 1055 1056 // TODO: Oddity to investigate; triggered by test in issue #265. 1057 if (high_bound <= low_bound) 1058 { 1059 low_bound = 0.0f; 1060 high_bound = 1.0f; 1061 } 1062 1063 float rscale = high_bound - low_bound; 1064 float scale = 1.0f / rscale; 1065 1066 float scaled_low_bound = low_bound * scale; 1067 rscale *= 1.0f / 64.0f; 1068 1069 vfloat scalev(scale); 1070 vfloat scaled_low_boundv(scaled_low_bound); 1071 vfloat quant_level_m1v(quant_level_m1); 1072 vfloat rscalev(rscale); 1073 vfloat low_boundv(low_bound); 1074 1075 // This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known 1076 // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements 1077 if (get_quant_level(quant_level) <= 16) 1078 { 1079 vint4 tab0 = vint4::load(qat.quant_to_unquant); 1080 vint tab0p; 1081 vtable_prepare(tab0, tab0p); 1082 1083 for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) 1084 { 1085 vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv; 1086 ix = clampzo(ix); 1087 1088 // Look up the two closest indexes and return the one that was closest 1089 vfloat ix1 = ix * quant_level_m1v; 1090 1091 vint weightl = float_to_int(ix1); 1092 vint weighth = min(weightl + vint(1), steps_m1); 1093 1094 vint ixli = vtable_8bt_32bi(tab0p, weightl); 1095 vint ixhi = vtable_8bt_32bi(tab0p, weighth); 1096 1097 vmask mask = int_to_float(ixli + ixhi) < (vfloat(128.0f) * ix); 1098 vint weight = select(ixli, ixhi, mask); 1099 vfloat ixl = int_to_float(weight); 1100 1101 // Invert the weight-scaling that was done initially 1102 storea(ixl * rscalev + low_boundv, weight_set_out + i); 1103 vint scn = pack_low_bytes(weight); 1104 store_nbytes(scn, quantized_weight_set + i); 1105 } 1106 } 1107 else 1108 { 1109 vint4 tab0 = vint4::load(qat.quant_to_unquant + 0); 1110 vint4 tab1 = vint4::load(qat.quant_to_unquant + 16); 1111 vint tab0p, tab1p; 1112 vtable_prepare(tab0, tab1, tab0p, tab1p); 1113 1114 for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) 1115 { 1116 vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv; 1117 ix = clampzo(ix); 1118 1119 // Look up the two closest indexes and return the one that was closest 1120 vfloat ix1 = ix * quant_level_m1v; 1121 1122 vint weightl = float_to_int(ix1); 1123 vint weighth = min(weightl + vint(1), steps_m1); 1124 1125 vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl); 1126 vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth); 1127 1128 vmask mask = int_to_float(ixli + ixhi) < (vfloat(128.0f) * ix); 1129 vint weight = select(ixli, ixhi, mask); 1130 vfloat ixl = int_to_float(weight); 1131 1132 // Invert the weight-scaling that was done initially 1133 storea(ixl * rscalev + low_boundv, weight_set_out + i); 1134 vint scn = pack_low_bytes(weight); 1135 store_nbytes(scn, quantized_weight_set + i); 1136 } 1137 } 1138} 1139 1140/** 1141 * @brief Compute the RGB + offset for a HDR endpoint mode #7. 1142 * 1143 * Since the matrix needed has a regular structure we can simplify the inverse calculation. This 1144 * gives us ~24 multiplications vs. 96 for a generic inverse. 1145 * 1146 * mat[0] = vfloat4(rgba_ws.x, 0.0f, 0.0f, wght_ws.x); 1147 * mat[1] = vfloat4( 0.0f, rgba_ws.y, 0.0f, wght_ws.y); 1148 * mat[2] = vfloat4( 0.0f, 0.0f, rgba_ws.z, wght_ws.z); 1149 * mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z, psum); 1150 * mat = invert(mat); 1151 * 1152 * @param rgba_weight_sum Sum of partition component error weights. 1153 * @param weight_weight_sum Sum of partition component error weights * texel weight. 1154 * @param rgbq_sum Sum of partition component error weights * texel weight * color data. 1155 * @param psum Sum of RGB color weights * texel weight^2. 1156 */ 1157static inline vfloat4 compute_rgbo_vector( 1158 vfloat4 rgba_weight_sum, 1159 vfloat4 weight_weight_sum, 1160 vfloat4 rgbq_sum, 1161 float psum 1162) { 1163 float X = rgba_weight_sum.lane<0>(); 1164 float Y = rgba_weight_sum.lane<1>(); 1165 float Z = rgba_weight_sum.lane<2>(); 1166 float P = weight_weight_sum.lane<0>(); 1167 float Q = weight_weight_sum.lane<1>(); 1168 float R = weight_weight_sum.lane<2>(); 1169 float S = psum; 1170 1171 float PP = P * P; 1172 float QQ = Q * Q; 1173 float RR = R * R; 1174 1175 float SZmRR = S * Z - RR; 1176 float DT = SZmRR * Y - Z * QQ; 1177 float YP = Y * P; 1178 float QX = Q * X; 1179 float YX = Y * X; 1180 float mZYP = -Z * YP; 1181 float mZQX = -Z * QX; 1182 float mRYX = -R * YX; 1183 float ZQP = Z * Q * P; 1184 float RYP = R * YP; 1185 float RQX = R * QX; 1186 1187 // Compute the reciprocal of matrix determinant 1188 float rdet = 1.0f / (DT * X + mZYP * P); 1189 1190 // Actually compute the adjugate, and then apply 1/det separately 1191 vfloat4 mat0(DT, ZQP, RYP, mZYP); 1192 vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX); 1193 vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX); 1194 vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX); 1195 vfloat4 vect = rgbq_sum * rdet; 1196 1197 return vfloat4(dot_s(mat0, vect), 1198 dot_s(mat1, vect), 1199 dot_s(mat2, vect), 1200 dot_s(mat3, vect)); 1201} 1202 1203/* See header for documentation. */ 1204void recompute_ideal_colors_1plane( 1205 const image_block& blk, 1206 const partition_info& pi, 1207 const decimation_info& di, 1208 const uint8_t* dec_weights_uquant, 1209 endpoints& ep, 1210 vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS], 1211 vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS] 1212) { 1213 unsigned int weight_count = di.weight_count; 1214 unsigned int total_texel_count = blk.texel_count; 1215 unsigned int partition_count = pi.partition_count; 1216 1217 promise(weight_count > 0); 1218 promise(total_texel_count > 0); 1219 promise(partition_count > 0); 1220 1221 ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS]; 1222 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) 1223 { 1224 vint unquant_value(dec_weights_uquant + i); 1225 vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f); 1226 storea(unquant_valuef, dec_weight + i); 1227 } 1228 1229 ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS]; 1230 float* undec_weight_ref; 1231 if (di.max_texel_weight_count == 1) 1232 { 1233 undec_weight_ref = dec_weight; 1234 } 1235 else if (di.max_texel_weight_count <= 2) 1236 { 1237 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) 1238 { 1239 vfloat weight = bilinear_infill_vla_2(di, dec_weight, i); 1240 storea(weight, undec_weight + i); 1241 } 1242 1243 undec_weight_ref = undec_weight; 1244 } 1245 else 1246 { 1247 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) 1248 { 1249 vfloat weight = bilinear_infill_vla(di, dec_weight, i); 1250 storea(weight, undec_weight + i); 1251 } 1252 1253 undec_weight_ref = undec_weight; 1254 } 1255 1256 vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count)); 1257 1258 for (unsigned int i = 0; i < partition_count; i++) 1259 { 1260 unsigned int texel_count = pi.partition_texel_count[i]; 1261 const uint8_t *texel_indexes = pi.texels_of_partition[i]; 1262 1263 // Only compute a partition mean if more than one partition 1264 if (partition_count > 1) 1265 { 1266 rgba_sum = vfloat4::zero(); 1267 promise(texel_count > 0); 1268 for (unsigned int j = 0; j < texel_count; j++) 1269 { 1270 unsigned int tix = texel_indexes[j]; 1271 rgba_sum += blk.texel(tix); 1272 } 1273 } 1274 1275 rgba_sum = rgba_sum * blk.channel_weight; 1276 vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f); 1277 vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>()); 1278 1279 float scale_max = 0.0f; 1280 float scale_min = 1e10f; 1281 1282 float wmin1 = 1.0f; 1283 float wmax1 = 0.0f; 1284 1285 float left_sum_s = 0.0f; 1286 float middle_sum_s = 0.0f; 1287 float right_sum_s = 0.0f; 1288 1289 vfloat4 color_vec_x = vfloat4::zero(); 1290 vfloat4 color_vec_y = vfloat4::zero(); 1291 1292 vfloat4 scale_vec = vfloat4::zero(); 1293 1294 float weight_weight_sum_s = 1e-17f; 1295 1296 vfloat4 color_weight = blk.channel_weight; 1297 float ls_weight = hadd_rgb_s(color_weight); 1298 1299 for (unsigned int j = 0; j < texel_count; j++) 1300 { 1301 unsigned int tix = texel_indexes[j]; 1302 vfloat4 rgba = blk.texel(tix); 1303 1304 float idx0 = undec_weight_ref[tix]; 1305 1306 float om_idx0 = 1.0f - idx0; 1307 wmin1 = astc::min(idx0, wmin1); 1308 wmax1 = astc::max(idx0, wmax1); 1309 1310 float scale = dot3_s(scale_dir, rgba); 1311 scale_min = astc::min(scale, scale_min); 1312 scale_max = astc::max(scale, scale_max); 1313 1314 left_sum_s += om_idx0 * om_idx0; 1315 middle_sum_s += om_idx0 * idx0; 1316 right_sum_s += idx0 * idx0; 1317 weight_weight_sum_s += idx0; 1318 1319 vfloat4 color_idx(idx0); 1320 vfloat4 cwprod = rgba; 1321 vfloat4 cwiprod = cwprod * color_idx; 1322 1323 color_vec_y += cwiprod; 1324 color_vec_x += cwprod - cwiprod; 1325 1326 scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight); 1327 } 1328 1329 vfloat4 left_sum = vfloat4(left_sum_s) * color_weight; 1330 vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight; 1331 vfloat4 right_sum = vfloat4(right_sum_s) * color_weight; 1332 vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight; 1333 1334 color_vec_x = color_vec_x * color_weight; 1335 color_vec_y = color_vec_y * color_weight; 1336 1337 // Initialize the luminance and scale vectors with a reasonable default 1338 float scalediv = scale_min / astc::max(scale_max, 1e-10f); 1339 scalediv = astc::clamp1f(scalediv); 1340 1341 vfloat4 sds = scale_dir * scale_max; 1342 1343 rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv); 1344 1345 if (wmin1 >= wmax1 * 0.999f) 1346 { 1347 // If all weights in the partition were equal, then just take average of all colors in 1348 // the partition and use that as both endpoint colors 1349 vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; 1350 1351 vmask4 notnan_mask = avg == avg; 1352 ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask); 1353 ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask); 1354 1355 rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f); 1356 } 1357 else 1358 { 1359 // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given 1360 // set of texel weights and pixel colors 1361 vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum); 1362 vfloat4 color_rdet1 = 1.0f / color_det1; 1363 1364 float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>()); 1365 float ls_rdet1 = 1.0f / ls_det1; 1366 1367 vfloat4 color_mss1 = (left_sum * left_sum) 1368 + (2.0f * middle_sum * middle_sum) 1369 + (right_sum * right_sum); 1370 1371 float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>()) 1372 + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>()) 1373 + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>()); 1374 1375 vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1; 1376 vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1; 1377 1378 vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f); 1379 vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); 1380 vmask4 full_mask = det_mask & notnan_mask; 1381 1382 ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask); 1383 ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask); 1384 1385 float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1; 1386 float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1; 1387 1388 if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1) 1389 { 1390 float scalediv2 = scale_ep0 / scale_ep1; 1391 vfloat4 sdsm = scale_dir * scale_ep1; 1392 rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2); 1393 } 1394 } 1395 1396 // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR 1397 if (blk.rgb_lns[0] || blk.alpha_lns[0]) 1398 { 1399 vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight; 1400 float psum = right_sum_s * hadd_rgb_s(color_weight); 1401 1402 vfloat4 rgbq_sum = color_vec_x + color_vec_y; 1403 rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); 1404 1405 vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); 1406 rgbo_vectors[i] = rgbovec; 1407 1408 // We can get a failure due to the use of a singular (non-invertible) matrix 1409 // If it failed, compute rgbo_vectors[] with a different method ... 1410 if (astc::isnan(dot_s(rgbovec, rgbovec))) 1411 { 1412 vfloat4 v0 = ep.endpt0[i]; 1413 vfloat4 v1 = ep.endpt1[i]; 1414 1415 float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f); 1416 avgdif = astc::max(avgdif, 0.0f); 1417 1418 vfloat4 avg = (v0 + v1) * 0.5f; 1419 vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; 1420 rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); 1421 } 1422 } 1423 } 1424} 1425 1426/* See header for documentation. */ 1427void recompute_ideal_colors_2planes( 1428 const image_block& blk, 1429 const block_size_descriptor& bsd, 1430 const decimation_info& di, 1431 const uint8_t* dec_weights_uquant_plane1, 1432 const uint8_t* dec_weights_uquant_plane2, 1433 endpoints& ep, 1434 vfloat4& rgbs_vector, 1435 vfloat4& rgbo_vector, 1436 int plane2_component 1437) { 1438 unsigned int weight_count = di.weight_count; 1439 unsigned int total_texel_count = blk.texel_count; 1440 1441 promise(total_texel_count > 0); 1442 promise(weight_count > 0); 1443 1444 ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE]; 1445 ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE]; 1446 1447 assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE); 1448 1449 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) 1450 { 1451 vint unquant_value1(dec_weights_uquant_plane1 + i); 1452 vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f); 1453 storea(unquant_value1f, dec_weight_plane1 + i); 1454 1455 vint unquant_value2(dec_weights_uquant_plane2 + i); 1456 vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f); 1457 storea(unquant_value2f, dec_weight_plane2 + i); 1458 } 1459 1460 ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS]; 1461 ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS]; 1462 1463 float* undec_weight_plane1_ref; 1464 float* undec_weight_plane2_ref; 1465 1466 if (di.max_texel_weight_count == 1) 1467 { 1468 undec_weight_plane1_ref = dec_weight_plane1; 1469 undec_weight_plane2_ref = dec_weight_plane2; 1470 } 1471 else if (di.max_texel_weight_count <= 2) 1472 { 1473 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) 1474 { 1475 vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i); 1476 storea(weight, undec_weight_plane1 + i); 1477 1478 weight = bilinear_infill_vla_2(di, dec_weight_plane2, i); 1479 storea(weight, undec_weight_plane2 + i); 1480 } 1481 1482 undec_weight_plane1_ref = undec_weight_plane1; 1483 undec_weight_plane2_ref = undec_weight_plane2; 1484 } 1485 else 1486 { 1487 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) 1488 { 1489 vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i); 1490 storea(weight, undec_weight_plane1 + i); 1491 1492 weight = bilinear_infill_vla(di, dec_weight_plane2, i); 1493 storea(weight, undec_weight_plane2 + i); 1494 } 1495 1496 undec_weight_plane1_ref = undec_weight_plane1; 1497 undec_weight_plane2_ref = undec_weight_plane2; 1498 } 1499 1500 unsigned int texel_count = bsd.texel_count; 1501 vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f); 1502 vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>()); 1503 1504 float scale_max = 0.0f; 1505 float scale_min = 1e10f; 1506 1507 float wmin1 = 1.0f; 1508 float wmax1 = 0.0f; 1509 1510 float wmin2 = 1.0f; 1511 float wmax2 = 0.0f; 1512 1513 float left1_sum_s = 0.0f; 1514 float middle1_sum_s = 0.0f; 1515 float right1_sum_s = 0.0f; 1516 1517 float left2_sum_s = 0.0f; 1518 float middle2_sum_s = 0.0f; 1519 float right2_sum_s = 0.0f; 1520 1521 vfloat4 color_vec_x = vfloat4::zero(); 1522 vfloat4 color_vec_y = vfloat4::zero(); 1523 1524 vfloat4 scale_vec = vfloat4::zero(); 1525 1526 vfloat4 weight_weight_sum = vfloat4(1e-17f); 1527 1528 vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component); 1529 vfloat4 color_weight = blk.channel_weight; 1530 float ls_weight = hadd_rgb_s(color_weight); 1531 1532 for (unsigned int j = 0; j < texel_count; j++) 1533 { 1534 vfloat4 rgba = blk.texel(j); 1535 1536 float idx0 = undec_weight_plane1_ref[j]; 1537 1538 float om_idx0 = 1.0f - idx0; 1539 wmin1 = astc::min(idx0, wmin1); 1540 wmax1 = astc::max(idx0, wmax1); 1541 1542 float scale = dot3_s(scale_dir, rgba); 1543 scale_min = astc::min(scale, scale_min); 1544 scale_max = astc::max(scale, scale_max); 1545 1546 left1_sum_s += om_idx0 * om_idx0; 1547 middle1_sum_s += om_idx0 * idx0; 1548 right1_sum_s += idx0 * idx0; 1549 1550 float idx1 = undec_weight_plane2_ref[j]; 1551 1552 float om_idx1 = 1.0f - idx1; 1553 wmin2 = astc::min(idx1, wmin2); 1554 wmax2 = astc::max(idx1, wmax2); 1555 1556 left2_sum_s += om_idx1 * om_idx1; 1557 middle2_sum_s += om_idx1 * idx1; 1558 right2_sum_s += idx1 * idx1; 1559 1560 vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask); 1561 1562 vfloat4 cwprod = rgba; 1563 vfloat4 cwiprod = cwprod * color_idx; 1564 1565 color_vec_y += cwiprod; 1566 color_vec_x += cwprod - cwiprod; 1567 1568 scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale); 1569 weight_weight_sum += color_idx; 1570 } 1571 1572 vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight; 1573 vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight; 1574 vfloat4 right1_sum = vfloat4(right1_sum_s) * color_weight; 1575 vfloat4 lmrs_sum = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight; 1576 1577 vfloat4 left2_sum = vfloat4(left2_sum_s) * color_weight; 1578 vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight; 1579 vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight; 1580 1581 color_vec_x = color_vec_x * color_weight; 1582 color_vec_y = color_vec_y * color_weight; 1583 1584 // Initialize the luminance and scale vectors with a reasonable default 1585 float scalediv = scale_min / astc::max(scale_max, 1e-10f); 1586 scalediv = astc::clamp1f(scalediv); 1587 1588 vfloat4 sds = scale_dir * scale_max; 1589 1590 rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv); 1591 1592 if (wmin1 >= wmax1 * 0.999f) 1593 { 1594 // If all weights in the partition were equal, then just take average of all colors in 1595 // the partition and use that as both endpoint colors 1596 vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; 1597 1598 vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component); 1599 vmask4 notnan_mask = avg == avg; 1600 vmask4 full_mask = p1_mask & notnan_mask; 1601 1602 ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask); 1603 ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask); 1604 1605 rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f); 1606 } 1607 else 1608 { 1609 // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given 1610 // set of texel weights and pixel colors 1611 vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum); 1612 vfloat4 color_rdet1 = 1.0f / color_det1; 1613 1614 float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>()); 1615 float ls_rdet1 = 1.0f / ls_det1; 1616 1617 vfloat4 color_mss1 = (left1_sum * left1_sum) 1618 + (2.0f * middle1_sum * middle1_sum) 1619 + (right1_sum * right1_sum); 1620 1621 float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>()) 1622 + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>()) 1623 + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>()); 1624 1625 vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1; 1626 vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1; 1627 1628 float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1; 1629 float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1; 1630 1631 vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component); 1632 vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f); 1633 vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); 1634 vmask4 full_mask = p1_mask & det_mask & notnan_mask; 1635 1636 ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask); 1637 ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask); 1638 1639 if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1) 1640 { 1641 float scalediv2 = scale_ep0 / scale_ep1; 1642 vfloat4 sdsm = scale_dir * scale_ep1; 1643 rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2); 1644 } 1645 } 1646 1647 if (wmin2 >= wmax2 * 0.999f) 1648 { 1649 // If all weights in the partition were equal, then just take average of all colors in 1650 // the partition and use that as both endpoint colors 1651 vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; 1652 1653 vmask4 notnan_mask = avg == avg; 1654 vmask4 full_mask = p2_mask & notnan_mask; 1655 1656 ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask); 1657 ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask); 1658 } 1659 else 1660 { 1661 // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given 1662 // set of texel weights and pixel colors 1663 vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum); 1664 vfloat4 color_rdet2 = 1.0f / color_det2; 1665 1666 vfloat4 color_mss2 = (left2_sum * left2_sum) 1667 + (2.0f * middle2_sum * middle2_sum) 1668 + (right2_sum * right2_sum); 1669 1670 vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2; 1671 vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2; 1672 1673 vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f); 1674 vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); 1675 vmask4 full_mask = p2_mask & det_mask & notnan_mask; 1676 1677 ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask); 1678 ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask); 1679 } 1680 1681 // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR 1682 if (blk.rgb_lns[0] || blk.alpha_lns[0]) 1683 { 1684 weight_weight_sum = weight_weight_sum * color_weight; 1685 float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight); 1686 1687 vfloat4 rgbq_sum = color_vec_x + color_vec_y; 1688 rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); 1689 1690 rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); 1691 1692 // We can get a failure due to the use of a singular (non-invertible) matrix 1693 // If it failed, compute rgbo_vectors[] with a different method ... 1694 if (astc::isnan(dot_s(rgbo_vector, rgbo_vector))) 1695 { 1696 vfloat4 v0 = ep.endpt0[0]; 1697 vfloat4 v1 = ep.endpt1[0]; 1698 1699 float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f); 1700 avgdif = astc::max(avgdif, 0.0f); 1701 1702 vfloat4 avg = (v0 + v1) * 0.5f; 1703 vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; 1704 1705 rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); 1706 } 1707 } 1708} 1709 1710#endif 1711