1// SPDX-License-Identifier: Apache-2.0 2// ---------------------------------------------------------------------------- 3// Copyright 2011-2023 Arm Limited 4// 5// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6// use this file except in compliance with the License. You may obtain a copy 7// of the License at: 8// 9// http://www.apache.org/licenses/LICENSE-2.0 10// 11// Unless required by applicable law or agreed to in writing, software 12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14// License for the specific language governing permissions and limitations 15// under the License. 16// ---------------------------------------------------------------------------- 17 18#if !defined(ASTCENC_DECOMPRESS_ONLY) 19 20/** 21 * @brief Functions for finding best partition for a block. 22 * 23 * The partition search operates in two stages. The first pass uses kmeans clustering to group 24 * texels into an ideal partitioning for the requested partition count, and then compares that 25 * against the 1024 partitionings generated by the ASTC partition hash function. The generated 26 * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal 27 * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and 28 * partitionings that actually generate fewer than the requested partition count, but only the top 29 * N candidates are actually put through a more detailed search. N is determined by the compressor 30 * quality preset. 31 * 32 * For the detailed search, each candidate is checked against two possible encoding methods: 33 * 34 * - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints). 35 * - The best partitioning assuming same chroma colors (RGB + scale endpoints). 36 * 37 * This is implemented by computing the compute mean color and dominant direction for each 38 * partition. This defines two lines, both of which go through the mean color value. 39 * 40 * - One line has a direction defined by the dominant direction; this is used to assess the error 41 * from using an uncorrelated color representation. 42 * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma 43 * (RGB + scale) color representation. 44 * 45 * The best candidate is selected by computing the squared-errors that result from using these 46 * lines for endpoint selection. 47 */ 48 49#include <limits> 50#include "astcenc_internal.h" 51 52/** 53 * @brief Pick some initial kmeans cluster centers. 54 * 55 * @param blk The image block color data to compress. 56 * @param texel_count The number of texels in the block. 57 * @param partition_count The number of partitions in the block. 58 * @param[out] cluster_centers The initial partition cluster center colors. 59 */ 60static void kmeans_init( 61 const image_block& blk, 62 unsigned int texel_count, 63 unsigned int partition_count, 64 vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS] 65) { 66 promise(texel_count > 0); 67 promise(partition_count > 0); 68 69 unsigned int clusters_selected = 0; 70 float distances[BLOCK_MAX_TEXELS]; 71 72 // Pick a random sample as first cluster center; 145897 from random.org 73 unsigned int sample = 145897 % texel_count; 74 vfloat4 center_color = blk.texel(sample); 75 cluster_centers[clusters_selected] = center_color; 76 clusters_selected++; 77 78 // Compute the distance to the first cluster center 79 float distance_sum = 0.0f; 80 for (unsigned int i = 0; i < texel_count; i++) 81 { 82 vfloat4 color = blk.texel(i); 83 vfloat4 diff = color - center_color; 84 float distance = dot_s(diff * diff, blk.channel_weight); 85 distance_sum += distance; 86 distances[i] = distance; 87 } 88 89 // More numbers from random.org for weighted-random center selection 90 const float cluster_cutoffs[9] { 91 0.626220f, 0.932770f, 0.275454f, 92 0.318558f, 0.240113f, 0.009190f, 93 0.347661f, 0.731960f, 0.156391f 94 }; 95 96 unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2); 97 98 // Pick the remaining samples as needed 99 while (true) 100 { 101 // Pick the next center in a weighted-random fashion. 102 float summa = 0.0f; 103 float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++]; 104 for (sample = 0; sample < texel_count; sample++) 105 { 106 summa += distances[sample]; 107 if (summa >= distance_cutoff) 108 { 109 break; 110 } 111 } 112 113 // Clamp to a valid range and store the selected cluster center 114 sample = astc::min(sample, texel_count - 1); 115 116 center_color = blk.texel(sample); 117 cluster_centers[clusters_selected++] = center_color; 118 if (clusters_selected >= partition_count) 119 { 120 break; 121 } 122 123 // Compute the distance to the new cluster center, keep the min dist 124 distance_sum = 0.0f; 125 for (unsigned int i = 0; i < texel_count; i++) 126 { 127 vfloat4 color = blk.texel(i); 128 vfloat4 diff = color - center_color; 129 float distance = dot_s(diff * diff, blk.channel_weight); 130 distance = astc::min(distance, distances[i]); 131 distance_sum += distance; 132 distances[i] = distance; 133 } 134 } 135} 136 137/** 138 * @brief Assign texels to clusters, based on a set of chosen center points. 139 * 140 * @param blk The image block color data to compress. 141 * @param texel_count The number of texels in the block. 142 * @param partition_count The number of partitions in the block. 143 * @param cluster_centers The partition cluster center colors. 144 * @param[out] partition_of_texel The partition assigned for each texel. 145 */ 146static void kmeans_assign( 147 const image_block& blk, 148 unsigned int texel_count, 149 unsigned int partition_count, 150 const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS], 151 uint8_t partition_of_texel[BLOCK_MAX_TEXELS] 152) { 153 promise(texel_count > 0); 154 promise(partition_count > 0); 155 156 uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 }; 157 158 // Find the best partition for every texel 159 for (unsigned int i = 0; i < texel_count; i++) 160 { 161 float best_distance = std::numeric_limits<float>::max(); 162 unsigned int best_partition = 0; 163 164 vfloat4 color = blk.texel(i); 165 for (unsigned int j = 0; j < partition_count; j++) 166 { 167 vfloat4 diff = color - cluster_centers[j]; 168 float distance = dot_s(diff * diff, blk.channel_weight); 169 if (distance < best_distance) 170 { 171 best_distance = distance; 172 best_partition = j; 173 } 174 } 175 176 partition_of_texel[i] = static_cast<uint8_t>(best_partition); 177 partition_texel_count[best_partition]++; 178 } 179 180 // It is possible to get a situation where a partition ends up without any texels. In this case, 181 // assign texel N to partition N. This is silly, but ensures that every partition retains at 182 // least one texel. Reassigning a texel in this manner may cause another partition to go empty, 183 // so if we actually did a reassignment, run the whole loop over again. 184 bool problem_case; 185 do 186 { 187 problem_case = false; 188 for (unsigned int i = 0; i < partition_count; i++) 189 { 190 if (partition_texel_count[i] == 0) 191 { 192 partition_texel_count[partition_of_texel[i]]--; 193 partition_texel_count[i]++; 194 partition_of_texel[i] = static_cast<uint8_t>(i); 195 problem_case = true; 196 } 197 } 198 } while (problem_case); 199} 200 201/** 202 * @brief Compute new cluster centers based on their center of gravity. 203 * 204 * @param blk The image block color data to compress. 205 * @param texel_count The number of texels in the block. 206 * @param partition_count The number of partitions in the block. 207 * @param[out] cluster_centers The new cluster center colors. 208 * @param partition_of_texel The partition assigned for each texel. 209 */ 210static void kmeans_update( 211 const image_block& blk, 212 unsigned int texel_count, 213 unsigned int partition_count, 214 vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS], 215 const uint8_t partition_of_texel[BLOCK_MAX_TEXELS] 216) { 217 promise(texel_count > 0); 218 promise(partition_count > 0); 219 220 vfloat4 color_sum[BLOCK_MAX_PARTITIONS] { 221 vfloat4::zero(), 222 vfloat4::zero(), 223 vfloat4::zero(), 224 vfloat4::zero() 225 }; 226 227 uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 }; 228 229 // Find the center-of-gravity in each cluster 230 for (unsigned int i = 0; i < texel_count; i++) 231 { 232 uint8_t partition = partition_of_texel[i]; 233 color_sum[partition] += blk.texel(i); 234 partition_texel_count[partition]++; 235 } 236 237 // Set the center of gravity to be the new cluster center 238 for (unsigned int i = 0; i < partition_count; i++) 239 { 240 float scale = 1.0f / static_cast<float>(partition_texel_count[i]); 241 cluster_centers[i] = color_sum[i] * scale; 242 } 243} 244 245/** 246 * @brief Compute bit-mismatch for partitioning in 2-partition mode. 247 * 248 * @param a The texel assignment bitvector for the block. 249 * @param b The texel assignment bitvector for the partition table. 250 * 251 * @return The number of bit mismatches. 252 */ 253#if ASTCENC_NEON != 0 254static inline uint8_t partition_mismatch2( 255 const uint64_t a[2], 256 const uint64_t b[2] 257) { 258 uint64x2_t a01 = vld1q_u64(a); 259 uint64x2_t b01 = vld1q_u64(b); 260 uint64x2_t b10 = vextq_u64(b01, b01, 1); 261 uint8_t c1 = popcount(veorq_u64(a01, b01)); 262 uint8_t c2 = popcount(veorq_u64(a01, b10)); 263 return static_cast<uint8_t>(astc::min(c1, c2) / 2); // 2 is the number of partitions 264} 265#else 266static inline uint8_t partition_mismatch2( 267 const uint64_t a[2], 268 const uint64_t b[2] 269) { 270 int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]); 271 int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]); 272 273 // Divide by 2 because XOR always counts errors twice, once when missing 274 // in the expected position, and again when present in the wrong partition 275 return static_cast<uint8_t>(astc::min(v1, v2) / 2); 276} 277#endif 278 279/** 280 * @brief Compute bit-mismatch for partitioning in 3-partition mode. 281 * 282 * @param a The texel assignment bitvector for the block. 283 * @param b The texel assignment bitvector for the partition table. 284 * 285 * @return The number of bit mismatches. 286 */ 287static inline uint8_t partition_mismatch3( 288 const uint64_t a[3], 289 const uint64_t b[3] 290) { 291 int p00 = popcount(a[0] ^ b[0]); 292 int p01 = popcount(a[0] ^ b[1]); 293 int p02 = popcount(a[0] ^ b[2]); 294 295 int p10 = popcount(a[1] ^ b[0]); 296 int p11 = popcount(a[1] ^ b[1]); 297 int p12 = popcount(a[1] ^ b[2]); 298 299 int p20 = popcount(a[2] ^ b[0]); 300 int p21 = popcount(a[2] ^ b[1]); 301 int p22 = popcount(a[2] ^ b[2]); 302 303 int s0 = p11 + p22; 304 int s1 = p12 + p21; 305 int v0 = astc::min(s0, s1) + p00; 306 307 int s2 = p10 + p22; 308 int s3 = p12 + p20; 309 int v1 = astc::min(s2, s3) + p01; 310 311 int s4 = p10 + p21; 312 int s5 = p11 + p20; 313 int v2 = astc::min(s4, s5) + p02; 314 315 // Divide by 2 because XOR always counts errors twice, once when missing 316 // in the expected position, and again when present in the wrong partition 317 return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2); 318} 319 320/** 321 * @brief Compute bit-mismatch for partitioning in 4-partition mode. 322 * 323 * @param a The texel assignment bitvector for the block. 324 * @param b The texel assignment bitvector for the partition table. 325 * 326 * @return The number of bit mismatches. 327 */ 328static inline uint8_t partition_mismatch4( 329 const uint64_t a[4], 330 const uint64_t b[4] 331) { 332 int p00 = popcount(a[0] ^ b[0]); 333 int p01 = popcount(a[0] ^ b[1]); 334 int p02 = popcount(a[0] ^ b[2]); 335 int p03 = popcount(a[0] ^ b[3]); 336 337 int p10 = popcount(a[1] ^ b[0]); 338 int p11 = popcount(a[1] ^ b[1]); 339 int p12 = popcount(a[1] ^ b[2]); 340 int p13 = popcount(a[1] ^ b[3]); 341 342 int p20 = popcount(a[2] ^ b[0]); 343 int p21 = popcount(a[2] ^ b[1]); 344 int p22 = popcount(a[2] ^ b[2]); 345 int p23 = popcount(a[2] ^ b[3]); 346 347 int p30 = popcount(a[3] ^ b[0]); 348 int p31 = popcount(a[3] ^ b[1]); 349 int p32 = popcount(a[3] ^ b[2]); 350 int p33 = popcount(a[3] ^ b[3]); 351 352 int mx23 = astc::min(p22 + p33, p23 + p32); 353 int mx13 = astc::min(p21 + p33, p23 + p31); 354 int mx12 = astc::min(p21 + p32, p22 + p31); 355 int mx03 = astc::min(p20 + p33, p23 + p30); 356 int mx02 = astc::min(p20 + p32, p22 + p30); 357 int mx01 = astc::min(p21 + p30, p20 + p31); 358 359 int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12); 360 int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02); 361 int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01); 362 int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12); 363 364 // Divide by 2 because XOR always counts errors twice, once when missing 365 // in the expected position, and again when present in the wrong partition 366 return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2); 367} 368 369using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*); 370 371/** 372 * @brief Count the partition table mismatches vs the data clustering. 373 * 374 * @param bsd The block size information. 375 * @param partition_count The number of partitions in the block. 376 * @param bitmaps The block texel partition assignment patterns. 377 * @param[out] mismatch_counts The array storing per partitioning mismatch counts. 378 */ 379static void count_partition_mismatch_bits( 380 const block_size_descriptor& bsd, 381 unsigned int partition_count, 382 const uint64_t bitmaps[BLOCK_MAX_PARTITIONS], 383 uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS] 384) { 385 unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1]; 386 promise(active_count > 0); 387 388 if (partition_count == 2) 389 { 390 for (unsigned int i = 0; i < active_count; i++) 391 { 392 mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]); 393 assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS); 394 assert(mismatch_counts[i] < bsd.texel_count); 395 } 396 } 397 else if (partition_count == 3) 398 { 399 for (unsigned int i = 0; i < active_count; i++) 400 { 401 mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]); 402 assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS); 403 assert(mismatch_counts[i] < bsd.texel_count); 404 } 405 } 406 else 407 { 408 for (unsigned int i = 0; i < active_count; i++) 409 { 410 mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]); 411 assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS); 412 assert(mismatch_counts[i] < bsd.texel_count); 413 } 414 } 415} 416 417/** 418 * @brief Use counting sort on the mismatch array to sort partition candidates. 419 * 420 * @param partitioning_count The number of packed partitionings. 421 * @param mismatch_count Partitioning mismatch counts, in index order. 422 * @param[out] partition_ordering Partition index values, in mismatch order. 423 * 424 * @return The number of active partitions in this selection. 425 */ 426static unsigned int get_partition_ordering_by_mismatch_bits( 427 unsigned int texel_count, 428 unsigned int partitioning_count, 429 const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS], 430 uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS] 431) { 432 promise(partitioning_count > 0); 433 uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 }; 434 435 // Create the histogram of mismatch counts 436 for (unsigned int i = 0; i < partitioning_count; i++) 437 { 438 mscount[mismatch_count[i]]++; 439 } 440 441 // Create a running sum from the histogram array 442 // Cells store previous values only; i.e. exclude self after sum 443 unsigned int sum = 0; 444 for (unsigned int i = 0; i < texel_count; i++) 445 { 446 uint16_t cnt = mscount[i]; 447 mscount[i] = sum; 448 sum += cnt; 449 } 450 451 // Use the running sum as the index, incrementing after read to allow 452 // sequential entries with the same count 453 for (unsigned int i = 0; i < partitioning_count; i++) 454 { 455 unsigned int idx = mscount[mismatch_count[i]]++; 456 partition_ordering[idx] = static_cast<uint16_t>(i); 457 } 458 459 return partitioning_count; 460} 461 462/** 463 * @brief Use k-means clustering to compute a partition ordering for a block.. 464 * 465 * @param bsd The block size information. 466 * @param blk The image block color data to compress. 467 * @param partition_count The desired number of partitions in the block. 468 * @param[out] partition_ordering The list of recommended partition indices, in priority order. 469 * 470 * @return The number of active partitionings in this selection. 471 */ 472static unsigned int compute_kmeans_partition_ordering( 473 const block_size_descriptor& bsd, 474 const image_block& blk, 475 unsigned int partition_count, 476 uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS] 477) { 478 vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]; 479 uint8_t texel_partitions[BLOCK_MAX_TEXELS]; 480 481 // Use three passes of k-means clustering to partition the block data 482 for (unsigned int i = 0; i < 3; i++) 483 { 484 if (i == 0) 485 { 486 kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers); 487 } 488 else 489 { 490 kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions); 491 } 492 493 kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions); 494 } 495 496 // Construct the block bitmaps of texel assignments to each partition 497 uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 }; 498 unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS); 499 promise(texels_to_process > 0); 500 for (unsigned int i = 0; i < texels_to_process; i++) 501 { 502 unsigned int idx = bsd.kmeans_texels[i]; 503 bitmaps[texel_partitions[idx]] |= 1ULL << i; 504 } 505 506 // Count the mismatch between the block and the format's partition tables 507 uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]; 508 count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts); 509 510 // Sort the partitions based on the number of mismatched bits 511 return get_partition_ordering_by_mismatch_bits( 512 texels_to_process, 513 bsd.partitioning_count_selected[partition_count - 1], 514 mismatch_counts, partition_ordering); 515} 516 517/** 518 * @brief Insert a partitioning into an order list of results, sorted by error. 519 * 520 * @param max_values The max number of entries in the best result arrays. 521 * @param this_error The error of the new entry. 522 * @param this_partition The partition ID of the new entry. 523 * @param[out] best_errors The array of best error values. 524 * @param[out] best_partitions The array of best partition values. 525 */ 526static void insert_result( 527 unsigned int max_values, 528 float this_error, 529 unsigned int this_partition, 530 float* best_errors, 531 unsigned int* best_partitions) 532{ 533 promise(max_values > 0); 534 535 // Don't bother searching if the current worst error beats the new error 536 if (this_error >= best_errors[max_values - 1]) 537 { 538 return; 539 } 540 541 // Else insert into the list in error-order 542 for (unsigned int i = 0; i < max_values; i++) 543 { 544 // Existing result is better - move on ... 545 if (this_error > best_errors[i]) 546 { 547 continue; 548 } 549 550 // Move existing results down one 551 for (unsigned int j = max_values - 1; j > i; j--) 552 { 553 best_errors[j] = best_errors[j - 1]; 554 best_partitions[j] = best_partitions[j - 1]; 555 } 556 557 // Insert new result 558 best_errors[i] = this_error; 559 best_partitions[i] = this_partition; 560 break; 561 } 562} 563 564/* See header for documentation. */ 565unsigned int find_best_partition_candidates( 566 const block_size_descriptor& bsd, 567 const image_block& blk, 568 unsigned int partition_count, 569 unsigned int partition_search_limit, 570 unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES], 571 unsigned int requested_candidates 572) { 573 // Constant used to estimate quantization error for a given partitioning; the optimal value for 574 // this depends on bitrate. These values have been determined empirically. 575 unsigned int texels_per_block = bsd.texel_count; 576 float weight_imprecision_estim = 0.055f; 577 if (texels_per_block <= 20) 578 { 579 weight_imprecision_estim = 0.03f; 580 } 581 else if (texels_per_block <= 31) 582 { 583 weight_imprecision_estim = 0.04f; 584 } 585 else if (texels_per_block <= 41) 586 { 587 weight_imprecision_estim = 0.05f; 588 } 589 590 promise(partition_count > 0); 591 promise(partition_search_limit > 0); 592 593 weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim; 594 595 uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS]; 596 unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence); 597 partition_search_limit = astc::min(partition_search_limit, sequence_len); 598 requested_candidates = astc::min(partition_search_limit, requested_candidates); 599 600 bool uses_alpha = !blk.is_constant_channel(3); 601 602 // Partitioning errors assuming uncorrelated-chrominance endpoints 603 float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES]; 604 unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES]; 605 606 // Partitioning errors assuming same-chrominance endpoints 607 float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES]; 608 unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES]; 609 610 for (unsigned int i = 0; i < requested_candidates; i++) 611 { 612 uncor_best_errors[i] = ERROR_CALC_DEFAULT; 613 samec_best_errors[i] = ERROR_CALC_DEFAULT; 614 } 615 616 if (uses_alpha) 617 { 618 for (unsigned int i = 0; i < partition_search_limit; i++) 619 { 620 unsigned int partition = partition_sequence[i]; 621 const auto& pi = bsd.get_raw_partition_info(partition_count, partition); 622 623 // Compute weighting to give to each component in each partition 624 partition_metrics pms[BLOCK_MAX_PARTITIONS]; 625 626 compute_avgs_and_dirs_4_comp(pi, blk, pms); 627 628 line4 uncor_lines[BLOCK_MAX_PARTITIONS]; 629 line4 samec_lines[BLOCK_MAX_PARTITIONS]; 630 631 processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS]; 632 processed_line4 samec_plines[BLOCK_MAX_PARTITIONS]; 633 634 float line_lengths[BLOCK_MAX_PARTITIONS]; 635 636 for (unsigned int j = 0; j < partition_count; j++) 637 { 638 partition_metrics& pm = pms[j]; 639 640 uncor_lines[j].a = pm.avg; 641 uncor_lines[j].b = normalize_safe(pm.dir, unit4()); 642 643 uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b); 644 uncor_plines[j].bs = uncor_lines[j].b; 645 646 samec_lines[j].a = vfloat4::zero(); 647 samec_lines[j].b = normalize_safe(pm.avg, unit4()); 648 649 samec_plines[j].amod = vfloat4::zero(); 650 samec_plines[j].bs = samec_lines[j].b; 651 } 652 653 float uncor_error = 0.0f; 654 float samec_error = 0.0f; 655 656 compute_error_squared_rgba(pi, 657 blk, 658 uncor_plines, 659 samec_plines, 660 line_lengths, 661 uncor_error, 662 samec_error); 663 664 // Compute an estimate of error introduced by weight quantization imprecision. 665 // This error is computed as follows, for each partition 666 // 1: compute the principal-axis vector (full length) in error-space 667 // 2: convert the principal-axis vector to regular RGB-space 668 // 3: scale the vector by a constant that estimates average quantization error 669 // 4: for each texel, square the vector, then do a dot-product with the texel's 670 // error weight; sum up the results across all texels. 671 // 4(optimized): square the vector once, then do a dot-product with the average 672 // texel error, then multiply by the number of texels. 673 674 for (unsigned int j = 0; j < partition_count; j++) 675 { 676 float tpp = static_cast<float>(pi.partition_texel_count[j]); 677 vfloat4 error_weights(tpp * weight_imprecision_estim); 678 679 vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j]; 680 vfloat4 samec_vector = samec_lines[j].b * line_lengths[j]; 681 682 uncor_error += dot_s(uncor_vector * uncor_vector, error_weights); 683 samec_error += dot_s(samec_vector * samec_vector, error_weights); 684 } 685 686 insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions); 687 insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions); 688 } 689 } 690 else 691 { 692 for (unsigned int i = 0; i < partition_search_limit; i++) 693 { 694 unsigned int partition = partition_sequence[i]; 695 const auto& pi = bsd.get_raw_partition_info(partition_count, partition); 696 697 // Compute weighting to give to each component in each partition 698 partition_metrics pms[BLOCK_MAX_PARTITIONS]; 699 compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); 700 701 partition_lines3 plines[BLOCK_MAX_PARTITIONS]; 702 703 for (unsigned int j = 0; j < partition_count; j++) 704 { 705 partition_metrics& pm = pms[j]; 706 partition_lines3& pl = plines[j]; 707 708 pl.uncor_line.a = pm.avg; 709 pl.uncor_line.b = normalize_safe(pm.dir, unit3()); 710 711 pl.samec_line.a = vfloat4::zero(); 712 pl.samec_line.b = normalize_safe(pm.avg, unit3()); 713 714 pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b); 715 pl.uncor_pline.bs = pl.uncor_line.b; 716 717 pl.samec_pline.amod = vfloat4::zero(); 718 pl.samec_pline.bs = pl.samec_line.b; 719 } 720 721 float uncor_error = 0.0f; 722 float samec_error = 0.0f; 723 724 compute_error_squared_rgb(pi, 725 blk, 726 plines, 727 uncor_error, 728 samec_error); 729 730 // Compute an estimate of error introduced by weight quantization imprecision. 731 // This error is computed as follows, for each partition 732 // 1: compute the principal-axis vector (full length) in error-space 733 // 2: convert the principal-axis vector to regular RGB-space 734 // 3: scale the vector by a constant that estimates average quantization error 735 // 4: for each texel, square the vector, then do a dot-product with the texel's 736 // error weight; sum up the results across all texels. 737 // 4(optimized): square the vector once, then do a dot-product with the average 738 // texel error, then multiply by the number of texels. 739 740 for (unsigned int j = 0; j < partition_count; j++) 741 { 742 partition_lines3& pl = plines[j]; 743 744 float tpp = static_cast<float>(pi.partition_texel_count[j]); 745 vfloat4 error_weights(tpp * weight_imprecision_estim); 746 747 vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length; 748 vfloat4 samec_vector = pl.samec_line.b * pl.line_length; 749 750 uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights); 751 samec_error += dot3_s(samec_vector * samec_vector, error_weights); 752 } 753 754 insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions); 755 insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions); 756 } 757 } 758 759 unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES]; 760 for (unsigned int i = 0; i < requested_candidates; i++) 761 { 762 interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index; 763 interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index; 764 } 765 766 uint64_t bitmasks[1024/64] { 0 }; 767 unsigned int emitted = 0; 768 769 // Deduplicate the first "requested" entries 770 for (unsigned int i = 0; i < requested_candidates * 2; i++) 771 { 772 unsigned int partition = interleave[i]; 773 774 unsigned int word = partition / 64; 775 unsigned int bit = partition % 64; 776 777 bool written = bitmasks[word] & (1ull << bit); 778 779 if (!written) 780 { 781 best_partitions[emitted] = partition; 782 bitmasks[word] |= 1ull << bit; 783 emitted++; 784 785 if (emitted == requested_candidates) 786 { 787 break; 788 } 789 } 790 } 791 792 return emitted; 793} 794 795#endif 796