1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0 2cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 3cc1dc7a3Sopenharmony_ci// Copyright 2011-2023 Arm Limited 4cc1dc7a3Sopenharmony_ci// 5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy 7cc1dc7a3Sopenharmony_ci// of the License at: 8cc1dc7a3Sopenharmony_ci// 9cc1dc7a3Sopenharmony_ci// http://www.apache.org/licenses/LICENSE-2.0 10cc1dc7a3Sopenharmony_ci// 11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software 12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations 15cc1dc7a3Sopenharmony_ci// under the License. 16cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 17cc1dc7a3Sopenharmony_ci 18cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DECOMPRESS_ONLY) 19cc1dc7a3Sopenharmony_ci 20cc1dc7a3Sopenharmony_ci/** 21cc1dc7a3Sopenharmony_ci * @brief Functions for finding best partition for a block. 22cc1dc7a3Sopenharmony_ci * 23cc1dc7a3Sopenharmony_ci * The partition search operates in two stages. The first pass uses kmeans clustering to group 24cc1dc7a3Sopenharmony_ci * texels into an ideal partitioning for the requested partition count, and then compares that 25cc1dc7a3Sopenharmony_ci * against the 1024 partitionings generated by the ASTC partition hash function. The generated 26cc1dc7a3Sopenharmony_ci * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal 27cc1dc7a3Sopenharmony_ci * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and 28cc1dc7a3Sopenharmony_ci * partitionings that actually generate fewer than the requested partition count, but only the top 29cc1dc7a3Sopenharmony_ci * N candidates are actually put through a more detailed search. N is determined by the compressor 30cc1dc7a3Sopenharmony_ci * quality preset. 31cc1dc7a3Sopenharmony_ci * 32cc1dc7a3Sopenharmony_ci * For the detailed search, each candidate is checked against two possible encoding methods: 33cc1dc7a3Sopenharmony_ci * 34cc1dc7a3Sopenharmony_ci * - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints). 35cc1dc7a3Sopenharmony_ci * - The best partitioning assuming same chroma colors (RGB + scale endpoints). 36cc1dc7a3Sopenharmony_ci * 37cc1dc7a3Sopenharmony_ci * This is implemented by computing the compute mean color and dominant direction for each 38cc1dc7a3Sopenharmony_ci * partition. This defines two lines, both of which go through the mean color value. 39cc1dc7a3Sopenharmony_ci * 40cc1dc7a3Sopenharmony_ci * - One line has a direction defined by the dominant direction; this is used to assess the error 41cc1dc7a3Sopenharmony_ci * from using an uncorrelated color representation. 42cc1dc7a3Sopenharmony_ci * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma 43cc1dc7a3Sopenharmony_ci * (RGB + scale) color representation. 44cc1dc7a3Sopenharmony_ci * 45cc1dc7a3Sopenharmony_ci * The best candidate is selected by computing the squared-errors that result from using these 46cc1dc7a3Sopenharmony_ci * lines for endpoint selection. 47cc1dc7a3Sopenharmony_ci */ 48cc1dc7a3Sopenharmony_ci 49cc1dc7a3Sopenharmony_ci#include <limits> 50cc1dc7a3Sopenharmony_ci#include "astcenc_internal.h" 51cc1dc7a3Sopenharmony_ci 52cc1dc7a3Sopenharmony_ci/** 53cc1dc7a3Sopenharmony_ci * @brief Pick some initial kmeans cluster centers. 54cc1dc7a3Sopenharmony_ci * 55cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 56cc1dc7a3Sopenharmony_ci * @param texel_count The number of texels in the block. 57cc1dc7a3Sopenharmony_ci * @param partition_count The number of partitions in the block. 58cc1dc7a3Sopenharmony_ci * @param[out] cluster_centers The initial partition cluster center colors. 59cc1dc7a3Sopenharmony_ci */ 60cc1dc7a3Sopenharmony_cistatic void kmeans_init( 61cc1dc7a3Sopenharmony_ci const image_block& blk, 62cc1dc7a3Sopenharmony_ci unsigned int texel_count, 63cc1dc7a3Sopenharmony_ci unsigned int partition_count, 64cc1dc7a3Sopenharmony_ci vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS] 65cc1dc7a3Sopenharmony_ci) { 66cc1dc7a3Sopenharmony_ci promise(texel_count > 0); 67cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 68cc1dc7a3Sopenharmony_ci 69cc1dc7a3Sopenharmony_ci unsigned int clusters_selected = 0; 70cc1dc7a3Sopenharmony_ci float distances[BLOCK_MAX_TEXELS]; 71cc1dc7a3Sopenharmony_ci 72cc1dc7a3Sopenharmony_ci // Pick a random sample as first cluster center; 145897 from random.org 73cc1dc7a3Sopenharmony_ci unsigned int sample = 145897 % texel_count; 74cc1dc7a3Sopenharmony_ci vfloat4 center_color = blk.texel(sample); 75cc1dc7a3Sopenharmony_ci cluster_centers[clusters_selected] = center_color; 76cc1dc7a3Sopenharmony_ci clusters_selected++; 77cc1dc7a3Sopenharmony_ci 78cc1dc7a3Sopenharmony_ci // Compute the distance to the first cluster center 79cc1dc7a3Sopenharmony_ci float distance_sum = 0.0f; 80cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i++) 81cc1dc7a3Sopenharmony_ci { 82cc1dc7a3Sopenharmony_ci vfloat4 color = blk.texel(i); 83cc1dc7a3Sopenharmony_ci vfloat4 diff = color - center_color; 84cc1dc7a3Sopenharmony_ci float distance = dot_s(diff * diff, blk.channel_weight); 85cc1dc7a3Sopenharmony_ci distance_sum += distance; 86cc1dc7a3Sopenharmony_ci distances[i] = distance; 87cc1dc7a3Sopenharmony_ci } 88cc1dc7a3Sopenharmony_ci 89cc1dc7a3Sopenharmony_ci // More numbers from random.org for weighted-random center selection 90cc1dc7a3Sopenharmony_ci const float cluster_cutoffs[9] { 91cc1dc7a3Sopenharmony_ci 0.626220f, 0.932770f, 0.275454f, 92cc1dc7a3Sopenharmony_ci 0.318558f, 0.240113f, 0.009190f, 93cc1dc7a3Sopenharmony_ci 0.347661f, 0.731960f, 0.156391f 94cc1dc7a3Sopenharmony_ci }; 95cc1dc7a3Sopenharmony_ci 96cc1dc7a3Sopenharmony_ci unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2); 97cc1dc7a3Sopenharmony_ci 98cc1dc7a3Sopenharmony_ci // Pick the remaining samples as needed 99cc1dc7a3Sopenharmony_ci while (true) 100cc1dc7a3Sopenharmony_ci { 101cc1dc7a3Sopenharmony_ci // Pick the next center in a weighted-random fashion. 102cc1dc7a3Sopenharmony_ci float summa = 0.0f; 103cc1dc7a3Sopenharmony_ci float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++]; 104cc1dc7a3Sopenharmony_ci for (sample = 0; sample < texel_count; sample++) 105cc1dc7a3Sopenharmony_ci { 106cc1dc7a3Sopenharmony_ci summa += distances[sample]; 107cc1dc7a3Sopenharmony_ci if (summa >= distance_cutoff) 108cc1dc7a3Sopenharmony_ci { 109cc1dc7a3Sopenharmony_ci break; 110cc1dc7a3Sopenharmony_ci } 111cc1dc7a3Sopenharmony_ci } 112cc1dc7a3Sopenharmony_ci 113cc1dc7a3Sopenharmony_ci // Clamp to a valid range and store the selected cluster center 114cc1dc7a3Sopenharmony_ci sample = astc::min(sample, texel_count - 1); 115cc1dc7a3Sopenharmony_ci 116cc1dc7a3Sopenharmony_ci center_color = blk.texel(sample); 117cc1dc7a3Sopenharmony_ci cluster_centers[clusters_selected++] = center_color; 118cc1dc7a3Sopenharmony_ci if (clusters_selected >= partition_count) 119cc1dc7a3Sopenharmony_ci { 120cc1dc7a3Sopenharmony_ci break; 121cc1dc7a3Sopenharmony_ci } 122cc1dc7a3Sopenharmony_ci 123cc1dc7a3Sopenharmony_ci // Compute the distance to the new cluster center, keep the min dist 124cc1dc7a3Sopenharmony_ci distance_sum = 0.0f; 125cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i++) 126cc1dc7a3Sopenharmony_ci { 127cc1dc7a3Sopenharmony_ci vfloat4 color = blk.texel(i); 128cc1dc7a3Sopenharmony_ci vfloat4 diff = color - center_color; 129cc1dc7a3Sopenharmony_ci float distance = dot_s(diff * diff, blk.channel_weight); 130cc1dc7a3Sopenharmony_ci distance = astc::min(distance, distances[i]); 131cc1dc7a3Sopenharmony_ci distance_sum += distance; 132cc1dc7a3Sopenharmony_ci distances[i] = distance; 133cc1dc7a3Sopenharmony_ci } 134cc1dc7a3Sopenharmony_ci } 135cc1dc7a3Sopenharmony_ci} 136cc1dc7a3Sopenharmony_ci 137cc1dc7a3Sopenharmony_ci/** 138cc1dc7a3Sopenharmony_ci * @brief Assign texels to clusters, based on a set of chosen center points. 139cc1dc7a3Sopenharmony_ci * 140cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 141cc1dc7a3Sopenharmony_ci * @param texel_count The number of texels in the block. 142cc1dc7a3Sopenharmony_ci * @param partition_count The number of partitions in the block. 143cc1dc7a3Sopenharmony_ci * @param cluster_centers The partition cluster center colors. 144cc1dc7a3Sopenharmony_ci * @param[out] partition_of_texel The partition assigned for each texel. 145cc1dc7a3Sopenharmony_ci */ 146cc1dc7a3Sopenharmony_cistatic void kmeans_assign( 147cc1dc7a3Sopenharmony_ci const image_block& blk, 148cc1dc7a3Sopenharmony_ci unsigned int texel_count, 149cc1dc7a3Sopenharmony_ci unsigned int partition_count, 150cc1dc7a3Sopenharmony_ci const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS], 151cc1dc7a3Sopenharmony_ci uint8_t partition_of_texel[BLOCK_MAX_TEXELS] 152cc1dc7a3Sopenharmony_ci) { 153cc1dc7a3Sopenharmony_ci promise(texel_count > 0); 154cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 155cc1dc7a3Sopenharmony_ci 156cc1dc7a3Sopenharmony_ci uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 }; 157cc1dc7a3Sopenharmony_ci 158cc1dc7a3Sopenharmony_ci // Find the best partition for every texel 159cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i++) 160cc1dc7a3Sopenharmony_ci { 161cc1dc7a3Sopenharmony_ci float best_distance = std::numeric_limits<float>::max(); 162cc1dc7a3Sopenharmony_ci unsigned int best_partition = 0; 163cc1dc7a3Sopenharmony_ci 164cc1dc7a3Sopenharmony_ci vfloat4 color = blk.texel(i); 165cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_count; j++) 166cc1dc7a3Sopenharmony_ci { 167cc1dc7a3Sopenharmony_ci vfloat4 diff = color - cluster_centers[j]; 168cc1dc7a3Sopenharmony_ci float distance = dot_s(diff * diff, blk.channel_weight); 169cc1dc7a3Sopenharmony_ci if (distance < best_distance) 170cc1dc7a3Sopenharmony_ci { 171cc1dc7a3Sopenharmony_ci best_distance = distance; 172cc1dc7a3Sopenharmony_ci best_partition = j; 173cc1dc7a3Sopenharmony_ci } 174cc1dc7a3Sopenharmony_ci } 175cc1dc7a3Sopenharmony_ci 176cc1dc7a3Sopenharmony_ci partition_of_texel[i] = static_cast<uint8_t>(best_partition); 177cc1dc7a3Sopenharmony_ci partition_texel_count[best_partition]++; 178cc1dc7a3Sopenharmony_ci } 179cc1dc7a3Sopenharmony_ci 180cc1dc7a3Sopenharmony_ci // It is possible to get a situation where a partition ends up without any texels. In this case, 181cc1dc7a3Sopenharmony_ci // assign texel N to partition N. This is silly, but ensures that every partition retains at 182cc1dc7a3Sopenharmony_ci // least one texel. Reassigning a texel in this manner may cause another partition to go empty, 183cc1dc7a3Sopenharmony_ci // so if we actually did a reassignment, run the whole loop over again. 184cc1dc7a3Sopenharmony_ci bool problem_case; 185cc1dc7a3Sopenharmony_ci do 186cc1dc7a3Sopenharmony_ci { 187cc1dc7a3Sopenharmony_ci problem_case = false; 188cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partition_count; i++) 189cc1dc7a3Sopenharmony_ci { 190cc1dc7a3Sopenharmony_ci if (partition_texel_count[i] == 0) 191cc1dc7a3Sopenharmony_ci { 192cc1dc7a3Sopenharmony_ci partition_texel_count[partition_of_texel[i]]--; 193cc1dc7a3Sopenharmony_ci partition_texel_count[i]++; 194cc1dc7a3Sopenharmony_ci partition_of_texel[i] = static_cast<uint8_t>(i); 195cc1dc7a3Sopenharmony_ci problem_case = true; 196cc1dc7a3Sopenharmony_ci } 197cc1dc7a3Sopenharmony_ci } 198cc1dc7a3Sopenharmony_ci } while (problem_case); 199cc1dc7a3Sopenharmony_ci} 200cc1dc7a3Sopenharmony_ci 201cc1dc7a3Sopenharmony_ci/** 202cc1dc7a3Sopenharmony_ci * @brief Compute new cluster centers based on their center of gravity. 203cc1dc7a3Sopenharmony_ci * 204cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 205cc1dc7a3Sopenharmony_ci * @param texel_count The number of texels in the block. 206cc1dc7a3Sopenharmony_ci * @param partition_count The number of partitions in the block. 207cc1dc7a3Sopenharmony_ci * @param[out] cluster_centers The new cluster center colors. 208cc1dc7a3Sopenharmony_ci * @param partition_of_texel The partition assigned for each texel. 209cc1dc7a3Sopenharmony_ci */ 210cc1dc7a3Sopenharmony_cistatic void kmeans_update( 211cc1dc7a3Sopenharmony_ci const image_block& blk, 212cc1dc7a3Sopenharmony_ci unsigned int texel_count, 213cc1dc7a3Sopenharmony_ci unsigned int partition_count, 214cc1dc7a3Sopenharmony_ci vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS], 215cc1dc7a3Sopenharmony_ci const uint8_t partition_of_texel[BLOCK_MAX_TEXELS] 216cc1dc7a3Sopenharmony_ci) { 217cc1dc7a3Sopenharmony_ci promise(texel_count > 0); 218cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 219cc1dc7a3Sopenharmony_ci 220cc1dc7a3Sopenharmony_ci vfloat4 color_sum[BLOCK_MAX_PARTITIONS] { 221cc1dc7a3Sopenharmony_ci vfloat4::zero(), 222cc1dc7a3Sopenharmony_ci vfloat4::zero(), 223cc1dc7a3Sopenharmony_ci vfloat4::zero(), 224cc1dc7a3Sopenharmony_ci vfloat4::zero() 225cc1dc7a3Sopenharmony_ci }; 226cc1dc7a3Sopenharmony_ci 227cc1dc7a3Sopenharmony_ci uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 }; 228cc1dc7a3Sopenharmony_ci 229cc1dc7a3Sopenharmony_ci // Find the center-of-gravity in each cluster 230cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i++) 231cc1dc7a3Sopenharmony_ci { 232cc1dc7a3Sopenharmony_ci uint8_t partition = partition_of_texel[i]; 233cc1dc7a3Sopenharmony_ci color_sum[partition] += blk.texel(i); 234cc1dc7a3Sopenharmony_ci partition_texel_count[partition]++; 235cc1dc7a3Sopenharmony_ci } 236cc1dc7a3Sopenharmony_ci 237cc1dc7a3Sopenharmony_ci // Set the center of gravity to be the new cluster center 238cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partition_count; i++) 239cc1dc7a3Sopenharmony_ci { 240cc1dc7a3Sopenharmony_ci float scale = 1.0f / static_cast<float>(partition_texel_count[i]); 241cc1dc7a3Sopenharmony_ci cluster_centers[i] = color_sum[i] * scale; 242cc1dc7a3Sopenharmony_ci } 243cc1dc7a3Sopenharmony_ci} 244cc1dc7a3Sopenharmony_ci 245cc1dc7a3Sopenharmony_ci/** 246cc1dc7a3Sopenharmony_ci * @brief Compute bit-mismatch for partitioning in 2-partition mode. 247cc1dc7a3Sopenharmony_ci * 248cc1dc7a3Sopenharmony_ci * @param a The texel assignment bitvector for the block. 249cc1dc7a3Sopenharmony_ci * @param b The texel assignment bitvector for the partition table. 250cc1dc7a3Sopenharmony_ci * 251cc1dc7a3Sopenharmony_ci * @return The number of bit mismatches. 252cc1dc7a3Sopenharmony_ci */ 253cc1dc7a3Sopenharmony_ci#if ASTCENC_NEON != 0 254cc1dc7a3Sopenharmony_cistatic inline uint8_t partition_mismatch2( 255cc1dc7a3Sopenharmony_ci const uint64_t a[2], 256cc1dc7a3Sopenharmony_ci const uint64_t b[2] 257cc1dc7a3Sopenharmony_ci) { 258cc1dc7a3Sopenharmony_ci uint64x2_t a01 = vld1q_u64(a); 259cc1dc7a3Sopenharmony_ci uint64x2_t b01 = vld1q_u64(b); 260cc1dc7a3Sopenharmony_ci uint64x2_t b10 = vextq_u64(b01, b01, 1); 261cc1dc7a3Sopenharmony_ci uint8_t c1 = popcount(veorq_u64(a01, b01)); 262cc1dc7a3Sopenharmony_ci uint8_t c2 = popcount(veorq_u64(a01, b10)); 263cc1dc7a3Sopenharmony_ci return static_cast<uint8_t>(astc::min(c1, c2) / 2); // 2 is the number of partitions 264cc1dc7a3Sopenharmony_ci} 265cc1dc7a3Sopenharmony_ci#else 266cc1dc7a3Sopenharmony_cistatic inline uint8_t partition_mismatch2( 267cc1dc7a3Sopenharmony_ci const uint64_t a[2], 268cc1dc7a3Sopenharmony_ci const uint64_t b[2] 269cc1dc7a3Sopenharmony_ci) { 270cc1dc7a3Sopenharmony_ci int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]); 271cc1dc7a3Sopenharmony_ci int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]); 272cc1dc7a3Sopenharmony_ci 273cc1dc7a3Sopenharmony_ci // Divide by 2 because XOR always counts errors twice, once when missing 274cc1dc7a3Sopenharmony_ci // in the expected position, and again when present in the wrong partition 275cc1dc7a3Sopenharmony_ci return static_cast<uint8_t>(astc::min(v1, v2) / 2); 276cc1dc7a3Sopenharmony_ci} 277cc1dc7a3Sopenharmony_ci#endif 278cc1dc7a3Sopenharmony_ci 279cc1dc7a3Sopenharmony_ci/** 280cc1dc7a3Sopenharmony_ci * @brief Compute bit-mismatch for partitioning in 3-partition mode. 281cc1dc7a3Sopenharmony_ci * 282cc1dc7a3Sopenharmony_ci * @param a The texel assignment bitvector for the block. 283cc1dc7a3Sopenharmony_ci * @param b The texel assignment bitvector for the partition table. 284cc1dc7a3Sopenharmony_ci * 285cc1dc7a3Sopenharmony_ci * @return The number of bit mismatches. 286cc1dc7a3Sopenharmony_ci */ 287cc1dc7a3Sopenharmony_cistatic inline uint8_t partition_mismatch3( 288cc1dc7a3Sopenharmony_ci const uint64_t a[3], 289cc1dc7a3Sopenharmony_ci const uint64_t b[3] 290cc1dc7a3Sopenharmony_ci) { 291cc1dc7a3Sopenharmony_ci int p00 = popcount(a[0] ^ b[0]); 292cc1dc7a3Sopenharmony_ci int p01 = popcount(a[0] ^ b[1]); 293cc1dc7a3Sopenharmony_ci int p02 = popcount(a[0] ^ b[2]); 294cc1dc7a3Sopenharmony_ci 295cc1dc7a3Sopenharmony_ci int p10 = popcount(a[1] ^ b[0]); 296cc1dc7a3Sopenharmony_ci int p11 = popcount(a[1] ^ b[1]); 297cc1dc7a3Sopenharmony_ci int p12 = popcount(a[1] ^ b[2]); 298cc1dc7a3Sopenharmony_ci 299cc1dc7a3Sopenharmony_ci int p20 = popcount(a[2] ^ b[0]); 300cc1dc7a3Sopenharmony_ci int p21 = popcount(a[2] ^ b[1]); 301cc1dc7a3Sopenharmony_ci int p22 = popcount(a[2] ^ b[2]); 302cc1dc7a3Sopenharmony_ci 303cc1dc7a3Sopenharmony_ci int s0 = p11 + p22; 304cc1dc7a3Sopenharmony_ci int s1 = p12 + p21; 305cc1dc7a3Sopenharmony_ci int v0 = astc::min(s0, s1) + p00; 306cc1dc7a3Sopenharmony_ci 307cc1dc7a3Sopenharmony_ci int s2 = p10 + p22; 308cc1dc7a3Sopenharmony_ci int s3 = p12 + p20; 309cc1dc7a3Sopenharmony_ci int v1 = astc::min(s2, s3) + p01; 310cc1dc7a3Sopenharmony_ci 311cc1dc7a3Sopenharmony_ci int s4 = p10 + p21; 312cc1dc7a3Sopenharmony_ci int s5 = p11 + p20; 313cc1dc7a3Sopenharmony_ci int v2 = astc::min(s4, s5) + p02; 314cc1dc7a3Sopenharmony_ci 315cc1dc7a3Sopenharmony_ci // Divide by 2 because XOR always counts errors twice, once when missing 316cc1dc7a3Sopenharmony_ci // in the expected position, and again when present in the wrong partition 317cc1dc7a3Sopenharmony_ci return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2); 318cc1dc7a3Sopenharmony_ci} 319cc1dc7a3Sopenharmony_ci 320cc1dc7a3Sopenharmony_ci/** 321cc1dc7a3Sopenharmony_ci * @brief Compute bit-mismatch for partitioning in 4-partition mode. 322cc1dc7a3Sopenharmony_ci * 323cc1dc7a3Sopenharmony_ci * @param a The texel assignment bitvector for the block. 324cc1dc7a3Sopenharmony_ci * @param b The texel assignment bitvector for the partition table. 325cc1dc7a3Sopenharmony_ci * 326cc1dc7a3Sopenharmony_ci * @return The number of bit mismatches. 327cc1dc7a3Sopenharmony_ci */ 328cc1dc7a3Sopenharmony_cistatic inline uint8_t partition_mismatch4( 329cc1dc7a3Sopenharmony_ci const uint64_t a[4], 330cc1dc7a3Sopenharmony_ci const uint64_t b[4] 331cc1dc7a3Sopenharmony_ci) { 332cc1dc7a3Sopenharmony_ci int p00 = popcount(a[0] ^ b[0]); 333cc1dc7a3Sopenharmony_ci int p01 = popcount(a[0] ^ b[1]); 334cc1dc7a3Sopenharmony_ci int p02 = popcount(a[0] ^ b[2]); 335cc1dc7a3Sopenharmony_ci int p03 = popcount(a[0] ^ b[3]); 336cc1dc7a3Sopenharmony_ci 337cc1dc7a3Sopenharmony_ci int p10 = popcount(a[1] ^ b[0]); 338cc1dc7a3Sopenharmony_ci int p11 = popcount(a[1] ^ b[1]); 339cc1dc7a3Sopenharmony_ci int p12 = popcount(a[1] ^ b[2]); 340cc1dc7a3Sopenharmony_ci int p13 = popcount(a[1] ^ b[3]); 341cc1dc7a3Sopenharmony_ci 342cc1dc7a3Sopenharmony_ci int p20 = popcount(a[2] ^ b[0]); 343cc1dc7a3Sopenharmony_ci int p21 = popcount(a[2] ^ b[1]); 344cc1dc7a3Sopenharmony_ci int p22 = popcount(a[2] ^ b[2]); 345cc1dc7a3Sopenharmony_ci int p23 = popcount(a[2] ^ b[3]); 346cc1dc7a3Sopenharmony_ci 347cc1dc7a3Sopenharmony_ci int p30 = popcount(a[3] ^ b[0]); 348cc1dc7a3Sopenharmony_ci int p31 = popcount(a[3] ^ b[1]); 349cc1dc7a3Sopenharmony_ci int p32 = popcount(a[3] ^ b[2]); 350cc1dc7a3Sopenharmony_ci int p33 = popcount(a[3] ^ b[3]); 351cc1dc7a3Sopenharmony_ci 352cc1dc7a3Sopenharmony_ci int mx23 = astc::min(p22 + p33, p23 + p32); 353cc1dc7a3Sopenharmony_ci int mx13 = astc::min(p21 + p33, p23 + p31); 354cc1dc7a3Sopenharmony_ci int mx12 = astc::min(p21 + p32, p22 + p31); 355cc1dc7a3Sopenharmony_ci int mx03 = astc::min(p20 + p33, p23 + p30); 356cc1dc7a3Sopenharmony_ci int mx02 = astc::min(p20 + p32, p22 + p30); 357cc1dc7a3Sopenharmony_ci int mx01 = astc::min(p21 + p30, p20 + p31); 358cc1dc7a3Sopenharmony_ci 359cc1dc7a3Sopenharmony_ci int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12); 360cc1dc7a3Sopenharmony_ci int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02); 361cc1dc7a3Sopenharmony_ci int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01); 362cc1dc7a3Sopenharmony_ci int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12); 363cc1dc7a3Sopenharmony_ci 364cc1dc7a3Sopenharmony_ci // Divide by 2 because XOR always counts errors twice, once when missing 365cc1dc7a3Sopenharmony_ci // in the expected position, and again when present in the wrong partition 366cc1dc7a3Sopenharmony_ci return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2); 367cc1dc7a3Sopenharmony_ci} 368cc1dc7a3Sopenharmony_ci 369cc1dc7a3Sopenharmony_ciusing mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*); 370cc1dc7a3Sopenharmony_ci 371cc1dc7a3Sopenharmony_ci/** 372cc1dc7a3Sopenharmony_ci * @brief Count the partition table mismatches vs the data clustering. 373cc1dc7a3Sopenharmony_ci * 374cc1dc7a3Sopenharmony_ci * @param bsd The block size information. 375cc1dc7a3Sopenharmony_ci * @param partition_count The number of partitions in the block. 376cc1dc7a3Sopenharmony_ci * @param bitmaps The block texel partition assignment patterns. 377cc1dc7a3Sopenharmony_ci * @param[out] mismatch_counts The array storing per partitioning mismatch counts. 378cc1dc7a3Sopenharmony_ci */ 379cc1dc7a3Sopenharmony_cistatic void count_partition_mismatch_bits( 380cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 381cc1dc7a3Sopenharmony_ci unsigned int partition_count, 382cc1dc7a3Sopenharmony_ci const uint64_t bitmaps[BLOCK_MAX_PARTITIONS], 383cc1dc7a3Sopenharmony_ci uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS] 384cc1dc7a3Sopenharmony_ci) { 385cc1dc7a3Sopenharmony_ci unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1]; 386cc1dc7a3Sopenharmony_ci promise(active_count > 0); 387cc1dc7a3Sopenharmony_ci 388cc1dc7a3Sopenharmony_ci if (partition_count == 2) 389cc1dc7a3Sopenharmony_ci { 390cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < active_count; i++) 391cc1dc7a3Sopenharmony_ci { 392cc1dc7a3Sopenharmony_ci mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]); 393cc1dc7a3Sopenharmony_ci assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS); 394cc1dc7a3Sopenharmony_ci assert(mismatch_counts[i] < bsd.texel_count); 395cc1dc7a3Sopenharmony_ci } 396cc1dc7a3Sopenharmony_ci } 397cc1dc7a3Sopenharmony_ci else if (partition_count == 3) 398cc1dc7a3Sopenharmony_ci { 399cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < active_count; i++) 400cc1dc7a3Sopenharmony_ci { 401cc1dc7a3Sopenharmony_ci mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]); 402cc1dc7a3Sopenharmony_ci assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS); 403cc1dc7a3Sopenharmony_ci assert(mismatch_counts[i] < bsd.texel_count); 404cc1dc7a3Sopenharmony_ci } 405cc1dc7a3Sopenharmony_ci } 406cc1dc7a3Sopenharmony_ci else 407cc1dc7a3Sopenharmony_ci { 408cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < active_count; i++) 409cc1dc7a3Sopenharmony_ci { 410cc1dc7a3Sopenharmony_ci mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]); 411cc1dc7a3Sopenharmony_ci assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS); 412cc1dc7a3Sopenharmony_ci assert(mismatch_counts[i] < bsd.texel_count); 413cc1dc7a3Sopenharmony_ci } 414cc1dc7a3Sopenharmony_ci } 415cc1dc7a3Sopenharmony_ci} 416cc1dc7a3Sopenharmony_ci 417cc1dc7a3Sopenharmony_ci/** 418cc1dc7a3Sopenharmony_ci * @brief Use counting sort on the mismatch array to sort partition candidates. 419cc1dc7a3Sopenharmony_ci * 420cc1dc7a3Sopenharmony_ci * @param partitioning_count The number of packed partitionings. 421cc1dc7a3Sopenharmony_ci * @param mismatch_count Partitioning mismatch counts, in index order. 422cc1dc7a3Sopenharmony_ci * @param[out] partition_ordering Partition index values, in mismatch order. 423cc1dc7a3Sopenharmony_ci * 424cc1dc7a3Sopenharmony_ci * @return The number of active partitions in this selection. 425cc1dc7a3Sopenharmony_ci */ 426cc1dc7a3Sopenharmony_cistatic unsigned int get_partition_ordering_by_mismatch_bits( 427cc1dc7a3Sopenharmony_ci unsigned int texel_count, 428cc1dc7a3Sopenharmony_ci unsigned int partitioning_count, 429cc1dc7a3Sopenharmony_ci const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS], 430cc1dc7a3Sopenharmony_ci uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS] 431cc1dc7a3Sopenharmony_ci) { 432cc1dc7a3Sopenharmony_ci promise(partitioning_count > 0); 433cc1dc7a3Sopenharmony_ci uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 }; 434cc1dc7a3Sopenharmony_ci 435cc1dc7a3Sopenharmony_ci // Create the histogram of mismatch counts 436cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partitioning_count; i++) 437cc1dc7a3Sopenharmony_ci { 438cc1dc7a3Sopenharmony_ci mscount[mismatch_count[i]]++; 439cc1dc7a3Sopenharmony_ci } 440cc1dc7a3Sopenharmony_ci 441cc1dc7a3Sopenharmony_ci // Create a running sum from the histogram array 442cc1dc7a3Sopenharmony_ci // Cells store previous values only; i.e. exclude self after sum 443cc1dc7a3Sopenharmony_ci unsigned int sum = 0; 444cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i++) 445cc1dc7a3Sopenharmony_ci { 446cc1dc7a3Sopenharmony_ci uint16_t cnt = mscount[i]; 447cc1dc7a3Sopenharmony_ci mscount[i] = sum; 448cc1dc7a3Sopenharmony_ci sum += cnt; 449cc1dc7a3Sopenharmony_ci } 450cc1dc7a3Sopenharmony_ci 451cc1dc7a3Sopenharmony_ci // Use the running sum as the index, incrementing after read to allow 452cc1dc7a3Sopenharmony_ci // sequential entries with the same count 453cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partitioning_count; i++) 454cc1dc7a3Sopenharmony_ci { 455cc1dc7a3Sopenharmony_ci unsigned int idx = mscount[mismatch_count[i]]++; 456cc1dc7a3Sopenharmony_ci partition_ordering[idx] = static_cast<uint16_t>(i); 457cc1dc7a3Sopenharmony_ci } 458cc1dc7a3Sopenharmony_ci 459cc1dc7a3Sopenharmony_ci return partitioning_count; 460cc1dc7a3Sopenharmony_ci} 461cc1dc7a3Sopenharmony_ci 462cc1dc7a3Sopenharmony_ci/** 463cc1dc7a3Sopenharmony_ci * @brief Use k-means clustering to compute a partition ordering for a block.. 464cc1dc7a3Sopenharmony_ci * 465cc1dc7a3Sopenharmony_ci * @param bsd The block size information. 466cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 467cc1dc7a3Sopenharmony_ci * @param partition_count The desired number of partitions in the block. 468cc1dc7a3Sopenharmony_ci * @param[out] partition_ordering The list of recommended partition indices, in priority order. 469cc1dc7a3Sopenharmony_ci * 470cc1dc7a3Sopenharmony_ci * @return The number of active partitionings in this selection. 471cc1dc7a3Sopenharmony_ci */ 472cc1dc7a3Sopenharmony_cistatic unsigned int compute_kmeans_partition_ordering( 473cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 474cc1dc7a3Sopenharmony_ci const image_block& blk, 475cc1dc7a3Sopenharmony_ci unsigned int partition_count, 476cc1dc7a3Sopenharmony_ci uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS] 477cc1dc7a3Sopenharmony_ci) { 478cc1dc7a3Sopenharmony_ci vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]; 479cc1dc7a3Sopenharmony_ci uint8_t texel_partitions[BLOCK_MAX_TEXELS]; 480cc1dc7a3Sopenharmony_ci 481cc1dc7a3Sopenharmony_ci // Use three passes of k-means clustering to partition the block data 482cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < 3; i++) 483cc1dc7a3Sopenharmony_ci { 484cc1dc7a3Sopenharmony_ci if (i == 0) 485cc1dc7a3Sopenharmony_ci { 486cc1dc7a3Sopenharmony_ci kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers); 487cc1dc7a3Sopenharmony_ci } 488cc1dc7a3Sopenharmony_ci else 489cc1dc7a3Sopenharmony_ci { 490cc1dc7a3Sopenharmony_ci kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions); 491cc1dc7a3Sopenharmony_ci } 492cc1dc7a3Sopenharmony_ci 493cc1dc7a3Sopenharmony_ci kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions); 494cc1dc7a3Sopenharmony_ci } 495cc1dc7a3Sopenharmony_ci 496cc1dc7a3Sopenharmony_ci // Construct the block bitmaps of texel assignments to each partition 497cc1dc7a3Sopenharmony_ci uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 }; 498cc1dc7a3Sopenharmony_ci unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS); 499cc1dc7a3Sopenharmony_ci promise(texels_to_process > 0); 500cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texels_to_process; i++) 501cc1dc7a3Sopenharmony_ci { 502cc1dc7a3Sopenharmony_ci unsigned int idx = bsd.kmeans_texels[i]; 503cc1dc7a3Sopenharmony_ci bitmaps[texel_partitions[idx]] |= 1ULL << i; 504cc1dc7a3Sopenharmony_ci } 505cc1dc7a3Sopenharmony_ci 506cc1dc7a3Sopenharmony_ci // Count the mismatch between the block and the format's partition tables 507cc1dc7a3Sopenharmony_ci uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]; 508cc1dc7a3Sopenharmony_ci count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts); 509cc1dc7a3Sopenharmony_ci 510cc1dc7a3Sopenharmony_ci // Sort the partitions based on the number of mismatched bits 511cc1dc7a3Sopenharmony_ci return get_partition_ordering_by_mismatch_bits( 512cc1dc7a3Sopenharmony_ci texels_to_process, 513cc1dc7a3Sopenharmony_ci bsd.partitioning_count_selected[partition_count - 1], 514cc1dc7a3Sopenharmony_ci mismatch_counts, partition_ordering); 515cc1dc7a3Sopenharmony_ci} 516cc1dc7a3Sopenharmony_ci 517cc1dc7a3Sopenharmony_ci/** 518cc1dc7a3Sopenharmony_ci * @brief Insert a partitioning into an order list of results, sorted by error. 519cc1dc7a3Sopenharmony_ci * 520cc1dc7a3Sopenharmony_ci * @param max_values The max number of entries in the best result arrays. 521cc1dc7a3Sopenharmony_ci * @param this_error The error of the new entry. 522cc1dc7a3Sopenharmony_ci * @param this_partition The partition ID of the new entry. 523cc1dc7a3Sopenharmony_ci * @param[out] best_errors The array of best error values. 524cc1dc7a3Sopenharmony_ci * @param[out] best_partitions The array of best partition values. 525cc1dc7a3Sopenharmony_ci */ 526cc1dc7a3Sopenharmony_cistatic void insert_result( 527cc1dc7a3Sopenharmony_ci unsigned int max_values, 528cc1dc7a3Sopenharmony_ci float this_error, 529cc1dc7a3Sopenharmony_ci unsigned int this_partition, 530cc1dc7a3Sopenharmony_ci float* best_errors, 531cc1dc7a3Sopenharmony_ci unsigned int* best_partitions) 532cc1dc7a3Sopenharmony_ci{ 533cc1dc7a3Sopenharmony_ci promise(max_values > 0); 534cc1dc7a3Sopenharmony_ci 535cc1dc7a3Sopenharmony_ci // Don't bother searching if the current worst error beats the new error 536cc1dc7a3Sopenharmony_ci if (this_error >= best_errors[max_values - 1]) 537cc1dc7a3Sopenharmony_ci { 538cc1dc7a3Sopenharmony_ci return; 539cc1dc7a3Sopenharmony_ci } 540cc1dc7a3Sopenharmony_ci 541cc1dc7a3Sopenharmony_ci // Else insert into the list in error-order 542cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < max_values; i++) 543cc1dc7a3Sopenharmony_ci { 544cc1dc7a3Sopenharmony_ci // Existing result is better - move on ... 545cc1dc7a3Sopenharmony_ci if (this_error > best_errors[i]) 546cc1dc7a3Sopenharmony_ci { 547cc1dc7a3Sopenharmony_ci continue; 548cc1dc7a3Sopenharmony_ci } 549cc1dc7a3Sopenharmony_ci 550cc1dc7a3Sopenharmony_ci // Move existing results down one 551cc1dc7a3Sopenharmony_ci for (unsigned int j = max_values - 1; j > i; j--) 552cc1dc7a3Sopenharmony_ci { 553cc1dc7a3Sopenharmony_ci best_errors[j] = best_errors[j - 1]; 554cc1dc7a3Sopenharmony_ci best_partitions[j] = best_partitions[j - 1]; 555cc1dc7a3Sopenharmony_ci } 556cc1dc7a3Sopenharmony_ci 557cc1dc7a3Sopenharmony_ci // Insert new result 558cc1dc7a3Sopenharmony_ci best_errors[i] = this_error; 559cc1dc7a3Sopenharmony_ci best_partitions[i] = this_partition; 560cc1dc7a3Sopenharmony_ci break; 561cc1dc7a3Sopenharmony_ci } 562cc1dc7a3Sopenharmony_ci} 563cc1dc7a3Sopenharmony_ci 564cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 565cc1dc7a3Sopenharmony_ciunsigned int find_best_partition_candidates( 566cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 567cc1dc7a3Sopenharmony_ci const image_block& blk, 568cc1dc7a3Sopenharmony_ci unsigned int partition_count, 569cc1dc7a3Sopenharmony_ci unsigned int partition_search_limit, 570cc1dc7a3Sopenharmony_ci unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES], 571cc1dc7a3Sopenharmony_ci unsigned int requested_candidates 572cc1dc7a3Sopenharmony_ci) { 573cc1dc7a3Sopenharmony_ci // Constant used to estimate quantization error for a given partitioning; the optimal value for 574cc1dc7a3Sopenharmony_ci // this depends on bitrate. These values have been determined empirically. 575cc1dc7a3Sopenharmony_ci unsigned int texels_per_block = bsd.texel_count; 576cc1dc7a3Sopenharmony_ci float weight_imprecision_estim = 0.055f; 577cc1dc7a3Sopenharmony_ci if (texels_per_block <= 20) 578cc1dc7a3Sopenharmony_ci { 579cc1dc7a3Sopenharmony_ci weight_imprecision_estim = 0.03f; 580cc1dc7a3Sopenharmony_ci } 581cc1dc7a3Sopenharmony_ci else if (texels_per_block <= 31) 582cc1dc7a3Sopenharmony_ci { 583cc1dc7a3Sopenharmony_ci weight_imprecision_estim = 0.04f; 584cc1dc7a3Sopenharmony_ci } 585cc1dc7a3Sopenharmony_ci else if (texels_per_block <= 41) 586cc1dc7a3Sopenharmony_ci { 587cc1dc7a3Sopenharmony_ci weight_imprecision_estim = 0.05f; 588cc1dc7a3Sopenharmony_ci } 589cc1dc7a3Sopenharmony_ci 590cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 591cc1dc7a3Sopenharmony_ci promise(partition_search_limit > 0); 592cc1dc7a3Sopenharmony_ci 593cc1dc7a3Sopenharmony_ci weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim; 594cc1dc7a3Sopenharmony_ci 595cc1dc7a3Sopenharmony_ci uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS]; 596cc1dc7a3Sopenharmony_ci unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence); 597cc1dc7a3Sopenharmony_ci partition_search_limit = astc::min(partition_search_limit, sequence_len); 598cc1dc7a3Sopenharmony_ci requested_candidates = astc::min(partition_search_limit, requested_candidates); 599cc1dc7a3Sopenharmony_ci 600cc1dc7a3Sopenharmony_ci bool uses_alpha = !blk.is_constant_channel(3); 601cc1dc7a3Sopenharmony_ci 602cc1dc7a3Sopenharmony_ci // Partitioning errors assuming uncorrelated-chrominance endpoints 603cc1dc7a3Sopenharmony_ci float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES]; 604cc1dc7a3Sopenharmony_ci unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES]; 605cc1dc7a3Sopenharmony_ci 606cc1dc7a3Sopenharmony_ci // Partitioning errors assuming same-chrominance endpoints 607cc1dc7a3Sopenharmony_ci float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES]; 608cc1dc7a3Sopenharmony_ci unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES]; 609cc1dc7a3Sopenharmony_ci 610cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < requested_candidates; i++) 611cc1dc7a3Sopenharmony_ci { 612cc1dc7a3Sopenharmony_ci uncor_best_errors[i] = ERROR_CALC_DEFAULT; 613cc1dc7a3Sopenharmony_ci samec_best_errors[i] = ERROR_CALC_DEFAULT; 614cc1dc7a3Sopenharmony_ci } 615cc1dc7a3Sopenharmony_ci 616cc1dc7a3Sopenharmony_ci if (uses_alpha) 617cc1dc7a3Sopenharmony_ci { 618cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partition_search_limit; i++) 619cc1dc7a3Sopenharmony_ci { 620cc1dc7a3Sopenharmony_ci unsigned int partition = partition_sequence[i]; 621cc1dc7a3Sopenharmony_ci const auto& pi = bsd.get_raw_partition_info(partition_count, partition); 622cc1dc7a3Sopenharmony_ci 623cc1dc7a3Sopenharmony_ci // Compute weighting to give to each component in each partition 624cc1dc7a3Sopenharmony_ci partition_metrics pms[BLOCK_MAX_PARTITIONS]; 625cc1dc7a3Sopenharmony_ci 626cc1dc7a3Sopenharmony_ci compute_avgs_and_dirs_4_comp(pi, blk, pms); 627cc1dc7a3Sopenharmony_ci 628cc1dc7a3Sopenharmony_ci line4 uncor_lines[BLOCK_MAX_PARTITIONS]; 629cc1dc7a3Sopenharmony_ci line4 samec_lines[BLOCK_MAX_PARTITIONS]; 630cc1dc7a3Sopenharmony_ci 631cc1dc7a3Sopenharmony_ci processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS]; 632cc1dc7a3Sopenharmony_ci processed_line4 samec_plines[BLOCK_MAX_PARTITIONS]; 633cc1dc7a3Sopenharmony_ci 634cc1dc7a3Sopenharmony_ci float line_lengths[BLOCK_MAX_PARTITIONS]; 635cc1dc7a3Sopenharmony_ci 636cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_count; j++) 637cc1dc7a3Sopenharmony_ci { 638cc1dc7a3Sopenharmony_ci partition_metrics& pm = pms[j]; 639cc1dc7a3Sopenharmony_ci 640cc1dc7a3Sopenharmony_ci uncor_lines[j].a = pm.avg; 641cc1dc7a3Sopenharmony_ci uncor_lines[j].b = normalize_safe(pm.dir, unit4()); 642cc1dc7a3Sopenharmony_ci 643cc1dc7a3Sopenharmony_ci uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b); 644cc1dc7a3Sopenharmony_ci uncor_plines[j].bs = uncor_lines[j].b; 645cc1dc7a3Sopenharmony_ci 646cc1dc7a3Sopenharmony_ci samec_lines[j].a = vfloat4::zero(); 647cc1dc7a3Sopenharmony_ci samec_lines[j].b = normalize_safe(pm.avg, unit4()); 648cc1dc7a3Sopenharmony_ci 649cc1dc7a3Sopenharmony_ci samec_plines[j].amod = vfloat4::zero(); 650cc1dc7a3Sopenharmony_ci samec_plines[j].bs = samec_lines[j].b; 651cc1dc7a3Sopenharmony_ci } 652cc1dc7a3Sopenharmony_ci 653cc1dc7a3Sopenharmony_ci float uncor_error = 0.0f; 654cc1dc7a3Sopenharmony_ci float samec_error = 0.0f; 655cc1dc7a3Sopenharmony_ci 656cc1dc7a3Sopenharmony_ci compute_error_squared_rgba(pi, 657cc1dc7a3Sopenharmony_ci blk, 658cc1dc7a3Sopenharmony_ci uncor_plines, 659cc1dc7a3Sopenharmony_ci samec_plines, 660cc1dc7a3Sopenharmony_ci line_lengths, 661cc1dc7a3Sopenharmony_ci uncor_error, 662cc1dc7a3Sopenharmony_ci samec_error); 663cc1dc7a3Sopenharmony_ci 664cc1dc7a3Sopenharmony_ci // Compute an estimate of error introduced by weight quantization imprecision. 665cc1dc7a3Sopenharmony_ci // This error is computed as follows, for each partition 666cc1dc7a3Sopenharmony_ci // 1: compute the principal-axis vector (full length) in error-space 667cc1dc7a3Sopenharmony_ci // 2: convert the principal-axis vector to regular RGB-space 668cc1dc7a3Sopenharmony_ci // 3: scale the vector by a constant that estimates average quantization error 669cc1dc7a3Sopenharmony_ci // 4: for each texel, square the vector, then do a dot-product with the texel's 670cc1dc7a3Sopenharmony_ci // error weight; sum up the results across all texels. 671cc1dc7a3Sopenharmony_ci // 4(optimized): square the vector once, then do a dot-product with the average 672cc1dc7a3Sopenharmony_ci // texel error, then multiply by the number of texels. 673cc1dc7a3Sopenharmony_ci 674cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_count; j++) 675cc1dc7a3Sopenharmony_ci { 676cc1dc7a3Sopenharmony_ci float tpp = static_cast<float>(pi.partition_texel_count[j]); 677cc1dc7a3Sopenharmony_ci vfloat4 error_weights(tpp * weight_imprecision_estim); 678cc1dc7a3Sopenharmony_ci 679cc1dc7a3Sopenharmony_ci vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j]; 680cc1dc7a3Sopenharmony_ci vfloat4 samec_vector = samec_lines[j].b * line_lengths[j]; 681cc1dc7a3Sopenharmony_ci 682cc1dc7a3Sopenharmony_ci uncor_error += dot_s(uncor_vector * uncor_vector, error_weights); 683cc1dc7a3Sopenharmony_ci samec_error += dot_s(samec_vector * samec_vector, error_weights); 684cc1dc7a3Sopenharmony_ci } 685cc1dc7a3Sopenharmony_ci 686cc1dc7a3Sopenharmony_ci insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions); 687cc1dc7a3Sopenharmony_ci insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions); 688cc1dc7a3Sopenharmony_ci } 689cc1dc7a3Sopenharmony_ci } 690cc1dc7a3Sopenharmony_ci else 691cc1dc7a3Sopenharmony_ci { 692cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partition_search_limit; i++) 693cc1dc7a3Sopenharmony_ci { 694cc1dc7a3Sopenharmony_ci unsigned int partition = partition_sequence[i]; 695cc1dc7a3Sopenharmony_ci const auto& pi = bsd.get_raw_partition_info(partition_count, partition); 696cc1dc7a3Sopenharmony_ci 697cc1dc7a3Sopenharmony_ci // Compute weighting to give to each component in each partition 698cc1dc7a3Sopenharmony_ci partition_metrics pms[BLOCK_MAX_PARTITIONS]; 699cc1dc7a3Sopenharmony_ci compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); 700cc1dc7a3Sopenharmony_ci 701cc1dc7a3Sopenharmony_ci partition_lines3 plines[BLOCK_MAX_PARTITIONS]; 702cc1dc7a3Sopenharmony_ci 703cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_count; j++) 704cc1dc7a3Sopenharmony_ci { 705cc1dc7a3Sopenharmony_ci partition_metrics& pm = pms[j]; 706cc1dc7a3Sopenharmony_ci partition_lines3& pl = plines[j]; 707cc1dc7a3Sopenharmony_ci 708cc1dc7a3Sopenharmony_ci pl.uncor_line.a = pm.avg; 709cc1dc7a3Sopenharmony_ci pl.uncor_line.b = normalize_safe(pm.dir, unit3()); 710cc1dc7a3Sopenharmony_ci 711cc1dc7a3Sopenharmony_ci pl.samec_line.a = vfloat4::zero(); 712cc1dc7a3Sopenharmony_ci pl.samec_line.b = normalize_safe(pm.avg, unit3()); 713cc1dc7a3Sopenharmony_ci 714cc1dc7a3Sopenharmony_ci pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b); 715cc1dc7a3Sopenharmony_ci pl.uncor_pline.bs = pl.uncor_line.b; 716cc1dc7a3Sopenharmony_ci 717cc1dc7a3Sopenharmony_ci pl.samec_pline.amod = vfloat4::zero(); 718cc1dc7a3Sopenharmony_ci pl.samec_pline.bs = pl.samec_line.b; 719cc1dc7a3Sopenharmony_ci } 720cc1dc7a3Sopenharmony_ci 721cc1dc7a3Sopenharmony_ci float uncor_error = 0.0f; 722cc1dc7a3Sopenharmony_ci float samec_error = 0.0f; 723cc1dc7a3Sopenharmony_ci 724cc1dc7a3Sopenharmony_ci compute_error_squared_rgb(pi, 725cc1dc7a3Sopenharmony_ci blk, 726cc1dc7a3Sopenharmony_ci plines, 727cc1dc7a3Sopenharmony_ci uncor_error, 728cc1dc7a3Sopenharmony_ci samec_error); 729cc1dc7a3Sopenharmony_ci 730cc1dc7a3Sopenharmony_ci // Compute an estimate of error introduced by weight quantization imprecision. 731cc1dc7a3Sopenharmony_ci // This error is computed as follows, for each partition 732cc1dc7a3Sopenharmony_ci // 1: compute the principal-axis vector (full length) in error-space 733cc1dc7a3Sopenharmony_ci // 2: convert the principal-axis vector to regular RGB-space 734cc1dc7a3Sopenharmony_ci // 3: scale the vector by a constant that estimates average quantization error 735cc1dc7a3Sopenharmony_ci // 4: for each texel, square the vector, then do a dot-product with the texel's 736cc1dc7a3Sopenharmony_ci // error weight; sum up the results across all texels. 737cc1dc7a3Sopenharmony_ci // 4(optimized): square the vector once, then do a dot-product with the average 738cc1dc7a3Sopenharmony_ci // texel error, then multiply by the number of texels. 739cc1dc7a3Sopenharmony_ci 740cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_count; j++) 741cc1dc7a3Sopenharmony_ci { 742cc1dc7a3Sopenharmony_ci partition_lines3& pl = plines[j]; 743cc1dc7a3Sopenharmony_ci 744cc1dc7a3Sopenharmony_ci float tpp = static_cast<float>(pi.partition_texel_count[j]); 745cc1dc7a3Sopenharmony_ci vfloat4 error_weights(tpp * weight_imprecision_estim); 746cc1dc7a3Sopenharmony_ci 747cc1dc7a3Sopenharmony_ci vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length; 748cc1dc7a3Sopenharmony_ci vfloat4 samec_vector = pl.samec_line.b * pl.line_length; 749cc1dc7a3Sopenharmony_ci 750cc1dc7a3Sopenharmony_ci uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights); 751cc1dc7a3Sopenharmony_ci samec_error += dot3_s(samec_vector * samec_vector, error_weights); 752cc1dc7a3Sopenharmony_ci } 753cc1dc7a3Sopenharmony_ci 754cc1dc7a3Sopenharmony_ci insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions); 755cc1dc7a3Sopenharmony_ci insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions); 756cc1dc7a3Sopenharmony_ci } 757cc1dc7a3Sopenharmony_ci } 758cc1dc7a3Sopenharmony_ci 759cc1dc7a3Sopenharmony_ci unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES]; 760cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < requested_candidates; i++) 761cc1dc7a3Sopenharmony_ci { 762cc1dc7a3Sopenharmony_ci interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index; 763cc1dc7a3Sopenharmony_ci interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index; 764cc1dc7a3Sopenharmony_ci } 765cc1dc7a3Sopenharmony_ci 766cc1dc7a3Sopenharmony_ci uint64_t bitmasks[1024/64] { 0 }; 767cc1dc7a3Sopenharmony_ci unsigned int emitted = 0; 768cc1dc7a3Sopenharmony_ci 769cc1dc7a3Sopenharmony_ci // Deduplicate the first "requested" entries 770cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < requested_candidates * 2; i++) 771cc1dc7a3Sopenharmony_ci { 772cc1dc7a3Sopenharmony_ci unsigned int partition = interleave[i]; 773cc1dc7a3Sopenharmony_ci 774cc1dc7a3Sopenharmony_ci unsigned int word = partition / 64; 775cc1dc7a3Sopenharmony_ci unsigned int bit = partition % 64; 776cc1dc7a3Sopenharmony_ci 777cc1dc7a3Sopenharmony_ci bool written = bitmasks[word] & (1ull << bit); 778cc1dc7a3Sopenharmony_ci 779cc1dc7a3Sopenharmony_ci if (!written) 780cc1dc7a3Sopenharmony_ci { 781cc1dc7a3Sopenharmony_ci best_partitions[emitted] = partition; 782cc1dc7a3Sopenharmony_ci bitmasks[word] |= 1ull << bit; 783cc1dc7a3Sopenharmony_ci emitted++; 784cc1dc7a3Sopenharmony_ci 785cc1dc7a3Sopenharmony_ci if (emitted == requested_candidates) 786cc1dc7a3Sopenharmony_ci { 787cc1dc7a3Sopenharmony_ci break; 788cc1dc7a3Sopenharmony_ci } 789cc1dc7a3Sopenharmony_ci } 790cc1dc7a3Sopenharmony_ci } 791cc1dc7a3Sopenharmony_ci 792cc1dc7a3Sopenharmony_ci return emitted; 793cc1dc7a3Sopenharmony_ci} 794cc1dc7a3Sopenharmony_ci 795cc1dc7a3Sopenharmony_ci#endif 796