1// SPDX-License-Identifier: Apache-2.0 2// ---------------------------------------------------------------------------- 3// Copyright 2011-2023 Arm Limited 4// 5// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6// use this file except in compliance with the License. You may obtain a copy 7// of the License at: 8// 9// http://www.apache.org/licenses/LICENSE-2.0 10// 11// Unless required by applicable law or agreed to in writing, software 12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14// License for the specific language governing permissions and limitations 15// under the License. 16// ---------------------------------------------------------------------------- 17 18/** 19 * @brief Functions for finding dominant direction of a set of colors. 20 */ 21#if !defined(ASTCENC_DECOMPRESS_ONLY) 22 23#include "astcenc_internal.h" 24 25#include <cassert> 26 27/** 28 * @brief Compute the average RGB color of each partition. 29 * 30 * The algorithm here uses a vectorized sequential scan and per-partition 31 * color accumulators, using select() to mask texel lanes in other partitions. 32 * 33 * We only accumulate sums for N-1 partitions during the scan; the value for 34 * the last partition can be computed given that we know the block-wide average 35 * already. 36 * 37 * Because of this we could reduce the loop iteration count so it "just" spans 38 * the max texel index needed for the N-1 partitions, which could need fewer 39 * iterations than the full block texel count. However, this makes the loop 40 * count erratic and causes more branch mispredictions so is a net loss. 41 * 42 * @param pi The partitioning to use. 43 * @param blk The block data to process. 44 * @param[out] averages The output averages. Unused partition indices will 45 * not be initialized, and lane<3> will be zero. 46 */ 47static void compute_partition_averages_rgb( 48 const partition_info& pi, 49 const image_block& blk, 50 vfloat4 averages[BLOCK_MAX_PARTITIONS] 51) { 52 unsigned int partition_count = pi.partition_count; 53 unsigned int texel_count = blk.texel_count; 54 promise(texel_count > 0); 55 56 // For 1 partition just use the precomputed mean 57 if (partition_count == 1) 58 { 59 averages[0] = blk.data_mean.swz<0, 1, 2>(); 60 } 61 // For 2 partitions scan results for partition 0, compute partition 1 62 else if (partition_count == 2) 63 { 64 vfloatacc pp_avg_rgb[3] {}; 65 66 vint lane_id = vint::lane_id(); 67 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 68 { 69 vint texel_partition(pi.partition_of_texel + i); 70 71 vmask lane_mask = lane_id < vint(texel_count); 72 lane_id += vint(ASTCENC_SIMD_WIDTH); 73 74 vmask p0_mask = lane_mask & (texel_partition == vint(0)); 75 76 vfloat data_r = loada(blk.data_r + i); 77 haccumulate(pp_avg_rgb[0], data_r, p0_mask); 78 79 vfloat data_g = loada(blk.data_g + i); 80 haccumulate(pp_avg_rgb[1], data_g, p0_mask); 81 82 vfloat data_b = loada(blk.data_b + i); 83 haccumulate(pp_avg_rgb[2], data_b, p0_mask); 84 } 85 86 vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count); 87 88 vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]), 89 hadd_s(pp_avg_rgb[1]), 90 hadd_s(pp_avg_rgb[2])); 91 92 vfloat4 p1_total = block_total - p0_total; 93 94 averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); 95 averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); 96 } 97 // For 3 partitions scan results for partition 0/1, compute partition 2 98 else if (partition_count == 3) 99 { 100 vfloatacc pp_avg_rgb[2][3] {}; 101 102 vint lane_id = vint::lane_id(); 103 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 104 { 105 vint texel_partition(pi.partition_of_texel + i); 106 107 vmask lane_mask = lane_id < vint(texel_count); 108 lane_id += vint(ASTCENC_SIMD_WIDTH); 109 110 vmask p0_mask = lane_mask & (texel_partition == vint(0)); 111 vmask p1_mask = lane_mask & (texel_partition == vint(1)); 112 113 vfloat data_r = loada(blk.data_r + i); 114 haccumulate(pp_avg_rgb[0][0], data_r, p0_mask); 115 haccumulate(pp_avg_rgb[1][0], data_r, p1_mask); 116 117 vfloat data_g = loada(blk.data_g + i); 118 haccumulate(pp_avg_rgb[0][1], data_g, p0_mask); 119 haccumulate(pp_avg_rgb[1][1], data_g, p1_mask); 120 121 vfloat data_b = loada(blk.data_b + i); 122 haccumulate(pp_avg_rgb[0][2], data_b, p0_mask); 123 haccumulate(pp_avg_rgb[1][2], data_b, p1_mask); 124 } 125 126 vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count); 127 128 vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]), 129 hadd_s(pp_avg_rgb[0][1]), 130 hadd_s(pp_avg_rgb[0][2])); 131 132 vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]), 133 hadd_s(pp_avg_rgb[1][1]), 134 hadd_s(pp_avg_rgb[1][2])); 135 136 vfloat4 p2_total = block_total - p0_total - p1_total; 137 138 averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); 139 averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); 140 averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]); 141 } 142 else 143 { 144 // For 4 partitions scan results for partition 0/1/2, compute partition 3 145 vfloatacc pp_avg_rgb[3][3] {}; 146 147 vint lane_id = vint::lane_id(); 148 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 149 { 150 vint texel_partition(pi.partition_of_texel + i); 151 152 vmask lane_mask = lane_id < vint(texel_count); 153 lane_id += vint(ASTCENC_SIMD_WIDTH); 154 155 vmask p0_mask = lane_mask & (texel_partition == vint(0)); 156 vmask p1_mask = lane_mask & (texel_partition == vint(1)); 157 vmask p2_mask = lane_mask & (texel_partition == vint(2)); 158 159 vfloat data_r = loada(blk.data_r + i); 160 haccumulate(pp_avg_rgb[0][0], data_r, p0_mask); 161 haccumulate(pp_avg_rgb[1][0], data_r, p1_mask); 162 haccumulate(pp_avg_rgb[2][0], data_r, p2_mask); 163 164 vfloat data_g = loada(blk.data_g + i); 165 haccumulate(pp_avg_rgb[0][1], data_g, p0_mask); 166 haccumulate(pp_avg_rgb[1][1], data_g, p1_mask); 167 haccumulate(pp_avg_rgb[2][1], data_g, p2_mask); 168 169 vfloat data_b = loada(blk.data_b + i); 170 haccumulate(pp_avg_rgb[0][2], data_b, p0_mask); 171 haccumulate(pp_avg_rgb[1][2], data_b, p1_mask); 172 haccumulate(pp_avg_rgb[2][2], data_b, p2_mask); 173 } 174 175 vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count); 176 177 vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]), 178 hadd_s(pp_avg_rgb[0][1]), 179 hadd_s(pp_avg_rgb[0][2])); 180 181 vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]), 182 hadd_s(pp_avg_rgb[1][1]), 183 hadd_s(pp_avg_rgb[1][2])); 184 185 vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]), 186 hadd_s(pp_avg_rgb[2][1]), 187 hadd_s(pp_avg_rgb[2][2])); 188 189 vfloat4 p3_total = block_total - p0_total - p1_total- p2_total; 190 191 averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); 192 averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); 193 averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]); 194 averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]); 195 } 196} 197 198/** 199 * @brief Compute the average RGBA color of each partition. 200 * 201 * The algorithm here uses a vectorized sequential scan and per-partition 202 * color accumulators, using select() to mask texel lanes in other partitions. 203 * 204 * We only accumulate sums for N-1 partitions during the scan; the value for 205 * the last partition can be computed given that we know the block-wide average 206 * already. 207 * 208 * Because of this we could reduce the loop iteration count so it "just" spans 209 * the max texel index needed for the N-1 partitions, which could need fewer 210 * iterations than the full block texel count. However, this makes the loop 211 * count erratic and causes more branch mispredictions so is a net loss. 212 * 213 * @param pi The partitioning to use. 214 * @param blk The block data to process. 215 * @param[out] averages The output averages. Unused partition indices will 216 * not be initialized. 217 */ 218static void compute_partition_averages_rgba( 219 const partition_info& pi, 220 const image_block& blk, 221 vfloat4 averages[BLOCK_MAX_PARTITIONS] 222) { 223 unsigned int partition_count = pi.partition_count; 224 unsigned int texel_count = blk.texel_count; 225 promise(texel_count > 0); 226 227 // For 1 partition just use the precomputed mean 228 if (partition_count == 1) 229 { 230 averages[0] = blk.data_mean; 231 } 232 // For 2 partitions scan results for partition 0, compute partition 1 233 else if (partition_count == 2) 234 { 235 vfloat4 pp_avg_rgba[4] {}; 236 237 vint lane_id = vint::lane_id(); 238 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 239 { 240 vint texel_partition(pi.partition_of_texel + i); 241 242 vmask lane_mask = lane_id < vint(texel_count); 243 lane_id += vint(ASTCENC_SIMD_WIDTH); 244 245 vmask p0_mask = lane_mask & (texel_partition == vint(0)); 246 247 vfloat data_r = loada(blk.data_r + i); 248 haccumulate(pp_avg_rgba[0], data_r, p0_mask); 249 250 vfloat data_g = loada(blk.data_g + i); 251 haccumulate(pp_avg_rgba[1], data_g, p0_mask); 252 253 vfloat data_b = loada(blk.data_b + i); 254 haccumulate(pp_avg_rgba[2], data_b, p0_mask); 255 256 vfloat data_a = loada(blk.data_a + i); 257 haccumulate(pp_avg_rgba[3], data_a, p0_mask); 258 } 259 260 vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count); 261 262 vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]), 263 hadd_s(pp_avg_rgba[1]), 264 hadd_s(pp_avg_rgba[2]), 265 hadd_s(pp_avg_rgba[3])); 266 267 vfloat4 p1_total = block_total - p0_total; 268 269 averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); 270 averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); 271 } 272 // For 3 partitions scan results for partition 0/1, compute partition 2 273 else if (partition_count == 3) 274 { 275 vfloat4 pp_avg_rgba[2][4] {}; 276 277 vint lane_id = vint::lane_id(); 278 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 279 { 280 vint texel_partition(pi.partition_of_texel + i); 281 282 vmask lane_mask = lane_id < vint(texel_count); 283 lane_id += vint(ASTCENC_SIMD_WIDTH); 284 285 vmask p0_mask = lane_mask & (texel_partition == vint(0)); 286 vmask p1_mask = lane_mask & (texel_partition == vint(1)); 287 288 vfloat data_r = loada(blk.data_r + i); 289 haccumulate(pp_avg_rgba[0][0], data_r, p0_mask); 290 haccumulate(pp_avg_rgba[1][0], data_r, p1_mask); 291 292 vfloat data_g = loada(blk.data_g + i); 293 haccumulate(pp_avg_rgba[0][1], data_g, p0_mask); 294 haccumulate(pp_avg_rgba[1][1], data_g, p1_mask); 295 296 vfloat data_b = loada(blk.data_b + i); 297 haccumulate(pp_avg_rgba[0][2], data_b, p0_mask); 298 haccumulate(pp_avg_rgba[1][2], data_b, p1_mask); 299 300 vfloat data_a = loada(blk.data_a + i); 301 haccumulate(pp_avg_rgba[0][3], data_a, p0_mask); 302 haccumulate(pp_avg_rgba[1][3], data_a, p1_mask); 303 } 304 305 vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count); 306 307 vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]), 308 hadd_s(pp_avg_rgba[0][1]), 309 hadd_s(pp_avg_rgba[0][2]), 310 hadd_s(pp_avg_rgba[0][3])); 311 312 vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]), 313 hadd_s(pp_avg_rgba[1][1]), 314 hadd_s(pp_avg_rgba[1][2]), 315 hadd_s(pp_avg_rgba[1][3])); 316 317 vfloat4 p2_total = block_total - p0_total - p1_total; 318 319 averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); 320 averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); 321 averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]); 322 } 323 else 324 { 325 // For 4 partitions scan results for partition 0/1/2, compute partition 3 326 vfloat4 pp_avg_rgba[3][4] {}; 327 328 vint lane_id = vint::lane_id(); 329 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 330 { 331 vint texel_partition(pi.partition_of_texel + i); 332 333 vmask lane_mask = lane_id < vint(texel_count); 334 lane_id += vint(ASTCENC_SIMD_WIDTH); 335 336 vmask p0_mask = lane_mask & (texel_partition == vint(0)); 337 vmask p1_mask = lane_mask & (texel_partition == vint(1)); 338 vmask p2_mask = lane_mask & (texel_partition == vint(2)); 339 340 vfloat data_r = loada(blk.data_r + i); 341 haccumulate(pp_avg_rgba[0][0], data_r, p0_mask); 342 haccumulate(pp_avg_rgba[1][0], data_r, p1_mask); 343 haccumulate(pp_avg_rgba[2][0], data_r, p2_mask); 344 345 vfloat data_g = loada(blk.data_g + i); 346 haccumulate(pp_avg_rgba[0][1], data_g, p0_mask); 347 haccumulate(pp_avg_rgba[1][1], data_g, p1_mask); 348 haccumulate(pp_avg_rgba[2][1], data_g, p2_mask); 349 350 vfloat data_b = loada(blk.data_b + i); 351 haccumulate(pp_avg_rgba[0][2], data_b, p0_mask); 352 haccumulate(pp_avg_rgba[1][2], data_b, p1_mask); 353 haccumulate(pp_avg_rgba[2][2], data_b, p2_mask); 354 355 vfloat data_a = loada(blk.data_a + i); 356 haccumulate(pp_avg_rgba[0][3], data_a, p0_mask); 357 haccumulate(pp_avg_rgba[1][3], data_a, p1_mask); 358 haccumulate(pp_avg_rgba[2][3], data_a, p2_mask); 359 } 360 361 vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count); 362 363 vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]), 364 hadd_s(pp_avg_rgba[0][1]), 365 hadd_s(pp_avg_rgba[0][2]), 366 hadd_s(pp_avg_rgba[0][3])); 367 368 vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]), 369 hadd_s(pp_avg_rgba[1][1]), 370 hadd_s(pp_avg_rgba[1][2]), 371 hadd_s(pp_avg_rgba[1][3])); 372 373 vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]), 374 hadd_s(pp_avg_rgba[2][1]), 375 hadd_s(pp_avg_rgba[2][2]), 376 hadd_s(pp_avg_rgba[2][3])); 377 378 vfloat4 p3_total = block_total - p0_total - p1_total- p2_total; 379 380 averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); 381 averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); 382 averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]); 383 averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]); 384 } 385} 386 387/* See header for documentation. */ 388void compute_avgs_and_dirs_4_comp( 389 const partition_info& pi, 390 const image_block& blk, 391 partition_metrics pm[BLOCK_MAX_PARTITIONS] 392) { 393 int partition_count = pi.partition_count; 394 promise(partition_count > 0); 395 396 // Pre-compute partition_averages 397 vfloat4 partition_averages[BLOCK_MAX_PARTITIONS]; 398 compute_partition_averages_rgba(pi, blk, partition_averages); 399 400 for (int partition = 0; partition < partition_count; partition++) 401 { 402 const uint8_t *texel_indexes = pi.texels_of_partition[partition]; 403 unsigned int texel_count = pi.partition_texel_count[partition]; 404 promise(texel_count > 0); 405 406 vfloat4 average = partition_averages[partition]; 407 pm[partition].avg = average; 408 409 vfloat4 sum_xp = vfloat4::zero(); 410 vfloat4 sum_yp = vfloat4::zero(); 411 vfloat4 sum_zp = vfloat4::zero(); 412 vfloat4 sum_wp = vfloat4::zero(); 413 414 for (unsigned int i = 0; i < texel_count; i++) 415 { 416 unsigned int iwt = texel_indexes[i]; 417 vfloat4 texel_datum = blk.texel(iwt); 418 texel_datum = texel_datum - average; 419 420 vfloat4 zero = vfloat4::zero(); 421 422 vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; 423 sum_xp += select(zero, texel_datum, tdm0); 424 425 vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; 426 sum_yp += select(zero, texel_datum, tdm1); 427 428 vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; 429 sum_zp += select(zero, texel_datum, tdm2); 430 431 vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero; 432 sum_wp += select(zero, texel_datum, tdm3); 433 } 434 435 vfloat4 prod_xp = dot(sum_xp, sum_xp); 436 vfloat4 prod_yp = dot(sum_yp, sum_yp); 437 vfloat4 prod_zp = dot(sum_zp, sum_zp); 438 vfloat4 prod_wp = dot(sum_wp, sum_wp); 439 440 vfloat4 best_vector = sum_xp; 441 vfloat4 best_sum = prod_xp; 442 443 vmask4 mask = prod_yp > best_sum; 444 best_vector = select(best_vector, sum_yp, mask); 445 best_sum = select(best_sum, prod_yp, mask); 446 447 mask = prod_zp > best_sum; 448 best_vector = select(best_vector, sum_zp, mask); 449 best_sum = select(best_sum, prod_zp, mask); 450 451 mask = prod_wp > best_sum; 452 best_vector = select(best_vector, sum_wp, mask); 453 454 pm[partition].dir = best_vector; 455 } 456} 457 458/* See header for documentation. */ 459void compute_avgs_and_dirs_3_comp( 460 const partition_info& pi, 461 const image_block& blk, 462 unsigned int omitted_component, 463 partition_metrics pm[BLOCK_MAX_PARTITIONS] 464) { 465 // Pre-compute partition_averages 466 vfloat4 partition_averages[BLOCK_MAX_PARTITIONS]; 467 compute_partition_averages_rgba(pi, blk, partition_averages); 468 469 const float* data_vr = blk.data_r; 470 const float* data_vg = blk.data_g; 471 const float* data_vb = blk.data_b; 472 473 // TODO: Data-driven permute would be useful to avoid this ... 474 if (omitted_component == 0) 475 { 476 partition_averages[0] = partition_averages[0].swz<1, 2, 3>(); 477 partition_averages[1] = partition_averages[1].swz<1, 2, 3>(); 478 partition_averages[2] = partition_averages[2].swz<1, 2, 3>(); 479 partition_averages[3] = partition_averages[3].swz<1, 2, 3>(); 480 481 data_vr = blk.data_g; 482 data_vg = blk.data_b; 483 data_vb = blk.data_a; 484 } 485 else if (omitted_component == 1) 486 { 487 partition_averages[0] = partition_averages[0].swz<0, 2, 3>(); 488 partition_averages[1] = partition_averages[1].swz<0, 2, 3>(); 489 partition_averages[2] = partition_averages[2].swz<0, 2, 3>(); 490 partition_averages[3] = partition_averages[3].swz<0, 2, 3>(); 491 492 data_vg = blk.data_b; 493 data_vb = blk.data_a; 494 } 495 else if (omitted_component == 2) 496 { 497 partition_averages[0] = partition_averages[0].swz<0, 1, 3>(); 498 partition_averages[1] = partition_averages[1].swz<0, 1, 3>(); 499 partition_averages[2] = partition_averages[2].swz<0, 1, 3>(); 500 partition_averages[3] = partition_averages[3].swz<0, 1, 3>(); 501 502 data_vb = blk.data_a; 503 } 504 else 505 { 506 partition_averages[0] = partition_averages[0].swz<0, 1, 2>(); 507 partition_averages[1] = partition_averages[1].swz<0, 1, 2>(); 508 partition_averages[2] = partition_averages[2].swz<0, 1, 2>(); 509 partition_averages[3] = partition_averages[3].swz<0, 1, 2>(); 510 } 511 512 unsigned int partition_count = pi.partition_count; 513 promise(partition_count > 0); 514 515 for (unsigned int partition = 0; partition < partition_count; partition++) 516 { 517 const uint8_t *texel_indexes = pi.texels_of_partition[partition]; 518 unsigned int texel_count = pi.partition_texel_count[partition]; 519 promise(texel_count > 0); 520 521 vfloat4 average = partition_averages[partition]; 522 pm[partition].avg = average; 523 524 vfloat4 sum_xp = vfloat4::zero(); 525 vfloat4 sum_yp = vfloat4::zero(); 526 vfloat4 sum_zp = vfloat4::zero(); 527 528 for (unsigned int i = 0; i < texel_count; i++) 529 { 530 unsigned int iwt = texel_indexes[i]; 531 532 vfloat4 texel_datum = vfloat3(data_vr[iwt], 533 data_vg[iwt], 534 data_vb[iwt]); 535 texel_datum = texel_datum - average; 536 537 vfloat4 zero = vfloat4::zero(); 538 539 vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; 540 sum_xp += select(zero, texel_datum, tdm0); 541 542 vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; 543 sum_yp += select(zero, texel_datum, tdm1); 544 545 vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; 546 sum_zp += select(zero, texel_datum, tdm2); 547 } 548 549 vfloat4 prod_xp = dot(sum_xp, sum_xp); 550 vfloat4 prod_yp = dot(sum_yp, sum_yp); 551 vfloat4 prod_zp = dot(sum_zp, sum_zp); 552 553 vfloat4 best_vector = sum_xp; 554 vfloat4 best_sum = prod_xp; 555 556 vmask4 mask = prod_yp > best_sum; 557 best_vector = select(best_vector, sum_yp, mask); 558 best_sum = select(best_sum, prod_yp, mask); 559 560 mask = prod_zp > best_sum; 561 best_vector = select(best_vector, sum_zp, mask); 562 563 pm[partition].dir = best_vector; 564 } 565} 566 567/* See header for documentation. */ 568void compute_avgs_and_dirs_3_comp_rgb( 569 const partition_info& pi, 570 const image_block& blk, 571 partition_metrics pm[BLOCK_MAX_PARTITIONS] 572) { 573 unsigned int partition_count = pi.partition_count; 574 promise(partition_count > 0); 575 576 // Pre-compute partition_averages 577 vfloat4 partition_averages[BLOCK_MAX_PARTITIONS]; 578 compute_partition_averages_rgb(pi, blk, partition_averages); 579 580 for (unsigned int partition = 0; partition < partition_count; partition++) 581 { 582 const uint8_t *texel_indexes = pi.texels_of_partition[partition]; 583 unsigned int texel_count = pi.partition_texel_count[partition]; 584 promise(texel_count > 0); 585 586 vfloat4 average = partition_averages[partition]; 587 pm[partition].avg = average; 588 589 vfloat4 sum_xp = vfloat4::zero(); 590 vfloat4 sum_yp = vfloat4::zero(); 591 vfloat4 sum_zp = vfloat4::zero(); 592 593 for (unsigned int i = 0; i < texel_count; i++) 594 { 595 unsigned int iwt = texel_indexes[i]; 596 597 vfloat4 texel_datum = blk.texel3(iwt); 598 texel_datum = texel_datum - average; 599 600 vfloat4 zero = vfloat4::zero(); 601 602 vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; 603 sum_xp += select(zero, texel_datum, tdm0); 604 605 vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; 606 sum_yp += select(zero, texel_datum, tdm1); 607 608 vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; 609 sum_zp += select(zero, texel_datum, tdm2); 610 } 611 612 vfloat4 prod_xp = dot(sum_xp, sum_xp); 613 vfloat4 prod_yp = dot(sum_yp, sum_yp); 614 vfloat4 prod_zp = dot(sum_zp, sum_zp); 615 616 vfloat4 best_vector = sum_xp; 617 vfloat4 best_sum = prod_xp; 618 619 vmask4 mask = prod_yp > best_sum; 620 best_vector = select(best_vector, sum_yp, mask); 621 best_sum = select(best_sum, prod_yp, mask); 622 623 mask = prod_zp > best_sum; 624 best_vector = select(best_vector, sum_zp, mask); 625 626 pm[partition].dir = best_vector; 627 } 628} 629 630/* See header for documentation. */ 631void compute_avgs_and_dirs_2_comp( 632 const partition_info& pt, 633 const image_block& blk, 634 unsigned int component1, 635 unsigned int component2, 636 partition_metrics pm[BLOCK_MAX_PARTITIONS] 637) { 638 vfloat4 average; 639 640 const float* data_vr = nullptr; 641 const float* data_vg = nullptr; 642 643 if (component1 == 0 && component2 == 1) 644 { 645 average = blk.data_mean.swz<0, 1>(); 646 647 data_vr = blk.data_r; 648 data_vg = blk.data_g; 649 } 650 else if (component1 == 0 && component2 == 2) 651 { 652 average = blk.data_mean.swz<0, 2>(); 653 654 data_vr = blk.data_r; 655 data_vg = blk.data_b; 656 } 657 else // (component1 == 1 && component2 == 2) 658 { 659 assert(component1 == 1 && component2 == 2); 660 661 average = blk.data_mean.swz<1, 2>(); 662 663 data_vr = blk.data_g; 664 data_vg = blk.data_b; 665 } 666 667 unsigned int partition_count = pt.partition_count; 668 promise(partition_count > 0); 669 670 for (unsigned int partition = 0; partition < partition_count; partition++) 671 { 672 const uint8_t *texel_indexes = pt.texels_of_partition[partition]; 673 unsigned int texel_count = pt.partition_texel_count[partition]; 674 promise(texel_count > 0); 675 676 // Only compute a partition mean if more than one partition 677 if (partition_count > 1) 678 { 679 average = vfloat4::zero(); 680 for (unsigned int i = 0; i < texel_count; i++) 681 { 682 unsigned int iwt = texel_indexes[i]; 683 average += vfloat2(data_vr[iwt], data_vg[iwt]); 684 } 685 686 average = average / static_cast<float>(texel_count); 687 } 688 689 pm[partition].avg = average; 690 691 vfloat4 sum_xp = vfloat4::zero(); 692 vfloat4 sum_yp = vfloat4::zero(); 693 694 for (unsigned int i = 0; i < texel_count; i++) 695 { 696 unsigned int iwt = texel_indexes[i]; 697 vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]); 698 texel_datum = texel_datum - average; 699 700 vfloat4 zero = vfloat4::zero(); 701 702 vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; 703 sum_xp += select(zero, texel_datum, tdm0); 704 705 vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; 706 sum_yp += select(zero, texel_datum, tdm1); 707 } 708 709 vfloat4 prod_xp = dot(sum_xp, sum_xp); 710 vfloat4 prod_yp = dot(sum_yp, sum_yp); 711 712 vfloat4 best_vector = sum_xp; 713 vfloat4 best_sum = prod_xp; 714 715 vmask4 mask = prod_yp > best_sum; 716 best_vector = select(best_vector, sum_yp, mask); 717 718 pm[partition].dir = best_vector; 719 } 720} 721 722/* See header for documentation. */ 723void compute_error_squared_rgba( 724 const partition_info& pi, 725 const image_block& blk, 726 const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS], 727 const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS], 728 float line_lengths[BLOCK_MAX_PARTITIONS], 729 float& uncor_error, 730 float& samec_error 731) { 732 unsigned int partition_count = pi.partition_count; 733 promise(partition_count > 0); 734 735 vfloatacc uncor_errorsumv = vfloatacc::zero(); 736 vfloatacc samec_errorsumv = vfloatacc::zero(); 737 738 for (unsigned int partition = 0; partition < partition_count; partition++) 739 { 740 const uint8_t *texel_indexes = pi.texels_of_partition[partition]; 741 742 processed_line4 l_uncor = uncor_plines[partition]; 743 processed_line4 l_samec = samec_plines[partition]; 744 745 unsigned int texel_count = pi.partition_texel_count[partition]; 746 promise(texel_count > 0); 747 748 // Vectorize some useful scalar inputs 749 vfloat l_uncor_bs0(l_uncor.bs.lane<0>()); 750 vfloat l_uncor_bs1(l_uncor.bs.lane<1>()); 751 vfloat l_uncor_bs2(l_uncor.bs.lane<2>()); 752 vfloat l_uncor_bs3(l_uncor.bs.lane<3>()); 753 754 vfloat l_uncor_amod0(l_uncor.amod.lane<0>()); 755 vfloat l_uncor_amod1(l_uncor.amod.lane<1>()); 756 vfloat l_uncor_amod2(l_uncor.amod.lane<2>()); 757 vfloat l_uncor_amod3(l_uncor.amod.lane<3>()); 758 759 vfloat l_samec_bs0(l_samec.bs.lane<0>()); 760 vfloat l_samec_bs1(l_samec.bs.lane<1>()); 761 vfloat l_samec_bs2(l_samec.bs.lane<2>()); 762 vfloat l_samec_bs3(l_samec.bs.lane<3>()); 763 764 assert(all(l_samec.amod == vfloat4(0.0f))); 765 766 vfloat uncor_loparamv(1e10f); 767 vfloat uncor_hiparamv(-1e10f); 768 769 vfloat ew_r(blk.channel_weight.lane<0>()); 770 vfloat ew_g(blk.channel_weight.lane<1>()); 771 vfloat ew_b(blk.channel_weight.lane<2>()); 772 vfloat ew_a(blk.channel_weight.lane<3>()); 773 774 // This implementation over-shoots, but this is safe as we initialize the texel_indexes 775 // array to extend the last value. This means min/max are not impacted, but we need to mask 776 // out the dummy values when we compute the line weighting. 777 vint lane_ids = vint::lane_id(); 778 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 779 { 780 vmask mask = lane_ids < vint(texel_count); 781 vint texel_idxs(texel_indexes + i); 782 783 vfloat data_r = gatherf(blk.data_r, texel_idxs); 784 vfloat data_g = gatherf(blk.data_g, texel_idxs); 785 vfloat data_b = gatherf(blk.data_b, texel_idxs); 786 vfloat data_a = gatherf(blk.data_a, texel_idxs); 787 788 vfloat uncor_param = (data_r * l_uncor_bs0) 789 + (data_g * l_uncor_bs1) 790 + (data_b * l_uncor_bs2) 791 + (data_a * l_uncor_bs3); 792 793 uncor_loparamv = min(uncor_param, uncor_loparamv); 794 uncor_hiparamv = max(uncor_param, uncor_hiparamv); 795 796 vfloat uncor_dist0 = (l_uncor_amod0 - data_r) 797 + (uncor_param * l_uncor_bs0); 798 vfloat uncor_dist1 = (l_uncor_amod1 - data_g) 799 + (uncor_param * l_uncor_bs1); 800 vfloat uncor_dist2 = (l_uncor_amod2 - data_b) 801 + (uncor_param * l_uncor_bs2); 802 vfloat uncor_dist3 = (l_uncor_amod3 - data_a) 803 + (uncor_param * l_uncor_bs3); 804 805 vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0) 806 + (ew_g * uncor_dist1 * uncor_dist1) 807 + (ew_b * uncor_dist2 * uncor_dist2) 808 + (ew_a * uncor_dist3 * uncor_dist3); 809 810 haccumulate(uncor_errorsumv, uncor_err, mask); 811 812 // Process samechroma data 813 vfloat samec_param = (data_r * l_samec_bs0) 814 + (data_g * l_samec_bs1) 815 + (data_b * l_samec_bs2) 816 + (data_a * l_samec_bs3); 817 818 vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r; 819 vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g; 820 vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b; 821 vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a; 822 823 vfloat samec_err = (ew_r * samec_dist0 * samec_dist0) 824 + (ew_g * samec_dist1 * samec_dist1) 825 + (ew_b * samec_dist2 * samec_dist2) 826 + (ew_a * samec_dist3 * samec_dist3); 827 828 haccumulate(samec_errorsumv, samec_err, mask); 829 830 lane_ids += vint(ASTCENC_SIMD_WIDTH); 831 } 832 833 // Turn very small numbers and NaNs into a small number 834 float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv); 835 line_lengths[partition] = astc::max(uncor_linelen, 1e-7f); 836 } 837 838 uncor_error = hadd_s(uncor_errorsumv); 839 samec_error = hadd_s(samec_errorsumv); 840} 841 842/* See header for documentation. */ 843void compute_error_squared_rgb( 844 const partition_info& pi, 845 const image_block& blk, 846 partition_lines3 plines[BLOCK_MAX_PARTITIONS], 847 float& uncor_error, 848 float& samec_error 849) { 850 unsigned int partition_count = pi.partition_count; 851 promise(partition_count > 0); 852 853 vfloatacc uncor_errorsumv = vfloatacc::zero(); 854 vfloatacc samec_errorsumv = vfloatacc::zero(); 855 856 for (unsigned int partition = 0; partition < partition_count; partition++) 857 { 858 partition_lines3& pl = plines[partition]; 859 const uint8_t *texel_indexes = pi.texels_of_partition[partition]; 860 unsigned int texel_count = pi.partition_texel_count[partition]; 861 promise(texel_count > 0); 862 863 processed_line3 l_uncor = pl.uncor_pline; 864 processed_line3 l_samec = pl.samec_pline; 865 866 // Vectorize some useful scalar inputs 867 vfloat l_uncor_bs0(l_uncor.bs.lane<0>()); 868 vfloat l_uncor_bs1(l_uncor.bs.lane<1>()); 869 vfloat l_uncor_bs2(l_uncor.bs.lane<2>()); 870 871 vfloat l_uncor_amod0(l_uncor.amod.lane<0>()); 872 vfloat l_uncor_amod1(l_uncor.amod.lane<1>()); 873 vfloat l_uncor_amod2(l_uncor.amod.lane<2>()); 874 875 vfloat l_samec_bs0(l_samec.bs.lane<0>()); 876 vfloat l_samec_bs1(l_samec.bs.lane<1>()); 877 vfloat l_samec_bs2(l_samec.bs.lane<2>()); 878 879 assert(all(l_samec.amod == vfloat4(0.0f))); 880 881 vfloat uncor_loparamv(1e10f); 882 vfloat uncor_hiparamv(-1e10f); 883 884 vfloat ew_r(blk.channel_weight.lane<0>()); 885 vfloat ew_g(blk.channel_weight.lane<1>()); 886 vfloat ew_b(blk.channel_weight.lane<2>()); 887 888 // This implementation over-shoots, but this is safe as we initialize the weights array 889 // to extend the last value. This means min/max are not impacted, but we need to mask 890 // out the dummy values when we compute the line weighting. 891 vint lane_ids = vint::lane_id(); 892 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 893 { 894 vmask mask = lane_ids < vint(texel_count); 895 vint texel_idxs(texel_indexes + i); 896 897 vfloat data_r = gatherf(blk.data_r, texel_idxs); 898 vfloat data_g = gatherf(blk.data_g, texel_idxs); 899 vfloat data_b = gatherf(blk.data_b, texel_idxs); 900 901 vfloat uncor_param = (data_r * l_uncor_bs0) 902 + (data_g * l_uncor_bs1) 903 + (data_b * l_uncor_bs2); 904 905 uncor_loparamv = min(uncor_param, uncor_loparamv); 906 uncor_hiparamv = max(uncor_param, uncor_hiparamv); 907 908 vfloat uncor_dist0 = (l_uncor_amod0 - data_r) 909 + (uncor_param * l_uncor_bs0); 910 vfloat uncor_dist1 = (l_uncor_amod1 - data_g) 911 + (uncor_param * l_uncor_bs1); 912 vfloat uncor_dist2 = (l_uncor_amod2 - data_b) 913 + (uncor_param * l_uncor_bs2); 914 915 vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0) 916 + (ew_g * uncor_dist1 * uncor_dist1) 917 + (ew_b * uncor_dist2 * uncor_dist2); 918 919 haccumulate(uncor_errorsumv, uncor_err, mask); 920 921 // Process samechroma data 922 vfloat samec_param = (data_r * l_samec_bs0) 923 + (data_g * l_samec_bs1) 924 + (data_b * l_samec_bs2); 925 926 vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r; 927 vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g; 928 vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b; 929 930 vfloat samec_err = (ew_r * samec_dist0 * samec_dist0) 931 + (ew_g * samec_dist1 * samec_dist1) 932 + (ew_b * samec_dist2 * samec_dist2); 933 934 haccumulate(samec_errorsumv, samec_err, mask); 935 936 lane_ids += vint(ASTCENC_SIMD_WIDTH); 937 } 938 939 // Turn very small numbers and NaNs into a small number 940 float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv); 941 pl.line_length = astc::max(uncor_linelen, 1e-7f); 942 } 943 944 uncor_error = hadd_s(uncor_errorsumv); 945 samec_error = hadd_s(samec_errorsumv); 946} 947 948#endif 949