1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0 2cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 3cc1dc7a3Sopenharmony_ci// Copyright 2011-2024 Arm Limited 4cc1dc7a3Sopenharmony_ci// 5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy 7cc1dc7a3Sopenharmony_ci// of the License at: 8cc1dc7a3Sopenharmony_ci// 9cc1dc7a3Sopenharmony_ci// http://www.apache.org/licenses/LICENSE-2.0 10cc1dc7a3Sopenharmony_ci// 11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software 12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations 15cc1dc7a3Sopenharmony_ci// under the License. 16cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 17cc1dc7a3Sopenharmony_ci 18cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DECOMPRESS_ONLY) 19cc1dc7a3Sopenharmony_ci 20cc1dc7a3Sopenharmony_ci/** 21cc1dc7a3Sopenharmony_ci * @brief Functions for computing color endpoints and texel weights. 22cc1dc7a3Sopenharmony_ci */ 23cc1dc7a3Sopenharmony_ci 24cc1dc7a3Sopenharmony_ci#include <cassert> 25cc1dc7a3Sopenharmony_ci 26cc1dc7a3Sopenharmony_ci#include "astcenc_internal.h" 27cc1dc7a3Sopenharmony_ci#include "astcenc_vecmathlib.h" 28cc1dc7a3Sopenharmony_ci 29cc1dc7a3Sopenharmony_ci/** 30cc1dc7a3Sopenharmony_ci * @brief Compute the infilled weight for N texel indices in a decimated grid. 31cc1dc7a3Sopenharmony_ci * 32cc1dc7a3Sopenharmony_ci * @param di The weight grid decimation to use. 33cc1dc7a3Sopenharmony_ci * @param weights The decimated weight values to use. 34cc1dc7a3Sopenharmony_ci * @param index The first texel index to interpolate. 35cc1dc7a3Sopenharmony_ci * 36cc1dc7a3Sopenharmony_ci * @return The interpolated weight for the given set of SIMD_WIDTH texels. 37cc1dc7a3Sopenharmony_ci */ 38cc1dc7a3Sopenharmony_cistatic vfloat bilinear_infill_vla( 39cc1dc7a3Sopenharmony_ci const decimation_info& di, 40cc1dc7a3Sopenharmony_ci const float* weights, 41cc1dc7a3Sopenharmony_ci unsigned int index 42cc1dc7a3Sopenharmony_ci) { 43cc1dc7a3Sopenharmony_ci // Load the bilinear filter texel weight indexes in the decimated grid 44cc1dc7a3Sopenharmony_ci vint weight_idx0 = vint(di.texel_weights_tr[0] + index); 45cc1dc7a3Sopenharmony_ci vint weight_idx1 = vint(di.texel_weights_tr[1] + index); 46cc1dc7a3Sopenharmony_ci vint weight_idx2 = vint(di.texel_weights_tr[2] + index); 47cc1dc7a3Sopenharmony_ci vint weight_idx3 = vint(di.texel_weights_tr[3] + index); 48cc1dc7a3Sopenharmony_ci 49cc1dc7a3Sopenharmony_ci // Load the bilinear filter weights from the decimated grid 50cc1dc7a3Sopenharmony_ci vfloat weight_val0 = gatherf(weights, weight_idx0); 51cc1dc7a3Sopenharmony_ci vfloat weight_val1 = gatherf(weights, weight_idx1); 52cc1dc7a3Sopenharmony_ci vfloat weight_val2 = gatherf(weights, weight_idx2); 53cc1dc7a3Sopenharmony_ci vfloat weight_val3 = gatherf(weights, weight_idx3); 54cc1dc7a3Sopenharmony_ci 55cc1dc7a3Sopenharmony_ci // Load the weight contribution factors for each decimated weight 56cc1dc7a3Sopenharmony_ci vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index); 57cc1dc7a3Sopenharmony_ci vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index); 58cc1dc7a3Sopenharmony_ci vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index); 59cc1dc7a3Sopenharmony_ci vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index); 60cc1dc7a3Sopenharmony_ci 61cc1dc7a3Sopenharmony_ci // Compute the bilinear interpolation to generate the per-texel weight 62cc1dc7a3Sopenharmony_ci return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) + 63cc1dc7a3Sopenharmony_ci (weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3); 64cc1dc7a3Sopenharmony_ci} 65cc1dc7a3Sopenharmony_ci 66cc1dc7a3Sopenharmony_ci/** 67cc1dc7a3Sopenharmony_ci * @brief Compute the infilled weight for N texel indices in a decimated grid. 68cc1dc7a3Sopenharmony_ci * 69cc1dc7a3Sopenharmony_ci * This is specialized version which computes only two weights per texel for 70cc1dc7a3Sopenharmony_ci * encodings that are only decimated in a single axis. 71cc1dc7a3Sopenharmony_ci * 72cc1dc7a3Sopenharmony_ci * @param di The weight grid decimation to use. 73cc1dc7a3Sopenharmony_ci * @param weights The decimated weight values to use. 74cc1dc7a3Sopenharmony_ci * @param index The first texel index to interpolate. 75cc1dc7a3Sopenharmony_ci * 76cc1dc7a3Sopenharmony_ci * @return The interpolated weight for the given set of SIMD_WIDTH texels. 77cc1dc7a3Sopenharmony_ci */ 78cc1dc7a3Sopenharmony_cistatic vfloat bilinear_infill_vla_2( 79cc1dc7a3Sopenharmony_ci const decimation_info& di, 80cc1dc7a3Sopenharmony_ci const float* weights, 81cc1dc7a3Sopenharmony_ci unsigned int index 82cc1dc7a3Sopenharmony_ci) { 83cc1dc7a3Sopenharmony_ci // Load the bilinear filter texel weight indexes in the decimated grid 84cc1dc7a3Sopenharmony_ci vint weight_idx0 = vint(di.texel_weights_tr[0] + index); 85cc1dc7a3Sopenharmony_ci vint weight_idx1 = vint(di.texel_weights_tr[1] + index); 86cc1dc7a3Sopenharmony_ci 87cc1dc7a3Sopenharmony_ci // Load the bilinear filter weights from the decimated grid 88cc1dc7a3Sopenharmony_ci vfloat weight_val0 = gatherf(weights, weight_idx0); 89cc1dc7a3Sopenharmony_ci vfloat weight_val1 = gatherf(weights, weight_idx1); 90cc1dc7a3Sopenharmony_ci 91cc1dc7a3Sopenharmony_ci // Load the weight contribution factors for each decimated weight 92cc1dc7a3Sopenharmony_ci vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index); 93cc1dc7a3Sopenharmony_ci vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index); 94cc1dc7a3Sopenharmony_ci 95cc1dc7a3Sopenharmony_ci // Compute the bilinear interpolation to generate the per-texel weight 96cc1dc7a3Sopenharmony_ci return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1); 97cc1dc7a3Sopenharmony_ci} 98cc1dc7a3Sopenharmony_ci 99cc1dc7a3Sopenharmony_ci/** 100cc1dc7a3Sopenharmony_ci * @brief Compute the ideal endpoints and weights for 1 color component. 101cc1dc7a3Sopenharmony_ci * 102cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 103cc1dc7a3Sopenharmony_ci * @param pi The partition info for the current trial. 104cc1dc7a3Sopenharmony_ci * @param[out] ei The computed ideal endpoints and weights. 105cc1dc7a3Sopenharmony_ci * @param component The color component to compute. 106cc1dc7a3Sopenharmony_ci */ 107cc1dc7a3Sopenharmony_cistatic void compute_ideal_colors_and_weights_1_comp( 108cc1dc7a3Sopenharmony_ci const image_block& blk, 109cc1dc7a3Sopenharmony_ci const partition_info& pi, 110cc1dc7a3Sopenharmony_ci endpoints_and_weights& ei, 111cc1dc7a3Sopenharmony_ci unsigned int component 112cc1dc7a3Sopenharmony_ci) { 113cc1dc7a3Sopenharmony_ci unsigned int partition_count = pi.partition_count; 114cc1dc7a3Sopenharmony_ci ei.ep.partition_count = partition_count; 115cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 116cc1dc7a3Sopenharmony_ci 117cc1dc7a3Sopenharmony_ci unsigned int texel_count = blk.texel_count; 118cc1dc7a3Sopenharmony_ci promise(texel_count > 0); 119cc1dc7a3Sopenharmony_ci 120cc1dc7a3Sopenharmony_ci float error_weight; 121cc1dc7a3Sopenharmony_ci const float* data_vr = nullptr; 122cc1dc7a3Sopenharmony_ci 123cc1dc7a3Sopenharmony_ci assert(component < BLOCK_MAX_COMPONENTS); 124cc1dc7a3Sopenharmony_ci switch (component) 125cc1dc7a3Sopenharmony_ci { 126cc1dc7a3Sopenharmony_ci case 0: 127cc1dc7a3Sopenharmony_ci error_weight = blk.channel_weight.lane<0>(); 128cc1dc7a3Sopenharmony_ci data_vr = blk.data_r; 129cc1dc7a3Sopenharmony_ci break; 130cc1dc7a3Sopenharmony_ci case 1: 131cc1dc7a3Sopenharmony_ci error_weight = blk.channel_weight.lane<1>(); 132cc1dc7a3Sopenharmony_ci data_vr = blk.data_g; 133cc1dc7a3Sopenharmony_ci break; 134cc1dc7a3Sopenharmony_ci case 2: 135cc1dc7a3Sopenharmony_ci error_weight = blk.channel_weight.lane<2>(); 136cc1dc7a3Sopenharmony_ci data_vr = blk.data_b; 137cc1dc7a3Sopenharmony_ci break; 138cc1dc7a3Sopenharmony_ci default: 139cc1dc7a3Sopenharmony_ci assert(component == 3); 140cc1dc7a3Sopenharmony_ci error_weight = blk.channel_weight.lane<3>(); 141cc1dc7a3Sopenharmony_ci data_vr = blk.data_a; 142cc1dc7a3Sopenharmony_ci break; 143cc1dc7a3Sopenharmony_ci } 144cc1dc7a3Sopenharmony_ci 145cc1dc7a3Sopenharmony_ci vmask4 sep_mask = vint4::lane_id() == vint4(component); 146cc1dc7a3Sopenharmony_ci bool is_constant_wes { true }; 147cc1dc7a3Sopenharmony_ci float partition0_len_sq { 0.0f }; 148cc1dc7a3Sopenharmony_ci 149cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partition_count; i++) 150cc1dc7a3Sopenharmony_ci { 151cc1dc7a3Sopenharmony_ci float lowvalue { 1e10f }; 152cc1dc7a3Sopenharmony_ci float highvalue { -1e10f }; 153cc1dc7a3Sopenharmony_ci 154cc1dc7a3Sopenharmony_ci unsigned int partition_texel_count = pi.partition_texel_count[i]; 155cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_texel_count; j++) 156cc1dc7a3Sopenharmony_ci { 157cc1dc7a3Sopenharmony_ci unsigned int tix = pi.texels_of_partition[i][j]; 158cc1dc7a3Sopenharmony_ci float value = data_vr[tix]; 159cc1dc7a3Sopenharmony_ci lowvalue = astc::min(value, lowvalue); 160cc1dc7a3Sopenharmony_ci highvalue = astc::max(value, highvalue); 161cc1dc7a3Sopenharmony_ci } 162cc1dc7a3Sopenharmony_ci 163cc1dc7a3Sopenharmony_ci if (highvalue <= lowvalue) 164cc1dc7a3Sopenharmony_ci { 165cc1dc7a3Sopenharmony_ci lowvalue = 0.0f; 166cc1dc7a3Sopenharmony_ci highvalue = 1e-7f; 167cc1dc7a3Sopenharmony_ci } 168cc1dc7a3Sopenharmony_ci 169cc1dc7a3Sopenharmony_ci float length = highvalue - lowvalue; 170cc1dc7a3Sopenharmony_ci float length_squared = length * length; 171cc1dc7a3Sopenharmony_ci float scale = 1.0f / length; 172cc1dc7a3Sopenharmony_ci 173cc1dc7a3Sopenharmony_ci if (i == 0) 174cc1dc7a3Sopenharmony_ci { 175cc1dc7a3Sopenharmony_ci partition0_len_sq = length_squared; 176cc1dc7a3Sopenharmony_ci } 177cc1dc7a3Sopenharmony_ci else 178cc1dc7a3Sopenharmony_ci { 179cc1dc7a3Sopenharmony_ci is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; 180cc1dc7a3Sopenharmony_ci } 181cc1dc7a3Sopenharmony_ci 182cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_texel_count; j++) 183cc1dc7a3Sopenharmony_ci { 184cc1dc7a3Sopenharmony_ci unsigned int tix = pi.texels_of_partition[i][j]; 185cc1dc7a3Sopenharmony_ci float value = (data_vr[tix] - lowvalue) * scale; 186cc1dc7a3Sopenharmony_ci value = astc::clamp1f(value); 187cc1dc7a3Sopenharmony_ci 188cc1dc7a3Sopenharmony_ci ei.weights[tix] = value; 189cc1dc7a3Sopenharmony_ci ei.weight_error_scale[tix] = length_squared * error_weight; 190cc1dc7a3Sopenharmony_ci assert(!astc::isnan(ei.weight_error_scale[tix])); 191cc1dc7a3Sopenharmony_ci } 192cc1dc7a3Sopenharmony_ci 193cc1dc7a3Sopenharmony_ci ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask); 194cc1dc7a3Sopenharmony_ci ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask); 195cc1dc7a3Sopenharmony_ci } 196cc1dc7a3Sopenharmony_ci 197cc1dc7a3Sopenharmony_ci // Zero initialize any SIMD over-fetch 198cc1dc7a3Sopenharmony_ci unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); 199cc1dc7a3Sopenharmony_ci for (unsigned int i = texel_count; i < texel_count_simd; i++) 200cc1dc7a3Sopenharmony_ci { 201cc1dc7a3Sopenharmony_ci ei.weights[i] = 0.0f; 202cc1dc7a3Sopenharmony_ci ei.weight_error_scale[i] = 0.0f; 203cc1dc7a3Sopenharmony_ci } 204cc1dc7a3Sopenharmony_ci 205cc1dc7a3Sopenharmony_ci ei.is_constant_weight_error_scale = is_constant_wes; 206cc1dc7a3Sopenharmony_ci} 207cc1dc7a3Sopenharmony_ci 208cc1dc7a3Sopenharmony_ci/** 209cc1dc7a3Sopenharmony_ci * @brief Compute the ideal endpoints and weights for 2 color components. 210cc1dc7a3Sopenharmony_ci * 211cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 212cc1dc7a3Sopenharmony_ci * @param pi The partition info for the current trial. 213cc1dc7a3Sopenharmony_ci * @param[out] ei The computed ideal endpoints and weights. 214cc1dc7a3Sopenharmony_ci * @param component1 The first color component to compute. 215cc1dc7a3Sopenharmony_ci * @param component2 The second color component to compute. 216cc1dc7a3Sopenharmony_ci */ 217cc1dc7a3Sopenharmony_cistatic void compute_ideal_colors_and_weights_2_comp( 218cc1dc7a3Sopenharmony_ci const image_block& blk, 219cc1dc7a3Sopenharmony_ci const partition_info& pi, 220cc1dc7a3Sopenharmony_ci endpoints_and_weights& ei, 221cc1dc7a3Sopenharmony_ci int component1, 222cc1dc7a3Sopenharmony_ci int component2 223cc1dc7a3Sopenharmony_ci) { 224cc1dc7a3Sopenharmony_ci unsigned int partition_count = pi.partition_count; 225cc1dc7a3Sopenharmony_ci ei.ep.partition_count = partition_count; 226cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 227cc1dc7a3Sopenharmony_ci 228cc1dc7a3Sopenharmony_ci unsigned int texel_count = blk.texel_count; 229cc1dc7a3Sopenharmony_ci promise(texel_count > 0); 230cc1dc7a3Sopenharmony_ci 231cc1dc7a3Sopenharmony_ci partition_metrics pms[BLOCK_MAX_PARTITIONS]; 232cc1dc7a3Sopenharmony_ci 233cc1dc7a3Sopenharmony_ci float error_weight; 234cc1dc7a3Sopenharmony_ci const float* data_vr = nullptr; 235cc1dc7a3Sopenharmony_ci const float* data_vg = nullptr; 236cc1dc7a3Sopenharmony_ci 237cc1dc7a3Sopenharmony_ci if (component1 == 0 && component2 == 1) 238cc1dc7a3Sopenharmony_ci { 239cc1dc7a3Sopenharmony_ci error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f; 240cc1dc7a3Sopenharmony_ci 241cc1dc7a3Sopenharmony_ci data_vr = blk.data_r; 242cc1dc7a3Sopenharmony_ci data_vg = blk.data_g; 243cc1dc7a3Sopenharmony_ci } 244cc1dc7a3Sopenharmony_ci else if (component1 == 0 && component2 == 2) 245cc1dc7a3Sopenharmony_ci { 246cc1dc7a3Sopenharmony_ci error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f; 247cc1dc7a3Sopenharmony_ci 248cc1dc7a3Sopenharmony_ci data_vr = blk.data_r; 249cc1dc7a3Sopenharmony_ci data_vg = blk.data_b; 250cc1dc7a3Sopenharmony_ci } 251cc1dc7a3Sopenharmony_ci else // (component1 == 1 && component2 == 2) 252cc1dc7a3Sopenharmony_ci { 253cc1dc7a3Sopenharmony_ci assert(component1 == 1 && component2 == 2); 254cc1dc7a3Sopenharmony_ci 255cc1dc7a3Sopenharmony_ci error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f; 256cc1dc7a3Sopenharmony_ci 257cc1dc7a3Sopenharmony_ci data_vr = blk.data_g; 258cc1dc7a3Sopenharmony_ci data_vg = blk.data_b; 259cc1dc7a3Sopenharmony_ci } 260cc1dc7a3Sopenharmony_ci 261cc1dc7a3Sopenharmony_ci compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms); 262cc1dc7a3Sopenharmony_ci 263cc1dc7a3Sopenharmony_ci bool is_constant_wes { true }; 264cc1dc7a3Sopenharmony_ci float partition0_len_sq { 0.0f }; 265cc1dc7a3Sopenharmony_ci 266cc1dc7a3Sopenharmony_ci vmask4 comp1_mask = vint4::lane_id() == vint4(component1); 267cc1dc7a3Sopenharmony_ci vmask4 comp2_mask = vint4::lane_id() == vint4(component2); 268cc1dc7a3Sopenharmony_ci 269cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partition_count; i++) 270cc1dc7a3Sopenharmony_ci { 271cc1dc7a3Sopenharmony_ci vfloat4 dir = pms[i].dir; 272cc1dc7a3Sopenharmony_ci if (hadd_s(dir) < 0.0f) 273cc1dc7a3Sopenharmony_ci { 274cc1dc7a3Sopenharmony_ci dir = vfloat4::zero() - dir; 275cc1dc7a3Sopenharmony_ci } 276cc1dc7a3Sopenharmony_ci 277cc1dc7a3Sopenharmony_ci line2 line { pms[i].avg, normalize_safe(dir, unit2()) }; 278cc1dc7a3Sopenharmony_ci float lowparam { 1e10f }; 279cc1dc7a3Sopenharmony_ci float highparam { -1e10f }; 280cc1dc7a3Sopenharmony_ci 281cc1dc7a3Sopenharmony_ci unsigned int partition_texel_count = pi.partition_texel_count[i]; 282cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_texel_count; j++) 283cc1dc7a3Sopenharmony_ci { 284cc1dc7a3Sopenharmony_ci unsigned int tix = pi.texels_of_partition[i][j]; 285cc1dc7a3Sopenharmony_ci vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]); 286cc1dc7a3Sopenharmony_ci float param = dot_s(point - line.a, line.b); 287cc1dc7a3Sopenharmony_ci ei.weights[tix] = param; 288cc1dc7a3Sopenharmony_ci 289cc1dc7a3Sopenharmony_ci lowparam = astc::min(param, lowparam); 290cc1dc7a3Sopenharmony_ci highparam = astc::max(param, highparam); 291cc1dc7a3Sopenharmony_ci } 292cc1dc7a3Sopenharmony_ci 293cc1dc7a3Sopenharmony_ci // It is possible for a uniform-color partition to produce length=0; 294cc1dc7a3Sopenharmony_ci // this causes NaN issues so set to small value to avoid this problem 295cc1dc7a3Sopenharmony_ci if (highparam <= lowparam) 296cc1dc7a3Sopenharmony_ci { 297cc1dc7a3Sopenharmony_ci lowparam = 0.0f; 298cc1dc7a3Sopenharmony_ci highparam = 1e-7f; 299cc1dc7a3Sopenharmony_ci } 300cc1dc7a3Sopenharmony_ci 301cc1dc7a3Sopenharmony_ci float length = highparam - lowparam; 302cc1dc7a3Sopenharmony_ci float length_squared = length * length; 303cc1dc7a3Sopenharmony_ci float scale = 1.0f / length; 304cc1dc7a3Sopenharmony_ci 305cc1dc7a3Sopenharmony_ci if (i == 0) 306cc1dc7a3Sopenharmony_ci { 307cc1dc7a3Sopenharmony_ci partition0_len_sq = length_squared; 308cc1dc7a3Sopenharmony_ci } 309cc1dc7a3Sopenharmony_ci else 310cc1dc7a3Sopenharmony_ci { 311cc1dc7a3Sopenharmony_ci is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; 312cc1dc7a3Sopenharmony_ci } 313cc1dc7a3Sopenharmony_ci 314cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_texel_count; j++) 315cc1dc7a3Sopenharmony_ci { 316cc1dc7a3Sopenharmony_ci unsigned int tix = pi.texels_of_partition[i][j]; 317cc1dc7a3Sopenharmony_ci float idx = (ei.weights[tix] - lowparam) * scale; 318cc1dc7a3Sopenharmony_ci idx = astc::clamp1f(idx); 319cc1dc7a3Sopenharmony_ci 320cc1dc7a3Sopenharmony_ci ei.weights[tix] = idx; 321cc1dc7a3Sopenharmony_ci ei.weight_error_scale[tix] = length_squared * error_weight; 322cc1dc7a3Sopenharmony_ci assert(!astc::isnan(ei.weight_error_scale[tix])); 323cc1dc7a3Sopenharmony_ci } 324cc1dc7a3Sopenharmony_ci 325cc1dc7a3Sopenharmony_ci vfloat4 lowvalue = line.a + line.b * lowparam; 326cc1dc7a3Sopenharmony_ci vfloat4 highvalue = line.a + line.b * highparam; 327cc1dc7a3Sopenharmony_ci 328cc1dc7a3Sopenharmony_ci vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask); 329cc1dc7a3Sopenharmony_ci vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask); 330cc1dc7a3Sopenharmony_ci 331cc1dc7a3Sopenharmony_ci ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask); 332cc1dc7a3Sopenharmony_ci ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask); 333cc1dc7a3Sopenharmony_ci } 334cc1dc7a3Sopenharmony_ci 335cc1dc7a3Sopenharmony_ci // Zero initialize any SIMD over-fetch 336cc1dc7a3Sopenharmony_ci unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); 337cc1dc7a3Sopenharmony_ci for (unsigned int i = texel_count; i < texel_count_simd; i++) 338cc1dc7a3Sopenharmony_ci { 339cc1dc7a3Sopenharmony_ci ei.weights[i] = 0.0f; 340cc1dc7a3Sopenharmony_ci ei.weight_error_scale[i] = 0.0f; 341cc1dc7a3Sopenharmony_ci } 342cc1dc7a3Sopenharmony_ci 343cc1dc7a3Sopenharmony_ci ei.is_constant_weight_error_scale = is_constant_wes; 344cc1dc7a3Sopenharmony_ci} 345cc1dc7a3Sopenharmony_ci 346cc1dc7a3Sopenharmony_ci/** 347cc1dc7a3Sopenharmony_ci * @brief Compute the ideal endpoints and weights for 3 color components. 348cc1dc7a3Sopenharmony_ci * 349cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 350cc1dc7a3Sopenharmony_ci * @param pi The partition info for the current trial. 351cc1dc7a3Sopenharmony_ci * @param[out] ei The computed ideal endpoints and weights. 352cc1dc7a3Sopenharmony_ci * @param omitted_component The color component excluded from the calculation. 353cc1dc7a3Sopenharmony_ci */ 354cc1dc7a3Sopenharmony_cistatic void compute_ideal_colors_and_weights_3_comp( 355cc1dc7a3Sopenharmony_ci const image_block& blk, 356cc1dc7a3Sopenharmony_ci const partition_info& pi, 357cc1dc7a3Sopenharmony_ci endpoints_and_weights& ei, 358cc1dc7a3Sopenharmony_ci unsigned int omitted_component 359cc1dc7a3Sopenharmony_ci) { 360cc1dc7a3Sopenharmony_ci unsigned int partition_count = pi.partition_count; 361cc1dc7a3Sopenharmony_ci ei.ep.partition_count = partition_count; 362cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 363cc1dc7a3Sopenharmony_ci 364cc1dc7a3Sopenharmony_ci unsigned int texel_count = blk.texel_count; 365cc1dc7a3Sopenharmony_ci promise(texel_count > 0); 366cc1dc7a3Sopenharmony_ci 367cc1dc7a3Sopenharmony_ci partition_metrics *pms = reinterpret_cast<partition_metrics *>(&blk.pms[0]); 368cc1dc7a3Sopenharmony_ci 369cc1dc7a3Sopenharmony_ci float error_weight; 370cc1dc7a3Sopenharmony_ci const float* data_vr = nullptr; 371cc1dc7a3Sopenharmony_ci const float* data_vg = nullptr; 372cc1dc7a3Sopenharmony_ci const float* data_vb = nullptr; 373cc1dc7a3Sopenharmony_ci if (omitted_component == 0) 374cc1dc7a3Sopenharmony_ci { 375cc1dc7a3Sopenharmony_ci error_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>()); 376cc1dc7a3Sopenharmony_ci data_vr = blk.data_g; 377cc1dc7a3Sopenharmony_ci data_vg = blk.data_b; 378cc1dc7a3Sopenharmony_ci data_vb = blk.data_a; 379cc1dc7a3Sopenharmony_ci } 380cc1dc7a3Sopenharmony_ci else if (omitted_component == 1) 381cc1dc7a3Sopenharmony_ci { 382cc1dc7a3Sopenharmony_ci error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>()); 383cc1dc7a3Sopenharmony_ci data_vr = blk.data_r; 384cc1dc7a3Sopenharmony_ci data_vg = blk.data_b; 385cc1dc7a3Sopenharmony_ci data_vb = blk.data_a; 386cc1dc7a3Sopenharmony_ci } 387cc1dc7a3Sopenharmony_ci else if (omitted_component == 2) 388cc1dc7a3Sopenharmony_ci { 389cc1dc7a3Sopenharmony_ci error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>()); 390cc1dc7a3Sopenharmony_ci data_vr = blk.data_r; 391cc1dc7a3Sopenharmony_ci data_vg = blk.data_g; 392cc1dc7a3Sopenharmony_ci data_vb = blk.data_a; 393cc1dc7a3Sopenharmony_ci } 394cc1dc7a3Sopenharmony_ci else 395cc1dc7a3Sopenharmony_ci { 396cc1dc7a3Sopenharmony_ci assert(omitted_component == 3); 397cc1dc7a3Sopenharmony_ci 398cc1dc7a3Sopenharmony_ci error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()); 399cc1dc7a3Sopenharmony_ci data_vr = blk.data_r; 400cc1dc7a3Sopenharmony_ci data_vg = blk.data_g; 401cc1dc7a3Sopenharmony_ci data_vb = blk.data_b; 402cc1dc7a3Sopenharmony_ci } 403cc1dc7a3Sopenharmony_ci 404cc1dc7a3Sopenharmony_ci error_weight = error_weight * (1.0f / 3.0f); 405cc1dc7a3Sopenharmony_ci 406cc1dc7a3Sopenharmony_ci if (omitted_component == 3) 407cc1dc7a3Sopenharmony_ci { 408cc1dc7a3Sopenharmony_ci compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); 409cc1dc7a3Sopenharmony_ci } 410cc1dc7a3Sopenharmony_ci else 411cc1dc7a3Sopenharmony_ci { 412cc1dc7a3Sopenharmony_ci compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms); 413cc1dc7a3Sopenharmony_ci } 414cc1dc7a3Sopenharmony_ci 415cc1dc7a3Sopenharmony_ci bool is_constant_wes { true }; 416cc1dc7a3Sopenharmony_ci float partition0_len_sq { 0.0f }; 417cc1dc7a3Sopenharmony_ci 418cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partition_count; i++) 419cc1dc7a3Sopenharmony_ci { 420cc1dc7a3Sopenharmony_ci vfloat4 dir = pms[i].dir; 421cc1dc7a3Sopenharmony_ci if (hadd_rgb_s(dir) < 0.0f) 422cc1dc7a3Sopenharmony_ci { 423cc1dc7a3Sopenharmony_ci dir = vfloat4::zero() - dir; 424cc1dc7a3Sopenharmony_ci } 425cc1dc7a3Sopenharmony_ci 426cc1dc7a3Sopenharmony_ci line3 line { pms[i].avg, normalize_safe(dir, unit3()) }; 427cc1dc7a3Sopenharmony_ci float lowparam { 1e10f }; 428cc1dc7a3Sopenharmony_ci float highparam { -1e10f }; 429cc1dc7a3Sopenharmony_ci 430cc1dc7a3Sopenharmony_ci unsigned int partition_texel_count = pi.partition_texel_count[i]; 431cc1dc7a3Sopenharmony_ci 432cc1dc7a3Sopenharmony_ci vfloat4 lowparam_vec = vfloat4(1e10f, 1e10f, 1e10f, 1e10f); 433cc1dc7a3Sopenharmony_ci vfloat4 highparam_vec = vfloat4(-1e10f, -1e10f, -1e10f, -1e10f); 434cc1dc7a3Sopenharmony_ci 435cc1dc7a3Sopenharmony_ci unsigned int j = 0; 436cc1dc7a3Sopenharmony_ci for (; j + ASTCENC_SIMD_WIDTH <= partition_texel_count; j += ASTCENC_SIMD_WIDTH) 437cc1dc7a3Sopenharmony_ci { 438cc1dc7a3Sopenharmony_ci unsigned int tix0 = pi.texels_of_partition[i][j]; 439cc1dc7a3Sopenharmony_ci unsigned int tix1 = pi.texels_of_partition[i][j + 1]; 440cc1dc7a3Sopenharmony_ci unsigned int tix2 = pi.texels_of_partition[i][j + 2]; 441cc1dc7a3Sopenharmony_ci unsigned int tix3 = pi.texels_of_partition[i][j + 3]; 442cc1dc7a3Sopenharmony_ci 443cc1dc7a3Sopenharmony_ci vfloat4 points0 = vfloat4(data_vr[tix0], data_vg[tix0], data_vb[tix0], 0.0f); 444cc1dc7a3Sopenharmony_ci vfloat4 points1 = vfloat4(data_vr[tix1], data_vg[tix1], data_vb[tix1], 0.0f); 445cc1dc7a3Sopenharmony_ci vfloat4 points2 = vfloat4(data_vr[tix2], data_vg[tix2], data_vb[tix2], 0.0f); 446cc1dc7a3Sopenharmony_ci vfloat4 points3 = vfloat4(data_vr[tix3], data_vg[tix3], data_vb[tix3], 0.0f); 447cc1dc7a3Sopenharmony_ci 448cc1dc7a3Sopenharmony_ci vfloat4 sub_v0 = points0 - line.a; 449cc1dc7a3Sopenharmony_ci vfloat4 sub_v1 = points1 - line.a; 450cc1dc7a3Sopenharmony_ci vfloat4 sub_v2 = points2 - line.a; 451cc1dc7a3Sopenharmony_ci vfloat4 sub_v3 = points3 - line.a; 452cc1dc7a3Sopenharmony_ci 453cc1dc7a3Sopenharmony_ci vfloat4 params0 = sub_v0 * line.b; 454cc1dc7a3Sopenharmony_ci vfloat4 params1 = sub_v1 * line.b; 455cc1dc7a3Sopenharmony_ci vfloat4 params2 = sub_v2 * line.b; 456cc1dc7a3Sopenharmony_ci vfloat4 params3 = sub_v3 * line.b; 457cc1dc7a3Sopenharmony_ci 458cc1dc7a3Sopenharmony_ci float param0 = hadd_rgba_s(params0); 459cc1dc7a3Sopenharmony_ci float param1 = hadd_rgba_s(params1); 460cc1dc7a3Sopenharmony_ci float param2 = hadd_rgba_s(params2); 461cc1dc7a3Sopenharmony_ci float param3 = hadd_rgba_s(params3); 462cc1dc7a3Sopenharmony_ci 463cc1dc7a3Sopenharmony_ci ei.weights[tix0] = param0; 464cc1dc7a3Sopenharmony_ci ei.weights[tix1] = param1; 465cc1dc7a3Sopenharmony_ci ei.weights[tix2] = param2; 466cc1dc7a3Sopenharmony_ci ei.weights[tix3] = param3; 467cc1dc7a3Sopenharmony_ci 468cc1dc7a3Sopenharmony_ci vfloat4 params_vec = vfloat4(param0, param1, param2, param3); 469cc1dc7a3Sopenharmony_ci lowparam_vec = min(params_vec, lowparam_vec); 470cc1dc7a3Sopenharmony_ci highparam_vec = max(params_vec, highparam_vec); 471cc1dc7a3Sopenharmony_ci } 472cc1dc7a3Sopenharmony_ci 473cc1dc7a3Sopenharmony_ci lowparam = hmin_s(vfloat4(lowparam_vec)); 474cc1dc7a3Sopenharmony_ci highparam = hmax_s(vfloat4(highparam_vec)); 475cc1dc7a3Sopenharmony_ci 476cc1dc7a3Sopenharmony_ci for (; j < partition_texel_count; j++) 477cc1dc7a3Sopenharmony_ci { 478cc1dc7a3Sopenharmony_ci unsigned int tix = pi.texels_of_partition[i][j]; 479cc1dc7a3Sopenharmony_ci vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]); 480cc1dc7a3Sopenharmony_ci float param = dot3_s(point - line.a, line.b); 481cc1dc7a3Sopenharmony_ci ei.weights[tix] = param; 482cc1dc7a3Sopenharmony_ci 483cc1dc7a3Sopenharmony_ci lowparam = astc::min(param, lowparam); 484cc1dc7a3Sopenharmony_ci highparam = astc::max(param, highparam); 485cc1dc7a3Sopenharmony_ci } 486cc1dc7a3Sopenharmony_ci 487cc1dc7a3Sopenharmony_ci // It is possible for a uniform-color partition to produce length=0; 488cc1dc7a3Sopenharmony_ci // this causes NaN issues so set to small value to avoid this problem 489cc1dc7a3Sopenharmony_ci if (highparam <= lowparam) 490cc1dc7a3Sopenharmony_ci { 491cc1dc7a3Sopenharmony_ci lowparam = 0.0f; 492cc1dc7a3Sopenharmony_ci highparam = 1e-7f; 493cc1dc7a3Sopenharmony_ci } 494cc1dc7a3Sopenharmony_ci 495cc1dc7a3Sopenharmony_ci float length = highparam - lowparam; 496cc1dc7a3Sopenharmony_ci float length_squared = length * length; 497cc1dc7a3Sopenharmony_ci float scale = 1.0f / length; 498cc1dc7a3Sopenharmony_ci 499cc1dc7a3Sopenharmony_ci if (i == 0) 500cc1dc7a3Sopenharmony_ci { 501cc1dc7a3Sopenharmony_ci partition0_len_sq = length_squared; 502cc1dc7a3Sopenharmony_ci } 503cc1dc7a3Sopenharmony_ci else 504cc1dc7a3Sopenharmony_ci { 505cc1dc7a3Sopenharmony_ci is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; 506cc1dc7a3Sopenharmony_ci } 507cc1dc7a3Sopenharmony_ci 508cc1dc7a3Sopenharmony_ci for (j = 0; j < partition_texel_count; j++) 509cc1dc7a3Sopenharmony_ci { 510cc1dc7a3Sopenharmony_ci unsigned int tix = pi.texels_of_partition[i][j]; 511cc1dc7a3Sopenharmony_ci float idx = (ei.weights[tix] - lowparam) * scale; 512cc1dc7a3Sopenharmony_ci idx = astc::clamp1f(idx); 513cc1dc7a3Sopenharmony_ci 514cc1dc7a3Sopenharmony_ci ei.weights[tix] = idx; 515cc1dc7a3Sopenharmony_ci ei.weight_error_scale[tix] = length_squared * error_weight; 516cc1dc7a3Sopenharmony_ci assert(!astc::isnan(ei.weight_error_scale[tix])); 517cc1dc7a3Sopenharmony_ci } 518cc1dc7a3Sopenharmony_ci 519cc1dc7a3Sopenharmony_ci vfloat4 ep0 = line.a + line.b * lowparam; 520cc1dc7a3Sopenharmony_ci vfloat4 ep1 = line.a + line.b * highparam; 521cc1dc7a3Sopenharmony_ci 522cc1dc7a3Sopenharmony_ci vfloat4 bmin = blk.data_min; 523cc1dc7a3Sopenharmony_ci vfloat4 bmax = blk.data_max; 524cc1dc7a3Sopenharmony_ci 525cc1dc7a3Sopenharmony_ci assert(omitted_component < BLOCK_MAX_COMPONENTS); 526cc1dc7a3Sopenharmony_ci switch (omitted_component) 527cc1dc7a3Sopenharmony_ci { 528cc1dc7a3Sopenharmony_ci case 0: 529cc1dc7a3Sopenharmony_ci ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>()); 530cc1dc7a3Sopenharmony_ci ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>()); 531cc1dc7a3Sopenharmony_ci break; 532cc1dc7a3Sopenharmony_ci case 1: 533cc1dc7a3Sopenharmony_ci ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>()); 534cc1dc7a3Sopenharmony_ci ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>()); 535cc1dc7a3Sopenharmony_ci break; 536cc1dc7a3Sopenharmony_ci case 2: 537cc1dc7a3Sopenharmony_ci ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>()); 538cc1dc7a3Sopenharmony_ci ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>()); 539cc1dc7a3Sopenharmony_ci break; 540cc1dc7a3Sopenharmony_ci default: 541cc1dc7a3Sopenharmony_ci ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>()); 542cc1dc7a3Sopenharmony_ci ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>()); 543cc1dc7a3Sopenharmony_ci break; 544cc1dc7a3Sopenharmony_ci } 545cc1dc7a3Sopenharmony_ci } 546cc1dc7a3Sopenharmony_ci 547cc1dc7a3Sopenharmony_ci // Zero initialize any SIMD over-fetch 548cc1dc7a3Sopenharmony_ci unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); 549cc1dc7a3Sopenharmony_ci for (unsigned int i = texel_count; i < texel_count_simd; i++) 550cc1dc7a3Sopenharmony_ci { 551cc1dc7a3Sopenharmony_ci ei.weights[i] = 0.0f; 552cc1dc7a3Sopenharmony_ci ei.weight_error_scale[i] = 0.0f; 553cc1dc7a3Sopenharmony_ci } 554cc1dc7a3Sopenharmony_ci 555cc1dc7a3Sopenharmony_ci ei.is_constant_weight_error_scale = is_constant_wes; 556cc1dc7a3Sopenharmony_ci} 557cc1dc7a3Sopenharmony_ci 558cc1dc7a3Sopenharmony_ci/** 559cc1dc7a3Sopenharmony_ci * @brief Compute the ideal endpoints and weights for 4 color components. 560cc1dc7a3Sopenharmony_ci * 561cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 562cc1dc7a3Sopenharmony_ci * @param pi The partition info for the current trial. 563cc1dc7a3Sopenharmony_ci * @param[out] ei The computed ideal endpoints and weights. 564cc1dc7a3Sopenharmony_ci */ 565cc1dc7a3Sopenharmony_cistatic void compute_ideal_colors_and_weights_4_comp( 566cc1dc7a3Sopenharmony_ci const image_block& blk, 567cc1dc7a3Sopenharmony_ci const partition_info& pi, 568cc1dc7a3Sopenharmony_ci endpoints_and_weights& ei 569cc1dc7a3Sopenharmony_ci) { 570cc1dc7a3Sopenharmony_ci const float error_weight = hadd_s(blk.channel_weight) / 4.0f; 571cc1dc7a3Sopenharmony_ci 572cc1dc7a3Sopenharmony_ci unsigned int partition_count = pi.partition_count; 573cc1dc7a3Sopenharmony_ci 574cc1dc7a3Sopenharmony_ci unsigned int texel_count = blk.texel_count; 575cc1dc7a3Sopenharmony_ci promise(texel_count > 0); 576cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 577cc1dc7a3Sopenharmony_ci 578cc1dc7a3Sopenharmony_ci partition_metrics pms[BLOCK_MAX_PARTITIONS]; 579cc1dc7a3Sopenharmony_ci 580cc1dc7a3Sopenharmony_ci compute_avgs_and_dirs_4_comp(pi, blk, pms); 581cc1dc7a3Sopenharmony_ci 582cc1dc7a3Sopenharmony_ci bool is_constant_wes { true }; 583cc1dc7a3Sopenharmony_ci float partition0_len_sq { 0.0f }; 584cc1dc7a3Sopenharmony_ci 585cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partition_count; i++) 586cc1dc7a3Sopenharmony_ci { 587cc1dc7a3Sopenharmony_ci vfloat4 dir = pms[i].dir; 588cc1dc7a3Sopenharmony_ci if (hadd_rgb_s(dir) < 0.0f) 589cc1dc7a3Sopenharmony_ci { 590cc1dc7a3Sopenharmony_ci dir = vfloat4::zero() - dir; 591cc1dc7a3Sopenharmony_ci } 592cc1dc7a3Sopenharmony_ci 593cc1dc7a3Sopenharmony_ci line4 line { pms[i].avg, normalize_safe(dir, unit4()) }; 594cc1dc7a3Sopenharmony_ci float lowparam { 1e10f }; 595cc1dc7a3Sopenharmony_ci float highparam { -1e10f }; 596cc1dc7a3Sopenharmony_ci 597cc1dc7a3Sopenharmony_ci unsigned int partition_texel_count = pi.partition_texel_count[i]; 598cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_texel_count; j++) 599cc1dc7a3Sopenharmony_ci { 600cc1dc7a3Sopenharmony_ci unsigned int tix = pi.texels_of_partition[i][j]; 601cc1dc7a3Sopenharmony_ci vfloat4 point = blk.texel(tix); 602cc1dc7a3Sopenharmony_ci float param = dot_s(point - line.a, line.b); 603cc1dc7a3Sopenharmony_ci ei.weights[tix] = param; 604cc1dc7a3Sopenharmony_ci 605cc1dc7a3Sopenharmony_ci lowparam = astc::min(param, lowparam); 606cc1dc7a3Sopenharmony_ci highparam = astc::max(param, highparam); 607cc1dc7a3Sopenharmony_ci } 608cc1dc7a3Sopenharmony_ci 609cc1dc7a3Sopenharmony_ci // It is possible for a uniform-color partition to produce length=0; 610cc1dc7a3Sopenharmony_ci // this causes NaN issues so set to small value to avoid this problem 611cc1dc7a3Sopenharmony_ci if (highparam <= lowparam) 612cc1dc7a3Sopenharmony_ci { 613cc1dc7a3Sopenharmony_ci lowparam = 0.0f; 614cc1dc7a3Sopenharmony_ci highparam = 1e-7f; 615cc1dc7a3Sopenharmony_ci } 616cc1dc7a3Sopenharmony_ci 617cc1dc7a3Sopenharmony_ci float length = highparam - lowparam; 618cc1dc7a3Sopenharmony_ci float length_squared = length * length; 619cc1dc7a3Sopenharmony_ci float scale = 1.0f / length; 620cc1dc7a3Sopenharmony_ci 621cc1dc7a3Sopenharmony_ci if (i == 0) 622cc1dc7a3Sopenharmony_ci { 623cc1dc7a3Sopenharmony_ci partition0_len_sq = length_squared; 624cc1dc7a3Sopenharmony_ci } 625cc1dc7a3Sopenharmony_ci else 626cc1dc7a3Sopenharmony_ci { 627cc1dc7a3Sopenharmony_ci is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; 628cc1dc7a3Sopenharmony_ci } 629cc1dc7a3Sopenharmony_ci 630cc1dc7a3Sopenharmony_ci ei.ep.endpt0[i] = line.a + line.b * lowparam; 631cc1dc7a3Sopenharmony_ci ei.ep.endpt1[i] = line.a + line.b * highparam; 632cc1dc7a3Sopenharmony_ci 633cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_texel_count; j++) 634cc1dc7a3Sopenharmony_ci { 635cc1dc7a3Sopenharmony_ci unsigned int tix = pi.texels_of_partition[i][j]; 636cc1dc7a3Sopenharmony_ci float idx = (ei.weights[tix] - lowparam) * scale; 637cc1dc7a3Sopenharmony_ci idx = astc::clamp1f(idx); 638cc1dc7a3Sopenharmony_ci 639cc1dc7a3Sopenharmony_ci ei.weights[tix] = idx; 640cc1dc7a3Sopenharmony_ci ei.weight_error_scale[tix] = length_squared * error_weight; 641cc1dc7a3Sopenharmony_ci assert(!astc::isnan(ei.weight_error_scale[tix])); 642cc1dc7a3Sopenharmony_ci } 643cc1dc7a3Sopenharmony_ci } 644cc1dc7a3Sopenharmony_ci 645cc1dc7a3Sopenharmony_ci // Zero initialize any SIMD over-fetch 646cc1dc7a3Sopenharmony_ci unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); 647cc1dc7a3Sopenharmony_ci for (unsigned int i = texel_count; i < texel_count_simd; i++) 648cc1dc7a3Sopenharmony_ci { 649cc1dc7a3Sopenharmony_ci ei.weights[i] = 0.0f; 650cc1dc7a3Sopenharmony_ci ei.weight_error_scale[i] = 0.0f; 651cc1dc7a3Sopenharmony_ci } 652cc1dc7a3Sopenharmony_ci 653cc1dc7a3Sopenharmony_ci ei.is_constant_weight_error_scale = is_constant_wes; 654cc1dc7a3Sopenharmony_ci} 655cc1dc7a3Sopenharmony_ci 656cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 657cc1dc7a3Sopenharmony_civoid compute_ideal_colors_and_weights_1plane( 658cc1dc7a3Sopenharmony_ci const image_block& blk, 659cc1dc7a3Sopenharmony_ci const partition_info& pi, 660cc1dc7a3Sopenharmony_ci endpoints_and_weights& ei 661cc1dc7a3Sopenharmony_ci) { 662cc1dc7a3Sopenharmony_ci bool uses_alpha = !blk.is_constant_channel(3); 663cc1dc7a3Sopenharmony_ci 664cc1dc7a3Sopenharmony_ci if (uses_alpha) 665cc1dc7a3Sopenharmony_ci { 666cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_4_comp(blk, pi, ei); 667cc1dc7a3Sopenharmony_ci } 668cc1dc7a3Sopenharmony_ci else 669cc1dc7a3Sopenharmony_ci { 670cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3); 671cc1dc7a3Sopenharmony_ci } 672cc1dc7a3Sopenharmony_ci} 673cc1dc7a3Sopenharmony_ci 674cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 675cc1dc7a3Sopenharmony_civoid compute_ideal_colors_and_weights_2planes( 676cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 677cc1dc7a3Sopenharmony_ci const image_block& blk, 678cc1dc7a3Sopenharmony_ci unsigned int plane2_component, 679cc1dc7a3Sopenharmony_ci endpoints_and_weights& ei1, 680cc1dc7a3Sopenharmony_ci endpoints_and_weights& ei2 681cc1dc7a3Sopenharmony_ci) { 682cc1dc7a3Sopenharmony_ci const auto& pi = bsd.get_partition_info(1, 0); 683cc1dc7a3Sopenharmony_ci bool uses_alpha = !blk.is_constant_channel(3); 684cc1dc7a3Sopenharmony_ci 685cc1dc7a3Sopenharmony_ci assert(plane2_component < BLOCK_MAX_COMPONENTS); 686cc1dc7a3Sopenharmony_ci switch (plane2_component) 687cc1dc7a3Sopenharmony_ci { 688cc1dc7a3Sopenharmony_ci case 0: // Separate weights for red 689cc1dc7a3Sopenharmony_ci if (uses_alpha) 690cc1dc7a3Sopenharmony_ci { 691cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0); 692cc1dc7a3Sopenharmony_ci } 693cc1dc7a3Sopenharmony_ci else 694cc1dc7a3Sopenharmony_ci { 695cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2); 696cc1dc7a3Sopenharmony_ci } 697cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0); 698cc1dc7a3Sopenharmony_ci break; 699cc1dc7a3Sopenharmony_ci 700cc1dc7a3Sopenharmony_ci case 1: // Separate weights for green 701cc1dc7a3Sopenharmony_ci if (uses_alpha) 702cc1dc7a3Sopenharmony_ci { 703cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1); 704cc1dc7a3Sopenharmony_ci } 705cc1dc7a3Sopenharmony_ci else 706cc1dc7a3Sopenharmony_ci { 707cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2); 708cc1dc7a3Sopenharmony_ci } 709cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1); 710cc1dc7a3Sopenharmony_ci break; 711cc1dc7a3Sopenharmony_ci 712cc1dc7a3Sopenharmony_ci case 2: // Separate weights for blue 713cc1dc7a3Sopenharmony_ci if (uses_alpha) 714cc1dc7a3Sopenharmony_ci { 715cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2); 716cc1dc7a3Sopenharmony_ci } 717cc1dc7a3Sopenharmony_ci else 718cc1dc7a3Sopenharmony_ci { 719cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1); 720cc1dc7a3Sopenharmony_ci } 721cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2); 722cc1dc7a3Sopenharmony_ci break; 723cc1dc7a3Sopenharmony_ci 724cc1dc7a3Sopenharmony_ci default: // Separate weights for alpha 725cc1dc7a3Sopenharmony_ci assert(uses_alpha); 726cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3); 727cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3); 728cc1dc7a3Sopenharmony_ci break; 729cc1dc7a3Sopenharmony_ci } 730cc1dc7a3Sopenharmony_ci} 731cc1dc7a3Sopenharmony_ci 732cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 733cc1dc7a3Sopenharmony_cifloat compute_error_of_weight_set_1plane( 734cc1dc7a3Sopenharmony_ci const endpoints_and_weights& eai, 735cc1dc7a3Sopenharmony_ci const decimation_info& di, 736cc1dc7a3Sopenharmony_ci const float* dec_weight_quant_uvalue 737cc1dc7a3Sopenharmony_ci) { 738cc1dc7a3Sopenharmony_ci vfloatacc error_summav = vfloatacc::zero(); 739cc1dc7a3Sopenharmony_ci unsigned int texel_count = di.texel_count; 740cc1dc7a3Sopenharmony_ci promise(texel_count > 0); 741cc1dc7a3Sopenharmony_ci 742cc1dc7a3Sopenharmony_ci // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized 743cc1dc7a3Sopenharmony_ci if (di.max_texel_weight_count > 2) 744cc1dc7a3Sopenharmony_ci { 745cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 746cc1dc7a3Sopenharmony_ci { 747cc1dc7a3Sopenharmony_ci // Compute the bilinear interpolation of the decimated weight grid 748cc1dc7a3Sopenharmony_ci vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i); 749cc1dc7a3Sopenharmony_ci 750cc1dc7a3Sopenharmony_ci // Compute the error between the computed value and the ideal weight 751cc1dc7a3Sopenharmony_ci vfloat actual_values = loada(eai.weights + i); 752cc1dc7a3Sopenharmony_ci vfloat diff = current_values - actual_values; 753cc1dc7a3Sopenharmony_ci vfloat significance = loada(eai.weight_error_scale + i); 754cc1dc7a3Sopenharmony_ci vfloat error = diff * diff * significance; 755cc1dc7a3Sopenharmony_ci 756cc1dc7a3Sopenharmony_ci haccumulate(error_summav, error); 757cc1dc7a3Sopenharmony_ci } 758cc1dc7a3Sopenharmony_ci } 759cc1dc7a3Sopenharmony_ci else if (di.max_texel_weight_count > 1) 760cc1dc7a3Sopenharmony_ci { 761cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 762cc1dc7a3Sopenharmony_ci { 763cc1dc7a3Sopenharmony_ci // Compute the bilinear interpolation of the decimated weight grid 764cc1dc7a3Sopenharmony_ci vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i); 765cc1dc7a3Sopenharmony_ci 766cc1dc7a3Sopenharmony_ci // Compute the error between the computed value and the ideal weight 767cc1dc7a3Sopenharmony_ci vfloat actual_values = loada(eai.weights + i); 768cc1dc7a3Sopenharmony_ci vfloat diff = current_values - actual_values; 769cc1dc7a3Sopenharmony_ci vfloat significance = loada(eai.weight_error_scale + i); 770cc1dc7a3Sopenharmony_ci vfloat error = diff * diff * significance; 771cc1dc7a3Sopenharmony_ci 772cc1dc7a3Sopenharmony_ci haccumulate(error_summav, error); 773cc1dc7a3Sopenharmony_ci } 774cc1dc7a3Sopenharmony_ci } 775cc1dc7a3Sopenharmony_ci else 776cc1dc7a3Sopenharmony_ci { 777cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 778cc1dc7a3Sopenharmony_ci { 779cc1dc7a3Sopenharmony_ci // Load the weight set directly, without interpolation 780cc1dc7a3Sopenharmony_ci vfloat current_values = loada(dec_weight_quant_uvalue + i); 781cc1dc7a3Sopenharmony_ci 782cc1dc7a3Sopenharmony_ci // Compute the error between the computed value and the ideal weight 783cc1dc7a3Sopenharmony_ci vfloat actual_values = loada(eai.weights + i); 784cc1dc7a3Sopenharmony_ci vfloat diff = current_values - actual_values; 785cc1dc7a3Sopenharmony_ci vfloat significance = loada(eai.weight_error_scale + i); 786cc1dc7a3Sopenharmony_ci vfloat error = diff * diff * significance; 787cc1dc7a3Sopenharmony_ci 788cc1dc7a3Sopenharmony_ci haccumulate(error_summav, error); 789cc1dc7a3Sopenharmony_ci } 790cc1dc7a3Sopenharmony_ci } 791cc1dc7a3Sopenharmony_ci 792cc1dc7a3Sopenharmony_ci // Resolve the final scalar accumulator sum 793cc1dc7a3Sopenharmony_ci return hadd_s(error_summav); 794cc1dc7a3Sopenharmony_ci} 795cc1dc7a3Sopenharmony_ci 796cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 797cc1dc7a3Sopenharmony_cifloat compute_error_of_weight_set_2planes( 798cc1dc7a3Sopenharmony_ci const endpoints_and_weights& eai1, 799cc1dc7a3Sopenharmony_ci const endpoints_and_weights& eai2, 800cc1dc7a3Sopenharmony_ci const decimation_info& di, 801cc1dc7a3Sopenharmony_ci const float* dec_weight_quant_uvalue_plane1, 802cc1dc7a3Sopenharmony_ci const float* dec_weight_quant_uvalue_plane2 803cc1dc7a3Sopenharmony_ci) { 804cc1dc7a3Sopenharmony_ci vfloatacc error_summav = vfloatacc::zero(); 805cc1dc7a3Sopenharmony_ci unsigned int texel_count = di.texel_count; 806cc1dc7a3Sopenharmony_ci promise(texel_count > 0); 807cc1dc7a3Sopenharmony_ci 808cc1dc7a3Sopenharmony_ci // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized 809cc1dc7a3Sopenharmony_ci if (di.max_texel_weight_count > 2) 810cc1dc7a3Sopenharmony_ci { 811cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 812cc1dc7a3Sopenharmony_ci { 813cc1dc7a3Sopenharmony_ci // Plane 1 814cc1dc7a3Sopenharmony_ci // Compute the bilinear interpolation of the decimated weight grid 815cc1dc7a3Sopenharmony_ci vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i); 816cc1dc7a3Sopenharmony_ci 817cc1dc7a3Sopenharmony_ci // Compute the error between the computed value and the ideal weight 818cc1dc7a3Sopenharmony_ci vfloat actual_values1 = loada(eai1.weights + i); 819cc1dc7a3Sopenharmony_ci vfloat diff = current_values1 - actual_values1; 820cc1dc7a3Sopenharmony_ci vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i); 821cc1dc7a3Sopenharmony_ci 822cc1dc7a3Sopenharmony_ci // Plane 2 823cc1dc7a3Sopenharmony_ci // Compute the bilinear interpolation of the decimated weight grid 824cc1dc7a3Sopenharmony_ci vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i); 825cc1dc7a3Sopenharmony_ci 826cc1dc7a3Sopenharmony_ci // Compute the error between the computed value and the ideal weight 827cc1dc7a3Sopenharmony_ci vfloat actual_values2 = loada(eai2.weights + i); 828cc1dc7a3Sopenharmony_ci diff = current_values2 - actual_values2; 829cc1dc7a3Sopenharmony_ci vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i); 830cc1dc7a3Sopenharmony_ci 831cc1dc7a3Sopenharmony_ci haccumulate(error_summav, error1 + error2); 832cc1dc7a3Sopenharmony_ci } 833cc1dc7a3Sopenharmony_ci } 834cc1dc7a3Sopenharmony_ci else if (di.max_texel_weight_count > 1) 835cc1dc7a3Sopenharmony_ci { 836cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 837cc1dc7a3Sopenharmony_ci { 838cc1dc7a3Sopenharmony_ci // Plane 1 839cc1dc7a3Sopenharmony_ci // Compute the bilinear interpolation of the decimated weight grid 840cc1dc7a3Sopenharmony_ci vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i); 841cc1dc7a3Sopenharmony_ci 842cc1dc7a3Sopenharmony_ci // Compute the error between the computed value and the ideal weight 843cc1dc7a3Sopenharmony_ci vfloat actual_values1 = loada(eai1.weights + i); 844cc1dc7a3Sopenharmony_ci vfloat diff = current_values1 - actual_values1; 845cc1dc7a3Sopenharmony_ci vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i); 846cc1dc7a3Sopenharmony_ci 847cc1dc7a3Sopenharmony_ci // Plane 2 848cc1dc7a3Sopenharmony_ci // Compute the bilinear interpolation of the decimated weight grid 849cc1dc7a3Sopenharmony_ci vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i); 850cc1dc7a3Sopenharmony_ci 851cc1dc7a3Sopenharmony_ci // Compute the error between the computed value and the ideal weight 852cc1dc7a3Sopenharmony_ci vfloat actual_values2 = loada(eai2.weights + i); 853cc1dc7a3Sopenharmony_ci diff = current_values2 - actual_values2; 854cc1dc7a3Sopenharmony_ci vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i); 855cc1dc7a3Sopenharmony_ci 856cc1dc7a3Sopenharmony_ci haccumulate(error_summav, error1 + error2); 857cc1dc7a3Sopenharmony_ci } 858cc1dc7a3Sopenharmony_ci } 859cc1dc7a3Sopenharmony_ci else 860cc1dc7a3Sopenharmony_ci { 861cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 862cc1dc7a3Sopenharmony_ci { 863cc1dc7a3Sopenharmony_ci // Plane 1 864cc1dc7a3Sopenharmony_ci // Load the weight set directly, without interpolation 865cc1dc7a3Sopenharmony_ci vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i); 866cc1dc7a3Sopenharmony_ci 867cc1dc7a3Sopenharmony_ci // Compute the error between the computed value and the ideal weight 868cc1dc7a3Sopenharmony_ci vfloat actual_values1 = loada(eai1.weights + i); 869cc1dc7a3Sopenharmony_ci vfloat diff = current_values1 - actual_values1; 870cc1dc7a3Sopenharmony_ci vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i); 871cc1dc7a3Sopenharmony_ci 872cc1dc7a3Sopenharmony_ci // Plane 2 873cc1dc7a3Sopenharmony_ci // Load the weight set directly, without interpolation 874cc1dc7a3Sopenharmony_ci vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i); 875cc1dc7a3Sopenharmony_ci 876cc1dc7a3Sopenharmony_ci // Compute the error between the computed value and the ideal weight 877cc1dc7a3Sopenharmony_ci vfloat actual_values2 = loada(eai2.weights + i); 878cc1dc7a3Sopenharmony_ci diff = current_values2 - actual_values2; 879cc1dc7a3Sopenharmony_ci vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i); 880cc1dc7a3Sopenharmony_ci 881cc1dc7a3Sopenharmony_ci haccumulate(error_summav, error1 + error2); 882cc1dc7a3Sopenharmony_ci } 883cc1dc7a3Sopenharmony_ci } 884cc1dc7a3Sopenharmony_ci 885cc1dc7a3Sopenharmony_ci // Resolve the final scalar accumulator sum 886cc1dc7a3Sopenharmony_ci return hadd_s(error_summav); 887cc1dc7a3Sopenharmony_ci} 888cc1dc7a3Sopenharmony_ci 889cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 890cc1dc7a3Sopenharmony_civoid compute_ideal_weights_for_decimation( 891cc1dc7a3Sopenharmony_ci const endpoints_and_weights& ei, 892cc1dc7a3Sopenharmony_ci const decimation_info& di, 893cc1dc7a3Sopenharmony_ci float* dec_weight_ideal_value 894cc1dc7a3Sopenharmony_ci) { 895cc1dc7a3Sopenharmony_ci unsigned int texel_count = di.texel_count; 896cc1dc7a3Sopenharmony_ci unsigned int weight_count = di.weight_count; 897cc1dc7a3Sopenharmony_ci bool is_direct = texel_count == weight_count; 898cc1dc7a3Sopenharmony_ci promise(texel_count > 0); 899cc1dc7a3Sopenharmony_ci promise(weight_count > 0); 900cc1dc7a3Sopenharmony_ci 901cc1dc7a3Sopenharmony_ci // Ensure that the end of the output arrays that are used for SIMD paths later are filled so we 902cc1dc7a3Sopenharmony_ci // can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight 903cc1dc7a3Sopenharmony_ci // arrays always contain space for 64 elements 904cc1dc7a3Sopenharmony_ci unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1); 905cc1dc7a3Sopenharmony_ci storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd); 906cc1dc7a3Sopenharmony_ci 907cc1dc7a3Sopenharmony_ci // If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the 908cc1dc7a3Sopenharmony_ci // zero-initialized SIMD over-fetch region 909cc1dc7a3Sopenharmony_ci if (is_direct) 910cc1dc7a3Sopenharmony_ci { 911cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 912cc1dc7a3Sopenharmony_ci { 913cc1dc7a3Sopenharmony_ci vfloat weight(ei.weights + i); 914cc1dc7a3Sopenharmony_ci storea(weight, dec_weight_ideal_value + i); 915cc1dc7a3Sopenharmony_ci } 916cc1dc7a3Sopenharmony_ci 917cc1dc7a3Sopenharmony_ci return; 918cc1dc7a3Sopenharmony_ci } 919cc1dc7a3Sopenharmony_ci 920cc1dc7a3Sopenharmony_ci // Otherwise compute an estimate and perform single refinement iteration 921cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS]; 922cc1dc7a3Sopenharmony_ci 923cc1dc7a3Sopenharmony_ci // Compute an initial average for each decimated weight 924cc1dc7a3Sopenharmony_ci bool constant_wes = ei.is_constant_weight_error_scale; 925cc1dc7a3Sopenharmony_ci vfloat weight_error_scale(ei.weight_error_scale[0]); 926cc1dc7a3Sopenharmony_ci 927cc1dc7a3Sopenharmony_ci // This overshoots - this is OK as we initialize the array tails in the 928cc1dc7a3Sopenharmony_ci // decimation table structures to safe values ... 929cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) 930cc1dc7a3Sopenharmony_ci { 931cc1dc7a3Sopenharmony_ci // Start with a small value to avoid div-by-zero later 932cc1dc7a3Sopenharmony_ci vfloat weight_weight(1e-10f); 933cc1dc7a3Sopenharmony_ci vfloat initial_weight = vfloat::zero(); 934cc1dc7a3Sopenharmony_ci 935cc1dc7a3Sopenharmony_ci // Accumulate error weighting of all the texels using this weight 936cc1dc7a3Sopenharmony_ci vint weight_texel_count(di.weight_texel_count + i); 937cc1dc7a3Sopenharmony_ci unsigned int max_texel_count = hmax(weight_texel_count).lane<0>(); 938cc1dc7a3Sopenharmony_ci promise(max_texel_count > 0); 939cc1dc7a3Sopenharmony_ci 940cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < max_texel_count; j++) 941cc1dc7a3Sopenharmony_ci { 942cc1dc7a3Sopenharmony_ci#ifdef ASTCENC_USE_COMMON_GATHERF 943cc1dc7a3Sopenharmony_ci const uint8_t* texel = di.weight_texels_tr[j] + i; 944cc1dc7a3Sopenharmony_ci#else 945cc1dc7a3Sopenharmony_ci vint texel(di.weight_texels_tr[j] + i); 946cc1dc7a3Sopenharmony_ci#endif 947cc1dc7a3Sopenharmony_ci vfloat weight = loada(di.weights_texel_contribs_tr[j] + i); 948cc1dc7a3Sopenharmony_ci 949cc1dc7a3Sopenharmony_ci if (!constant_wes) 950cc1dc7a3Sopenharmony_ci { 951cc1dc7a3Sopenharmony_ci weight_error_scale = gatherf(ei.weight_error_scale, texel); 952cc1dc7a3Sopenharmony_ci } 953cc1dc7a3Sopenharmony_ci 954cc1dc7a3Sopenharmony_ci vfloat contrib_weight = weight * weight_error_scale; 955cc1dc7a3Sopenharmony_ci 956cc1dc7a3Sopenharmony_ci weight_weight += contrib_weight; 957cc1dc7a3Sopenharmony_ci initial_weight += gatherf(ei.weights, texel) * contrib_weight; 958cc1dc7a3Sopenharmony_ci } 959cc1dc7a3Sopenharmony_ci 960cc1dc7a3Sopenharmony_ci storea(initial_weight / weight_weight, dec_weight_ideal_value + i); 961cc1dc7a3Sopenharmony_ci } 962cc1dc7a3Sopenharmony_ci 963cc1dc7a3Sopenharmony_ci // Populate the interpolated weight grid based on the initial average 964cc1dc7a3Sopenharmony_ci // Process SIMD-width texel coordinates at at time while we can. Safe to 965cc1dc7a3Sopenharmony_ci // over-process full SIMD vectors - the tail is zeroed. 966cc1dc7a3Sopenharmony_ci if (di.max_texel_weight_count <= 2) 967cc1dc7a3Sopenharmony_ci { 968cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 969cc1dc7a3Sopenharmony_ci { 970cc1dc7a3Sopenharmony_ci vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i); 971cc1dc7a3Sopenharmony_ci storea(weight, infilled_weights + i); 972cc1dc7a3Sopenharmony_ci } 973cc1dc7a3Sopenharmony_ci } 974cc1dc7a3Sopenharmony_ci else 975cc1dc7a3Sopenharmony_ci { 976cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 977cc1dc7a3Sopenharmony_ci { 978cc1dc7a3Sopenharmony_ci vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i); 979cc1dc7a3Sopenharmony_ci storea(weight, infilled_weights + i); 980cc1dc7a3Sopenharmony_ci } 981cc1dc7a3Sopenharmony_ci } 982cc1dc7a3Sopenharmony_ci 983cc1dc7a3Sopenharmony_ci // Perform a single iteration of refinement 984cc1dc7a3Sopenharmony_ci // Empirically determined step size; larger values don't help but smaller drops image quality 985cc1dc7a3Sopenharmony_ci constexpr float stepsize = 0.25f; 986cc1dc7a3Sopenharmony_ci constexpr float chd_scale = -WEIGHTS_TEXEL_SUM; 987cc1dc7a3Sopenharmony_ci 988cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) 989cc1dc7a3Sopenharmony_ci { 990cc1dc7a3Sopenharmony_ci vfloat weight_val = loada(dec_weight_ideal_value + i); 991cc1dc7a3Sopenharmony_ci 992cc1dc7a3Sopenharmony_ci // Accumulate error weighting of all the texels using this weight 993cc1dc7a3Sopenharmony_ci // Start with a small value to avoid div-by-zero later 994cc1dc7a3Sopenharmony_ci vfloat error_change0(1e-10f); 995cc1dc7a3Sopenharmony_ci vfloat error_change1(0.0f); 996cc1dc7a3Sopenharmony_ci 997cc1dc7a3Sopenharmony_ci // Accumulate error weighting of all the texels using this weight 998cc1dc7a3Sopenharmony_ci vint weight_texel_count(di.weight_texel_count + i); 999cc1dc7a3Sopenharmony_ci unsigned int max_texel_count = hmax(weight_texel_count).lane<0>(); 1000cc1dc7a3Sopenharmony_ci promise(max_texel_count > 0); 1001cc1dc7a3Sopenharmony_ci 1002cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < max_texel_count; j++) 1003cc1dc7a3Sopenharmony_ci { 1004cc1dc7a3Sopenharmony_ci#ifdef ASTCENC_USE_COMMON_GATHERF 1005cc1dc7a3Sopenharmony_ci const uint8_t* texel = di.weight_texels_tr[j] + i; 1006cc1dc7a3Sopenharmony_ci#else 1007cc1dc7a3Sopenharmony_ci vint texel(di.weight_texels_tr[j] + i); 1008cc1dc7a3Sopenharmony_ci#endif 1009cc1dc7a3Sopenharmony_ci vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i); 1010cc1dc7a3Sopenharmony_ci 1011cc1dc7a3Sopenharmony_ci if (!constant_wes) 1012cc1dc7a3Sopenharmony_ci { 1013cc1dc7a3Sopenharmony_ci weight_error_scale = gatherf(ei.weight_error_scale, texel); 1014cc1dc7a3Sopenharmony_ci } 1015cc1dc7a3Sopenharmony_ci 1016cc1dc7a3Sopenharmony_ci vfloat scale = weight_error_scale * contrib_weight; 1017cc1dc7a3Sopenharmony_ci vfloat old_weight = gatherf(infilled_weights, texel); 1018cc1dc7a3Sopenharmony_ci vfloat ideal_weight = gatherf(ei.weights, texel); 1019cc1dc7a3Sopenharmony_ci 1020cc1dc7a3Sopenharmony_ci error_change0 += contrib_weight * scale; 1021cc1dc7a3Sopenharmony_ci error_change1 += (old_weight - ideal_weight) * scale; 1022cc1dc7a3Sopenharmony_ci } 1023cc1dc7a3Sopenharmony_ci 1024cc1dc7a3Sopenharmony_ci vfloat step = (error_change1 * chd_scale) / error_change0; 1025cc1dc7a3Sopenharmony_ci step = clamp(-stepsize, stepsize, step); 1026cc1dc7a3Sopenharmony_ci 1027cc1dc7a3Sopenharmony_ci // Update the weight; note this can store negative values 1028cc1dc7a3Sopenharmony_ci storea(weight_val + step, dec_weight_ideal_value + i); 1029cc1dc7a3Sopenharmony_ci } 1030cc1dc7a3Sopenharmony_ci} 1031cc1dc7a3Sopenharmony_ci 1032cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 1033cc1dc7a3Sopenharmony_civoid compute_quantized_weights_for_decimation( 1034cc1dc7a3Sopenharmony_ci const decimation_info& di, 1035cc1dc7a3Sopenharmony_ci float low_bound, 1036cc1dc7a3Sopenharmony_ci float high_bound, 1037cc1dc7a3Sopenharmony_ci const float* dec_weight_ideal_value, 1038cc1dc7a3Sopenharmony_ci float* weight_set_out, 1039cc1dc7a3Sopenharmony_ci uint8_t* quantized_weight_set, 1040cc1dc7a3Sopenharmony_ci quant_method quant_level 1041cc1dc7a3Sopenharmony_ci) { 1042cc1dc7a3Sopenharmony_ci int weight_count = di.weight_count; 1043cc1dc7a3Sopenharmony_ci promise(weight_count > 0); 1044cc1dc7a3Sopenharmony_ci const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level]; 1045cc1dc7a3Sopenharmony_ci 1046cc1dc7a3Sopenharmony_ci // The available quant levels, stored with a minus 1 bias 1047cc1dc7a3Sopenharmony_ci static const float quant_levels_m1[12] { 1048cc1dc7a3Sopenharmony_ci 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f 1049cc1dc7a3Sopenharmony_ci }; 1050cc1dc7a3Sopenharmony_ci 1051cc1dc7a3Sopenharmony_ci vint steps_m1(get_quant_level(quant_level) - 1); 1052cc1dc7a3Sopenharmony_ci float quant_level_m1 = quant_levels_m1[quant_level]; 1053cc1dc7a3Sopenharmony_ci 1054cc1dc7a3Sopenharmony_ci // Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds 1055cc1dc7a3Sopenharmony_ci 1056cc1dc7a3Sopenharmony_ci // TODO: Oddity to investigate; triggered by test in issue #265. 1057cc1dc7a3Sopenharmony_ci if (high_bound <= low_bound) 1058cc1dc7a3Sopenharmony_ci { 1059cc1dc7a3Sopenharmony_ci low_bound = 0.0f; 1060cc1dc7a3Sopenharmony_ci high_bound = 1.0f; 1061cc1dc7a3Sopenharmony_ci } 1062cc1dc7a3Sopenharmony_ci 1063cc1dc7a3Sopenharmony_ci float rscale = high_bound - low_bound; 1064cc1dc7a3Sopenharmony_ci float scale = 1.0f / rscale; 1065cc1dc7a3Sopenharmony_ci 1066cc1dc7a3Sopenharmony_ci float scaled_low_bound = low_bound * scale; 1067cc1dc7a3Sopenharmony_ci rscale *= 1.0f / 64.0f; 1068cc1dc7a3Sopenharmony_ci 1069cc1dc7a3Sopenharmony_ci vfloat scalev(scale); 1070cc1dc7a3Sopenharmony_ci vfloat scaled_low_boundv(scaled_low_bound); 1071cc1dc7a3Sopenharmony_ci vfloat quant_level_m1v(quant_level_m1); 1072cc1dc7a3Sopenharmony_ci vfloat rscalev(rscale); 1073cc1dc7a3Sopenharmony_ci vfloat low_boundv(low_bound); 1074cc1dc7a3Sopenharmony_ci 1075cc1dc7a3Sopenharmony_ci // This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known 1076cc1dc7a3Sopenharmony_ci // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements 1077cc1dc7a3Sopenharmony_ci if (get_quant_level(quant_level) <= 16) 1078cc1dc7a3Sopenharmony_ci { 1079cc1dc7a3Sopenharmony_ci vint4 tab0 = vint4::load(qat.quant_to_unquant); 1080cc1dc7a3Sopenharmony_ci vint tab0p; 1081cc1dc7a3Sopenharmony_ci vtable_prepare(tab0, tab0p); 1082cc1dc7a3Sopenharmony_ci 1083cc1dc7a3Sopenharmony_ci for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) 1084cc1dc7a3Sopenharmony_ci { 1085cc1dc7a3Sopenharmony_ci vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv; 1086cc1dc7a3Sopenharmony_ci ix = clampzo(ix); 1087cc1dc7a3Sopenharmony_ci 1088cc1dc7a3Sopenharmony_ci // Look up the two closest indexes and return the one that was closest 1089cc1dc7a3Sopenharmony_ci vfloat ix1 = ix * quant_level_m1v; 1090cc1dc7a3Sopenharmony_ci 1091cc1dc7a3Sopenharmony_ci vint weightl = float_to_int(ix1); 1092cc1dc7a3Sopenharmony_ci vint weighth = min(weightl + vint(1), steps_m1); 1093cc1dc7a3Sopenharmony_ci 1094cc1dc7a3Sopenharmony_ci vint ixli = vtable_8bt_32bi(tab0p, weightl); 1095cc1dc7a3Sopenharmony_ci vint ixhi = vtable_8bt_32bi(tab0p, weighth); 1096cc1dc7a3Sopenharmony_ci 1097cc1dc7a3Sopenharmony_ci vmask mask = int_to_float(ixli + ixhi) < (vfloat(128.0f) * ix); 1098cc1dc7a3Sopenharmony_ci vint weight = select(ixli, ixhi, mask); 1099cc1dc7a3Sopenharmony_ci vfloat ixl = int_to_float(weight); 1100cc1dc7a3Sopenharmony_ci 1101cc1dc7a3Sopenharmony_ci // Invert the weight-scaling that was done initially 1102cc1dc7a3Sopenharmony_ci storea(ixl * rscalev + low_boundv, weight_set_out + i); 1103cc1dc7a3Sopenharmony_ci vint scn = pack_low_bytes(weight); 1104cc1dc7a3Sopenharmony_ci store_nbytes(scn, quantized_weight_set + i); 1105cc1dc7a3Sopenharmony_ci } 1106cc1dc7a3Sopenharmony_ci } 1107cc1dc7a3Sopenharmony_ci else 1108cc1dc7a3Sopenharmony_ci { 1109cc1dc7a3Sopenharmony_ci vint4 tab0 = vint4::load(qat.quant_to_unquant + 0); 1110cc1dc7a3Sopenharmony_ci vint4 tab1 = vint4::load(qat.quant_to_unquant + 16); 1111cc1dc7a3Sopenharmony_ci vint tab0p, tab1p; 1112cc1dc7a3Sopenharmony_ci vtable_prepare(tab0, tab1, tab0p, tab1p); 1113cc1dc7a3Sopenharmony_ci 1114cc1dc7a3Sopenharmony_ci for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) 1115cc1dc7a3Sopenharmony_ci { 1116cc1dc7a3Sopenharmony_ci vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv; 1117cc1dc7a3Sopenharmony_ci ix = clampzo(ix); 1118cc1dc7a3Sopenharmony_ci 1119cc1dc7a3Sopenharmony_ci // Look up the two closest indexes and return the one that was closest 1120cc1dc7a3Sopenharmony_ci vfloat ix1 = ix * quant_level_m1v; 1121cc1dc7a3Sopenharmony_ci 1122cc1dc7a3Sopenharmony_ci vint weightl = float_to_int(ix1); 1123cc1dc7a3Sopenharmony_ci vint weighth = min(weightl + vint(1), steps_m1); 1124cc1dc7a3Sopenharmony_ci 1125cc1dc7a3Sopenharmony_ci vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl); 1126cc1dc7a3Sopenharmony_ci vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth); 1127cc1dc7a3Sopenharmony_ci 1128cc1dc7a3Sopenharmony_ci vmask mask = int_to_float(ixli + ixhi) < (vfloat(128.0f) * ix); 1129cc1dc7a3Sopenharmony_ci vint weight = select(ixli, ixhi, mask); 1130cc1dc7a3Sopenharmony_ci vfloat ixl = int_to_float(weight); 1131cc1dc7a3Sopenharmony_ci 1132cc1dc7a3Sopenharmony_ci // Invert the weight-scaling that was done initially 1133cc1dc7a3Sopenharmony_ci storea(ixl * rscalev + low_boundv, weight_set_out + i); 1134cc1dc7a3Sopenharmony_ci vint scn = pack_low_bytes(weight); 1135cc1dc7a3Sopenharmony_ci store_nbytes(scn, quantized_weight_set + i); 1136cc1dc7a3Sopenharmony_ci } 1137cc1dc7a3Sopenharmony_ci } 1138cc1dc7a3Sopenharmony_ci} 1139cc1dc7a3Sopenharmony_ci 1140cc1dc7a3Sopenharmony_ci/** 1141cc1dc7a3Sopenharmony_ci * @brief Compute the RGB + offset for a HDR endpoint mode #7. 1142cc1dc7a3Sopenharmony_ci * 1143cc1dc7a3Sopenharmony_ci * Since the matrix needed has a regular structure we can simplify the inverse calculation. This 1144cc1dc7a3Sopenharmony_ci * gives us ~24 multiplications vs. 96 for a generic inverse. 1145cc1dc7a3Sopenharmony_ci * 1146cc1dc7a3Sopenharmony_ci * mat[0] = vfloat4(rgba_ws.x, 0.0f, 0.0f, wght_ws.x); 1147cc1dc7a3Sopenharmony_ci * mat[1] = vfloat4( 0.0f, rgba_ws.y, 0.0f, wght_ws.y); 1148cc1dc7a3Sopenharmony_ci * mat[2] = vfloat4( 0.0f, 0.0f, rgba_ws.z, wght_ws.z); 1149cc1dc7a3Sopenharmony_ci * mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z, psum); 1150cc1dc7a3Sopenharmony_ci * mat = invert(mat); 1151cc1dc7a3Sopenharmony_ci * 1152cc1dc7a3Sopenharmony_ci * @param rgba_weight_sum Sum of partition component error weights. 1153cc1dc7a3Sopenharmony_ci * @param weight_weight_sum Sum of partition component error weights * texel weight. 1154cc1dc7a3Sopenharmony_ci * @param rgbq_sum Sum of partition component error weights * texel weight * color data. 1155cc1dc7a3Sopenharmony_ci * @param psum Sum of RGB color weights * texel weight^2. 1156cc1dc7a3Sopenharmony_ci */ 1157cc1dc7a3Sopenharmony_cistatic inline vfloat4 compute_rgbo_vector( 1158cc1dc7a3Sopenharmony_ci vfloat4 rgba_weight_sum, 1159cc1dc7a3Sopenharmony_ci vfloat4 weight_weight_sum, 1160cc1dc7a3Sopenharmony_ci vfloat4 rgbq_sum, 1161cc1dc7a3Sopenharmony_ci float psum 1162cc1dc7a3Sopenharmony_ci) { 1163cc1dc7a3Sopenharmony_ci float X = rgba_weight_sum.lane<0>(); 1164cc1dc7a3Sopenharmony_ci float Y = rgba_weight_sum.lane<1>(); 1165cc1dc7a3Sopenharmony_ci float Z = rgba_weight_sum.lane<2>(); 1166cc1dc7a3Sopenharmony_ci float P = weight_weight_sum.lane<0>(); 1167cc1dc7a3Sopenharmony_ci float Q = weight_weight_sum.lane<1>(); 1168cc1dc7a3Sopenharmony_ci float R = weight_weight_sum.lane<2>(); 1169cc1dc7a3Sopenharmony_ci float S = psum; 1170cc1dc7a3Sopenharmony_ci 1171cc1dc7a3Sopenharmony_ci float PP = P * P; 1172cc1dc7a3Sopenharmony_ci float QQ = Q * Q; 1173cc1dc7a3Sopenharmony_ci float RR = R * R; 1174cc1dc7a3Sopenharmony_ci 1175cc1dc7a3Sopenharmony_ci float SZmRR = S * Z - RR; 1176cc1dc7a3Sopenharmony_ci float DT = SZmRR * Y - Z * QQ; 1177cc1dc7a3Sopenharmony_ci float YP = Y * P; 1178cc1dc7a3Sopenharmony_ci float QX = Q * X; 1179cc1dc7a3Sopenharmony_ci float YX = Y * X; 1180cc1dc7a3Sopenharmony_ci float mZYP = -Z * YP; 1181cc1dc7a3Sopenharmony_ci float mZQX = -Z * QX; 1182cc1dc7a3Sopenharmony_ci float mRYX = -R * YX; 1183cc1dc7a3Sopenharmony_ci float ZQP = Z * Q * P; 1184cc1dc7a3Sopenharmony_ci float RYP = R * YP; 1185cc1dc7a3Sopenharmony_ci float RQX = R * QX; 1186cc1dc7a3Sopenharmony_ci 1187cc1dc7a3Sopenharmony_ci // Compute the reciprocal of matrix determinant 1188cc1dc7a3Sopenharmony_ci float rdet = 1.0f / (DT * X + mZYP * P); 1189cc1dc7a3Sopenharmony_ci 1190cc1dc7a3Sopenharmony_ci // Actually compute the adjugate, and then apply 1/det separately 1191cc1dc7a3Sopenharmony_ci vfloat4 mat0(DT, ZQP, RYP, mZYP); 1192cc1dc7a3Sopenharmony_ci vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX); 1193cc1dc7a3Sopenharmony_ci vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX); 1194cc1dc7a3Sopenharmony_ci vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX); 1195cc1dc7a3Sopenharmony_ci vfloat4 vect = rgbq_sum * rdet; 1196cc1dc7a3Sopenharmony_ci 1197cc1dc7a3Sopenharmony_ci return vfloat4(dot_s(mat0, vect), 1198cc1dc7a3Sopenharmony_ci dot_s(mat1, vect), 1199cc1dc7a3Sopenharmony_ci dot_s(mat2, vect), 1200cc1dc7a3Sopenharmony_ci dot_s(mat3, vect)); 1201cc1dc7a3Sopenharmony_ci} 1202cc1dc7a3Sopenharmony_ci 1203cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 1204cc1dc7a3Sopenharmony_civoid recompute_ideal_colors_1plane( 1205cc1dc7a3Sopenharmony_ci const image_block& blk, 1206cc1dc7a3Sopenharmony_ci const partition_info& pi, 1207cc1dc7a3Sopenharmony_ci const decimation_info& di, 1208cc1dc7a3Sopenharmony_ci const uint8_t* dec_weights_uquant, 1209cc1dc7a3Sopenharmony_ci endpoints& ep, 1210cc1dc7a3Sopenharmony_ci vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS], 1211cc1dc7a3Sopenharmony_ci vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS] 1212cc1dc7a3Sopenharmony_ci) { 1213cc1dc7a3Sopenharmony_ci unsigned int weight_count = di.weight_count; 1214cc1dc7a3Sopenharmony_ci unsigned int total_texel_count = blk.texel_count; 1215cc1dc7a3Sopenharmony_ci unsigned int partition_count = pi.partition_count; 1216cc1dc7a3Sopenharmony_ci 1217cc1dc7a3Sopenharmony_ci promise(weight_count > 0); 1218cc1dc7a3Sopenharmony_ci promise(total_texel_count > 0); 1219cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 1220cc1dc7a3Sopenharmony_ci 1221cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS]; 1222cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) 1223cc1dc7a3Sopenharmony_ci { 1224cc1dc7a3Sopenharmony_ci vint unquant_value(dec_weights_uquant + i); 1225cc1dc7a3Sopenharmony_ci vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f); 1226cc1dc7a3Sopenharmony_ci storea(unquant_valuef, dec_weight + i); 1227cc1dc7a3Sopenharmony_ci } 1228cc1dc7a3Sopenharmony_ci 1229cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS]; 1230cc1dc7a3Sopenharmony_ci float* undec_weight_ref; 1231cc1dc7a3Sopenharmony_ci if (di.max_texel_weight_count == 1) 1232cc1dc7a3Sopenharmony_ci { 1233cc1dc7a3Sopenharmony_ci undec_weight_ref = dec_weight; 1234cc1dc7a3Sopenharmony_ci } 1235cc1dc7a3Sopenharmony_ci else if (di.max_texel_weight_count <= 2) 1236cc1dc7a3Sopenharmony_ci { 1237cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) 1238cc1dc7a3Sopenharmony_ci { 1239cc1dc7a3Sopenharmony_ci vfloat weight = bilinear_infill_vla_2(di, dec_weight, i); 1240cc1dc7a3Sopenharmony_ci storea(weight, undec_weight + i); 1241cc1dc7a3Sopenharmony_ci } 1242cc1dc7a3Sopenharmony_ci 1243cc1dc7a3Sopenharmony_ci undec_weight_ref = undec_weight; 1244cc1dc7a3Sopenharmony_ci } 1245cc1dc7a3Sopenharmony_ci else 1246cc1dc7a3Sopenharmony_ci { 1247cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) 1248cc1dc7a3Sopenharmony_ci { 1249cc1dc7a3Sopenharmony_ci vfloat weight = bilinear_infill_vla(di, dec_weight, i); 1250cc1dc7a3Sopenharmony_ci storea(weight, undec_weight + i); 1251cc1dc7a3Sopenharmony_ci } 1252cc1dc7a3Sopenharmony_ci 1253cc1dc7a3Sopenharmony_ci undec_weight_ref = undec_weight; 1254cc1dc7a3Sopenharmony_ci } 1255cc1dc7a3Sopenharmony_ci 1256cc1dc7a3Sopenharmony_ci vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count)); 1257cc1dc7a3Sopenharmony_ci 1258cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partition_count; i++) 1259cc1dc7a3Sopenharmony_ci { 1260cc1dc7a3Sopenharmony_ci unsigned int texel_count = pi.partition_texel_count[i]; 1261cc1dc7a3Sopenharmony_ci const uint8_t *texel_indexes = pi.texels_of_partition[i]; 1262cc1dc7a3Sopenharmony_ci 1263cc1dc7a3Sopenharmony_ci // Only compute a partition mean if more than one partition 1264cc1dc7a3Sopenharmony_ci if (partition_count > 1) 1265cc1dc7a3Sopenharmony_ci { 1266cc1dc7a3Sopenharmony_ci rgba_sum = vfloat4::zero(); 1267cc1dc7a3Sopenharmony_ci promise(texel_count > 0); 1268cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < texel_count; j++) 1269cc1dc7a3Sopenharmony_ci { 1270cc1dc7a3Sopenharmony_ci unsigned int tix = texel_indexes[j]; 1271cc1dc7a3Sopenharmony_ci rgba_sum += blk.texel(tix); 1272cc1dc7a3Sopenharmony_ci } 1273cc1dc7a3Sopenharmony_ci } 1274cc1dc7a3Sopenharmony_ci 1275cc1dc7a3Sopenharmony_ci rgba_sum = rgba_sum * blk.channel_weight; 1276cc1dc7a3Sopenharmony_ci vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f); 1277cc1dc7a3Sopenharmony_ci vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>()); 1278cc1dc7a3Sopenharmony_ci 1279cc1dc7a3Sopenharmony_ci float scale_max = 0.0f; 1280cc1dc7a3Sopenharmony_ci float scale_min = 1e10f; 1281cc1dc7a3Sopenharmony_ci 1282cc1dc7a3Sopenharmony_ci float wmin1 = 1.0f; 1283cc1dc7a3Sopenharmony_ci float wmax1 = 0.0f; 1284cc1dc7a3Sopenharmony_ci 1285cc1dc7a3Sopenharmony_ci float left_sum_s = 0.0f; 1286cc1dc7a3Sopenharmony_ci float middle_sum_s = 0.0f; 1287cc1dc7a3Sopenharmony_ci float right_sum_s = 0.0f; 1288cc1dc7a3Sopenharmony_ci 1289cc1dc7a3Sopenharmony_ci vfloat4 color_vec_x = vfloat4::zero(); 1290cc1dc7a3Sopenharmony_ci vfloat4 color_vec_y = vfloat4::zero(); 1291cc1dc7a3Sopenharmony_ci 1292cc1dc7a3Sopenharmony_ci vfloat4 scale_vec = vfloat4::zero(); 1293cc1dc7a3Sopenharmony_ci 1294cc1dc7a3Sopenharmony_ci float weight_weight_sum_s = 1e-17f; 1295cc1dc7a3Sopenharmony_ci 1296cc1dc7a3Sopenharmony_ci vfloat4 color_weight = blk.channel_weight; 1297cc1dc7a3Sopenharmony_ci float ls_weight = hadd_rgb_s(color_weight); 1298cc1dc7a3Sopenharmony_ci 1299cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < texel_count; j++) 1300cc1dc7a3Sopenharmony_ci { 1301cc1dc7a3Sopenharmony_ci unsigned int tix = texel_indexes[j]; 1302cc1dc7a3Sopenharmony_ci vfloat4 rgba = blk.texel(tix); 1303cc1dc7a3Sopenharmony_ci 1304cc1dc7a3Sopenharmony_ci float idx0 = undec_weight_ref[tix]; 1305cc1dc7a3Sopenharmony_ci 1306cc1dc7a3Sopenharmony_ci float om_idx0 = 1.0f - idx0; 1307cc1dc7a3Sopenharmony_ci wmin1 = astc::min(idx0, wmin1); 1308cc1dc7a3Sopenharmony_ci wmax1 = astc::max(idx0, wmax1); 1309cc1dc7a3Sopenharmony_ci 1310cc1dc7a3Sopenharmony_ci float scale = dot3_s(scale_dir, rgba); 1311cc1dc7a3Sopenharmony_ci scale_min = astc::min(scale, scale_min); 1312cc1dc7a3Sopenharmony_ci scale_max = astc::max(scale, scale_max); 1313cc1dc7a3Sopenharmony_ci 1314cc1dc7a3Sopenharmony_ci left_sum_s += om_idx0 * om_idx0; 1315cc1dc7a3Sopenharmony_ci middle_sum_s += om_idx0 * idx0; 1316cc1dc7a3Sopenharmony_ci right_sum_s += idx0 * idx0; 1317cc1dc7a3Sopenharmony_ci weight_weight_sum_s += idx0; 1318cc1dc7a3Sopenharmony_ci 1319cc1dc7a3Sopenharmony_ci vfloat4 color_idx(idx0); 1320cc1dc7a3Sopenharmony_ci vfloat4 cwprod = rgba; 1321cc1dc7a3Sopenharmony_ci vfloat4 cwiprod = cwprod * color_idx; 1322cc1dc7a3Sopenharmony_ci 1323cc1dc7a3Sopenharmony_ci color_vec_y += cwiprod; 1324cc1dc7a3Sopenharmony_ci color_vec_x += cwprod - cwiprod; 1325cc1dc7a3Sopenharmony_ci 1326cc1dc7a3Sopenharmony_ci scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight); 1327cc1dc7a3Sopenharmony_ci } 1328cc1dc7a3Sopenharmony_ci 1329cc1dc7a3Sopenharmony_ci vfloat4 left_sum = vfloat4(left_sum_s) * color_weight; 1330cc1dc7a3Sopenharmony_ci vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight; 1331cc1dc7a3Sopenharmony_ci vfloat4 right_sum = vfloat4(right_sum_s) * color_weight; 1332cc1dc7a3Sopenharmony_ci vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight; 1333cc1dc7a3Sopenharmony_ci 1334cc1dc7a3Sopenharmony_ci color_vec_x = color_vec_x * color_weight; 1335cc1dc7a3Sopenharmony_ci color_vec_y = color_vec_y * color_weight; 1336cc1dc7a3Sopenharmony_ci 1337cc1dc7a3Sopenharmony_ci // Initialize the luminance and scale vectors with a reasonable default 1338cc1dc7a3Sopenharmony_ci float scalediv = scale_min / astc::max(scale_max, 1e-10f); 1339cc1dc7a3Sopenharmony_ci scalediv = astc::clamp1f(scalediv); 1340cc1dc7a3Sopenharmony_ci 1341cc1dc7a3Sopenharmony_ci vfloat4 sds = scale_dir * scale_max; 1342cc1dc7a3Sopenharmony_ci 1343cc1dc7a3Sopenharmony_ci rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv); 1344cc1dc7a3Sopenharmony_ci 1345cc1dc7a3Sopenharmony_ci if (wmin1 >= wmax1 * 0.999f) 1346cc1dc7a3Sopenharmony_ci { 1347cc1dc7a3Sopenharmony_ci // If all weights in the partition were equal, then just take average of all colors in 1348cc1dc7a3Sopenharmony_ci // the partition and use that as both endpoint colors 1349cc1dc7a3Sopenharmony_ci vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; 1350cc1dc7a3Sopenharmony_ci 1351cc1dc7a3Sopenharmony_ci vmask4 notnan_mask = avg == avg; 1352cc1dc7a3Sopenharmony_ci ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask); 1353cc1dc7a3Sopenharmony_ci ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask); 1354cc1dc7a3Sopenharmony_ci 1355cc1dc7a3Sopenharmony_ci rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f); 1356cc1dc7a3Sopenharmony_ci } 1357cc1dc7a3Sopenharmony_ci else 1358cc1dc7a3Sopenharmony_ci { 1359cc1dc7a3Sopenharmony_ci // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given 1360cc1dc7a3Sopenharmony_ci // set of texel weights and pixel colors 1361cc1dc7a3Sopenharmony_ci vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum); 1362cc1dc7a3Sopenharmony_ci vfloat4 color_rdet1 = 1.0f / color_det1; 1363cc1dc7a3Sopenharmony_ci 1364cc1dc7a3Sopenharmony_ci float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>()); 1365cc1dc7a3Sopenharmony_ci float ls_rdet1 = 1.0f / ls_det1; 1366cc1dc7a3Sopenharmony_ci 1367cc1dc7a3Sopenharmony_ci vfloat4 color_mss1 = (left_sum * left_sum) 1368cc1dc7a3Sopenharmony_ci + (2.0f * middle_sum * middle_sum) 1369cc1dc7a3Sopenharmony_ci + (right_sum * right_sum); 1370cc1dc7a3Sopenharmony_ci 1371cc1dc7a3Sopenharmony_ci float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>()) 1372cc1dc7a3Sopenharmony_ci + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>()) 1373cc1dc7a3Sopenharmony_ci + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>()); 1374cc1dc7a3Sopenharmony_ci 1375cc1dc7a3Sopenharmony_ci vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1; 1376cc1dc7a3Sopenharmony_ci vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1; 1377cc1dc7a3Sopenharmony_ci 1378cc1dc7a3Sopenharmony_ci vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f); 1379cc1dc7a3Sopenharmony_ci vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); 1380cc1dc7a3Sopenharmony_ci vmask4 full_mask = det_mask & notnan_mask; 1381cc1dc7a3Sopenharmony_ci 1382cc1dc7a3Sopenharmony_ci ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask); 1383cc1dc7a3Sopenharmony_ci ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask); 1384cc1dc7a3Sopenharmony_ci 1385cc1dc7a3Sopenharmony_ci float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1; 1386cc1dc7a3Sopenharmony_ci float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1; 1387cc1dc7a3Sopenharmony_ci 1388cc1dc7a3Sopenharmony_ci if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1) 1389cc1dc7a3Sopenharmony_ci { 1390cc1dc7a3Sopenharmony_ci float scalediv2 = scale_ep0 / scale_ep1; 1391cc1dc7a3Sopenharmony_ci vfloat4 sdsm = scale_dir * scale_ep1; 1392cc1dc7a3Sopenharmony_ci rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2); 1393cc1dc7a3Sopenharmony_ci } 1394cc1dc7a3Sopenharmony_ci } 1395cc1dc7a3Sopenharmony_ci 1396cc1dc7a3Sopenharmony_ci // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR 1397cc1dc7a3Sopenharmony_ci if (blk.rgb_lns[0] || blk.alpha_lns[0]) 1398cc1dc7a3Sopenharmony_ci { 1399cc1dc7a3Sopenharmony_ci vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight; 1400cc1dc7a3Sopenharmony_ci float psum = right_sum_s * hadd_rgb_s(color_weight); 1401cc1dc7a3Sopenharmony_ci 1402cc1dc7a3Sopenharmony_ci vfloat4 rgbq_sum = color_vec_x + color_vec_y; 1403cc1dc7a3Sopenharmony_ci rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); 1404cc1dc7a3Sopenharmony_ci 1405cc1dc7a3Sopenharmony_ci vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); 1406cc1dc7a3Sopenharmony_ci rgbo_vectors[i] = rgbovec; 1407cc1dc7a3Sopenharmony_ci 1408cc1dc7a3Sopenharmony_ci // We can get a failure due to the use of a singular (non-invertible) matrix 1409cc1dc7a3Sopenharmony_ci // If it failed, compute rgbo_vectors[] with a different method ... 1410cc1dc7a3Sopenharmony_ci if (astc::isnan(dot_s(rgbovec, rgbovec))) 1411cc1dc7a3Sopenharmony_ci { 1412cc1dc7a3Sopenharmony_ci vfloat4 v0 = ep.endpt0[i]; 1413cc1dc7a3Sopenharmony_ci vfloat4 v1 = ep.endpt1[i]; 1414cc1dc7a3Sopenharmony_ci 1415cc1dc7a3Sopenharmony_ci float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f); 1416cc1dc7a3Sopenharmony_ci avgdif = astc::max(avgdif, 0.0f); 1417cc1dc7a3Sopenharmony_ci 1418cc1dc7a3Sopenharmony_ci vfloat4 avg = (v0 + v1) * 0.5f; 1419cc1dc7a3Sopenharmony_ci vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; 1420cc1dc7a3Sopenharmony_ci rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); 1421cc1dc7a3Sopenharmony_ci } 1422cc1dc7a3Sopenharmony_ci } 1423cc1dc7a3Sopenharmony_ci } 1424cc1dc7a3Sopenharmony_ci} 1425cc1dc7a3Sopenharmony_ci 1426cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 1427cc1dc7a3Sopenharmony_civoid recompute_ideal_colors_2planes( 1428cc1dc7a3Sopenharmony_ci const image_block& blk, 1429cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 1430cc1dc7a3Sopenharmony_ci const decimation_info& di, 1431cc1dc7a3Sopenharmony_ci const uint8_t* dec_weights_uquant_plane1, 1432cc1dc7a3Sopenharmony_ci const uint8_t* dec_weights_uquant_plane2, 1433cc1dc7a3Sopenharmony_ci endpoints& ep, 1434cc1dc7a3Sopenharmony_ci vfloat4& rgbs_vector, 1435cc1dc7a3Sopenharmony_ci vfloat4& rgbo_vector, 1436cc1dc7a3Sopenharmony_ci int plane2_component 1437cc1dc7a3Sopenharmony_ci) { 1438cc1dc7a3Sopenharmony_ci unsigned int weight_count = di.weight_count; 1439cc1dc7a3Sopenharmony_ci unsigned int total_texel_count = blk.texel_count; 1440cc1dc7a3Sopenharmony_ci 1441cc1dc7a3Sopenharmony_ci promise(total_texel_count > 0); 1442cc1dc7a3Sopenharmony_ci promise(weight_count > 0); 1443cc1dc7a3Sopenharmony_ci 1444cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE]; 1445cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE]; 1446cc1dc7a3Sopenharmony_ci 1447cc1dc7a3Sopenharmony_ci assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE); 1448cc1dc7a3Sopenharmony_ci 1449cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) 1450cc1dc7a3Sopenharmony_ci { 1451cc1dc7a3Sopenharmony_ci vint unquant_value1(dec_weights_uquant_plane1 + i); 1452cc1dc7a3Sopenharmony_ci vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f); 1453cc1dc7a3Sopenharmony_ci storea(unquant_value1f, dec_weight_plane1 + i); 1454cc1dc7a3Sopenharmony_ci 1455cc1dc7a3Sopenharmony_ci vint unquant_value2(dec_weights_uquant_plane2 + i); 1456cc1dc7a3Sopenharmony_ci vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f); 1457cc1dc7a3Sopenharmony_ci storea(unquant_value2f, dec_weight_plane2 + i); 1458cc1dc7a3Sopenharmony_ci } 1459cc1dc7a3Sopenharmony_ci 1460cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS]; 1461cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS]; 1462cc1dc7a3Sopenharmony_ci 1463cc1dc7a3Sopenharmony_ci float* undec_weight_plane1_ref; 1464cc1dc7a3Sopenharmony_ci float* undec_weight_plane2_ref; 1465cc1dc7a3Sopenharmony_ci 1466cc1dc7a3Sopenharmony_ci if (di.max_texel_weight_count == 1) 1467cc1dc7a3Sopenharmony_ci { 1468cc1dc7a3Sopenharmony_ci undec_weight_plane1_ref = dec_weight_plane1; 1469cc1dc7a3Sopenharmony_ci undec_weight_plane2_ref = dec_weight_plane2; 1470cc1dc7a3Sopenharmony_ci } 1471cc1dc7a3Sopenharmony_ci else if (di.max_texel_weight_count <= 2) 1472cc1dc7a3Sopenharmony_ci { 1473cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) 1474cc1dc7a3Sopenharmony_ci { 1475cc1dc7a3Sopenharmony_ci vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i); 1476cc1dc7a3Sopenharmony_ci storea(weight, undec_weight_plane1 + i); 1477cc1dc7a3Sopenharmony_ci 1478cc1dc7a3Sopenharmony_ci weight = bilinear_infill_vla_2(di, dec_weight_plane2, i); 1479cc1dc7a3Sopenharmony_ci storea(weight, undec_weight_plane2 + i); 1480cc1dc7a3Sopenharmony_ci } 1481cc1dc7a3Sopenharmony_ci 1482cc1dc7a3Sopenharmony_ci undec_weight_plane1_ref = undec_weight_plane1; 1483cc1dc7a3Sopenharmony_ci undec_weight_plane2_ref = undec_weight_plane2; 1484cc1dc7a3Sopenharmony_ci } 1485cc1dc7a3Sopenharmony_ci else 1486cc1dc7a3Sopenharmony_ci { 1487cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) 1488cc1dc7a3Sopenharmony_ci { 1489cc1dc7a3Sopenharmony_ci vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i); 1490cc1dc7a3Sopenharmony_ci storea(weight, undec_weight_plane1 + i); 1491cc1dc7a3Sopenharmony_ci 1492cc1dc7a3Sopenharmony_ci weight = bilinear_infill_vla(di, dec_weight_plane2, i); 1493cc1dc7a3Sopenharmony_ci storea(weight, undec_weight_plane2 + i); 1494cc1dc7a3Sopenharmony_ci } 1495cc1dc7a3Sopenharmony_ci 1496cc1dc7a3Sopenharmony_ci undec_weight_plane1_ref = undec_weight_plane1; 1497cc1dc7a3Sopenharmony_ci undec_weight_plane2_ref = undec_weight_plane2; 1498cc1dc7a3Sopenharmony_ci } 1499cc1dc7a3Sopenharmony_ci 1500cc1dc7a3Sopenharmony_ci unsigned int texel_count = bsd.texel_count; 1501cc1dc7a3Sopenharmony_ci vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f); 1502cc1dc7a3Sopenharmony_ci vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>()); 1503cc1dc7a3Sopenharmony_ci 1504cc1dc7a3Sopenharmony_ci float scale_max = 0.0f; 1505cc1dc7a3Sopenharmony_ci float scale_min = 1e10f; 1506cc1dc7a3Sopenharmony_ci 1507cc1dc7a3Sopenharmony_ci float wmin1 = 1.0f; 1508cc1dc7a3Sopenharmony_ci float wmax1 = 0.0f; 1509cc1dc7a3Sopenharmony_ci 1510cc1dc7a3Sopenharmony_ci float wmin2 = 1.0f; 1511cc1dc7a3Sopenharmony_ci float wmax2 = 0.0f; 1512cc1dc7a3Sopenharmony_ci 1513cc1dc7a3Sopenharmony_ci float left1_sum_s = 0.0f; 1514cc1dc7a3Sopenharmony_ci float middle1_sum_s = 0.0f; 1515cc1dc7a3Sopenharmony_ci float right1_sum_s = 0.0f; 1516cc1dc7a3Sopenharmony_ci 1517cc1dc7a3Sopenharmony_ci float left2_sum_s = 0.0f; 1518cc1dc7a3Sopenharmony_ci float middle2_sum_s = 0.0f; 1519cc1dc7a3Sopenharmony_ci float right2_sum_s = 0.0f; 1520cc1dc7a3Sopenharmony_ci 1521cc1dc7a3Sopenharmony_ci vfloat4 color_vec_x = vfloat4::zero(); 1522cc1dc7a3Sopenharmony_ci vfloat4 color_vec_y = vfloat4::zero(); 1523cc1dc7a3Sopenharmony_ci 1524cc1dc7a3Sopenharmony_ci vfloat4 scale_vec = vfloat4::zero(); 1525cc1dc7a3Sopenharmony_ci 1526cc1dc7a3Sopenharmony_ci vfloat4 weight_weight_sum = vfloat4(1e-17f); 1527cc1dc7a3Sopenharmony_ci 1528cc1dc7a3Sopenharmony_ci vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component); 1529cc1dc7a3Sopenharmony_ci vfloat4 color_weight = blk.channel_weight; 1530cc1dc7a3Sopenharmony_ci float ls_weight = hadd_rgb_s(color_weight); 1531cc1dc7a3Sopenharmony_ci 1532cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < texel_count; j++) 1533cc1dc7a3Sopenharmony_ci { 1534cc1dc7a3Sopenharmony_ci vfloat4 rgba = blk.texel(j); 1535cc1dc7a3Sopenharmony_ci 1536cc1dc7a3Sopenharmony_ci float idx0 = undec_weight_plane1_ref[j]; 1537cc1dc7a3Sopenharmony_ci 1538cc1dc7a3Sopenharmony_ci float om_idx0 = 1.0f - idx0; 1539cc1dc7a3Sopenharmony_ci wmin1 = astc::min(idx0, wmin1); 1540cc1dc7a3Sopenharmony_ci wmax1 = astc::max(idx0, wmax1); 1541cc1dc7a3Sopenharmony_ci 1542cc1dc7a3Sopenharmony_ci float scale = dot3_s(scale_dir, rgba); 1543cc1dc7a3Sopenharmony_ci scale_min = astc::min(scale, scale_min); 1544cc1dc7a3Sopenharmony_ci scale_max = astc::max(scale, scale_max); 1545cc1dc7a3Sopenharmony_ci 1546cc1dc7a3Sopenharmony_ci left1_sum_s += om_idx0 * om_idx0; 1547cc1dc7a3Sopenharmony_ci middle1_sum_s += om_idx0 * idx0; 1548cc1dc7a3Sopenharmony_ci right1_sum_s += idx0 * idx0; 1549cc1dc7a3Sopenharmony_ci 1550cc1dc7a3Sopenharmony_ci float idx1 = undec_weight_plane2_ref[j]; 1551cc1dc7a3Sopenharmony_ci 1552cc1dc7a3Sopenharmony_ci float om_idx1 = 1.0f - idx1; 1553cc1dc7a3Sopenharmony_ci wmin2 = astc::min(idx1, wmin2); 1554cc1dc7a3Sopenharmony_ci wmax2 = astc::max(idx1, wmax2); 1555cc1dc7a3Sopenharmony_ci 1556cc1dc7a3Sopenharmony_ci left2_sum_s += om_idx1 * om_idx1; 1557cc1dc7a3Sopenharmony_ci middle2_sum_s += om_idx1 * idx1; 1558cc1dc7a3Sopenharmony_ci right2_sum_s += idx1 * idx1; 1559cc1dc7a3Sopenharmony_ci 1560cc1dc7a3Sopenharmony_ci vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask); 1561cc1dc7a3Sopenharmony_ci 1562cc1dc7a3Sopenharmony_ci vfloat4 cwprod = rgba; 1563cc1dc7a3Sopenharmony_ci vfloat4 cwiprod = cwprod * color_idx; 1564cc1dc7a3Sopenharmony_ci 1565cc1dc7a3Sopenharmony_ci color_vec_y += cwiprod; 1566cc1dc7a3Sopenharmony_ci color_vec_x += cwprod - cwiprod; 1567cc1dc7a3Sopenharmony_ci 1568cc1dc7a3Sopenharmony_ci scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale); 1569cc1dc7a3Sopenharmony_ci weight_weight_sum += color_idx; 1570cc1dc7a3Sopenharmony_ci } 1571cc1dc7a3Sopenharmony_ci 1572cc1dc7a3Sopenharmony_ci vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight; 1573cc1dc7a3Sopenharmony_ci vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight; 1574cc1dc7a3Sopenharmony_ci vfloat4 right1_sum = vfloat4(right1_sum_s) * color_weight; 1575cc1dc7a3Sopenharmony_ci vfloat4 lmrs_sum = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight; 1576cc1dc7a3Sopenharmony_ci 1577cc1dc7a3Sopenharmony_ci vfloat4 left2_sum = vfloat4(left2_sum_s) * color_weight; 1578cc1dc7a3Sopenharmony_ci vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight; 1579cc1dc7a3Sopenharmony_ci vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight; 1580cc1dc7a3Sopenharmony_ci 1581cc1dc7a3Sopenharmony_ci color_vec_x = color_vec_x * color_weight; 1582cc1dc7a3Sopenharmony_ci color_vec_y = color_vec_y * color_weight; 1583cc1dc7a3Sopenharmony_ci 1584cc1dc7a3Sopenharmony_ci // Initialize the luminance and scale vectors with a reasonable default 1585cc1dc7a3Sopenharmony_ci float scalediv = scale_min / astc::max(scale_max, 1e-10f); 1586cc1dc7a3Sopenharmony_ci scalediv = astc::clamp1f(scalediv); 1587cc1dc7a3Sopenharmony_ci 1588cc1dc7a3Sopenharmony_ci vfloat4 sds = scale_dir * scale_max; 1589cc1dc7a3Sopenharmony_ci 1590cc1dc7a3Sopenharmony_ci rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv); 1591cc1dc7a3Sopenharmony_ci 1592cc1dc7a3Sopenharmony_ci if (wmin1 >= wmax1 * 0.999f) 1593cc1dc7a3Sopenharmony_ci { 1594cc1dc7a3Sopenharmony_ci // If all weights in the partition were equal, then just take average of all colors in 1595cc1dc7a3Sopenharmony_ci // the partition and use that as both endpoint colors 1596cc1dc7a3Sopenharmony_ci vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; 1597cc1dc7a3Sopenharmony_ci 1598cc1dc7a3Sopenharmony_ci vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component); 1599cc1dc7a3Sopenharmony_ci vmask4 notnan_mask = avg == avg; 1600cc1dc7a3Sopenharmony_ci vmask4 full_mask = p1_mask & notnan_mask; 1601cc1dc7a3Sopenharmony_ci 1602cc1dc7a3Sopenharmony_ci ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask); 1603cc1dc7a3Sopenharmony_ci ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask); 1604cc1dc7a3Sopenharmony_ci 1605cc1dc7a3Sopenharmony_ci rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f); 1606cc1dc7a3Sopenharmony_ci } 1607cc1dc7a3Sopenharmony_ci else 1608cc1dc7a3Sopenharmony_ci { 1609cc1dc7a3Sopenharmony_ci // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given 1610cc1dc7a3Sopenharmony_ci // set of texel weights and pixel colors 1611cc1dc7a3Sopenharmony_ci vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum); 1612cc1dc7a3Sopenharmony_ci vfloat4 color_rdet1 = 1.0f / color_det1; 1613cc1dc7a3Sopenharmony_ci 1614cc1dc7a3Sopenharmony_ci float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>()); 1615cc1dc7a3Sopenharmony_ci float ls_rdet1 = 1.0f / ls_det1; 1616cc1dc7a3Sopenharmony_ci 1617cc1dc7a3Sopenharmony_ci vfloat4 color_mss1 = (left1_sum * left1_sum) 1618cc1dc7a3Sopenharmony_ci + (2.0f * middle1_sum * middle1_sum) 1619cc1dc7a3Sopenharmony_ci + (right1_sum * right1_sum); 1620cc1dc7a3Sopenharmony_ci 1621cc1dc7a3Sopenharmony_ci float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>()) 1622cc1dc7a3Sopenharmony_ci + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>()) 1623cc1dc7a3Sopenharmony_ci + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>()); 1624cc1dc7a3Sopenharmony_ci 1625cc1dc7a3Sopenharmony_ci vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1; 1626cc1dc7a3Sopenharmony_ci vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1; 1627cc1dc7a3Sopenharmony_ci 1628cc1dc7a3Sopenharmony_ci float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1; 1629cc1dc7a3Sopenharmony_ci float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1; 1630cc1dc7a3Sopenharmony_ci 1631cc1dc7a3Sopenharmony_ci vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component); 1632cc1dc7a3Sopenharmony_ci vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f); 1633cc1dc7a3Sopenharmony_ci vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); 1634cc1dc7a3Sopenharmony_ci vmask4 full_mask = p1_mask & det_mask & notnan_mask; 1635cc1dc7a3Sopenharmony_ci 1636cc1dc7a3Sopenharmony_ci ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask); 1637cc1dc7a3Sopenharmony_ci ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask); 1638cc1dc7a3Sopenharmony_ci 1639cc1dc7a3Sopenharmony_ci if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1) 1640cc1dc7a3Sopenharmony_ci { 1641cc1dc7a3Sopenharmony_ci float scalediv2 = scale_ep0 / scale_ep1; 1642cc1dc7a3Sopenharmony_ci vfloat4 sdsm = scale_dir * scale_ep1; 1643cc1dc7a3Sopenharmony_ci rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2); 1644cc1dc7a3Sopenharmony_ci } 1645cc1dc7a3Sopenharmony_ci } 1646cc1dc7a3Sopenharmony_ci 1647cc1dc7a3Sopenharmony_ci if (wmin2 >= wmax2 * 0.999f) 1648cc1dc7a3Sopenharmony_ci { 1649cc1dc7a3Sopenharmony_ci // If all weights in the partition were equal, then just take average of all colors in 1650cc1dc7a3Sopenharmony_ci // the partition and use that as both endpoint colors 1651cc1dc7a3Sopenharmony_ci vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; 1652cc1dc7a3Sopenharmony_ci 1653cc1dc7a3Sopenharmony_ci vmask4 notnan_mask = avg == avg; 1654cc1dc7a3Sopenharmony_ci vmask4 full_mask = p2_mask & notnan_mask; 1655cc1dc7a3Sopenharmony_ci 1656cc1dc7a3Sopenharmony_ci ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask); 1657cc1dc7a3Sopenharmony_ci ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask); 1658cc1dc7a3Sopenharmony_ci } 1659cc1dc7a3Sopenharmony_ci else 1660cc1dc7a3Sopenharmony_ci { 1661cc1dc7a3Sopenharmony_ci // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given 1662cc1dc7a3Sopenharmony_ci // set of texel weights and pixel colors 1663cc1dc7a3Sopenharmony_ci vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum); 1664cc1dc7a3Sopenharmony_ci vfloat4 color_rdet2 = 1.0f / color_det2; 1665cc1dc7a3Sopenharmony_ci 1666cc1dc7a3Sopenharmony_ci vfloat4 color_mss2 = (left2_sum * left2_sum) 1667cc1dc7a3Sopenharmony_ci + (2.0f * middle2_sum * middle2_sum) 1668cc1dc7a3Sopenharmony_ci + (right2_sum * right2_sum); 1669cc1dc7a3Sopenharmony_ci 1670cc1dc7a3Sopenharmony_ci vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2; 1671cc1dc7a3Sopenharmony_ci vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2; 1672cc1dc7a3Sopenharmony_ci 1673cc1dc7a3Sopenharmony_ci vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f); 1674cc1dc7a3Sopenharmony_ci vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); 1675cc1dc7a3Sopenharmony_ci vmask4 full_mask = p2_mask & det_mask & notnan_mask; 1676cc1dc7a3Sopenharmony_ci 1677cc1dc7a3Sopenharmony_ci ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask); 1678cc1dc7a3Sopenharmony_ci ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask); 1679cc1dc7a3Sopenharmony_ci } 1680cc1dc7a3Sopenharmony_ci 1681cc1dc7a3Sopenharmony_ci // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR 1682cc1dc7a3Sopenharmony_ci if (blk.rgb_lns[0] || blk.alpha_lns[0]) 1683cc1dc7a3Sopenharmony_ci { 1684cc1dc7a3Sopenharmony_ci weight_weight_sum = weight_weight_sum * color_weight; 1685cc1dc7a3Sopenharmony_ci float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight); 1686cc1dc7a3Sopenharmony_ci 1687cc1dc7a3Sopenharmony_ci vfloat4 rgbq_sum = color_vec_x + color_vec_y; 1688cc1dc7a3Sopenharmony_ci rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); 1689cc1dc7a3Sopenharmony_ci 1690cc1dc7a3Sopenharmony_ci rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); 1691cc1dc7a3Sopenharmony_ci 1692cc1dc7a3Sopenharmony_ci // We can get a failure due to the use of a singular (non-invertible) matrix 1693cc1dc7a3Sopenharmony_ci // If it failed, compute rgbo_vectors[] with a different method ... 1694cc1dc7a3Sopenharmony_ci if (astc::isnan(dot_s(rgbo_vector, rgbo_vector))) 1695cc1dc7a3Sopenharmony_ci { 1696cc1dc7a3Sopenharmony_ci vfloat4 v0 = ep.endpt0[0]; 1697cc1dc7a3Sopenharmony_ci vfloat4 v1 = ep.endpt1[0]; 1698cc1dc7a3Sopenharmony_ci 1699cc1dc7a3Sopenharmony_ci float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f); 1700cc1dc7a3Sopenharmony_ci avgdif = astc::max(avgdif, 0.0f); 1701cc1dc7a3Sopenharmony_ci 1702cc1dc7a3Sopenharmony_ci vfloat4 avg = (v0 + v1) * 0.5f; 1703cc1dc7a3Sopenharmony_ci vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; 1704cc1dc7a3Sopenharmony_ci 1705cc1dc7a3Sopenharmony_ci rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); 1706cc1dc7a3Sopenharmony_ci } 1707cc1dc7a3Sopenharmony_ci } 1708cc1dc7a3Sopenharmony_ci} 1709cc1dc7a3Sopenharmony_ci 1710cc1dc7a3Sopenharmony_ci#endif 1711