1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0 2cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 3cc1dc7a3Sopenharmony_ci// Copyright 2011-2024 Arm Limited 4cc1dc7a3Sopenharmony_ci// 5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy 7cc1dc7a3Sopenharmony_ci// of the License at: 8cc1dc7a3Sopenharmony_ci// 9cc1dc7a3Sopenharmony_ci// http://www.apache.org/licenses/LICENSE-2.0 10cc1dc7a3Sopenharmony_ci// 11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software 12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations 15cc1dc7a3Sopenharmony_ci// under the License. 16cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 17cc1dc7a3Sopenharmony_ci 18cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DECOMPRESS_ONLY) 19cc1dc7a3Sopenharmony_ci 20cc1dc7a3Sopenharmony_ci/** 21cc1dc7a3Sopenharmony_ci * @brief Functions to compress a symbolic block. 22cc1dc7a3Sopenharmony_ci */ 23cc1dc7a3Sopenharmony_ci 24cc1dc7a3Sopenharmony_ci#include "astcenc_internal.h" 25cc1dc7a3Sopenharmony_ci#include "astcenc_diagnostic_trace.h" 26cc1dc7a3Sopenharmony_ci 27cc1dc7a3Sopenharmony_ci#include <cassert> 28cc1dc7a3Sopenharmony_ci#ifdef ASTC_CUSTOMIZED_ENABLE 29cc1dc7a3Sopenharmony_ciAstcCustomizedSoManager g_astcCustomizedSoManager; 30cc1dc7a3Sopenharmony_ci#endif 31cc1dc7a3Sopenharmony_ci 32cc1dc7a3Sopenharmony_ci/** 33cc1dc7a3Sopenharmony_ci * @brief Merge two planes of endpoints into a single vector. 34cc1dc7a3Sopenharmony_ci * 35cc1dc7a3Sopenharmony_ci * @param ep_plane1 The endpoints for plane 1. 36cc1dc7a3Sopenharmony_ci * @param ep_plane2 The endpoints for plane 2. 37cc1dc7a3Sopenharmony_ci * @param component_plane2 The color component for plane 2. 38cc1dc7a3Sopenharmony_ci * @param[out] result The merged output. 39cc1dc7a3Sopenharmony_ci */ 40cc1dc7a3Sopenharmony_cistatic void merge_endpoints( 41cc1dc7a3Sopenharmony_ci const endpoints& ep_plane1, 42cc1dc7a3Sopenharmony_ci const endpoints& ep_plane2, 43cc1dc7a3Sopenharmony_ci unsigned int component_plane2, 44cc1dc7a3Sopenharmony_ci endpoints& result 45cc1dc7a3Sopenharmony_ci) { 46cc1dc7a3Sopenharmony_ci unsigned int partition_count = ep_plane1.partition_count; 47cc1dc7a3Sopenharmony_ci assert(partition_count == 1); 48cc1dc7a3Sopenharmony_ci 49cc1dc7a3Sopenharmony_ci vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2); 50cc1dc7a3Sopenharmony_ci 51cc1dc7a3Sopenharmony_ci result.partition_count = partition_count; 52cc1dc7a3Sopenharmony_ci result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask); 53cc1dc7a3Sopenharmony_ci result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask); 54cc1dc7a3Sopenharmony_ci} 55cc1dc7a3Sopenharmony_ci 56cc1dc7a3Sopenharmony_ci/** 57cc1dc7a3Sopenharmony_ci * @brief Attempt to improve weights given a chosen configuration. 58cc1dc7a3Sopenharmony_ci * 59cc1dc7a3Sopenharmony_ci * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per 60cc1dc7a3Sopenharmony_ci * partition and per plane) and attempt to improve image quality by moving each weight up by one or 61cc1dc7a3Sopenharmony_ci * down by one quantization step. 62cc1dc7a3Sopenharmony_ci * 63cc1dc7a3Sopenharmony_ci * This is a specialized function which only supports operating on undecimated weight grids, 64cc1dc7a3Sopenharmony_ci * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation 65cc1dc7a3Sopenharmony_ci * is needed less often. 66cc1dc7a3Sopenharmony_ci * 67cc1dc7a3Sopenharmony_ci * @param decode_mode The decode mode (LDR, HDR). 68cc1dc7a3Sopenharmony_ci * @param bsd The block size information. 69cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 70cc1dc7a3Sopenharmony_ci * @param[out] scb The symbolic compressed block output. 71cc1dc7a3Sopenharmony_ci */ 72cc1dc7a3Sopenharmony_ci#if ASTCENC_NEON != 0 73cc1dc7a3Sopenharmony_cistatic bool realign_weights_undecimated( 74cc1dc7a3Sopenharmony_ci astcenc_profile decode_mode, 75cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 76cc1dc7a3Sopenharmony_ci const image_block& blk, 77cc1dc7a3Sopenharmony_ci symbolic_compressed_block& scb 78cc1dc7a3Sopenharmony_ci) { 79cc1dc7a3Sopenharmony_ci // Get the partition descriptor 80cc1dc7a3Sopenharmony_ci unsigned int partition_count = scb.partition_count; 81cc1dc7a3Sopenharmony_ci const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); 82cc1dc7a3Sopenharmony_ci 83cc1dc7a3Sopenharmony_ci // Get the quantization table 84cc1dc7a3Sopenharmony_ci const block_mode& bm = bsd.get_block_mode(scb.block_mode); 85cc1dc7a3Sopenharmony_ci unsigned int weight_quant_level = bm.quant_mode; 86cc1dc7a3Sopenharmony_ci const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level]; 87cc1dc7a3Sopenharmony_ci 88cc1dc7a3Sopenharmony_ci unsigned int max_plane = bm.is_dual_plane; 89cc1dc7a3Sopenharmony_ci int plane2_component = scb.plane2_component; 90cc1dc7a3Sopenharmony_ci vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component); 91cc1dc7a3Sopenharmony_ci 92cc1dc7a3Sopenharmony_ci // Decode the color endpoints 93cc1dc7a3Sopenharmony_ci bool rgb_hdr; 94cc1dc7a3Sopenharmony_ci bool alpha_hdr; 95cc1dc7a3Sopenharmony_ci vint4 endpnt0[BLOCK_MAX_PARTITIONS]; 96cc1dc7a3Sopenharmony_ci vint4 endpnt1[BLOCK_MAX_PARTITIONS]; 97cc1dc7a3Sopenharmony_ci vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS]; 98cc1dc7a3Sopenharmony_ci vfloat4 offset[BLOCK_MAX_PARTITIONS]; 99cc1dc7a3Sopenharmony_ci 100cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 101cc1dc7a3Sopenharmony_ci 102cc1dc7a3Sopenharmony_ci for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) 103cc1dc7a3Sopenharmony_ci { 104cc1dc7a3Sopenharmony_ci unpack_color_endpoints(decode_mode, 105cc1dc7a3Sopenharmony_ci scb.color_formats[pa_idx], 106cc1dc7a3Sopenharmony_ci scb.color_values[pa_idx], 107cc1dc7a3Sopenharmony_ci rgb_hdr, alpha_hdr, 108cc1dc7a3Sopenharmony_ci endpnt0[pa_idx], 109cc1dc7a3Sopenharmony_ci endpnt1[pa_idx]); 110cc1dc7a3Sopenharmony_ci } 111cc1dc7a3Sopenharmony_ci 112cc1dc7a3Sopenharmony_ci uint8_t* dec_weights_uquant = scb.weights; 113cc1dc7a3Sopenharmony_ci bool adjustments = false; 114cc1dc7a3Sopenharmony_ci 115cc1dc7a3Sopenharmony_ci // For each plane and partition ... 116cc1dc7a3Sopenharmony_ci for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++) 117cc1dc7a3Sopenharmony_ci { 118cc1dc7a3Sopenharmony_ci for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) 119cc1dc7a3Sopenharmony_ci { 120cc1dc7a3Sopenharmony_ci // Compute the endpoint delta for all components in current plane 121cc1dc7a3Sopenharmony_ci vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx]; 122cc1dc7a3Sopenharmony_ci epd = select(epd, vint4::zero(), plane_mask); 123cc1dc7a3Sopenharmony_ci 124cc1dc7a3Sopenharmony_ci endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]); 125cc1dc7a3Sopenharmony_ci offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f); 126cc1dc7a3Sopenharmony_ci } 127cc1dc7a3Sopenharmony_ci 128cc1dc7a3Sopenharmony_ci // For each weight compute previous, current, and next errors 129cc1dc7a3Sopenharmony_ci promise(bsd.texel_count > 0); 130cc1dc7a3Sopenharmony_ci 131cc1dc7a3Sopenharmony_ci unsigned int texel = 0; 132cc1dc7a3Sopenharmony_ci for (; texel + ASTCENC_SIMD_WIDTH <= bsd.texel_count; texel += ASTCENC_SIMD_WIDTH) 133cc1dc7a3Sopenharmony_ci { 134cc1dc7a3Sopenharmony_ci int uqw0 = dec_weights_uquant[texel]; 135cc1dc7a3Sopenharmony_ci int uqw1 = dec_weights_uquant[texel + 1]; 136cc1dc7a3Sopenharmony_ci int uqw2 = dec_weights_uquant[texel + 2]; 137cc1dc7a3Sopenharmony_ci int uqw3 = dec_weights_uquant[texel + 3]; 138cc1dc7a3Sopenharmony_ci 139cc1dc7a3Sopenharmony_ci vint4 uqw_vec = vint4(uqw0, uqw1, uqw2, uqw3); 140cc1dc7a3Sopenharmony_ci vint4 prev_and_next_vec = vint4(qat.prev_next_values[uqw0], qat.prev_next_values[uqw1], 141cc1dc7a3Sopenharmony_ci qat.prev_next_values[uqw2], qat.prev_next_values[uqw3]); 142cc1dc7a3Sopenharmony_ci 143cc1dc7a3Sopenharmony_ci vint4 mask = vint4(0xFF, 0xFF, 0xFF, 0xFF); 144cc1dc7a3Sopenharmony_ci vint4 uqw_down_vec = prev_and_next_vec & mask; 145cc1dc7a3Sopenharmony_ci vint4 uqw_up_vec = vint4(vshrq_n_s32(prev_and_next_vec.m, 8)) & mask; 146cc1dc7a3Sopenharmony_ci 147cc1dc7a3Sopenharmony_ci vfloat4 weight_base_vec = int_to_float(uqw_vec); 148cc1dc7a3Sopenharmony_ci vfloat4 weight_down_vec = int_to_float(uqw_down_vec) - weight_base_vec; 149cc1dc7a3Sopenharmony_ci vfloat4 weight_up_vec = int_to_float(uqw_up_vec) - weight_base_vec; 150cc1dc7a3Sopenharmony_ci 151cc1dc7a3Sopenharmony_ci unsigned int partition0 = pi.partition_of_texel[texel]; 152cc1dc7a3Sopenharmony_ci unsigned int partition1 = pi.partition_of_texel[texel + 1]; 153cc1dc7a3Sopenharmony_ci unsigned int partition2 = pi.partition_of_texel[texel + 2]; 154cc1dc7a3Sopenharmony_ci unsigned int partition3 = pi.partition_of_texel[texel + 3]; 155cc1dc7a3Sopenharmony_ci 156cc1dc7a3Sopenharmony_ci vfloat4 color_offset0 = offset[partition0]; 157cc1dc7a3Sopenharmony_ci vfloat4 color_offset1 = offset[partition1]; 158cc1dc7a3Sopenharmony_ci vfloat4 color_offset2 = offset[partition2]; 159cc1dc7a3Sopenharmony_ci vfloat4 color_offset3 = offset[partition3]; 160cc1dc7a3Sopenharmony_ci 161cc1dc7a3Sopenharmony_ci vfloat4 color_base0 = endpnt0f[partition0]; 162cc1dc7a3Sopenharmony_ci vfloat4 color_base1 = endpnt0f[partition1]; 163cc1dc7a3Sopenharmony_ci vfloat4 color_base2 = endpnt0f[partition2]; 164cc1dc7a3Sopenharmony_ci vfloat4 color_base3 = endpnt0f[partition3]; 165cc1dc7a3Sopenharmony_ci 166cc1dc7a3Sopenharmony_ci vfloat4 color0 = color_base0 + color_offset0 * weight_base_vec.lane<0>(); 167cc1dc7a3Sopenharmony_ci vfloat4 color1 = color_base1 + color_offset1 * weight_base_vec.lane<1>(); 168cc1dc7a3Sopenharmony_ci vfloat4 color2 = color_base2 + color_offset2 * weight_base_vec.lane<2>(); 169cc1dc7a3Sopenharmony_ci vfloat4 color3 = color_base3 + color_offset3 * weight_base_vec.lane<3>(); 170cc1dc7a3Sopenharmony_ci 171cc1dc7a3Sopenharmony_ci vfloat4 orig_color0 = blk.texel(texel); 172cc1dc7a3Sopenharmony_ci vfloat4 orig_color1 = blk.texel(texel + 1); 173cc1dc7a3Sopenharmony_ci vfloat4 orig_color2 = blk.texel(texel + 2); 174cc1dc7a3Sopenharmony_ci vfloat4 orig_color3 = blk.texel(texel + 3); 175cc1dc7a3Sopenharmony_ci 176cc1dc7a3Sopenharmony_ci vfloat4 error_weight = blk.channel_weight; 177cc1dc7a3Sopenharmony_ci 178cc1dc7a3Sopenharmony_ci vfloat4 color_diff0 = color0 - orig_color0; 179cc1dc7a3Sopenharmony_ci vfloat4 color_diff1 = color1 - orig_color1; 180cc1dc7a3Sopenharmony_ci vfloat4 color_diff2 = color2 - orig_color2; 181cc1dc7a3Sopenharmony_ci vfloat4 color_diff3 = color3 - orig_color3; 182cc1dc7a3Sopenharmony_ci 183cc1dc7a3Sopenharmony_ci vfloat4 color_diff_down0 = color_diff0 + color_offset0 * weight_down_vec.lane<0>(); 184cc1dc7a3Sopenharmony_ci vfloat4 color_diff_down1 = color_diff1 + color_offset1 * weight_down_vec.lane<1>(); 185cc1dc7a3Sopenharmony_ci vfloat4 color_diff_down2 = color_diff2 + color_offset2 * weight_down_vec.lane<2>(); 186cc1dc7a3Sopenharmony_ci vfloat4 color_diff_down3 = color_diff3 + color_offset3 * weight_down_vec.lane<3>(); 187cc1dc7a3Sopenharmony_ci 188cc1dc7a3Sopenharmony_ci vfloat4 color_diff_up0 = color_diff0 + color_offset0 * weight_up_vec.lane<0>(); 189cc1dc7a3Sopenharmony_ci vfloat4 color_diff_up1 = color_diff1 + color_offset1 * weight_up_vec.lane<1>(); 190cc1dc7a3Sopenharmony_ci vfloat4 color_diff_up2 = color_diff2 + color_offset2 * weight_up_vec.lane<2>(); 191cc1dc7a3Sopenharmony_ci vfloat4 color_diff_up3 = color_diff3 + color_offset3 * weight_up_vec.lane<3>(); 192cc1dc7a3Sopenharmony_ci 193cc1dc7a3Sopenharmony_ci float error_base0 = dot_s(color_diff0 * color_diff0, error_weight); 194cc1dc7a3Sopenharmony_ci float error_base1 = dot_s(color_diff1 * color_diff1, error_weight); 195cc1dc7a3Sopenharmony_ci float error_base2 = dot_s(color_diff2 * color_diff2, error_weight); 196cc1dc7a3Sopenharmony_ci float error_base3 = dot_s(color_diff3 * color_diff3, error_weight); 197cc1dc7a3Sopenharmony_ci 198cc1dc7a3Sopenharmony_ci float error_down0 = dot_s(color_diff_down0 * color_diff_down0, error_weight); 199cc1dc7a3Sopenharmony_ci float error_down1 = dot_s(color_diff_down1 * color_diff_down1, error_weight); 200cc1dc7a3Sopenharmony_ci float error_down2 = dot_s(color_diff_down2 * color_diff_down2, error_weight); 201cc1dc7a3Sopenharmony_ci float error_down3 = dot_s(color_diff_down3 * color_diff_down3, error_weight); 202cc1dc7a3Sopenharmony_ci 203cc1dc7a3Sopenharmony_ci float error_up0 = dot_s(color_diff_up0 * color_diff_up0, error_weight); 204cc1dc7a3Sopenharmony_ci float error_up1 = dot_s(color_diff_up1 * color_diff_up1, error_weight); 205cc1dc7a3Sopenharmony_ci float error_up2 = dot_s(color_diff_up2 * color_diff_up2, error_weight); 206cc1dc7a3Sopenharmony_ci float error_up3 = dot_s(color_diff_up3 * color_diff_up3, error_weight); 207cc1dc7a3Sopenharmony_ci 208cc1dc7a3Sopenharmony_ci vfloat4 error_base_vec = vfloat4(error_base0, error_base1, error_base2, error_base3); 209cc1dc7a3Sopenharmony_ci vfloat4 error_down_vec = vfloat4(error_down0, error_down1, error_down2, error_down3); 210cc1dc7a3Sopenharmony_ci vfloat4 error_up_vec = vfloat4(error_up0, error_up1, error_up2, error_up3); 211cc1dc7a3Sopenharmony_ci 212cc1dc7a3Sopenharmony_ci vmask4 check_result_up = (error_up_vec < error_base_vec) & 213cc1dc7a3Sopenharmony_ci (error_up_vec < error_down_vec) & (uqw_vec < vint4(64)); 214cc1dc7a3Sopenharmony_ci 215cc1dc7a3Sopenharmony_ci vmask4 check_result_down = (error_down_vec < error_base_vec) & (uqw_vec > vint4::zero()); 216cc1dc7a3Sopenharmony_ci check_result_down = check_result_down & (~check_result_up); 217cc1dc7a3Sopenharmony_ci 218cc1dc7a3Sopenharmony_ci if (popcount(check_result_up | check_result_down) != 0) 219cc1dc7a3Sopenharmony_ci { 220cc1dc7a3Sopenharmony_ci uqw_vec = select(uqw_vec, uqw_up_vec, check_result_up); 221cc1dc7a3Sopenharmony_ci uqw_vec = select(uqw_vec, uqw_down_vec, check_result_down); 222cc1dc7a3Sopenharmony_ci 223cc1dc7a3Sopenharmony_ci dec_weights_uquant[texel] = uqw_vec.lane<0>(); 224cc1dc7a3Sopenharmony_ci dec_weights_uquant[texel + 1] = uqw_vec.lane<1>(); 225cc1dc7a3Sopenharmony_ci dec_weights_uquant[texel + 2] = uqw_vec.lane<2>(); // channel 2 226cc1dc7a3Sopenharmony_ci dec_weights_uquant[texel + 3] = uqw_vec.lane<3>(); // channel 3 227cc1dc7a3Sopenharmony_ci adjustments = true; 228cc1dc7a3Sopenharmony_ci } 229cc1dc7a3Sopenharmony_ci }; 230cc1dc7a3Sopenharmony_ci 231cc1dc7a3Sopenharmony_ci for (; texel < bsd.texel_count; texel++) 232cc1dc7a3Sopenharmony_ci { 233cc1dc7a3Sopenharmony_ci int uqw = dec_weights_uquant[texel]; 234cc1dc7a3Sopenharmony_ci 235cc1dc7a3Sopenharmony_ci uint32_t prev_and_next = qat.prev_next_values[uqw]; 236cc1dc7a3Sopenharmony_ci int uqw_down = prev_and_next & 0xFF; 237cc1dc7a3Sopenharmony_ci int uqw_up = (prev_and_next >> 8) & 0xFF; 238cc1dc7a3Sopenharmony_ci 239cc1dc7a3Sopenharmony_ci // Interpolate the colors to create the diffs 240cc1dc7a3Sopenharmony_ci float weight_base = static_cast<float>(uqw); 241cc1dc7a3Sopenharmony_ci float weight_down = static_cast<float>(uqw_down - uqw); 242cc1dc7a3Sopenharmony_ci float weight_up = static_cast<float>(uqw_up - uqw); 243cc1dc7a3Sopenharmony_ci 244cc1dc7a3Sopenharmony_ci unsigned int partition = pi.partition_of_texel[texel]; 245cc1dc7a3Sopenharmony_ci vfloat4 color_offset = offset[partition]; 246cc1dc7a3Sopenharmony_ci vfloat4 color_base = endpnt0f[partition]; 247cc1dc7a3Sopenharmony_ci 248cc1dc7a3Sopenharmony_ci vfloat4 color = color_base + color_offset * weight_base; 249cc1dc7a3Sopenharmony_ci vfloat4 orig_color = blk.texel(texel); 250cc1dc7a3Sopenharmony_ci vfloat4 error_weight = blk.channel_weight; 251cc1dc7a3Sopenharmony_ci 252cc1dc7a3Sopenharmony_ci vfloat4 color_diff = color - orig_color; 253cc1dc7a3Sopenharmony_ci vfloat4 color_diff_down = color_diff + color_offset * weight_down; 254cc1dc7a3Sopenharmony_ci vfloat4 color_diff_up = color_diff + color_offset * weight_up; 255cc1dc7a3Sopenharmony_ci 256cc1dc7a3Sopenharmony_ci float error_base = dot_s(color_diff * color_diff, error_weight); 257cc1dc7a3Sopenharmony_ci float error_down = dot_s(color_diff_down * color_diff_down, error_weight); 258cc1dc7a3Sopenharmony_ci float error_up = dot_s(color_diff_up * color_diff_up, error_weight); 259cc1dc7a3Sopenharmony_ci 260cc1dc7a3Sopenharmony_ci // Check if the prev or next error is better, and if so use it 261cc1dc7a3Sopenharmony_ci if ((error_up < error_base) && (error_up < error_down) && (uqw < 64)) 262cc1dc7a3Sopenharmony_ci { 263cc1dc7a3Sopenharmony_ci dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up); 264cc1dc7a3Sopenharmony_ci adjustments = true; 265cc1dc7a3Sopenharmony_ci } 266cc1dc7a3Sopenharmony_ci else if ((error_down < error_base) && (uqw > 0)) 267cc1dc7a3Sopenharmony_ci { 268cc1dc7a3Sopenharmony_ci dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down); 269cc1dc7a3Sopenharmony_ci adjustments = true; 270cc1dc7a3Sopenharmony_ci } 271cc1dc7a3Sopenharmony_ci } 272cc1dc7a3Sopenharmony_ci 273cc1dc7a3Sopenharmony_ci // Prepare iteration for plane 2 274cc1dc7a3Sopenharmony_ci dec_weights_uquant += WEIGHTS_PLANE2_OFFSET; 275cc1dc7a3Sopenharmony_ci plane_mask = ~plane_mask; 276cc1dc7a3Sopenharmony_ci } 277cc1dc7a3Sopenharmony_ci return adjustments; 278cc1dc7a3Sopenharmony_ci} 279cc1dc7a3Sopenharmony_ci#else 280cc1dc7a3Sopenharmony_cistatic bool realign_weights_undecimated( 281cc1dc7a3Sopenharmony_ci astcenc_profile decode_mode, 282cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 283cc1dc7a3Sopenharmony_ci const image_block& blk, 284cc1dc7a3Sopenharmony_ci symbolic_compressed_block& scb 285cc1dc7a3Sopenharmony_ci) { 286cc1dc7a3Sopenharmony_ci // Get the partition descriptor 287cc1dc7a3Sopenharmony_ci unsigned int partition_count = scb.partition_count; 288cc1dc7a3Sopenharmony_ci const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); 289cc1dc7a3Sopenharmony_ci 290cc1dc7a3Sopenharmony_ci // Get the quantization table 291cc1dc7a3Sopenharmony_ci const block_mode& bm = bsd.get_block_mode(scb.block_mode); 292cc1dc7a3Sopenharmony_ci unsigned int weight_quant_level = bm.quant_mode; 293cc1dc7a3Sopenharmony_ci const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level]; 294cc1dc7a3Sopenharmony_ci 295cc1dc7a3Sopenharmony_ci unsigned int max_plane = bm.is_dual_plane; 296cc1dc7a3Sopenharmony_ci int plane2_component = scb.plane2_component; 297cc1dc7a3Sopenharmony_ci vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component); 298cc1dc7a3Sopenharmony_ci 299cc1dc7a3Sopenharmony_ci // Decode the color endpoints 300cc1dc7a3Sopenharmony_ci bool rgb_hdr; 301cc1dc7a3Sopenharmony_ci bool alpha_hdr; 302cc1dc7a3Sopenharmony_ci vint4 endpnt0[BLOCK_MAX_PARTITIONS]; 303cc1dc7a3Sopenharmony_ci vint4 endpnt1[BLOCK_MAX_PARTITIONS]; 304cc1dc7a3Sopenharmony_ci vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS]; 305cc1dc7a3Sopenharmony_ci vfloat4 offset[BLOCK_MAX_PARTITIONS]; 306cc1dc7a3Sopenharmony_ci 307cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 308cc1dc7a3Sopenharmony_ci 309cc1dc7a3Sopenharmony_ci for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) 310cc1dc7a3Sopenharmony_ci { 311cc1dc7a3Sopenharmony_ci unpack_color_endpoints(decode_mode, 312cc1dc7a3Sopenharmony_ci scb.color_formats[pa_idx], 313cc1dc7a3Sopenharmony_ci scb.color_values[pa_idx], 314cc1dc7a3Sopenharmony_ci rgb_hdr, alpha_hdr, 315cc1dc7a3Sopenharmony_ci endpnt0[pa_idx], 316cc1dc7a3Sopenharmony_ci endpnt1[pa_idx]); 317cc1dc7a3Sopenharmony_ci } 318cc1dc7a3Sopenharmony_ci 319cc1dc7a3Sopenharmony_ci uint8_t* dec_weights_uquant = scb.weights; 320cc1dc7a3Sopenharmony_ci bool adjustments = false; 321cc1dc7a3Sopenharmony_ci 322cc1dc7a3Sopenharmony_ci // For each plane and partition ... 323cc1dc7a3Sopenharmony_ci for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++) 324cc1dc7a3Sopenharmony_ci { 325cc1dc7a3Sopenharmony_ci for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) 326cc1dc7a3Sopenharmony_ci { 327cc1dc7a3Sopenharmony_ci // Compute the endpoint delta for all components in current plane 328cc1dc7a3Sopenharmony_ci vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx]; 329cc1dc7a3Sopenharmony_ci epd = select(epd, vint4::zero(), plane_mask); 330cc1dc7a3Sopenharmony_ci 331cc1dc7a3Sopenharmony_ci endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]); 332cc1dc7a3Sopenharmony_ci offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f); 333cc1dc7a3Sopenharmony_ci } 334cc1dc7a3Sopenharmony_ci 335cc1dc7a3Sopenharmony_ci // For each weight compute previous, current, and next errors 336cc1dc7a3Sopenharmony_ci promise(bsd.texel_count > 0); 337cc1dc7a3Sopenharmony_ci for (unsigned int texel = 0; texel < bsd.texel_count; texel++) 338cc1dc7a3Sopenharmony_ci { 339cc1dc7a3Sopenharmony_ci int uqw = dec_weights_uquant[texel]; 340cc1dc7a3Sopenharmony_ci 341cc1dc7a3Sopenharmony_ci uint32_t prev_and_next = qat.prev_next_values[uqw]; 342cc1dc7a3Sopenharmony_ci int uqw_down = prev_and_next & 0xFF; 343cc1dc7a3Sopenharmony_ci int uqw_up = (prev_and_next >> 8) & 0xFF; 344cc1dc7a3Sopenharmony_ci 345cc1dc7a3Sopenharmony_ci // Interpolate the colors to create the diffs 346cc1dc7a3Sopenharmony_ci float weight_base = static_cast<float>(uqw); 347cc1dc7a3Sopenharmony_ci float weight_down = static_cast<float>(uqw_down - uqw); 348cc1dc7a3Sopenharmony_ci float weight_up = static_cast<float>(uqw_up - uqw); 349cc1dc7a3Sopenharmony_ci 350cc1dc7a3Sopenharmony_ci unsigned int partition = pi.partition_of_texel[texel]; 351cc1dc7a3Sopenharmony_ci vfloat4 color_offset = offset[partition]; 352cc1dc7a3Sopenharmony_ci vfloat4 color_base = endpnt0f[partition]; 353cc1dc7a3Sopenharmony_ci 354cc1dc7a3Sopenharmony_ci vfloat4 color = color_base + color_offset * weight_base; 355cc1dc7a3Sopenharmony_ci vfloat4 orig_color = blk.texel(texel); 356cc1dc7a3Sopenharmony_ci vfloat4 error_weight = blk.channel_weight; 357cc1dc7a3Sopenharmony_ci 358cc1dc7a3Sopenharmony_ci vfloat4 color_diff = color - orig_color; 359cc1dc7a3Sopenharmony_ci vfloat4 color_diff_down = color_diff + color_offset * weight_down; 360cc1dc7a3Sopenharmony_ci vfloat4 color_diff_up = color_diff + color_offset * weight_up; 361cc1dc7a3Sopenharmony_ci 362cc1dc7a3Sopenharmony_ci float error_base = dot_s(color_diff * color_diff, error_weight); 363cc1dc7a3Sopenharmony_ci float error_down = dot_s(color_diff_down * color_diff_down, error_weight); 364cc1dc7a3Sopenharmony_ci float error_up = dot_s(color_diff_up * color_diff_up, error_weight); 365cc1dc7a3Sopenharmony_ci 366cc1dc7a3Sopenharmony_ci // Check if the prev or next error is better, and if so use it 367cc1dc7a3Sopenharmony_ci if ((error_up < error_base) && (error_up < error_down) && (uqw < 64)) 368cc1dc7a3Sopenharmony_ci { 369cc1dc7a3Sopenharmony_ci dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up); 370cc1dc7a3Sopenharmony_ci adjustments = true; 371cc1dc7a3Sopenharmony_ci } 372cc1dc7a3Sopenharmony_ci else if ((error_down < error_base) && (uqw > 0)) 373cc1dc7a3Sopenharmony_ci { 374cc1dc7a3Sopenharmony_ci dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down); 375cc1dc7a3Sopenharmony_ci adjustments = true; 376cc1dc7a3Sopenharmony_ci } 377cc1dc7a3Sopenharmony_ci } 378cc1dc7a3Sopenharmony_ci 379cc1dc7a3Sopenharmony_ci // Prepare iteration for plane 2 380cc1dc7a3Sopenharmony_ci dec_weights_uquant += WEIGHTS_PLANE2_OFFSET; 381cc1dc7a3Sopenharmony_ci plane_mask = ~plane_mask; 382cc1dc7a3Sopenharmony_ci } 383cc1dc7a3Sopenharmony_ci 384cc1dc7a3Sopenharmony_ci return adjustments; 385cc1dc7a3Sopenharmony_ci} 386cc1dc7a3Sopenharmony_ci#endif 387cc1dc7a3Sopenharmony_ci 388cc1dc7a3Sopenharmony_ci/** 389cc1dc7a3Sopenharmony_ci * @brief Attempt to improve weights given a chosen configuration. 390cc1dc7a3Sopenharmony_ci * 391cc1dc7a3Sopenharmony_ci * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per 392cc1dc7a3Sopenharmony_ci * partition and per plane) and attempt to improve image quality by moving each weight up by one or 393cc1dc7a3Sopenharmony_ci * down by one quantization step. 394cc1dc7a3Sopenharmony_ci * 395cc1dc7a3Sopenharmony_ci * @param decode_mode The decode mode (LDR, HDR). 396cc1dc7a3Sopenharmony_ci * @param bsd The block size information. 397cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 398cc1dc7a3Sopenharmony_ci * @param[out] scb The symbolic compressed block output. 399cc1dc7a3Sopenharmony_ci */ 400cc1dc7a3Sopenharmony_cistatic bool realign_weights_decimated( 401cc1dc7a3Sopenharmony_ci astcenc_profile decode_mode, 402cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 403cc1dc7a3Sopenharmony_ci const image_block& blk, 404cc1dc7a3Sopenharmony_ci symbolic_compressed_block& scb 405cc1dc7a3Sopenharmony_ci) { 406cc1dc7a3Sopenharmony_ci // Get the partition descriptor 407cc1dc7a3Sopenharmony_ci unsigned int partition_count = scb.partition_count; 408cc1dc7a3Sopenharmony_ci const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); 409cc1dc7a3Sopenharmony_ci 410cc1dc7a3Sopenharmony_ci // Get the quantization table 411cc1dc7a3Sopenharmony_ci const block_mode& bm = bsd.get_block_mode(scb.block_mode); 412cc1dc7a3Sopenharmony_ci unsigned int weight_quant_level = bm.quant_mode; 413cc1dc7a3Sopenharmony_ci const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level]; 414cc1dc7a3Sopenharmony_ci 415cc1dc7a3Sopenharmony_ci // Get the decimation table 416cc1dc7a3Sopenharmony_ci const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); 417cc1dc7a3Sopenharmony_ci unsigned int weight_count = di.weight_count; 418cc1dc7a3Sopenharmony_ci assert(weight_count != bsd.texel_count); 419cc1dc7a3Sopenharmony_ci 420cc1dc7a3Sopenharmony_ci unsigned int max_plane = bm.is_dual_plane; 421cc1dc7a3Sopenharmony_ci int plane2_component = scb.plane2_component; 422cc1dc7a3Sopenharmony_ci vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component); 423cc1dc7a3Sopenharmony_ci 424cc1dc7a3Sopenharmony_ci // Decode the color endpoints 425cc1dc7a3Sopenharmony_ci bool rgb_hdr; 426cc1dc7a3Sopenharmony_ci bool alpha_hdr; 427cc1dc7a3Sopenharmony_ci vint4 endpnt0[BLOCK_MAX_PARTITIONS]; 428cc1dc7a3Sopenharmony_ci vint4 endpnt1[BLOCK_MAX_PARTITIONS]; 429cc1dc7a3Sopenharmony_ci vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS]; 430cc1dc7a3Sopenharmony_ci vfloat4 offset[BLOCK_MAX_PARTITIONS]; 431cc1dc7a3Sopenharmony_ci 432cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 433cc1dc7a3Sopenharmony_ci promise(weight_count > 0); 434cc1dc7a3Sopenharmony_ci 435cc1dc7a3Sopenharmony_ci for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) 436cc1dc7a3Sopenharmony_ci { 437cc1dc7a3Sopenharmony_ci unpack_color_endpoints(decode_mode, 438cc1dc7a3Sopenharmony_ci scb.color_formats[pa_idx], 439cc1dc7a3Sopenharmony_ci scb.color_values[pa_idx], 440cc1dc7a3Sopenharmony_ci rgb_hdr, alpha_hdr, 441cc1dc7a3Sopenharmony_ci endpnt0[pa_idx], 442cc1dc7a3Sopenharmony_ci endpnt1[pa_idx]); 443cc1dc7a3Sopenharmony_ci } 444cc1dc7a3Sopenharmony_ci 445cc1dc7a3Sopenharmony_ci uint8_t* dec_weights_uquant = scb.weights; 446cc1dc7a3Sopenharmony_ci bool adjustments = false; 447cc1dc7a3Sopenharmony_ci 448cc1dc7a3Sopenharmony_ci // For each plane and partition ... 449cc1dc7a3Sopenharmony_ci for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++) 450cc1dc7a3Sopenharmony_ci { 451cc1dc7a3Sopenharmony_ci for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) 452cc1dc7a3Sopenharmony_ci { 453cc1dc7a3Sopenharmony_ci // Compute the endpoint delta for all components in current plane 454cc1dc7a3Sopenharmony_ci vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx]; 455cc1dc7a3Sopenharmony_ci epd = select(epd, vint4::zero(), plane_mask); 456cc1dc7a3Sopenharmony_ci 457cc1dc7a3Sopenharmony_ci endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]); 458cc1dc7a3Sopenharmony_ci offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f); 459cc1dc7a3Sopenharmony_ci } 460cc1dc7a3Sopenharmony_ci 461cc1dc7a3Sopenharmony_ci // Create an unquantized weight grid for this decimation level 462cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS]; 463cc1dc7a3Sopenharmony_ci for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH) 464cc1dc7a3Sopenharmony_ci { 465cc1dc7a3Sopenharmony_ci vint unquant_value(dec_weights_uquant + we_idx); 466cc1dc7a3Sopenharmony_ci vfloat unquant_valuef = int_to_float(unquant_value); 467cc1dc7a3Sopenharmony_ci storea(unquant_valuef, uq_weightsf + we_idx); 468cc1dc7a3Sopenharmony_ci } 469cc1dc7a3Sopenharmony_ci 470cc1dc7a3Sopenharmony_ci // For each weight compute previous, current, and next errors 471cc1dc7a3Sopenharmony_ci for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++) 472cc1dc7a3Sopenharmony_ci { 473cc1dc7a3Sopenharmony_ci int uqw = dec_weights_uquant[we_idx]; 474cc1dc7a3Sopenharmony_ci uint32_t prev_and_next = qat.prev_next_values[uqw]; 475cc1dc7a3Sopenharmony_ci 476cc1dc7a3Sopenharmony_ci float uqw_base = uq_weightsf[we_idx]; 477cc1dc7a3Sopenharmony_ci float uqw_down = static_cast<float>(prev_and_next & 0xFF); 478cc1dc7a3Sopenharmony_ci float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF); 479cc1dc7a3Sopenharmony_ci 480cc1dc7a3Sopenharmony_ci float uqw_diff_down = uqw_down - uqw_base; 481cc1dc7a3Sopenharmony_ci float uqw_diff_up = uqw_up - uqw_base; 482cc1dc7a3Sopenharmony_ci 483cc1dc7a3Sopenharmony_ci vfloat4 error_basev = vfloat4::zero(); 484cc1dc7a3Sopenharmony_ci vfloat4 error_downv = vfloat4::zero(); 485cc1dc7a3Sopenharmony_ci vfloat4 error_upv = vfloat4::zero(); 486cc1dc7a3Sopenharmony_ci 487cc1dc7a3Sopenharmony_ci // Interpolate the colors to create the diffs 488cc1dc7a3Sopenharmony_ci unsigned int texels_to_evaluate = di.weight_texel_count[we_idx]; 489cc1dc7a3Sopenharmony_ci promise(texels_to_evaluate > 0); 490cc1dc7a3Sopenharmony_ci for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++) 491cc1dc7a3Sopenharmony_ci { 492cc1dc7a3Sopenharmony_ci unsigned int texel = di.weight_texels_tr[te_idx][we_idx]; 493cc1dc7a3Sopenharmony_ci 494cc1dc7a3Sopenharmony_ci float tw_base = di.texel_contrib_for_weight[te_idx][we_idx]; 495cc1dc7a3Sopenharmony_ci 496cc1dc7a3Sopenharmony_ci float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel] 497cc1dc7a3Sopenharmony_ci + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel]) 498cc1dc7a3Sopenharmony_ci + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel] 499cc1dc7a3Sopenharmony_ci + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]); 500cc1dc7a3Sopenharmony_ci 501cc1dc7a3Sopenharmony_ci // Ideally this is integer rounded, but IQ gain it isn't worth the overhead 502cc1dc7a3Sopenharmony_ci // float weight = astc::flt_rd(weight_base + 0.5f); 503cc1dc7a3Sopenharmony_ci // float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight; 504cc1dc7a3Sopenharmony_ci // float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight; 505cc1dc7a3Sopenharmony_ci float weight_down = weight_base + uqw_diff_down * tw_base - weight_base; 506cc1dc7a3Sopenharmony_ci float weight_up = weight_base + uqw_diff_up * tw_base - weight_base; 507cc1dc7a3Sopenharmony_ci 508cc1dc7a3Sopenharmony_ci unsigned int partition = pi.partition_of_texel[texel]; 509cc1dc7a3Sopenharmony_ci vfloat4 color_offset = offset[partition]; 510cc1dc7a3Sopenharmony_ci vfloat4 color_base = endpnt0f[partition]; 511cc1dc7a3Sopenharmony_ci 512cc1dc7a3Sopenharmony_ci vfloat4 color = color_base + color_offset * weight_base; 513cc1dc7a3Sopenharmony_ci vfloat4 orig_color = blk.texel(texel); 514cc1dc7a3Sopenharmony_ci 515cc1dc7a3Sopenharmony_ci vfloat4 color_diff = color - orig_color; 516cc1dc7a3Sopenharmony_ci vfloat4 color_down_diff = color_diff + color_offset * weight_down; 517cc1dc7a3Sopenharmony_ci vfloat4 color_up_diff = color_diff + color_offset * weight_up; 518cc1dc7a3Sopenharmony_ci 519cc1dc7a3Sopenharmony_ci error_basev += color_diff * color_diff; 520cc1dc7a3Sopenharmony_ci error_downv += color_down_diff * color_down_diff; 521cc1dc7a3Sopenharmony_ci error_upv += color_up_diff * color_up_diff; 522cc1dc7a3Sopenharmony_ci } 523cc1dc7a3Sopenharmony_ci 524cc1dc7a3Sopenharmony_ci vfloat4 error_weight = blk.channel_weight; 525cc1dc7a3Sopenharmony_ci float error_base = hadd_s(error_basev * error_weight); 526cc1dc7a3Sopenharmony_ci float error_down = hadd_s(error_downv * error_weight); 527cc1dc7a3Sopenharmony_ci float error_up = hadd_s(error_upv * error_weight); 528cc1dc7a3Sopenharmony_ci 529cc1dc7a3Sopenharmony_ci // Check if the prev or next error is better, and if so use it 530cc1dc7a3Sopenharmony_ci if ((error_up < error_base) && (error_up < error_down) && (uqw < 64)) 531cc1dc7a3Sopenharmony_ci { 532cc1dc7a3Sopenharmony_ci uq_weightsf[we_idx] = uqw_up; 533cc1dc7a3Sopenharmony_ci dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up); 534cc1dc7a3Sopenharmony_ci adjustments = true; 535cc1dc7a3Sopenharmony_ci } 536cc1dc7a3Sopenharmony_ci else if ((error_down < error_base) && (uqw > 0)) 537cc1dc7a3Sopenharmony_ci { 538cc1dc7a3Sopenharmony_ci uq_weightsf[we_idx] = uqw_down; 539cc1dc7a3Sopenharmony_ci dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down); 540cc1dc7a3Sopenharmony_ci adjustments = true; 541cc1dc7a3Sopenharmony_ci } 542cc1dc7a3Sopenharmony_ci } 543cc1dc7a3Sopenharmony_ci 544cc1dc7a3Sopenharmony_ci // Prepare iteration for plane 2 545cc1dc7a3Sopenharmony_ci dec_weights_uquant += WEIGHTS_PLANE2_OFFSET; 546cc1dc7a3Sopenharmony_ci plane_mask = ~plane_mask; 547cc1dc7a3Sopenharmony_ci } 548cc1dc7a3Sopenharmony_ci 549cc1dc7a3Sopenharmony_ci return adjustments; 550cc1dc7a3Sopenharmony_ci} 551cc1dc7a3Sopenharmony_ci 552cc1dc7a3Sopenharmony_ci/** 553cc1dc7a3Sopenharmony_ci * @brief Compress a block using a chosen partitioning and 1 plane of weights. 554cc1dc7a3Sopenharmony_ci * 555cc1dc7a3Sopenharmony_ci * @param config The compressor configuration. 556cc1dc7a3Sopenharmony_ci * @param bsd The block size information. 557cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 558cc1dc7a3Sopenharmony_ci * @param only_always True if we only use "always" percentile block modes. 559cc1dc7a3Sopenharmony_ci * @param tune_errorval_threshold The error value threshold. 560cc1dc7a3Sopenharmony_ci * @param partition_count The partition count. 561cc1dc7a3Sopenharmony_ci * @param partition_index The partition index if @c partition_count is 2-4. 562cc1dc7a3Sopenharmony_ci * @param[out] scb The symbolic compressed block output. 563cc1dc7a3Sopenharmony_ci * @param[out] tmpbuf The quantized weights for plane 1. 564cc1dc7a3Sopenharmony_ci */ 565cc1dc7a3Sopenharmony_cistatic float compress_symbolic_block_for_partition_1plane( 566cc1dc7a3Sopenharmony_ci QualityProfile privateProfile, 567cc1dc7a3Sopenharmony_ci const astcenc_config& config, 568cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 569cc1dc7a3Sopenharmony_ci const image_block& blk, 570cc1dc7a3Sopenharmony_ci bool only_always, 571cc1dc7a3Sopenharmony_ci float tune_errorval_threshold, 572cc1dc7a3Sopenharmony_ci unsigned int partition_count, 573cc1dc7a3Sopenharmony_ci unsigned int partition_index, 574cc1dc7a3Sopenharmony_ci symbolic_compressed_block& scb, 575cc1dc7a3Sopenharmony_ci compression_working_buffers& tmpbuf, 576cc1dc7a3Sopenharmony_ci int quant_limit 577cc1dc7a3Sopenharmony_ci) { 578cc1dc7a3Sopenharmony_ci promise(partition_count > 0); 579cc1dc7a3Sopenharmony_ci promise(config.tune_candidate_limit > 0); 580cc1dc7a3Sopenharmony_ci promise(config.tune_refinement_limit > 0); 581cc1dc7a3Sopenharmony_ci 582cc1dc7a3Sopenharmony_ci int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit); 583cc1dc7a3Sopenharmony_ci 584cc1dc7a3Sopenharmony_ci auto compute_difference = &compute_symbolic_block_difference_1plane; 585cc1dc7a3Sopenharmony_ci if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM)) 586cc1dc7a3Sopenharmony_ci { 587cc1dc7a3Sopenharmony_ci compute_difference = &compute_symbolic_block_difference_1plane_1partition; 588cc1dc7a3Sopenharmony_ci } 589cc1dc7a3Sopenharmony_ci 590cc1dc7a3Sopenharmony_ci const auto& pi = bsd.get_partition_info(partition_count, partition_index); 591cc1dc7a3Sopenharmony_ci 592cc1dc7a3Sopenharmony_ci // Compute ideal weights and endpoint colors, with no quantization or decimation 593cc1dc7a3Sopenharmony_ci endpoints_and_weights& ei = tmpbuf.ei1; 594cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_1plane(blk, pi, ei); 595cc1dc7a3Sopenharmony_ci 596cc1dc7a3Sopenharmony_ci // Compute ideal weights and endpoint colors for every decimation 597cc1dc7a3Sopenharmony_ci float* dec_weights_ideal = tmpbuf.dec_weights_ideal; 598cc1dc7a3Sopenharmony_ci uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant; 599cc1dc7a3Sopenharmony_ci 600cc1dc7a3Sopenharmony_ci // For each decimation mode, compute an ideal set of weights with no quantization 601cc1dc7a3Sopenharmony_ci unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always 602cc1dc7a3Sopenharmony_ci : bsd.decimation_mode_count_selected; 603cc1dc7a3Sopenharmony_ci promise(max_decimation_modes > 0); 604cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < max_decimation_modes; i++) 605cc1dc7a3Sopenharmony_ci { 606cc1dc7a3Sopenharmony_ci const auto& dm = bsd.get_decimation_mode(i); 607cc1dc7a3Sopenharmony_ci if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant))) 608cc1dc7a3Sopenharmony_ci { 609cc1dc7a3Sopenharmony_ci continue; 610cc1dc7a3Sopenharmony_ci } 611cc1dc7a3Sopenharmony_ci 612cc1dc7a3Sopenharmony_ci const auto& di = bsd.get_decimation_info(i); 613cc1dc7a3Sopenharmony_ci 614cc1dc7a3Sopenharmony_ci compute_ideal_weights_for_decimation( 615cc1dc7a3Sopenharmony_ci ei, 616cc1dc7a3Sopenharmony_ci di, 617cc1dc7a3Sopenharmony_ci dec_weights_ideal + i * BLOCK_MAX_WEIGHTS); 618cc1dc7a3Sopenharmony_ci } 619cc1dc7a3Sopenharmony_ci 620cc1dc7a3Sopenharmony_ci // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal 621cc1dc7a3Sopenharmony_ci // weight pair, compute the smallest weight that will result in a color value greater than 1 622cc1dc7a3Sopenharmony_ci vfloat4 min_ep(10.0f); 623cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partition_count; i++) 624cc1dc7a3Sopenharmony_ci { 625cc1dc7a3Sopenharmony_ci vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]); 626cc1dc7a3Sopenharmony_ci 627cc1dc7a3Sopenharmony_ci vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep); 628cc1dc7a3Sopenharmony_ci min_ep = select(min_ep, ep, use_ep); 629cc1dc7a3Sopenharmony_ci } 630cc1dc7a3Sopenharmony_ci 631cc1dc7a3Sopenharmony_ci float min_wt_cutoff = hmin_s(min_ep); 632cc1dc7a3Sopenharmony_ci 633cc1dc7a3Sopenharmony_ci // For each mode, use the angular method to compute a shift 634cc1dc7a3Sopenharmony_ci compute_angular_endpoints_1plane( 635cc1dc7a3Sopenharmony_ci privateProfile, only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf); 636cc1dc7a3Sopenharmony_ci 637cc1dc7a3Sopenharmony_ci float* weight_low_value = tmpbuf.weight_low_value1; 638cc1dc7a3Sopenharmony_ci float* weight_high_value = tmpbuf.weight_high_value1; 639cc1dc7a3Sopenharmony_ci int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts; 640cc1dc7a3Sopenharmony_ci float* qwt_errors = tmpbuf.qwt_errors; 641cc1dc7a3Sopenharmony_ci 642cc1dc7a3Sopenharmony_ci // For each mode (which specifies a decimation and a quantization): 643cc1dc7a3Sopenharmony_ci // * Compute number of bits needed for the quantized weights 644cc1dc7a3Sopenharmony_ci // * Generate an optimized set of quantized weights 645cc1dc7a3Sopenharmony_ci // * Compute quantization errors for the mode 646cc1dc7a3Sopenharmony_ci 647cc1dc7a3Sopenharmony_ci 648cc1dc7a3Sopenharmony_ci static const int8_t free_bits_for_partition_count[4] { 649cc1dc7a3Sopenharmony_ci 115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS 650cc1dc7a3Sopenharmony_ci }; 651cc1dc7a3Sopenharmony_ci 652cc1dc7a3Sopenharmony_ci unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always 653cc1dc7a3Sopenharmony_ci : bsd.block_mode_count_1plane_selected; 654cc1dc7a3Sopenharmony_ci promise(max_block_modes > 0); 655cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < max_block_modes; i++) 656cc1dc7a3Sopenharmony_ci { 657cc1dc7a3Sopenharmony_ci const block_mode& bm = bsd.block_modes[i]; 658cc1dc7a3Sopenharmony_ci 659cc1dc7a3Sopenharmony_ci if (bm.quant_mode > max_weight_quant) 660cc1dc7a3Sopenharmony_ci { 661cc1dc7a3Sopenharmony_ci qwt_errors[i] = 1e38f; 662cc1dc7a3Sopenharmony_ci continue; 663cc1dc7a3Sopenharmony_ci } 664cc1dc7a3Sopenharmony_ci 665cc1dc7a3Sopenharmony_ci assert(!bm.is_dual_plane); 666cc1dc7a3Sopenharmony_ci int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits; 667cc1dc7a3Sopenharmony_ci if (bitcount <= 0) 668cc1dc7a3Sopenharmony_ci { 669cc1dc7a3Sopenharmony_ci qwt_errors[i] = 1e38f; 670cc1dc7a3Sopenharmony_ci continue; 671cc1dc7a3Sopenharmony_ci } 672cc1dc7a3Sopenharmony_ci 673cc1dc7a3Sopenharmony_ci if (weight_high_value[i] > 1.02f * min_wt_cutoff) 674cc1dc7a3Sopenharmony_ci { 675cc1dc7a3Sopenharmony_ci weight_high_value[i] = 1.0f; 676cc1dc7a3Sopenharmony_ci } 677cc1dc7a3Sopenharmony_ci 678cc1dc7a3Sopenharmony_ci int decimation_mode = bm.decimation_mode; 679cc1dc7a3Sopenharmony_ci const auto& di = bsd.get_decimation_info(decimation_mode); 680cc1dc7a3Sopenharmony_ci 681cc1dc7a3Sopenharmony_ci qwt_bitcounts[i] = static_cast<int8_t>(bitcount); 682cc1dc7a3Sopenharmony_ci 683cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; 684cc1dc7a3Sopenharmony_ci 685cc1dc7a3Sopenharmony_ci // Generate the optimized set of weights for the weight mode 686cc1dc7a3Sopenharmony_ci compute_quantized_weights_for_decimation( 687cc1dc7a3Sopenharmony_ci di, 688cc1dc7a3Sopenharmony_ci weight_low_value[i], weight_high_value[i], 689cc1dc7a3Sopenharmony_ci dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode, 690cc1dc7a3Sopenharmony_ci dec_weights_uquantf, 691cc1dc7a3Sopenharmony_ci dec_weights_uquant + BLOCK_MAX_WEIGHTS * i, 692cc1dc7a3Sopenharmony_ci bm.get_weight_quant_mode()); 693cc1dc7a3Sopenharmony_ci 694cc1dc7a3Sopenharmony_ci // Compute weight quantization errors for the block mode 695cc1dc7a3Sopenharmony_ci qwt_errors[i] = compute_error_of_weight_set_1plane( 696cc1dc7a3Sopenharmony_ci ei, 697cc1dc7a3Sopenharmony_ci di, 698cc1dc7a3Sopenharmony_ci dec_weights_uquantf); 699cc1dc7a3Sopenharmony_ci } 700cc1dc7a3Sopenharmony_ci 701cc1dc7a3Sopenharmony_ci // Decide the optimal combination of color endpoint encodings and weight encodings 702cc1dc7a3Sopenharmony_ci uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS]; 703cc1dc7a3Sopenharmony_ci int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES]; 704cc1dc7a3Sopenharmony_ci 705cc1dc7a3Sopenharmony_ci quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES]; 706cc1dc7a3Sopenharmony_ci quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES]; 707cc1dc7a3Sopenharmony_ci 708cc1dc7a3Sopenharmony_ci unsigned int candidate_count = compute_ideal_endpoint_formats( 709cc1dc7a3Sopenharmony_ci privateProfile, 710cc1dc7a3Sopenharmony_ci pi, blk, ei.ep, qwt_bitcounts, qwt_errors, 711cc1dc7a3Sopenharmony_ci config.tune_candidate_limit, 0, max_block_modes, 712cc1dc7a3Sopenharmony_ci partition_format_specifiers, block_mode_index, 713cc1dc7a3Sopenharmony_ci color_quant_level, color_quant_level_mod, tmpbuf); 714cc1dc7a3Sopenharmony_ci 715cc1dc7a3Sopenharmony_ci // Iterate over the N believed-to-be-best modes to find out which one is actually best 716cc1dc7a3Sopenharmony_ci float best_errorval_in_mode = ERROR_CALC_DEFAULT; 717cc1dc7a3Sopenharmony_ci float best_errorval_in_scb = scb.errorval; 718cc1dc7a3Sopenharmony_ci 719cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < candidate_count; i++) 720cc1dc7a3Sopenharmony_ci { 721cc1dc7a3Sopenharmony_ci TRACE_NODE(node0, "candidate"); 722cc1dc7a3Sopenharmony_ci 723cc1dc7a3Sopenharmony_ci const int bm_packed_index = block_mode_index[i]; 724cc1dc7a3Sopenharmony_ci assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected)); 725cc1dc7a3Sopenharmony_ci const block_mode& qw_bm = bsd.block_modes[bm_packed_index]; 726cc1dc7a3Sopenharmony_ci 727cc1dc7a3Sopenharmony_ci int decimation_mode = qw_bm.decimation_mode; 728cc1dc7a3Sopenharmony_ci const auto& di = bsd.get_decimation_info(decimation_mode); 729cc1dc7a3Sopenharmony_ci promise(di.weight_count > 0); 730cc1dc7a3Sopenharmony_ci 731cc1dc7a3Sopenharmony_ci trace_add_data("weight_x", di.weight_x); 732cc1dc7a3Sopenharmony_ci trace_add_data("weight_y", di.weight_y); 733cc1dc7a3Sopenharmony_ci trace_add_data("weight_z", di.weight_z); 734cc1dc7a3Sopenharmony_ci trace_add_data("weight_quant", qw_bm.quant_mode); 735cc1dc7a3Sopenharmony_ci 736cc1dc7a3Sopenharmony_ci // Recompute the ideal color endpoints before storing them 737cc1dc7a3Sopenharmony_ci vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS]; 738cc1dc7a3Sopenharmony_ci vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS]; 739cc1dc7a3Sopenharmony_ci 740cc1dc7a3Sopenharmony_ci symbolic_compressed_block workscb; 741cc1dc7a3Sopenharmony_ci endpoints workep = ei.ep; 742cc1dc7a3Sopenharmony_ci 743cc1dc7a3Sopenharmony_ci uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index; 744cc1dc7a3Sopenharmony_ci 745cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < di.weight_count; j++) 746cc1dc7a3Sopenharmony_ci { 747cc1dc7a3Sopenharmony_ci workscb.weights[j] = u8_weight_src[j]; 748cc1dc7a3Sopenharmony_ci } 749cc1dc7a3Sopenharmony_ci 750cc1dc7a3Sopenharmony_ci for (unsigned int l = 0; l < config.tune_refinement_limit; l++) 751cc1dc7a3Sopenharmony_ci { 752cc1dc7a3Sopenharmony_ci recompute_ideal_colors_1plane( 753cc1dc7a3Sopenharmony_ci blk, pi, di, workscb.weights, 754cc1dc7a3Sopenharmony_ci workep, rgbs_colors, rgbo_colors); 755cc1dc7a3Sopenharmony_ci 756cc1dc7a3Sopenharmony_ci // Quantize the chosen color, tracking if worth trying the mod value 757cc1dc7a3Sopenharmony_ci bool all_same = color_quant_level[i] != color_quant_level_mod[i]; 758cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_count; j++) 759cc1dc7a3Sopenharmony_ci { 760cc1dc7a3Sopenharmony_ci workscb.color_formats[j] = pack_color_endpoints( 761cc1dc7a3Sopenharmony_ci privateProfile, 762cc1dc7a3Sopenharmony_ci workep.endpt0[j], 763cc1dc7a3Sopenharmony_ci workep.endpt1[j], 764cc1dc7a3Sopenharmony_ci rgbs_colors[j], 765cc1dc7a3Sopenharmony_ci rgbo_colors[j], 766cc1dc7a3Sopenharmony_ci partition_format_specifiers[i][j], 767cc1dc7a3Sopenharmony_ci workscb.color_values[j], 768cc1dc7a3Sopenharmony_ci color_quant_level[i]); 769cc1dc7a3Sopenharmony_ci 770cc1dc7a3Sopenharmony_ci all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0]; 771cc1dc7a3Sopenharmony_ci } 772cc1dc7a3Sopenharmony_ci 773cc1dc7a3Sopenharmony_ci // If all the color endpoint modes are the same, we get a few more bits to store colors; 774cc1dc7a3Sopenharmony_ci // let's see if we can take advantage of this: requantize all the colors and see if the 775cc1dc7a3Sopenharmony_ci // endpoint modes remain the same. 776cc1dc7a3Sopenharmony_ci workscb.color_formats_matched = 0; 777cc1dc7a3Sopenharmony_ci if (partition_count >= 2 && all_same) 778cc1dc7a3Sopenharmony_ci { 779cc1dc7a3Sopenharmony_ci uint8_t colorvals[BLOCK_MAX_PARTITIONS][8]; 780cc1dc7a3Sopenharmony_ci uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 }; 781cc1dc7a3Sopenharmony_ci bool all_same_mod = true; 782cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < partition_count; j++) 783cc1dc7a3Sopenharmony_ci { 784cc1dc7a3Sopenharmony_ci color_formats_mod[j] = pack_color_endpoints( 785cc1dc7a3Sopenharmony_ci privateProfile, 786cc1dc7a3Sopenharmony_ci workep.endpt0[j], 787cc1dc7a3Sopenharmony_ci workep.endpt1[j], 788cc1dc7a3Sopenharmony_ci rgbs_colors[j], 789cc1dc7a3Sopenharmony_ci rgbo_colors[j], 790cc1dc7a3Sopenharmony_ci partition_format_specifiers[i][j], 791cc1dc7a3Sopenharmony_ci colorvals[j], 792cc1dc7a3Sopenharmony_ci color_quant_level_mod[i]); 793cc1dc7a3Sopenharmony_ci 794cc1dc7a3Sopenharmony_ci // Early out as soon as it's no longer possible to use mod 795cc1dc7a3Sopenharmony_ci if (color_formats_mod[j] != color_formats_mod[0]) 796cc1dc7a3Sopenharmony_ci { 797cc1dc7a3Sopenharmony_ci all_same_mod = false; 798cc1dc7a3Sopenharmony_ci break; 799cc1dc7a3Sopenharmony_ci } 800cc1dc7a3Sopenharmony_ci } 801cc1dc7a3Sopenharmony_ci 802cc1dc7a3Sopenharmony_ci if (all_same_mod) 803cc1dc7a3Sopenharmony_ci { 804cc1dc7a3Sopenharmony_ci workscb.color_formats_matched = 1; 805cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++) 806cc1dc7a3Sopenharmony_ci { 807cc1dc7a3Sopenharmony_ci for (unsigned int k = 0; k < 8; k++) 808cc1dc7a3Sopenharmony_ci { 809cc1dc7a3Sopenharmony_ci workscb.color_values[j][k] = colorvals[j][k]; 810cc1dc7a3Sopenharmony_ci } 811cc1dc7a3Sopenharmony_ci 812cc1dc7a3Sopenharmony_ci workscb.color_formats[j] = color_formats_mod[j]; 813cc1dc7a3Sopenharmony_ci } 814cc1dc7a3Sopenharmony_ci } 815cc1dc7a3Sopenharmony_ci } 816cc1dc7a3Sopenharmony_ci 817cc1dc7a3Sopenharmony_ci // Store header fields 818cc1dc7a3Sopenharmony_ci workscb.partition_count = static_cast<uint8_t>(partition_count); 819cc1dc7a3Sopenharmony_ci workscb.partition_index = static_cast<uint16_t>(partition_index); 820cc1dc7a3Sopenharmony_ci workscb.plane2_component = -1; 821cc1dc7a3Sopenharmony_ci workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i]; 822cc1dc7a3Sopenharmony_ci workscb.block_mode = qw_bm.mode_index; 823cc1dc7a3Sopenharmony_ci workscb.block_type = SYM_BTYPE_NONCONST; 824cc1dc7a3Sopenharmony_ci if (privateProfile == HIGH_SPEED_PROFILE) 825cc1dc7a3Sopenharmony_ci { 826cc1dc7a3Sopenharmony_ci workscb.errorval = 0; 827cc1dc7a3Sopenharmony_ci scb = workscb; 828cc1dc7a3Sopenharmony_ci break; 829cc1dc7a3Sopenharmony_ci } 830cc1dc7a3Sopenharmony_ci // Pre-realign test 831cc1dc7a3Sopenharmony_ci if (l == 0) 832cc1dc7a3Sopenharmony_ci { 833cc1dc7a3Sopenharmony_ci float errorval = compute_difference(config, bsd, workscb, blk); 834cc1dc7a3Sopenharmony_ci if (errorval == -ERROR_CALC_DEFAULT) 835cc1dc7a3Sopenharmony_ci { 836cc1dc7a3Sopenharmony_ci errorval = -errorval; 837cc1dc7a3Sopenharmony_ci workscb.block_type = SYM_BTYPE_ERROR; 838cc1dc7a3Sopenharmony_ci } 839cc1dc7a3Sopenharmony_ci 840cc1dc7a3Sopenharmony_ci trace_add_data("error_prerealign", errorval); 841cc1dc7a3Sopenharmony_ci best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode); 842cc1dc7a3Sopenharmony_ci 843cc1dc7a3Sopenharmony_ci // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first 844cc1dc7a3Sopenharmony_ci // iteration can help more so we give it a extra 8% leeway. Use this knowledge to 845cc1dc7a3Sopenharmony_ci // drive a heuristic to skip blocks that are unlikely to catch up with the best 846cc1dc7a3Sopenharmony_ci // block we have already. 847cc1dc7a3Sopenharmony_ci unsigned int iters_remaining = config.tune_refinement_limit - l; 848cc1dc7a3Sopenharmony_ci float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f; 849cc1dc7a3Sopenharmony_ci if (errorval > (threshold * best_errorval_in_scb)) 850cc1dc7a3Sopenharmony_ci { 851cc1dc7a3Sopenharmony_ci break; 852cc1dc7a3Sopenharmony_ci } 853cc1dc7a3Sopenharmony_ci 854cc1dc7a3Sopenharmony_ci if (errorval < best_errorval_in_scb) 855cc1dc7a3Sopenharmony_ci { 856cc1dc7a3Sopenharmony_ci best_errorval_in_scb = errorval; 857cc1dc7a3Sopenharmony_ci workscb.errorval = errorval; 858cc1dc7a3Sopenharmony_ci scb = workscb; 859cc1dc7a3Sopenharmony_ci 860cc1dc7a3Sopenharmony_ci if (errorval < tune_errorval_threshold) 861cc1dc7a3Sopenharmony_ci { 862cc1dc7a3Sopenharmony_ci // Skip remaining candidates - this is "good enough" 863cc1dc7a3Sopenharmony_ci i = candidate_count; 864cc1dc7a3Sopenharmony_ci break; 865cc1dc7a3Sopenharmony_ci } 866cc1dc7a3Sopenharmony_ci } 867cc1dc7a3Sopenharmony_ci } 868cc1dc7a3Sopenharmony_ci 869cc1dc7a3Sopenharmony_ci bool adjustments; 870cc1dc7a3Sopenharmony_ci if (di.weight_count != bsd.texel_count) 871cc1dc7a3Sopenharmony_ci { 872cc1dc7a3Sopenharmony_ci adjustments = realign_weights_decimated( 873cc1dc7a3Sopenharmony_ci config.profile, bsd, blk, workscb); 874cc1dc7a3Sopenharmony_ci } 875cc1dc7a3Sopenharmony_ci else 876cc1dc7a3Sopenharmony_ci { 877cc1dc7a3Sopenharmony_ci adjustments = realign_weights_undecimated( 878cc1dc7a3Sopenharmony_ci config.profile, bsd, blk, workscb); 879cc1dc7a3Sopenharmony_ci } 880cc1dc7a3Sopenharmony_ci 881cc1dc7a3Sopenharmony_ci // Post-realign test 882cc1dc7a3Sopenharmony_ci float errorval = compute_difference(config, bsd, workscb, blk); 883cc1dc7a3Sopenharmony_ci if (errorval == -ERROR_CALC_DEFAULT) 884cc1dc7a3Sopenharmony_ci { 885cc1dc7a3Sopenharmony_ci errorval = -errorval; 886cc1dc7a3Sopenharmony_ci workscb.block_type = SYM_BTYPE_ERROR; 887cc1dc7a3Sopenharmony_ci } 888cc1dc7a3Sopenharmony_ci 889cc1dc7a3Sopenharmony_ci trace_add_data("error_postrealign", errorval); 890cc1dc7a3Sopenharmony_ci best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode); 891cc1dc7a3Sopenharmony_ci 892cc1dc7a3Sopenharmony_ci // Average refinement improvement is 3.5% per iteration, so skip blocks that are 893cc1dc7a3Sopenharmony_ci // unlikely to catch up with the best block we have already. Assume a 4.5% per step to 894cc1dc7a3Sopenharmony_ci // give benefit of the doubt ... 895cc1dc7a3Sopenharmony_ci unsigned int iters_remaining = config.tune_refinement_limit - 1 - l; 896cc1dc7a3Sopenharmony_ci float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f; 897cc1dc7a3Sopenharmony_ci if (errorval > (threshold * best_errorval_in_scb)) 898cc1dc7a3Sopenharmony_ci { 899cc1dc7a3Sopenharmony_ci break; 900cc1dc7a3Sopenharmony_ci } 901cc1dc7a3Sopenharmony_ci 902cc1dc7a3Sopenharmony_ci if (errorval < best_errorval_in_scb) 903cc1dc7a3Sopenharmony_ci { 904cc1dc7a3Sopenharmony_ci best_errorval_in_scb = errorval; 905cc1dc7a3Sopenharmony_ci workscb.errorval = errorval; 906cc1dc7a3Sopenharmony_ci scb = workscb; 907cc1dc7a3Sopenharmony_ci 908cc1dc7a3Sopenharmony_ci if (errorval < tune_errorval_threshold) 909cc1dc7a3Sopenharmony_ci { 910cc1dc7a3Sopenharmony_ci // Skip remaining candidates - this is "good enough" 911cc1dc7a3Sopenharmony_ci i = candidate_count; 912cc1dc7a3Sopenharmony_ci break; 913cc1dc7a3Sopenharmony_ci } 914cc1dc7a3Sopenharmony_ci } 915cc1dc7a3Sopenharmony_ci 916cc1dc7a3Sopenharmony_ci if (!adjustments) 917cc1dc7a3Sopenharmony_ci { 918cc1dc7a3Sopenharmony_ci break; 919cc1dc7a3Sopenharmony_ci } 920cc1dc7a3Sopenharmony_ci } 921cc1dc7a3Sopenharmony_ci } 922cc1dc7a3Sopenharmony_ci 923cc1dc7a3Sopenharmony_ci return best_errorval_in_mode; 924cc1dc7a3Sopenharmony_ci} 925cc1dc7a3Sopenharmony_ci 926cc1dc7a3Sopenharmony_ci/** 927cc1dc7a3Sopenharmony_ci * @brief Compress a block using a chosen partitioning and 2 planes of weights. 928cc1dc7a3Sopenharmony_ci * 929cc1dc7a3Sopenharmony_ci * @param config The compressor configuration. 930cc1dc7a3Sopenharmony_ci * @param bsd The block size information. 931cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 932cc1dc7a3Sopenharmony_ci * @param tune_errorval_threshold The error value threshold. 933cc1dc7a3Sopenharmony_ci * @param plane2_component The component index for the second plane of weights. 934cc1dc7a3Sopenharmony_ci * @param[out] scb The symbolic compressed block output. 935cc1dc7a3Sopenharmony_ci * @param[out] tmpbuf The quantized weights for plane 1. 936cc1dc7a3Sopenharmony_ci */ 937cc1dc7a3Sopenharmony_cistatic float compress_symbolic_block_for_partition_2planes( 938cc1dc7a3Sopenharmony_ci QualityProfile privateProfile, 939cc1dc7a3Sopenharmony_ci const astcenc_config& config, 940cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 941cc1dc7a3Sopenharmony_ci const image_block& blk, 942cc1dc7a3Sopenharmony_ci float tune_errorval_threshold, 943cc1dc7a3Sopenharmony_ci unsigned int plane2_component, 944cc1dc7a3Sopenharmony_ci symbolic_compressed_block& scb, 945cc1dc7a3Sopenharmony_ci compression_working_buffers& tmpbuf, 946cc1dc7a3Sopenharmony_ci int quant_limit 947cc1dc7a3Sopenharmony_ci) { 948cc1dc7a3Sopenharmony_ci promise(config.tune_candidate_limit > 0); 949cc1dc7a3Sopenharmony_ci promise(config.tune_refinement_limit > 0); 950cc1dc7a3Sopenharmony_ci promise(bsd.decimation_mode_count_selected > 0); 951cc1dc7a3Sopenharmony_ci 952cc1dc7a3Sopenharmony_ci int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit); 953cc1dc7a3Sopenharmony_ci 954cc1dc7a3Sopenharmony_ci // Compute ideal weights and endpoint colors, with no quantization or decimation 955cc1dc7a3Sopenharmony_ci endpoints_and_weights& ei1 = tmpbuf.ei1; 956cc1dc7a3Sopenharmony_ci endpoints_and_weights& ei2 = tmpbuf.ei2; 957cc1dc7a3Sopenharmony_ci 958cc1dc7a3Sopenharmony_ci compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2); 959cc1dc7a3Sopenharmony_ci 960cc1dc7a3Sopenharmony_ci // Compute ideal weights and endpoint colors for every decimation 961cc1dc7a3Sopenharmony_ci float* dec_weights_ideal = tmpbuf.dec_weights_ideal; 962cc1dc7a3Sopenharmony_ci uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant; 963cc1dc7a3Sopenharmony_ci 964cc1dc7a3Sopenharmony_ci // For each decimation mode, compute an ideal set of weights with no quantization 965cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++) 966cc1dc7a3Sopenharmony_ci { 967cc1dc7a3Sopenharmony_ci const auto& dm = bsd.get_decimation_mode(i); 968cc1dc7a3Sopenharmony_ci if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant))) 969cc1dc7a3Sopenharmony_ci { 970cc1dc7a3Sopenharmony_ci continue; 971cc1dc7a3Sopenharmony_ci } 972cc1dc7a3Sopenharmony_ci 973cc1dc7a3Sopenharmony_ci const auto& di = bsd.get_decimation_info(i); 974cc1dc7a3Sopenharmony_ci 975cc1dc7a3Sopenharmony_ci compute_ideal_weights_for_decimation( 976cc1dc7a3Sopenharmony_ci ei1, 977cc1dc7a3Sopenharmony_ci di, 978cc1dc7a3Sopenharmony_ci dec_weights_ideal + i * BLOCK_MAX_WEIGHTS); 979cc1dc7a3Sopenharmony_ci 980cc1dc7a3Sopenharmony_ci compute_ideal_weights_for_decimation( 981cc1dc7a3Sopenharmony_ci ei2, 982cc1dc7a3Sopenharmony_ci di, 983cc1dc7a3Sopenharmony_ci dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET); 984cc1dc7a3Sopenharmony_ci } 985cc1dc7a3Sopenharmony_ci 986cc1dc7a3Sopenharmony_ci // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal 987cc1dc7a3Sopenharmony_ci // weight pair, compute the smallest weight that will result in a color value greater than 1 988cc1dc7a3Sopenharmony_ci vfloat4 min_ep1(10.0f); 989cc1dc7a3Sopenharmony_ci vfloat4 min_ep2(10.0f); 990cc1dc7a3Sopenharmony_ci 991cc1dc7a3Sopenharmony_ci vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]); 992cc1dc7a3Sopenharmony_ci vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1); 993cc1dc7a3Sopenharmony_ci min_ep1 = select(min_ep1, ep1, use_ep1); 994cc1dc7a3Sopenharmony_ci 995cc1dc7a3Sopenharmony_ci vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]); 996cc1dc7a3Sopenharmony_ci vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2); 997cc1dc7a3Sopenharmony_ci min_ep2 = select(min_ep2, ep2, use_ep2); 998cc1dc7a3Sopenharmony_ci 999cc1dc7a3Sopenharmony_ci vfloat4 err_max(ERROR_CALC_DEFAULT); 1000cc1dc7a3Sopenharmony_ci vmask4 err_mask = vint4::lane_id() == vint4(plane2_component); 1001cc1dc7a3Sopenharmony_ci 1002cc1dc7a3Sopenharmony_ci // Set the plane2 component to max error in ep1 1003cc1dc7a3Sopenharmony_ci min_ep1 = select(min_ep1, err_max, err_mask); 1004cc1dc7a3Sopenharmony_ci 1005cc1dc7a3Sopenharmony_ci float min_wt_cutoff1 = hmin_s(min_ep1); 1006cc1dc7a3Sopenharmony_ci 1007cc1dc7a3Sopenharmony_ci // Set the minwt2 to the plane2 component min in ep2 1008cc1dc7a3Sopenharmony_ci float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask)); 1009cc1dc7a3Sopenharmony_ci 1010cc1dc7a3Sopenharmony_ci compute_angular_endpoints_2planes( 1011cc1dc7a3Sopenharmony_ci privateProfile, bsd, dec_weights_ideal, max_weight_quant, tmpbuf); 1012cc1dc7a3Sopenharmony_ci 1013cc1dc7a3Sopenharmony_ci // For each mode (which specifies a decimation and a quantization): 1014cc1dc7a3Sopenharmony_ci // * Compute number of bits needed for the quantized weights 1015cc1dc7a3Sopenharmony_ci // * Generate an optimized set of quantized weights 1016cc1dc7a3Sopenharmony_ci // * Compute quantization errors for the mode 1017cc1dc7a3Sopenharmony_ci 1018cc1dc7a3Sopenharmony_ci float* weight_low_value1 = tmpbuf.weight_low_value1; 1019cc1dc7a3Sopenharmony_ci float* weight_high_value1 = tmpbuf.weight_high_value1; 1020cc1dc7a3Sopenharmony_ci float* weight_low_value2 = tmpbuf.weight_low_value2; 1021cc1dc7a3Sopenharmony_ci float* weight_high_value2 = tmpbuf.weight_high_value2; 1022cc1dc7a3Sopenharmony_ci 1023cc1dc7a3Sopenharmony_ci int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts; 1024cc1dc7a3Sopenharmony_ci float* qwt_errors = tmpbuf.qwt_errors; 1025cc1dc7a3Sopenharmony_ci 1026cc1dc7a3Sopenharmony_ci unsigned int start_2plane = bsd.block_mode_count_1plane_selected; 1027cc1dc7a3Sopenharmony_ci unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected; 1028cc1dc7a3Sopenharmony_ci 1029cc1dc7a3Sopenharmony_ci for (unsigned int i = start_2plane; i < end_2plane; i++) 1030cc1dc7a3Sopenharmony_ci { 1031cc1dc7a3Sopenharmony_ci const block_mode& bm = bsd.block_modes[i]; 1032cc1dc7a3Sopenharmony_ci assert(bm.is_dual_plane); 1033cc1dc7a3Sopenharmony_ci 1034cc1dc7a3Sopenharmony_ci if (bm.quant_mode > max_weight_quant) 1035cc1dc7a3Sopenharmony_ci { 1036cc1dc7a3Sopenharmony_ci qwt_errors[i] = 1e38f; 1037cc1dc7a3Sopenharmony_ci continue; 1038cc1dc7a3Sopenharmony_ci } 1039cc1dc7a3Sopenharmony_ci 1040cc1dc7a3Sopenharmony_ci qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits); 1041cc1dc7a3Sopenharmony_ci 1042cc1dc7a3Sopenharmony_ci if (weight_high_value1[i] > 1.02f * min_wt_cutoff1) 1043cc1dc7a3Sopenharmony_ci { 1044cc1dc7a3Sopenharmony_ci weight_high_value1[i] = 1.0f; 1045cc1dc7a3Sopenharmony_ci } 1046cc1dc7a3Sopenharmony_ci 1047cc1dc7a3Sopenharmony_ci if (weight_high_value2[i] > 1.02f * min_wt_cutoff2) 1048cc1dc7a3Sopenharmony_ci { 1049cc1dc7a3Sopenharmony_ci weight_high_value2[i] = 1.0f; 1050cc1dc7a3Sopenharmony_ci } 1051cc1dc7a3Sopenharmony_ci 1052cc1dc7a3Sopenharmony_ci unsigned int decimation_mode = bm.decimation_mode; 1053cc1dc7a3Sopenharmony_ci const auto& di = bsd.get_decimation_info(decimation_mode); 1054cc1dc7a3Sopenharmony_ci 1055cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; 1056cc1dc7a3Sopenharmony_ci 1057cc1dc7a3Sopenharmony_ci // Generate the optimized set of weights for the mode 1058cc1dc7a3Sopenharmony_ci compute_quantized_weights_for_decimation( 1059cc1dc7a3Sopenharmony_ci di, 1060cc1dc7a3Sopenharmony_ci weight_low_value1[i], 1061cc1dc7a3Sopenharmony_ci weight_high_value1[i], 1062cc1dc7a3Sopenharmony_ci dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode, 1063cc1dc7a3Sopenharmony_ci dec_weights_uquantf, 1064cc1dc7a3Sopenharmony_ci dec_weights_uquant + BLOCK_MAX_WEIGHTS * i, 1065cc1dc7a3Sopenharmony_ci bm.get_weight_quant_mode()); 1066cc1dc7a3Sopenharmony_ci 1067cc1dc7a3Sopenharmony_ci compute_quantized_weights_for_decimation( 1068cc1dc7a3Sopenharmony_ci di, 1069cc1dc7a3Sopenharmony_ci weight_low_value2[i], 1070cc1dc7a3Sopenharmony_ci weight_high_value2[i], 1071cc1dc7a3Sopenharmony_ci dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET, 1072cc1dc7a3Sopenharmony_ci dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET, 1073cc1dc7a3Sopenharmony_ci dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET, 1074cc1dc7a3Sopenharmony_ci bm.get_weight_quant_mode()); 1075cc1dc7a3Sopenharmony_ci 1076cc1dc7a3Sopenharmony_ci // Compute weight quantization errors for the block mode 1077cc1dc7a3Sopenharmony_ci qwt_errors[i] = compute_error_of_weight_set_2planes( 1078cc1dc7a3Sopenharmony_ci ei1, 1079cc1dc7a3Sopenharmony_ci ei2, 1080cc1dc7a3Sopenharmony_ci di, 1081cc1dc7a3Sopenharmony_ci dec_weights_uquantf, 1082cc1dc7a3Sopenharmony_ci dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET); 1083cc1dc7a3Sopenharmony_ci } 1084cc1dc7a3Sopenharmony_ci 1085cc1dc7a3Sopenharmony_ci // Decide the optimal combination of color endpoint encodings and weight encodings 1086cc1dc7a3Sopenharmony_ci uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS]; 1087cc1dc7a3Sopenharmony_ci int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES]; 1088cc1dc7a3Sopenharmony_ci 1089cc1dc7a3Sopenharmony_ci quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES]; 1090cc1dc7a3Sopenharmony_ci quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES]; 1091cc1dc7a3Sopenharmony_ci 1092cc1dc7a3Sopenharmony_ci endpoints epm; 1093cc1dc7a3Sopenharmony_ci merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm); 1094cc1dc7a3Sopenharmony_ci 1095cc1dc7a3Sopenharmony_ci const auto& pi = bsd.get_partition_info(1, 0); 1096cc1dc7a3Sopenharmony_ci unsigned int candidate_count = compute_ideal_endpoint_formats( 1097cc1dc7a3Sopenharmony_ci config.privateProfile, 1098cc1dc7a3Sopenharmony_ci pi, blk, epm, qwt_bitcounts, qwt_errors, 1099cc1dc7a3Sopenharmony_ci config.tune_candidate_limit, 1100cc1dc7a3Sopenharmony_ci bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected, 1101cc1dc7a3Sopenharmony_ci partition_format_specifiers, block_mode_index, 1102cc1dc7a3Sopenharmony_ci color_quant_level, color_quant_level_mod, tmpbuf); 1103cc1dc7a3Sopenharmony_ci 1104cc1dc7a3Sopenharmony_ci // Iterate over the N believed-to-be-best modes to find out which one is actually best 1105cc1dc7a3Sopenharmony_ci float best_errorval_in_mode = ERROR_CALC_DEFAULT; 1106cc1dc7a3Sopenharmony_ci float best_errorval_in_scb = scb.errorval; 1107cc1dc7a3Sopenharmony_ci 1108cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < candidate_count; i++) 1109cc1dc7a3Sopenharmony_ci { 1110cc1dc7a3Sopenharmony_ci TRACE_NODE(node0, "candidate"); 1111cc1dc7a3Sopenharmony_ci 1112cc1dc7a3Sopenharmony_ci const int bm_packed_index = block_mode_index[i]; 1113cc1dc7a3Sopenharmony_ci assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) && 1114cc1dc7a3Sopenharmony_ci bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected)); 1115cc1dc7a3Sopenharmony_ci const block_mode& qw_bm = bsd.block_modes[bm_packed_index]; 1116cc1dc7a3Sopenharmony_ci 1117cc1dc7a3Sopenharmony_ci int decimation_mode = qw_bm.decimation_mode; 1118cc1dc7a3Sopenharmony_ci const auto& di = bsd.get_decimation_info(decimation_mode); 1119cc1dc7a3Sopenharmony_ci promise(di.weight_count > 0); 1120cc1dc7a3Sopenharmony_ci 1121cc1dc7a3Sopenharmony_ci trace_add_data("weight_x", di.weight_x); 1122cc1dc7a3Sopenharmony_ci trace_add_data("weight_y", di.weight_y); 1123cc1dc7a3Sopenharmony_ci trace_add_data("weight_z", di.weight_z); 1124cc1dc7a3Sopenharmony_ci trace_add_data("weight_quant", qw_bm.quant_mode); 1125cc1dc7a3Sopenharmony_ci 1126cc1dc7a3Sopenharmony_ci vfloat4 rgbs_color; 1127cc1dc7a3Sopenharmony_ci vfloat4 rgbo_color; 1128cc1dc7a3Sopenharmony_ci 1129cc1dc7a3Sopenharmony_ci symbolic_compressed_block workscb; 1130cc1dc7a3Sopenharmony_ci endpoints workep = epm; 1131cc1dc7a3Sopenharmony_ci 1132cc1dc7a3Sopenharmony_ci uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index; 1133cc1dc7a3Sopenharmony_ci uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET; 1134cc1dc7a3Sopenharmony_ci 1135cc1dc7a3Sopenharmony_ci for (int j = 0; j < di.weight_count; j++) 1136cc1dc7a3Sopenharmony_ci { 1137cc1dc7a3Sopenharmony_ci workscb.weights[j] = u8_weight1_src[j]; 1138cc1dc7a3Sopenharmony_ci workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j]; 1139cc1dc7a3Sopenharmony_ci } 1140cc1dc7a3Sopenharmony_ci 1141cc1dc7a3Sopenharmony_ci for (unsigned int l = 0; l < config.tune_refinement_limit; l++) 1142cc1dc7a3Sopenharmony_ci { 1143cc1dc7a3Sopenharmony_ci recompute_ideal_colors_2planes( 1144cc1dc7a3Sopenharmony_ci blk, bsd, di, 1145cc1dc7a3Sopenharmony_ci workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET, 1146cc1dc7a3Sopenharmony_ci workep, rgbs_color, rgbo_color, plane2_component); 1147cc1dc7a3Sopenharmony_ci 1148cc1dc7a3Sopenharmony_ci // Quantize the chosen color 1149cc1dc7a3Sopenharmony_ci workscb.color_formats[0] = pack_color_endpoints( 1150cc1dc7a3Sopenharmony_ci privateProfile, 1151cc1dc7a3Sopenharmony_ci workep.endpt0[0], 1152cc1dc7a3Sopenharmony_ci workep.endpt1[0], 1153cc1dc7a3Sopenharmony_ci rgbs_color, rgbo_color, 1154cc1dc7a3Sopenharmony_ci partition_format_specifiers[i][0], 1155cc1dc7a3Sopenharmony_ci workscb.color_values[0], 1156cc1dc7a3Sopenharmony_ci color_quant_level[i]); 1157cc1dc7a3Sopenharmony_ci 1158cc1dc7a3Sopenharmony_ci // Store header fields 1159cc1dc7a3Sopenharmony_ci workscb.partition_count = 1; 1160cc1dc7a3Sopenharmony_ci workscb.partition_index = 0; 1161cc1dc7a3Sopenharmony_ci workscb.quant_mode = color_quant_level[i]; 1162cc1dc7a3Sopenharmony_ci workscb.color_formats_matched = 0; 1163cc1dc7a3Sopenharmony_ci workscb.block_mode = qw_bm.mode_index; 1164cc1dc7a3Sopenharmony_ci workscb.plane2_component = static_cast<int8_t>(plane2_component); 1165cc1dc7a3Sopenharmony_ci workscb.block_type = SYM_BTYPE_NONCONST; 1166cc1dc7a3Sopenharmony_ci 1167cc1dc7a3Sopenharmony_ci // Pre-realign test 1168cc1dc7a3Sopenharmony_ci if (l == 0) 1169cc1dc7a3Sopenharmony_ci { 1170cc1dc7a3Sopenharmony_ci float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk); 1171cc1dc7a3Sopenharmony_ci if (errorval == -ERROR_CALC_DEFAULT) 1172cc1dc7a3Sopenharmony_ci { 1173cc1dc7a3Sopenharmony_ci errorval = -errorval; 1174cc1dc7a3Sopenharmony_ci workscb.block_type = SYM_BTYPE_ERROR; 1175cc1dc7a3Sopenharmony_ci } 1176cc1dc7a3Sopenharmony_ci 1177cc1dc7a3Sopenharmony_ci trace_add_data("error_prerealign", errorval); 1178cc1dc7a3Sopenharmony_ci best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode); 1179cc1dc7a3Sopenharmony_ci 1180cc1dc7a3Sopenharmony_ci // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first 1181cc1dc7a3Sopenharmony_ci // iteration can help more so we give it a extra 8% leeway. Use this knowledge to 1182cc1dc7a3Sopenharmony_ci // drive a heuristic to skip blocks that are unlikely to catch up with the best 1183cc1dc7a3Sopenharmony_ci // block we have already. 1184cc1dc7a3Sopenharmony_ci unsigned int iters_remaining = config.tune_refinement_limit - l; 1185cc1dc7a3Sopenharmony_ci float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f; 1186cc1dc7a3Sopenharmony_ci if (errorval > (threshold * best_errorval_in_scb)) 1187cc1dc7a3Sopenharmony_ci { 1188cc1dc7a3Sopenharmony_ci break; 1189cc1dc7a3Sopenharmony_ci } 1190cc1dc7a3Sopenharmony_ci 1191cc1dc7a3Sopenharmony_ci if (errorval < best_errorval_in_scb) 1192cc1dc7a3Sopenharmony_ci { 1193cc1dc7a3Sopenharmony_ci best_errorval_in_scb = errorval; 1194cc1dc7a3Sopenharmony_ci workscb.errorval = errorval; 1195cc1dc7a3Sopenharmony_ci scb = workscb; 1196cc1dc7a3Sopenharmony_ci 1197cc1dc7a3Sopenharmony_ci if (errorval < tune_errorval_threshold) 1198cc1dc7a3Sopenharmony_ci { 1199cc1dc7a3Sopenharmony_ci // Skip remaining candidates - this is "good enough" 1200cc1dc7a3Sopenharmony_ci i = candidate_count; 1201cc1dc7a3Sopenharmony_ci break; 1202cc1dc7a3Sopenharmony_ci } 1203cc1dc7a3Sopenharmony_ci } 1204cc1dc7a3Sopenharmony_ci } 1205cc1dc7a3Sopenharmony_ci 1206cc1dc7a3Sopenharmony_ci // Perform a final pass over the weights to try to improve them. 1207cc1dc7a3Sopenharmony_ci bool adjustments; 1208cc1dc7a3Sopenharmony_ci if (di.weight_count != bsd.texel_count) 1209cc1dc7a3Sopenharmony_ci { 1210cc1dc7a3Sopenharmony_ci adjustments = realign_weights_decimated( 1211cc1dc7a3Sopenharmony_ci config.profile, bsd, blk, workscb); 1212cc1dc7a3Sopenharmony_ci } 1213cc1dc7a3Sopenharmony_ci else 1214cc1dc7a3Sopenharmony_ci { 1215cc1dc7a3Sopenharmony_ci adjustments = realign_weights_undecimated( 1216cc1dc7a3Sopenharmony_ci config.profile, bsd, blk, workscb); 1217cc1dc7a3Sopenharmony_ci } 1218cc1dc7a3Sopenharmony_ci 1219cc1dc7a3Sopenharmony_ci // Post-realign test 1220cc1dc7a3Sopenharmony_ci float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk); 1221cc1dc7a3Sopenharmony_ci if (errorval == -ERROR_CALC_DEFAULT) 1222cc1dc7a3Sopenharmony_ci { 1223cc1dc7a3Sopenharmony_ci errorval = -errorval; 1224cc1dc7a3Sopenharmony_ci workscb.block_type = SYM_BTYPE_ERROR; 1225cc1dc7a3Sopenharmony_ci } 1226cc1dc7a3Sopenharmony_ci 1227cc1dc7a3Sopenharmony_ci trace_add_data("error_postrealign", errorval); 1228cc1dc7a3Sopenharmony_ci best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode); 1229cc1dc7a3Sopenharmony_ci 1230cc1dc7a3Sopenharmony_ci // Average refinement improvement is 3.5% per iteration, so skip blocks that are 1231cc1dc7a3Sopenharmony_ci // unlikely to catch up with the best block we have already. Assume a 4.5% per step to 1232cc1dc7a3Sopenharmony_ci // give benefit of the doubt ... 1233cc1dc7a3Sopenharmony_ci unsigned int iters_remaining = config.tune_refinement_limit - 1 - l; 1234cc1dc7a3Sopenharmony_ci float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f; 1235cc1dc7a3Sopenharmony_ci if (errorval > (threshold * best_errorval_in_scb)) 1236cc1dc7a3Sopenharmony_ci { 1237cc1dc7a3Sopenharmony_ci break; 1238cc1dc7a3Sopenharmony_ci } 1239cc1dc7a3Sopenharmony_ci 1240cc1dc7a3Sopenharmony_ci if (errorval < best_errorval_in_scb) 1241cc1dc7a3Sopenharmony_ci { 1242cc1dc7a3Sopenharmony_ci best_errorval_in_scb = errorval; 1243cc1dc7a3Sopenharmony_ci workscb.errorval = errorval; 1244cc1dc7a3Sopenharmony_ci scb = workscb; 1245cc1dc7a3Sopenharmony_ci 1246cc1dc7a3Sopenharmony_ci if (errorval < tune_errorval_threshold) 1247cc1dc7a3Sopenharmony_ci { 1248cc1dc7a3Sopenharmony_ci // Skip remaining candidates - this is "good enough" 1249cc1dc7a3Sopenharmony_ci i = candidate_count; 1250cc1dc7a3Sopenharmony_ci break; 1251cc1dc7a3Sopenharmony_ci } 1252cc1dc7a3Sopenharmony_ci } 1253cc1dc7a3Sopenharmony_ci 1254cc1dc7a3Sopenharmony_ci if (!adjustments) 1255cc1dc7a3Sopenharmony_ci { 1256cc1dc7a3Sopenharmony_ci break; 1257cc1dc7a3Sopenharmony_ci } 1258cc1dc7a3Sopenharmony_ci } 1259cc1dc7a3Sopenharmony_ci } 1260cc1dc7a3Sopenharmony_ci 1261cc1dc7a3Sopenharmony_ci return best_errorval_in_mode; 1262cc1dc7a3Sopenharmony_ci} 1263cc1dc7a3Sopenharmony_ci 1264cc1dc7a3Sopenharmony_ci/** 1265cc1dc7a3Sopenharmony_ci * @brief Determine the lowest cross-channel correlation factor. 1266cc1dc7a3Sopenharmony_ci * 1267cc1dc7a3Sopenharmony_ci * @param texels_per_block The number of texels in a block. 1268cc1dc7a3Sopenharmony_ci * @param blk The image block color data to compress. 1269cc1dc7a3Sopenharmony_ci * 1270cc1dc7a3Sopenharmony_ci * @return Return the lowest correlation factor. 1271cc1dc7a3Sopenharmony_ci */ 1272cc1dc7a3Sopenharmony_cistatic float prepare_block_statistics( 1273cc1dc7a3Sopenharmony_ci int texels_per_block, 1274cc1dc7a3Sopenharmony_ci const image_block& blk 1275cc1dc7a3Sopenharmony_ci) { 1276cc1dc7a3Sopenharmony_ci // Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row 1277cc1dc7a3Sopenharmony_ci // of the matrix. The matrix is symmetric, so this is all we need for this use case. 1278cc1dc7a3Sopenharmony_ci float rs = 0.0f; 1279cc1dc7a3Sopenharmony_ci float gs = 0.0f; 1280cc1dc7a3Sopenharmony_ci float bs = 0.0f; 1281cc1dc7a3Sopenharmony_ci float as = 0.0f; 1282cc1dc7a3Sopenharmony_ci float rr_var = 0.0f; 1283cc1dc7a3Sopenharmony_ci float gg_var = 0.0f; 1284cc1dc7a3Sopenharmony_ci float bb_var = 0.0f; 1285cc1dc7a3Sopenharmony_ci float aa_var = 0.0f; 1286cc1dc7a3Sopenharmony_ci float rg_cov = 0.0f; 1287cc1dc7a3Sopenharmony_ci float rb_cov = 0.0f; 1288cc1dc7a3Sopenharmony_ci float ra_cov = 0.0f; 1289cc1dc7a3Sopenharmony_ci float gb_cov = 0.0f; 1290cc1dc7a3Sopenharmony_ci float ga_cov = 0.0f; 1291cc1dc7a3Sopenharmony_ci float ba_cov = 0.0f; 1292cc1dc7a3Sopenharmony_ci 1293cc1dc7a3Sopenharmony_ci float weight_sum = 0.0f; 1294cc1dc7a3Sopenharmony_ci 1295cc1dc7a3Sopenharmony_ci promise(texels_per_block > 0); 1296cc1dc7a3Sopenharmony_ci for (int i = 0; i < texels_per_block; i++) 1297cc1dc7a3Sopenharmony_ci { 1298cc1dc7a3Sopenharmony_ci float weight = hadd_s(blk.channel_weight) / 4.0f; 1299cc1dc7a3Sopenharmony_ci assert(weight >= 0.0f); 1300cc1dc7a3Sopenharmony_ci weight_sum += weight; 1301cc1dc7a3Sopenharmony_ci 1302cc1dc7a3Sopenharmony_ci float r = blk.data_r[i]; 1303cc1dc7a3Sopenharmony_ci float g = blk.data_g[i]; 1304cc1dc7a3Sopenharmony_ci float b = blk.data_b[i]; 1305cc1dc7a3Sopenharmony_ci float a = blk.data_a[i]; 1306cc1dc7a3Sopenharmony_ci 1307cc1dc7a3Sopenharmony_ci float rw = r * weight; 1308cc1dc7a3Sopenharmony_ci rs += rw; 1309cc1dc7a3Sopenharmony_ci rr_var += r * rw; 1310cc1dc7a3Sopenharmony_ci rg_cov += g * rw; 1311cc1dc7a3Sopenharmony_ci rb_cov += b * rw; 1312cc1dc7a3Sopenharmony_ci ra_cov += a * rw; 1313cc1dc7a3Sopenharmony_ci 1314cc1dc7a3Sopenharmony_ci float gw = g * weight; 1315cc1dc7a3Sopenharmony_ci gs += gw; 1316cc1dc7a3Sopenharmony_ci gg_var += g * gw; 1317cc1dc7a3Sopenharmony_ci gb_cov += b * gw; 1318cc1dc7a3Sopenharmony_ci ga_cov += a * gw; 1319cc1dc7a3Sopenharmony_ci 1320cc1dc7a3Sopenharmony_ci float bw = b * weight; 1321cc1dc7a3Sopenharmony_ci bs += bw; 1322cc1dc7a3Sopenharmony_ci bb_var += b * bw; 1323cc1dc7a3Sopenharmony_ci ba_cov += a * bw; 1324cc1dc7a3Sopenharmony_ci 1325cc1dc7a3Sopenharmony_ci float aw = a * weight; 1326cc1dc7a3Sopenharmony_ci as += aw; 1327cc1dc7a3Sopenharmony_ci aa_var += a * aw; 1328cc1dc7a3Sopenharmony_ci } 1329cc1dc7a3Sopenharmony_ci 1330cc1dc7a3Sopenharmony_ci float rpt = 1.0f / astc::max(weight_sum, 1e-7f); 1331cc1dc7a3Sopenharmony_ci 1332cc1dc7a3Sopenharmony_ci rr_var -= rs * (rs * rpt); 1333cc1dc7a3Sopenharmony_ci rg_cov -= gs * (rs * rpt); 1334cc1dc7a3Sopenharmony_ci rb_cov -= bs * (rs * rpt); 1335cc1dc7a3Sopenharmony_ci ra_cov -= as * (rs * rpt); 1336cc1dc7a3Sopenharmony_ci 1337cc1dc7a3Sopenharmony_ci gg_var -= gs * (gs * rpt); 1338cc1dc7a3Sopenharmony_ci gb_cov -= bs * (gs * rpt); 1339cc1dc7a3Sopenharmony_ci ga_cov -= as * (gs * rpt); 1340cc1dc7a3Sopenharmony_ci 1341cc1dc7a3Sopenharmony_ci bb_var -= bs * (bs * rpt); 1342cc1dc7a3Sopenharmony_ci ba_cov -= as * (bs * rpt); 1343cc1dc7a3Sopenharmony_ci 1344cc1dc7a3Sopenharmony_ci aa_var -= as * (as * rpt); 1345cc1dc7a3Sopenharmony_ci 1346cc1dc7a3Sopenharmony_ci // These will give a NaN if a channel is constant - these are fixed up in the next step 1347cc1dc7a3Sopenharmony_ci rg_cov *= astc::rsqrt(rr_var * gg_var); 1348cc1dc7a3Sopenharmony_ci rb_cov *= astc::rsqrt(rr_var * bb_var); 1349cc1dc7a3Sopenharmony_ci ra_cov *= astc::rsqrt(rr_var * aa_var); 1350cc1dc7a3Sopenharmony_ci gb_cov *= astc::rsqrt(gg_var * bb_var); 1351cc1dc7a3Sopenharmony_ci ga_cov *= astc::rsqrt(gg_var * aa_var); 1352cc1dc7a3Sopenharmony_ci ba_cov *= astc::rsqrt(bb_var * aa_var); 1353cc1dc7a3Sopenharmony_ci 1354cc1dc7a3Sopenharmony_ci if (astc::isnan(rg_cov)) rg_cov = 1.0f; 1355cc1dc7a3Sopenharmony_ci if (astc::isnan(rb_cov)) rb_cov = 1.0f; 1356cc1dc7a3Sopenharmony_ci if (astc::isnan(ra_cov)) ra_cov = 1.0f; 1357cc1dc7a3Sopenharmony_ci if (astc::isnan(gb_cov)) gb_cov = 1.0f; 1358cc1dc7a3Sopenharmony_ci if (astc::isnan(ga_cov)) ga_cov = 1.0f; 1359cc1dc7a3Sopenharmony_ci if (astc::isnan(ba_cov)) ba_cov = 1.0f; 1360cc1dc7a3Sopenharmony_ci 1361cc1dc7a3Sopenharmony_ci float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov)); 1362cc1dc7a3Sopenharmony_ci lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov)); 1363cc1dc7a3Sopenharmony_ci lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov)); 1364cc1dc7a3Sopenharmony_ci lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov)); 1365cc1dc7a3Sopenharmony_ci lowest_correlation = astc::min(lowest_correlation, fabsf(ba_cov)); 1366cc1dc7a3Sopenharmony_ci 1367cc1dc7a3Sopenharmony_ci // Diagnostic trace points 1368cc1dc7a3Sopenharmony_ci trace_add_data("min_r", blk.data_min.lane<0>()); 1369cc1dc7a3Sopenharmony_ci trace_add_data("max_r", blk.data_max.lane<0>()); 1370cc1dc7a3Sopenharmony_ci trace_add_data("min_g", blk.data_min.lane<1>()); 1371cc1dc7a3Sopenharmony_ci trace_add_data("max_g", blk.data_max.lane<1>()); 1372cc1dc7a3Sopenharmony_ci trace_add_data("min_b", blk.data_min.lane<2>()); 1373cc1dc7a3Sopenharmony_ci trace_add_data("max_b", blk.data_max.lane<2>()); 1374cc1dc7a3Sopenharmony_ci trace_add_data("min_a", blk.data_min.lane<3>()); 1375cc1dc7a3Sopenharmony_ci trace_add_data("max_a", blk.data_max.lane<3>()); 1376cc1dc7a3Sopenharmony_ci trace_add_data("cov_rg", fabsf(rg_cov)); 1377cc1dc7a3Sopenharmony_ci trace_add_data("cov_rb", fabsf(rb_cov)); 1378cc1dc7a3Sopenharmony_ci trace_add_data("cov_ra", fabsf(ra_cov)); 1379cc1dc7a3Sopenharmony_ci trace_add_data("cov_gb", fabsf(gb_cov)); 1380cc1dc7a3Sopenharmony_ci trace_add_data("cov_ga", fabsf(ga_cov)); 1381cc1dc7a3Sopenharmony_ci trace_add_data("cov_ba", fabsf(ba_cov)); 1382cc1dc7a3Sopenharmony_ci 1383cc1dc7a3Sopenharmony_ci return lowest_correlation; 1384cc1dc7a3Sopenharmony_ci} 1385cc1dc7a3Sopenharmony_ci 1386cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 1387cc1dc7a3Sopenharmony_civoid compress_block( 1388cc1dc7a3Sopenharmony_ci const astcenc_contexti& ctx, 1389cc1dc7a3Sopenharmony_ci const image_block& blk, 1390cc1dc7a3Sopenharmony_ci uint8_t pcb[16], 1391cc1dc7a3Sopenharmony_ci#if QUALITY_CONTROL 1392cc1dc7a3Sopenharmony_ci compression_working_buffers& tmpbuf, 1393cc1dc7a3Sopenharmony_ci bool calQualityEnable, 1394cc1dc7a3Sopenharmony_ci int32_t *mseBlock[RGBA_COM] 1395cc1dc7a3Sopenharmony_ci#else 1396cc1dc7a3Sopenharmony_ci compression_working_buffers& tmpbuf 1397cc1dc7a3Sopenharmony_ci#endif 1398cc1dc7a3Sopenharmony_ci ) 1399cc1dc7a3Sopenharmony_ci{ 1400cc1dc7a3Sopenharmony_ci astcenc_profile decode_mode = ctx.config.profile; 1401cc1dc7a3Sopenharmony_ci symbolic_compressed_block scb; 1402cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd = *ctx.bsd; 1403cc1dc7a3Sopenharmony_ci float lowest_correl; 1404cc1dc7a3Sopenharmony_ci 1405cc1dc7a3Sopenharmony_ci TRACE_NODE(node0, "block"); 1406cc1dc7a3Sopenharmony_ci trace_add_data("pos_x", blk.xpos); 1407cc1dc7a3Sopenharmony_ci trace_add_data("pos_y", blk.ypos); 1408cc1dc7a3Sopenharmony_ci trace_add_data("pos_z", blk.zpos); 1409cc1dc7a3Sopenharmony_ci 1410cc1dc7a3Sopenharmony_ci // Set stricter block targets for luminance data as we have more bits to play with 1411cc1dc7a3Sopenharmony_ci bool block_is_l = blk.is_luminance(); 1412cc1dc7a3Sopenharmony_ci float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f; 1413cc1dc7a3Sopenharmony_ci 1414cc1dc7a3Sopenharmony_ci // Set slightly stricter block targets for lumalpha data as we have more bits to play with 1415cc1dc7a3Sopenharmony_ci bool block_is_la = blk.is_luminancealpha(); 1416cc1dc7a3Sopenharmony_ci float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f; 1417cc1dc7a3Sopenharmony_ci 1418cc1dc7a3Sopenharmony_ci bool block_skip_two_plane = false; 1419cc1dc7a3Sopenharmony_ci int max_partitions; 1420cc1dc7a3Sopenharmony_ci if (ctx.config.privateProfile == HIGH_SPEED_PROFILE) 1421cc1dc7a3Sopenharmony_ci { 1422cc1dc7a3Sopenharmony_ci max_partitions = 1; 1423cc1dc7a3Sopenharmony_ci } 1424cc1dc7a3Sopenharmony_ci#ifdef ASTC_CUSTOMIZED_ENABLE 1425cc1dc7a3Sopenharmony_ci else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE) 1426cc1dc7a3Sopenharmony_ci { 1427cc1dc7a3Sopenharmony_ci if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() || 1428cc1dc7a3Sopenharmony_ci g_astcCustomizedSoManager.customizedMaxPartitionsFunc_ == nullptr) 1429cc1dc7a3Sopenharmony_ci { 1430cc1dc7a3Sopenharmony_ci printf("astcenc customized so dlopen failed or customizedMaxPartitionsFunc_ is nullptr!\n"); 1431cc1dc7a3Sopenharmony_ci return; 1432cc1dc7a3Sopenharmony_ci } 1433cc1dc7a3Sopenharmony_ci max_partitions = g_astcCustomizedSoManager.customizedMaxPartitionsFunc_(); 1434cc1dc7a3Sopenharmony_ci } 1435cc1dc7a3Sopenharmony_ci#endif 1436cc1dc7a3Sopenharmony_ci else 1437cc1dc7a3Sopenharmony_ci { 1438cc1dc7a3Sopenharmony_ci max_partitions = ctx.config.tune_partition_count_limit; 1439cc1dc7a3Sopenharmony_ci } 1440cc1dc7a3Sopenharmony_ci 1441cc1dc7a3Sopenharmony_ci unsigned int requested_partition_indices[3] { 1442cc1dc7a3Sopenharmony_ci ctx.config.tune_2partition_index_limit, 1443cc1dc7a3Sopenharmony_ci ctx.config.tune_3partition_index_limit, 1444cc1dc7a3Sopenharmony_ci ctx.config.tune_4partition_index_limit 1445cc1dc7a3Sopenharmony_ci }; 1446cc1dc7a3Sopenharmony_ci 1447cc1dc7a3Sopenharmony_ci unsigned int requested_partition_trials[3] { 1448cc1dc7a3Sopenharmony_ci ctx.config.tune_2partitioning_candidate_limit, 1449cc1dc7a3Sopenharmony_ci ctx.config.tune_3partitioning_candidate_limit, 1450cc1dc7a3Sopenharmony_ci ctx.config.tune_4partitioning_candidate_limit 1451cc1dc7a3Sopenharmony_ci }; 1452cc1dc7a3Sopenharmony_ci 1453cc1dc7a3Sopenharmony_ci#if defined(ASTCENC_DIAGNOSTICS) 1454cc1dc7a3Sopenharmony_ci // Do this early in diagnostic builds so we can dump uniform metrics 1455cc1dc7a3Sopenharmony_ci // for every block. Do it later in release builds to avoid redundant work! 1456cc1dc7a3Sopenharmony_ci float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count; 1457cc1dc7a3Sopenharmony_ci float error_threshold = ctx.config.tune_db_limit 1458cc1dc7a3Sopenharmony_ci * error_weight_sum 1459cc1dc7a3Sopenharmony_ci * block_is_l_scale 1460cc1dc7a3Sopenharmony_ci * block_is_la_scale; 1461cc1dc7a3Sopenharmony_ci 1462cc1dc7a3Sopenharmony_ci lowest_correl = prepare_block_statistics(bsd.texel_count, blk); 1463cc1dc7a3Sopenharmony_ci trace_add_data("lowest_correl", lowest_correl); 1464cc1dc7a3Sopenharmony_ci trace_add_data("tune_error_threshold", error_threshold); 1465cc1dc7a3Sopenharmony_ci#endif 1466cc1dc7a3Sopenharmony_ci 1467cc1dc7a3Sopenharmony_ci // Detected a constant-color block 1468cc1dc7a3Sopenharmony_ci if (all(blk.data_min == blk.data_max)) 1469cc1dc7a3Sopenharmony_ci { 1470cc1dc7a3Sopenharmony_ci TRACE_NODE(node1, "pass"); 1471cc1dc7a3Sopenharmony_ci trace_add_data("partition_count", 0); 1472cc1dc7a3Sopenharmony_ci trace_add_data("plane_count", 1); 1473cc1dc7a3Sopenharmony_ci 1474cc1dc7a3Sopenharmony_ci scb.partition_count = 0; 1475cc1dc7a3Sopenharmony_ci 1476cc1dc7a3Sopenharmony_ci // Encode as FP16 if using HDR 1477cc1dc7a3Sopenharmony_ci if ((decode_mode == ASTCENC_PRF_HDR) || 1478cc1dc7a3Sopenharmony_ci (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A)) 1479cc1dc7a3Sopenharmony_ci { 1480cc1dc7a3Sopenharmony_ci scb.block_type = SYM_BTYPE_CONST_F16; 1481cc1dc7a3Sopenharmony_ci vint4 color_f16 = float_to_float16(blk.origin_texel); 1482cc1dc7a3Sopenharmony_ci store(color_f16, scb.constant_color); 1483cc1dc7a3Sopenharmony_ci } 1484cc1dc7a3Sopenharmony_ci // Encode as UNORM16 if NOT using HDR 1485cc1dc7a3Sopenharmony_ci else 1486cc1dc7a3Sopenharmony_ci { 1487cc1dc7a3Sopenharmony_ci scb.block_type = SYM_BTYPE_CONST_U16; 1488cc1dc7a3Sopenharmony_ci vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f; 1489cc1dc7a3Sopenharmony_ci vint4 color_u16 = float_to_int_rtn(color_f32); 1490cc1dc7a3Sopenharmony_ci store(color_u16, scb.constant_color); 1491cc1dc7a3Sopenharmony_ci } 1492cc1dc7a3Sopenharmony_ci 1493cc1dc7a3Sopenharmony_ci trace_add_data("exit", "quality hit"); 1494cc1dc7a3Sopenharmony_ci if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE) 1495cc1dc7a3Sopenharmony_ci { 1496cc1dc7a3Sopenharmony_ci scb.block_type = SYM_BTYPE_NONCONST; 1497cc1dc7a3Sopenharmony_ci scb.partition_count = 1; 1498cc1dc7a3Sopenharmony_ci scb.color_formats_matched = 0; 1499cc1dc7a3Sopenharmony_ci scb.plane2_component = -1; 1500cc1dc7a3Sopenharmony_ci if (ctx.config.privateProfile == HIGH_SPEED_PROFILE) 1501cc1dc7a3Sopenharmony_ci { 1502cc1dc7a3Sopenharmony_ci scb.block_mode = HIGH_SPEED_PROFILE_BLOCK_MODE; 1503cc1dc7a3Sopenharmony_ci } 1504cc1dc7a3Sopenharmony_ci#ifdef ASTC_CUSTOMIZED_ENABLE 1505cc1dc7a3Sopenharmony_ci else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE) 1506cc1dc7a3Sopenharmony_ci { 1507cc1dc7a3Sopenharmony_ci if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() || 1508cc1dc7a3Sopenharmony_ci g_astcCustomizedSoManager.customizedBlockModeFunc_ == nullptr) 1509cc1dc7a3Sopenharmony_ci { 1510cc1dc7a3Sopenharmony_ci printf("astcenc customized so dlopen failed or customizedBlockModeFunc_ is nullptr!\n"); 1511cc1dc7a3Sopenharmony_ci return; 1512cc1dc7a3Sopenharmony_ci } 1513cc1dc7a3Sopenharmony_ci scb.block_mode = g_astcCustomizedSoManager.customizedBlockModeFunc_(); 1514cc1dc7a3Sopenharmony_ci } 1515cc1dc7a3Sopenharmony_ci#endif 1516cc1dc7a3Sopenharmony_ci scb.partition_index = 0; 1517cc1dc7a3Sopenharmony_ci scb.quant_mode = QUANT_256; 1518cc1dc7a3Sopenharmony_ci scb.color_formats[0] = 12; // color format is 12 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE 1519cc1dc7a3Sopenharmony_ci for (int w = 0; w < 16; w++) { // weights num is 16 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE 1520cc1dc7a3Sopenharmony_ci scb.weights[w] = 0; 1521cc1dc7a3Sopenharmony_ci } 1522cc1dc7a3Sopenharmony_ci for (unsigned int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit 1523cc1dc7a3Sopenharmony_ci scb.color_values[0][pixel << 1] = scb.constant_color[pixel] & BYTE_MASK; // low byte 1524cc1dc7a3Sopenharmony_ci scb.color_values[0][(pixel << 1) + 1] = (scb.constant_color[pixel] >> 8) & BYTE_MASK; // high byte 1525cc1dc7a3Sopenharmony_ci } 1526cc1dc7a3Sopenharmony_ci } 1527cc1dc7a3Sopenharmony_ci scb.privateProfile = ctx.config.privateProfile; 1528cc1dc7a3Sopenharmony_ci symbolic_to_physical(bsd, scb, pcb); 1529cc1dc7a3Sopenharmony_ci#if QUALITY_CONTROL 1530cc1dc7a3Sopenharmony_ci if (calQualityEnable) { 1531cc1dc7a3Sopenharmony_ci *mseBlock[R_COM] = *mseBlock[G_COM] = *mseBlock[B_COM] = *mseBlock[A_COM] = 0; 1532cc1dc7a3Sopenharmony_ci } 1533cc1dc7a3Sopenharmony_ci#endif 1534cc1dc7a3Sopenharmony_ci return; 1535cc1dc7a3Sopenharmony_ci } 1536cc1dc7a3Sopenharmony_ci 1537cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DIAGNOSTICS) 1538cc1dc7a3Sopenharmony_ci float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count; 1539cc1dc7a3Sopenharmony_ci float error_threshold = ctx.config.tune_db_limit 1540cc1dc7a3Sopenharmony_ci * error_weight_sum 1541cc1dc7a3Sopenharmony_ci * block_is_l_scale 1542cc1dc7a3Sopenharmony_ci * block_is_la_scale; 1543cc1dc7a3Sopenharmony_ci#endif 1544cc1dc7a3Sopenharmony_ci 1545cc1dc7a3Sopenharmony_ci // Set SCB and mode errors to a very high error value 1546cc1dc7a3Sopenharmony_ci scb.errorval = ERROR_CALC_DEFAULT; 1547cc1dc7a3Sopenharmony_ci scb.block_type = SYM_BTYPE_ERROR; 1548cc1dc7a3Sopenharmony_ci 1549cc1dc7a3Sopenharmony_ci float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] { 1550cc1dc7a3Sopenharmony_ci ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT 1551cc1dc7a3Sopenharmony_ci }; 1552cc1dc7a3Sopenharmony_ci 1553cc1dc7a3Sopenharmony_ci float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] { 1554cc1dc7a3Sopenharmony_ci 0.0f, 1555cc1dc7a3Sopenharmony_ci ctx.config.tune_2partition_early_out_limit_factor, 1556cc1dc7a3Sopenharmony_ci ctx.config.tune_3partition_early_out_limit_factor, 1557cc1dc7a3Sopenharmony_ci 0.0f 1558cc1dc7a3Sopenharmony_ci }; 1559cc1dc7a3Sopenharmony_ci 1560cc1dc7a3Sopenharmony_ci // Trial using 1 plane of weights and 1 partition. 1561cc1dc7a3Sopenharmony_ci 1562cc1dc7a3Sopenharmony_ci // Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified 1563cc1dc7a3Sopenharmony_ci // mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this 1564cc1dc7a3Sopenharmony_ci // optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the 1565cc1dc7a3Sopenharmony_ci // compression and slightly reduces image quality. 1566cc1dc7a3Sopenharmony_ci 1567cc1dc7a3Sopenharmony_ci float errorval_mult[2] { 1568cc1dc7a3Sopenharmony_ci 1.0f / ctx.config.tune_mse_overshoot, 1569cc1dc7a3Sopenharmony_ci 1.0f 1570cc1dc7a3Sopenharmony_ci }; 1571cc1dc7a3Sopenharmony_ci 1572cc1dc7a3Sopenharmony_ci static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot; 1573cc1dc7a3Sopenharmony_ci 1574cc1dc7a3Sopenharmony_ci // Only enable MODE0 fast path if enabled 1575cc1dc7a3Sopenharmony_ci // Never enable for 3D blocks as no "always" block modes are available 1576cc1dc7a3Sopenharmony_ci int start_trial = 1; 1577cc1dc7a3Sopenharmony_ci if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1)) 1578cc1dc7a3Sopenharmony_ci { 1579cc1dc7a3Sopenharmony_ci start_trial = 0; 1580cc1dc7a3Sopenharmony_ci } 1581cc1dc7a3Sopenharmony_ci 1582cc1dc7a3Sopenharmony_ci int quant_limit = QUANT_32; 1583cc1dc7a3Sopenharmony_ci for (int i = start_trial; i < 2; i++) 1584cc1dc7a3Sopenharmony_ci { 1585cc1dc7a3Sopenharmony_ci TRACE_NODE(node1, "pass"); 1586cc1dc7a3Sopenharmony_ci trace_add_data("partition_count", 1); 1587cc1dc7a3Sopenharmony_ci trace_add_data("plane_count", 1); 1588cc1dc7a3Sopenharmony_ci trace_add_data("search_mode", i); 1589cc1dc7a3Sopenharmony_ci 1590cc1dc7a3Sopenharmony_ci float errorval = compress_symbolic_block_for_partition_1plane( 1591cc1dc7a3Sopenharmony_ci ctx.config.privateProfile, 1592cc1dc7a3Sopenharmony_ci ctx.config, bsd, blk, i == 0, 1593cc1dc7a3Sopenharmony_ci error_threshold * errorval_mult[i] * errorval_overshoot, 1594cc1dc7a3Sopenharmony_ci 1, 0, scb, tmpbuf, QUANT_32); 1595cc1dc7a3Sopenharmony_ci 1596cc1dc7a3Sopenharmony_ci // Record the quant level so we can use the filter later searches 1597cc1dc7a3Sopenharmony_ci const auto& bm = bsd.get_block_mode(scb.block_mode); 1598cc1dc7a3Sopenharmony_ci quant_limit = bm.get_weight_quant_mode(); 1599cc1dc7a3Sopenharmony_ci 1600cc1dc7a3Sopenharmony_ci best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval); 1601cc1dc7a3Sopenharmony_ci if ((ctx.config.privateProfile == HIGH_SPEED_PROFILE) || (errorval < (error_threshold * errorval_mult[i]))) 1602cc1dc7a3Sopenharmony_ci { 1603cc1dc7a3Sopenharmony_ci trace_add_data("exit", "quality hit"); 1604cc1dc7a3Sopenharmony_ci goto END_OF_TESTS; 1605cc1dc7a3Sopenharmony_ci } 1606cc1dc7a3Sopenharmony_ci } 1607cc1dc7a3Sopenharmony_ci 1608cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DIAGNOSTICS) 1609cc1dc7a3Sopenharmony_ci lowest_correl = prepare_block_statistics(bsd.texel_count, blk); 1610cc1dc7a3Sopenharmony_ci#endif 1611cc1dc7a3Sopenharmony_ci 1612cc1dc7a3Sopenharmony_ci block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation; 1613cc1dc7a3Sopenharmony_ci 1614cc1dc7a3Sopenharmony_ci // Test the four possible 1-partition, 2-planes modes. Do this in reverse, as 1615cc1dc7a3Sopenharmony_ci // alpha is the most likely to be non-correlated if it is present in the data. 1616cc1dc7a3Sopenharmony_ci for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--) 1617cc1dc7a3Sopenharmony_ci { 1618cc1dc7a3Sopenharmony_ci if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE) 1619cc1dc7a3Sopenharmony_ci { 1620cc1dc7a3Sopenharmony_ci break; 1621cc1dc7a3Sopenharmony_ci } 1622cc1dc7a3Sopenharmony_ci TRACE_NODE(node1, "pass"); 1623cc1dc7a3Sopenharmony_ci trace_add_data("partition_count", 1); 1624cc1dc7a3Sopenharmony_ci trace_add_data("plane_count", 2); 1625cc1dc7a3Sopenharmony_ci trace_add_data("plane_component", i); 1626cc1dc7a3Sopenharmony_ci 1627cc1dc7a3Sopenharmony_ci if (block_skip_two_plane) 1628cc1dc7a3Sopenharmony_ci { 1629cc1dc7a3Sopenharmony_ci trace_add_data("skip", "tune_2plane_early_out_limit_correlation"); 1630cc1dc7a3Sopenharmony_ci continue; 1631cc1dc7a3Sopenharmony_ci } 1632cc1dc7a3Sopenharmony_ci 1633cc1dc7a3Sopenharmony_ci if (blk.grayscale && i != 3) 1634cc1dc7a3Sopenharmony_ci { 1635cc1dc7a3Sopenharmony_ci trace_add_data("skip", "grayscale block"); 1636cc1dc7a3Sopenharmony_ci continue; 1637cc1dc7a3Sopenharmony_ci } 1638cc1dc7a3Sopenharmony_ci 1639cc1dc7a3Sopenharmony_ci if (blk.is_constant_channel(i)) 1640cc1dc7a3Sopenharmony_ci { 1641cc1dc7a3Sopenharmony_ci trace_add_data("skip", "constant component"); 1642cc1dc7a3Sopenharmony_ci continue; 1643cc1dc7a3Sopenharmony_ci } 1644cc1dc7a3Sopenharmony_ci 1645cc1dc7a3Sopenharmony_ci float errorval = compress_symbolic_block_for_partition_2planes( 1646cc1dc7a3Sopenharmony_ci ctx.config.privateProfile, 1647cc1dc7a3Sopenharmony_ci ctx.config, bsd, blk, error_threshold * errorval_overshoot, 1648cc1dc7a3Sopenharmony_ci i, scb, tmpbuf, quant_limit); 1649cc1dc7a3Sopenharmony_ci 1650cc1dc7a3Sopenharmony_ci // If attempting two planes is much worse than the best one plane result 1651cc1dc7a3Sopenharmony_ci // then further two plane searches are unlikely to help so move on ... 1652cc1dc7a3Sopenharmony_ci if (errorval > (best_errorvals_for_pcount[0] * 1.85f)) 1653cc1dc7a3Sopenharmony_ci { 1654cc1dc7a3Sopenharmony_ci break; 1655cc1dc7a3Sopenharmony_ci } 1656cc1dc7a3Sopenharmony_ci 1657cc1dc7a3Sopenharmony_ci if (errorval < error_threshold) 1658cc1dc7a3Sopenharmony_ci { 1659cc1dc7a3Sopenharmony_ci trace_add_data("exit", "quality hit"); 1660cc1dc7a3Sopenharmony_ci goto END_OF_TESTS; 1661cc1dc7a3Sopenharmony_ci } 1662cc1dc7a3Sopenharmony_ci } 1663cc1dc7a3Sopenharmony_ci 1664cc1dc7a3Sopenharmony_ci // Find best blocks for 2, 3 and 4 partitions 1665cc1dc7a3Sopenharmony_ci for (int partition_count = 2; partition_count <= max_partitions; partition_count++) 1666cc1dc7a3Sopenharmony_ci { 1667cc1dc7a3Sopenharmony_ci unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES]; 1668cc1dc7a3Sopenharmony_ci 1669cc1dc7a3Sopenharmony_ci unsigned int requested_indices = requested_partition_indices[partition_count - 2]; 1670cc1dc7a3Sopenharmony_ci 1671cc1dc7a3Sopenharmony_ci unsigned int requested_trials = requested_partition_trials[partition_count - 2]; 1672cc1dc7a3Sopenharmony_ci requested_trials = astc::min(requested_trials, requested_indices); 1673cc1dc7a3Sopenharmony_ci 1674cc1dc7a3Sopenharmony_ci unsigned int actual_trials = find_best_partition_candidates( 1675cc1dc7a3Sopenharmony_ci bsd, blk, partition_count, requested_indices, partition_indices, requested_trials); 1676cc1dc7a3Sopenharmony_ci 1677cc1dc7a3Sopenharmony_ci float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2]; 1678cc1dc7a3Sopenharmony_ci 1679cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < actual_trials; i++) 1680cc1dc7a3Sopenharmony_ci { 1681cc1dc7a3Sopenharmony_ci TRACE_NODE(node1, "pass"); 1682cc1dc7a3Sopenharmony_ci trace_add_data("partition_count", partition_count); 1683cc1dc7a3Sopenharmony_ci trace_add_data("partition_index", partition_indices[i]); 1684cc1dc7a3Sopenharmony_ci trace_add_data("plane_count", 1); 1685cc1dc7a3Sopenharmony_ci trace_add_data("search_mode", i); 1686cc1dc7a3Sopenharmony_ci 1687cc1dc7a3Sopenharmony_ci float errorval = compress_symbolic_block_for_partition_1plane( 1688cc1dc7a3Sopenharmony_ci ctx.config.privateProfile, 1689cc1dc7a3Sopenharmony_ci ctx.config, bsd, blk, false, 1690cc1dc7a3Sopenharmony_ci error_threshold * errorval_overshoot, 1691cc1dc7a3Sopenharmony_ci partition_count, partition_indices[i], 1692cc1dc7a3Sopenharmony_ci scb, tmpbuf, quant_limit); 1693cc1dc7a3Sopenharmony_ci 1694cc1dc7a3Sopenharmony_ci best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval); 1695cc1dc7a3Sopenharmony_ci 1696cc1dc7a3Sopenharmony_ci // If using N partitions doesn't improve much over using N-1 partitions then skip trying 1697cc1dc7a3Sopenharmony_ci // N+1. Error can dramatically improve if the data is correlated or non-correlated and 1698cc1dc7a3Sopenharmony_ci // aligns with a partitioning that suits that encoding, so for this inner loop check add 1699cc1dc7a3Sopenharmony_ci // a large error scale because the "other" trial could be a lot better. 1700cc1dc7a3Sopenharmony_ci float best_error = best_errorvals_for_pcount[partition_count - 1]; 1701cc1dc7a3Sopenharmony_ci float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f; 1702cc1dc7a3Sopenharmony_ci if (best_error > (best_error_in_prev * best_error_scale)) 1703cc1dc7a3Sopenharmony_ci { 1704cc1dc7a3Sopenharmony_ci trace_add_data("skip", "tune_partition_early_out_limit_factor"); 1705cc1dc7a3Sopenharmony_ci goto END_OF_TESTS; 1706cc1dc7a3Sopenharmony_ci } 1707cc1dc7a3Sopenharmony_ci 1708cc1dc7a3Sopenharmony_ci if (errorval < error_threshold) 1709cc1dc7a3Sopenharmony_ci { 1710cc1dc7a3Sopenharmony_ci trace_add_data("exit", "quality hit"); 1711cc1dc7a3Sopenharmony_ci goto END_OF_TESTS; 1712cc1dc7a3Sopenharmony_ci } 1713cc1dc7a3Sopenharmony_ci } 1714cc1dc7a3Sopenharmony_ci 1715cc1dc7a3Sopenharmony_ci // If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1 1716cc1dc7a3Sopenharmony_ci float best_error = best_errorvals_for_pcount[partition_count - 1]; 1717cc1dc7a3Sopenharmony_ci float best_error_scale = exit_thresholds_for_pcount[partition_count - 1]; 1718cc1dc7a3Sopenharmony_ci if (best_error > (best_error_in_prev * best_error_scale)) 1719cc1dc7a3Sopenharmony_ci { 1720cc1dc7a3Sopenharmony_ci trace_add_data("skip", "tune_partition_early_out_limit_factor"); 1721cc1dc7a3Sopenharmony_ci goto END_OF_TESTS; 1722cc1dc7a3Sopenharmony_ci } 1723cc1dc7a3Sopenharmony_ci } 1724cc1dc7a3Sopenharmony_ci 1725cc1dc7a3Sopenharmony_ci trace_add_data("exit", "quality not hit"); 1726cc1dc7a3Sopenharmony_ci 1727cc1dc7a3Sopenharmony_ciEND_OF_TESTS: 1728cc1dc7a3Sopenharmony_ci // If we still have an error block then convert to something we can encode 1729cc1dc7a3Sopenharmony_ci // TODO: Do something more sensible here, such as average color block 1730cc1dc7a3Sopenharmony_ci if (scb.block_type == SYM_BTYPE_ERROR) 1731cc1dc7a3Sopenharmony_ci { 1732cc1dc7a3Sopenharmony_ci#if defined(ASTCENC_DIAGNOSTICS) 1733cc1dc7a3Sopenharmony_ci static bool printed_once = false; 1734cc1dc7a3Sopenharmony_ci if (!printed_once) 1735cc1dc7a3Sopenharmony_ci { 1736cc1dc7a3Sopenharmony_ci printed_once = true; 1737cc1dc7a3Sopenharmony_ci printf("WARN: At least one block failed to find a valid encoding.\n" 1738cc1dc7a3Sopenharmony_ci " Try increasing compression quality settings.\n\n"); 1739cc1dc7a3Sopenharmony_ci } 1740cc1dc7a3Sopenharmony_ci#endif 1741cc1dc7a3Sopenharmony_ci 1742cc1dc7a3Sopenharmony_ci scb.block_type = SYM_BTYPE_CONST_U16; 1743cc1dc7a3Sopenharmony_ci vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f; 1744cc1dc7a3Sopenharmony_ci vint4 color_u16 = float_to_int_rtn(color_f32); 1745cc1dc7a3Sopenharmony_ci store(color_u16, scb.constant_color); 1746cc1dc7a3Sopenharmony_ci } 1747cc1dc7a3Sopenharmony_ci 1748cc1dc7a3Sopenharmony_ci // Compress to a physical block 1749cc1dc7a3Sopenharmony_ci scb.privateProfile = ctx.config.privateProfile; 1750cc1dc7a3Sopenharmony_ci symbolic_to_physical(bsd, scb, pcb); 1751cc1dc7a3Sopenharmony_ci#if QUALITY_CONTROL 1752cc1dc7a3Sopenharmony_ci if (calQualityEnable) { 1753cc1dc7a3Sopenharmony_ci image_block decBlk = blk; 1754cc1dc7a3Sopenharmony_ci decompress_symbolic_block(ctx.config.profile, bsd, blk.xpos, blk.ypos, blk.zpos, scb, decBlk); 1755cc1dc7a3Sopenharmony_ci vint4 colorSumDiff = vint4::zero(); 1756cc1dc7a3Sopenharmony_ci for (size_t ii = 0; ii < bsd.texel_count; ii++) { 1757cc1dc7a3Sopenharmony_ci vint4 colorRef = float_to_int_rtn(blk.texel(ii) * 255.0f / 65535.0f); 1758cc1dc7a3Sopenharmony_ci vint4 colorTest = float_to_int_rtn(min(decBlk.texel(ii), 1.0f) * 255.0f); 1759cc1dc7a3Sopenharmony_ci vint4 colorDiff = colorRef - colorTest; 1760cc1dc7a3Sopenharmony_ci colorSumDiff += colorDiff * colorDiff; 1761cc1dc7a3Sopenharmony_ci } 1762cc1dc7a3Sopenharmony_ci *mseBlock[R_COM] = colorSumDiff.lane<0>(); 1763cc1dc7a3Sopenharmony_ci *mseBlock[G_COM] = colorSumDiff.lane<1>(); 1764cc1dc7a3Sopenharmony_ci *mseBlock[B_COM] = colorSumDiff.lane<2>(); 1765cc1dc7a3Sopenharmony_ci *mseBlock[A_COM] = colorSumDiff.lane<3>(); 1766cc1dc7a3Sopenharmony_ci } 1767cc1dc7a3Sopenharmony_ci#endif 1768cc1dc7a3Sopenharmony_ci} 1769cc1dc7a3Sopenharmony_ci 1770cc1dc7a3Sopenharmony_ci#endif 1771