1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0
2cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
3cc1dc7a3Sopenharmony_ci// Copyright 2011-2024 Arm Limited
4cc1dc7a3Sopenharmony_ci//
5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy
7cc1dc7a3Sopenharmony_ci// of the License at:
8cc1dc7a3Sopenharmony_ci//
9cc1dc7a3Sopenharmony_ci//     http://www.apache.org/licenses/LICENSE-2.0
10cc1dc7a3Sopenharmony_ci//
11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software
12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations
15cc1dc7a3Sopenharmony_ci// under the License.
16cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
17cc1dc7a3Sopenharmony_ci
18cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DECOMPRESS_ONLY)
19cc1dc7a3Sopenharmony_ci
20cc1dc7a3Sopenharmony_ci/**
21cc1dc7a3Sopenharmony_ci * @brief Functions to compress a symbolic block.
22cc1dc7a3Sopenharmony_ci */
23cc1dc7a3Sopenharmony_ci
24cc1dc7a3Sopenharmony_ci#include "astcenc_internal.h"
25cc1dc7a3Sopenharmony_ci#include "astcenc_diagnostic_trace.h"
26cc1dc7a3Sopenharmony_ci
27cc1dc7a3Sopenharmony_ci#include <cassert>
28cc1dc7a3Sopenharmony_ci#ifdef ASTC_CUSTOMIZED_ENABLE
29cc1dc7a3Sopenharmony_ciAstcCustomizedSoManager g_astcCustomizedSoManager;
30cc1dc7a3Sopenharmony_ci#endif
31cc1dc7a3Sopenharmony_ci
32cc1dc7a3Sopenharmony_ci/**
33cc1dc7a3Sopenharmony_ci * @brief Merge two planes of endpoints into a single vector.
34cc1dc7a3Sopenharmony_ci *
35cc1dc7a3Sopenharmony_ci * @param      ep_plane1          The endpoints for plane 1.
36cc1dc7a3Sopenharmony_ci * @param      ep_plane2          The endpoints for plane 2.
37cc1dc7a3Sopenharmony_ci * @param      component_plane2   The color component for plane 2.
38cc1dc7a3Sopenharmony_ci * @param[out] result             The merged output.
39cc1dc7a3Sopenharmony_ci */
40cc1dc7a3Sopenharmony_cistatic void merge_endpoints(
41cc1dc7a3Sopenharmony_ci	const endpoints& ep_plane1,
42cc1dc7a3Sopenharmony_ci	const endpoints& ep_plane2,
43cc1dc7a3Sopenharmony_ci	unsigned int component_plane2,
44cc1dc7a3Sopenharmony_ci	endpoints& result
45cc1dc7a3Sopenharmony_ci) {
46cc1dc7a3Sopenharmony_ci	unsigned int partition_count = ep_plane1.partition_count;
47cc1dc7a3Sopenharmony_ci	assert(partition_count == 1);
48cc1dc7a3Sopenharmony_ci
49cc1dc7a3Sopenharmony_ci	vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
50cc1dc7a3Sopenharmony_ci
51cc1dc7a3Sopenharmony_ci	result.partition_count = partition_count;
52cc1dc7a3Sopenharmony_ci	result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
53cc1dc7a3Sopenharmony_ci	result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
54cc1dc7a3Sopenharmony_ci}
55cc1dc7a3Sopenharmony_ci
56cc1dc7a3Sopenharmony_ci/**
57cc1dc7a3Sopenharmony_ci * @brief Attempt to improve weights given a chosen configuration.
58cc1dc7a3Sopenharmony_ci *
59cc1dc7a3Sopenharmony_ci * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
60cc1dc7a3Sopenharmony_ci * partition and per plane) and attempt to improve image quality by moving each weight up by one or
61cc1dc7a3Sopenharmony_ci * down by one quantization step.
62cc1dc7a3Sopenharmony_ci *
63cc1dc7a3Sopenharmony_ci * This is a specialized function which only supports operating on undecimated weight grids,
64cc1dc7a3Sopenharmony_ci * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
65cc1dc7a3Sopenharmony_ci * is needed less often.
66cc1dc7a3Sopenharmony_ci *
67cc1dc7a3Sopenharmony_ci * @param      decode_mode   The decode mode (LDR, HDR).
68cc1dc7a3Sopenharmony_ci * @param      bsd           The block size information.
69cc1dc7a3Sopenharmony_ci * @param      blk           The image block color data to compress.
70cc1dc7a3Sopenharmony_ci * @param[out] scb           The symbolic compressed block output.
71cc1dc7a3Sopenharmony_ci */
72cc1dc7a3Sopenharmony_ci#if ASTCENC_NEON != 0
73cc1dc7a3Sopenharmony_cistatic bool realign_weights_undecimated(
74cc1dc7a3Sopenharmony_ci	astcenc_profile decode_mode,
75cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
76cc1dc7a3Sopenharmony_ci	const image_block& blk,
77cc1dc7a3Sopenharmony_ci	symbolic_compressed_block& scb
78cc1dc7a3Sopenharmony_ci) {
79cc1dc7a3Sopenharmony_ci	// Get the partition descriptor
80cc1dc7a3Sopenharmony_ci	unsigned int partition_count = scb.partition_count;
81cc1dc7a3Sopenharmony_ci	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
82cc1dc7a3Sopenharmony_ci
83cc1dc7a3Sopenharmony_ci	// Get the quantization table
84cc1dc7a3Sopenharmony_ci	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
85cc1dc7a3Sopenharmony_ci	unsigned int weight_quant_level = bm.quant_mode;
86cc1dc7a3Sopenharmony_ci	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
87cc1dc7a3Sopenharmony_ci
88cc1dc7a3Sopenharmony_ci	unsigned int max_plane = bm.is_dual_plane;
89cc1dc7a3Sopenharmony_ci	int plane2_component = scb.plane2_component;
90cc1dc7a3Sopenharmony_ci	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
91cc1dc7a3Sopenharmony_ci
92cc1dc7a3Sopenharmony_ci	// Decode the color endpoints
93cc1dc7a3Sopenharmony_ci	bool rgb_hdr;
94cc1dc7a3Sopenharmony_ci	bool alpha_hdr;
95cc1dc7a3Sopenharmony_ci	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
96cc1dc7a3Sopenharmony_ci	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
97cc1dc7a3Sopenharmony_ci	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
98cc1dc7a3Sopenharmony_ci	vfloat4 offset[BLOCK_MAX_PARTITIONS];
99cc1dc7a3Sopenharmony_ci
100cc1dc7a3Sopenharmony_ci	promise(partition_count > 0);
101cc1dc7a3Sopenharmony_ci
102cc1dc7a3Sopenharmony_ci	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
103cc1dc7a3Sopenharmony_ci	{
104cc1dc7a3Sopenharmony_ci		unpack_color_endpoints(decode_mode,
105cc1dc7a3Sopenharmony_ci		                       scb.color_formats[pa_idx],
106cc1dc7a3Sopenharmony_ci		                       scb.color_values[pa_idx],
107cc1dc7a3Sopenharmony_ci		                       rgb_hdr, alpha_hdr,
108cc1dc7a3Sopenharmony_ci		                       endpnt0[pa_idx],
109cc1dc7a3Sopenharmony_ci		                       endpnt1[pa_idx]);
110cc1dc7a3Sopenharmony_ci	}
111cc1dc7a3Sopenharmony_ci
112cc1dc7a3Sopenharmony_ci	uint8_t* dec_weights_uquant = scb.weights;
113cc1dc7a3Sopenharmony_ci	bool adjustments = false;
114cc1dc7a3Sopenharmony_ci
115cc1dc7a3Sopenharmony_ci	// For each plane and partition ...
116cc1dc7a3Sopenharmony_ci	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
117cc1dc7a3Sopenharmony_ci	{
118cc1dc7a3Sopenharmony_ci		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
119cc1dc7a3Sopenharmony_ci		{
120cc1dc7a3Sopenharmony_ci			// Compute the endpoint delta for all components in current plane
121cc1dc7a3Sopenharmony_ci			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
122cc1dc7a3Sopenharmony_ci			epd = select(epd, vint4::zero(), plane_mask);
123cc1dc7a3Sopenharmony_ci
124cc1dc7a3Sopenharmony_ci			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
125cc1dc7a3Sopenharmony_ci			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
126cc1dc7a3Sopenharmony_ci		}
127cc1dc7a3Sopenharmony_ci
128cc1dc7a3Sopenharmony_ci		// For each weight compute previous, current, and next errors
129cc1dc7a3Sopenharmony_ci		promise(bsd.texel_count > 0);
130cc1dc7a3Sopenharmony_ci
131cc1dc7a3Sopenharmony_ci		unsigned int texel = 0;
132cc1dc7a3Sopenharmony_ci		for (; texel + ASTCENC_SIMD_WIDTH <= bsd.texel_count; texel += ASTCENC_SIMD_WIDTH)
133cc1dc7a3Sopenharmony_ci		{
134cc1dc7a3Sopenharmony_ci			int uqw0 = dec_weights_uquant[texel];
135cc1dc7a3Sopenharmony_ci			int uqw1 = dec_weights_uquant[texel + 1];
136cc1dc7a3Sopenharmony_ci			int uqw2 = dec_weights_uquant[texel + 2];
137cc1dc7a3Sopenharmony_ci			int uqw3 = dec_weights_uquant[texel + 3];
138cc1dc7a3Sopenharmony_ci
139cc1dc7a3Sopenharmony_ci			vint4 uqw_vec = vint4(uqw0, uqw1, uqw2, uqw3);
140cc1dc7a3Sopenharmony_ci			vint4 prev_and_next_vec = vint4(qat.prev_next_values[uqw0], qat.prev_next_values[uqw1],
141cc1dc7a3Sopenharmony_ci							qat.prev_next_values[uqw2], qat.prev_next_values[uqw3]);
142cc1dc7a3Sopenharmony_ci
143cc1dc7a3Sopenharmony_ci			vint4 mask = vint4(0xFF, 0xFF, 0xFF, 0xFF);
144cc1dc7a3Sopenharmony_ci			vint4 uqw_down_vec = prev_and_next_vec & mask;
145cc1dc7a3Sopenharmony_ci			vint4 uqw_up_vec = vint4(vshrq_n_s32(prev_and_next_vec.m, 8)) & mask;
146cc1dc7a3Sopenharmony_ci
147cc1dc7a3Sopenharmony_ci			vfloat4 weight_base_vec = int_to_float(uqw_vec);
148cc1dc7a3Sopenharmony_ci			vfloat4 weight_down_vec = int_to_float(uqw_down_vec) - weight_base_vec;
149cc1dc7a3Sopenharmony_ci			vfloat4 weight_up_vec = int_to_float(uqw_up_vec) - weight_base_vec;
150cc1dc7a3Sopenharmony_ci
151cc1dc7a3Sopenharmony_ci			unsigned int partition0 = pi.partition_of_texel[texel];
152cc1dc7a3Sopenharmony_ci			unsigned int partition1 = pi.partition_of_texel[texel + 1];
153cc1dc7a3Sopenharmony_ci			unsigned int partition2 = pi.partition_of_texel[texel + 2];
154cc1dc7a3Sopenharmony_ci			unsigned int partition3 = pi.partition_of_texel[texel + 3];
155cc1dc7a3Sopenharmony_ci
156cc1dc7a3Sopenharmony_ci			vfloat4 color_offset0 = offset[partition0];
157cc1dc7a3Sopenharmony_ci			vfloat4 color_offset1 = offset[partition1];
158cc1dc7a3Sopenharmony_ci			vfloat4 color_offset2 = offset[partition2];
159cc1dc7a3Sopenharmony_ci			vfloat4 color_offset3 = offset[partition3];
160cc1dc7a3Sopenharmony_ci
161cc1dc7a3Sopenharmony_ci			vfloat4 color_base0 = endpnt0f[partition0];
162cc1dc7a3Sopenharmony_ci			vfloat4 color_base1 = endpnt0f[partition1];
163cc1dc7a3Sopenharmony_ci			vfloat4 color_base2 = endpnt0f[partition2];
164cc1dc7a3Sopenharmony_ci			vfloat4 color_base3 = endpnt0f[partition3];
165cc1dc7a3Sopenharmony_ci
166cc1dc7a3Sopenharmony_ci			vfloat4 color0 = color_base0 + color_offset0 * weight_base_vec.lane<0>();
167cc1dc7a3Sopenharmony_ci			vfloat4 color1 = color_base1 + color_offset1 * weight_base_vec.lane<1>();
168cc1dc7a3Sopenharmony_ci			vfloat4 color2 = color_base2 + color_offset2 * weight_base_vec.lane<2>();
169cc1dc7a3Sopenharmony_ci			vfloat4 color3 = color_base3 + color_offset3 * weight_base_vec.lane<3>();
170cc1dc7a3Sopenharmony_ci
171cc1dc7a3Sopenharmony_ci			vfloat4 orig_color0 = blk.texel(texel);
172cc1dc7a3Sopenharmony_ci			vfloat4 orig_color1 = blk.texel(texel + 1);
173cc1dc7a3Sopenharmony_ci			vfloat4 orig_color2 = blk.texel(texel + 2);
174cc1dc7a3Sopenharmony_ci			vfloat4 orig_color3 = blk.texel(texel + 3);
175cc1dc7a3Sopenharmony_ci
176cc1dc7a3Sopenharmony_ci			vfloat4 error_weight = blk.channel_weight;
177cc1dc7a3Sopenharmony_ci
178cc1dc7a3Sopenharmony_ci			vfloat4 color_diff0 = color0 - orig_color0;
179cc1dc7a3Sopenharmony_ci			vfloat4 color_diff1 = color1 - orig_color1;
180cc1dc7a3Sopenharmony_ci			vfloat4 color_diff2 = color2 - orig_color2;
181cc1dc7a3Sopenharmony_ci			vfloat4 color_diff3 = color3 - orig_color3;
182cc1dc7a3Sopenharmony_ci
183cc1dc7a3Sopenharmony_ci			vfloat4 color_diff_down0 = color_diff0 + color_offset0 * weight_down_vec.lane<0>();
184cc1dc7a3Sopenharmony_ci			vfloat4 color_diff_down1 = color_diff1 + color_offset1 * weight_down_vec.lane<1>();
185cc1dc7a3Sopenharmony_ci			vfloat4 color_diff_down2 = color_diff2 + color_offset2 * weight_down_vec.lane<2>();
186cc1dc7a3Sopenharmony_ci			vfloat4 color_diff_down3 = color_diff3 + color_offset3 * weight_down_vec.lane<3>();
187cc1dc7a3Sopenharmony_ci
188cc1dc7a3Sopenharmony_ci			vfloat4 color_diff_up0 = color_diff0 + color_offset0 * weight_up_vec.lane<0>();
189cc1dc7a3Sopenharmony_ci			vfloat4 color_diff_up1 = color_diff1 + color_offset1 * weight_up_vec.lane<1>();
190cc1dc7a3Sopenharmony_ci			vfloat4 color_diff_up2 = color_diff2 + color_offset2 * weight_up_vec.lane<2>();
191cc1dc7a3Sopenharmony_ci			vfloat4 color_diff_up3 = color_diff3 + color_offset3 * weight_up_vec.lane<3>();
192cc1dc7a3Sopenharmony_ci
193cc1dc7a3Sopenharmony_ci			float error_base0 = dot_s(color_diff0 * color_diff0, error_weight);
194cc1dc7a3Sopenharmony_ci			float error_base1 = dot_s(color_diff1 * color_diff1, error_weight);
195cc1dc7a3Sopenharmony_ci			float error_base2 = dot_s(color_diff2 * color_diff2, error_weight);
196cc1dc7a3Sopenharmony_ci			float error_base3 = dot_s(color_diff3 * color_diff3, error_weight);
197cc1dc7a3Sopenharmony_ci
198cc1dc7a3Sopenharmony_ci			float error_down0 = dot_s(color_diff_down0 * color_diff_down0, error_weight);
199cc1dc7a3Sopenharmony_ci			float error_down1 = dot_s(color_diff_down1 * color_diff_down1, error_weight);
200cc1dc7a3Sopenharmony_ci			float error_down2 = dot_s(color_diff_down2 * color_diff_down2, error_weight);
201cc1dc7a3Sopenharmony_ci			float error_down3 = dot_s(color_diff_down3 * color_diff_down3, error_weight);
202cc1dc7a3Sopenharmony_ci
203cc1dc7a3Sopenharmony_ci			float error_up0 = dot_s(color_diff_up0 * color_diff_up0, error_weight);
204cc1dc7a3Sopenharmony_ci			float error_up1 = dot_s(color_diff_up1 * color_diff_up1, error_weight);
205cc1dc7a3Sopenharmony_ci			float error_up2 = dot_s(color_diff_up2 * color_diff_up2, error_weight);
206cc1dc7a3Sopenharmony_ci			float error_up3 = dot_s(color_diff_up3 * color_diff_up3, error_weight);
207cc1dc7a3Sopenharmony_ci
208cc1dc7a3Sopenharmony_ci			vfloat4 error_base_vec = vfloat4(error_base0, error_base1, error_base2, error_base3);
209cc1dc7a3Sopenharmony_ci			vfloat4 error_down_vec = vfloat4(error_down0, error_down1, error_down2, error_down3);
210cc1dc7a3Sopenharmony_ci			vfloat4 error_up_vec = vfloat4(error_up0, error_up1, error_up2, error_up3);
211cc1dc7a3Sopenharmony_ci
212cc1dc7a3Sopenharmony_ci			vmask4 check_result_up = (error_up_vec < error_base_vec) &
213cc1dc7a3Sopenharmony_ci			        (error_up_vec < error_down_vec) & (uqw_vec < vint4(64));
214cc1dc7a3Sopenharmony_ci
215cc1dc7a3Sopenharmony_ci			vmask4 check_result_down = (error_down_vec < error_base_vec) & (uqw_vec > vint4::zero());
216cc1dc7a3Sopenharmony_ci			check_result_down = check_result_down & (~check_result_up);
217cc1dc7a3Sopenharmony_ci
218cc1dc7a3Sopenharmony_ci			if (popcount(check_result_up | check_result_down) != 0)
219cc1dc7a3Sopenharmony_ci			{
220cc1dc7a3Sopenharmony_ci				uqw_vec = select(uqw_vec, uqw_up_vec, check_result_up);
221cc1dc7a3Sopenharmony_ci				uqw_vec = select(uqw_vec, uqw_down_vec, check_result_down);
222cc1dc7a3Sopenharmony_ci
223cc1dc7a3Sopenharmony_ci				dec_weights_uquant[texel] = uqw_vec.lane<0>();
224cc1dc7a3Sopenharmony_ci				dec_weights_uquant[texel + 1] = uqw_vec.lane<1>();
225cc1dc7a3Sopenharmony_ci				dec_weights_uquant[texel + 2] = uqw_vec.lane<2>();    // channel 2
226cc1dc7a3Sopenharmony_ci				dec_weights_uquant[texel + 3] = uqw_vec.lane<3>();    // channel 3
227cc1dc7a3Sopenharmony_ci				adjustments = true;
228cc1dc7a3Sopenharmony_ci			}
229cc1dc7a3Sopenharmony_ci		};
230cc1dc7a3Sopenharmony_ci
231cc1dc7a3Sopenharmony_ci		for (; texel < bsd.texel_count; texel++)
232cc1dc7a3Sopenharmony_ci		{
233cc1dc7a3Sopenharmony_ci			int uqw = dec_weights_uquant[texel];
234cc1dc7a3Sopenharmony_ci
235cc1dc7a3Sopenharmony_ci			uint32_t prev_and_next = qat.prev_next_values[uqw];
236cc1dc7a3Sopenharmony_ci			int uqw_down = prev_and_next & 0xFF;
237cc1dc7a3Sopenharmony_ci			int uqw_up = (prev_and_next >> 8) & 0xFF;
238cc1dc7a3Sopenharmony_ci
239cc1dc7a3Sopenharmony_ci			// Interpolate the colors to create the diffs
240cc1dc7a3Sopenharmony_ci			float weight_base = static_cast<float>(uqw);
241cc1dc7a3Sopenharmony_ci			float weight_down = static_cast<float>(uqw_down - uqw);
242cc1dc7a3Sopenharmony_ci			float weight_up = static_cast<float>(uqw_up - uqw);
243cc1dc7a3Sopenharmony_ci
244cc1dc7a3Sopenharmony_ci			unsigned int partition = pi.partition_of_texel[texel];
245cc1dc7a3Sopenharmony_ci			vfloat4 color_offset = offset[partition];
246cc1dc7a3Sopenharmony_ci			vfloat4 color_base   = endpnt0f[partition];
247cc1dc7a3Sopenharmony_ci
248cc1dc7a3Sopenharmony_ci			vfloat4 color = color_base + color_offset * weight_base;
249cc1dc7a3Sopenharmony_ci			vfloat4 orig_color   = blk.texel(texel);
250cc1dc7a3Sopenharmony_ci			vfloat4 error_weight = blk.channel_weight;
251cc1dc7a3Sopenharmony_ci
252cc1dc7a3Sopenharmony_ci			vfloat4 color_diff      = color - orig_color;
253cc1dc7a3Sopenharmony_ci			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
254cc1dc7a3Sopenharmony_ci			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;
255cc1dc7a3Sopenharmony_ci
256cc1dc7a3Sopenharmony_ci			float error_base = dot_s(color_diff      * color_diff,      error_weight);
257cc1dc7a3Sopenharmony_ci			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
258cc1dc7a3Sopenharmony_ci			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);
259cc1dc7a3Sopenharmony_ci
260cc1dc7a3Sopenharmony_ci			// Check if the prev or next error is better, and if so use it
261cc1dc7a3Sopenharmony_ci			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
262cc1dc7a3Sopenharmony_ci			{
263cc1dc7a3Sopenharmony_ci				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
264cc1dc7a3Sopenharmony_ci				adjustments = true;
265cc1dc7a3Sopenharmony_ci			}
266cc1dc7a3Sopenharmony_ci			else if ((error_down < error_base) && (uqw > 0))
267cc1dc7a3Sopenharmony_ci			{
268cc1dc7a3Sopenharmony_ci				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
269cc1dc7a3Sopenharmony_ci				adjustments = true;
270cc1dc7a3Sopenharmony_ci			}
271cc1dc7a3Sopenharmony_ci		}
272cc1dc7a3Sopenharmony_ci
273cc1dc7a3Sopenharmony_ci		// Prepare iteration for plane 2
274cc1dc7a3Sopenharmony_ci		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
275cc1dc7a3Sopenharmony_ci		plane_mask = ~plane_mask;
276cc1dc7a3Sopenharmony_ci	}
277cc1dc7a3Sopenharmony_ci	return adjustments;
278cc1dc7a3Sopenharmony_ci}
279cc1dc7a3Sopenharmony_ci#else
280cc1dc7a3Sopenharmony_cistatic bool realign_weights_undecimated(
281cc1dc7a3Sopenharmony_ci	astcenc_profile decode_mode,
282cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
283cc1dc7a3Sopenharmony_ci	const image_block& blk,
284cc1dc7a3Sopenharmony_ci	symbolic_compressed_block& scb
285cc1dc7a3Sopenharmony_ci) {
286cc1dc7a3Sopenharmony_ci	// Get the partition descriptor
287cc1dc7a3Sopenharmony_ci	unsigned int partition_count = scb.partition_count;
288cc1dc7a3Sopenharmony_ci	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
289cc1dc7a3Sopenharmony_ci
290cc1dc7a3Sopenharmony_ci	// Get the quantization table
291cc1dc7a3Sopenharmony_ci	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
292cc1dc7a3Sopenharmony_ci	unsigned int weight_quant_level = bm.quant_mode;
293cc1dc7a3Sopenharmony_ci	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
294cc1dc7a3Sopenharmony_ci
295cc1dc7a3Sopenharmony_ci	unsigned int max_plane = bm.is_dual_plane;
296cc1dc7a3Sopenharmony_ci	int plane2_component = scb.plane2_component;
297cc1dc7a3Sopenharmony_ci	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
298cc1dc7a3Sopenharmony_ci
299cc1dc7a3Sopenharmony_ci	// Decode the color endpoints
300cc1dc7a3Sopenharmony_ci	bool rgb_hdr;
301cc1dc7a3Sopenharmony_ci	bool alpha_hdr;
302cc1dc7a3Sopenharmony_ci	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
303cc1dc7a3Sopenharmony_ci	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
304cc1dc7a3Sopenharmony_ci	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
305cc1dc7a3Sopenharmony_ci	vfloat4 offset[BLOCK_MAX_PARTITIONS];
306cc1dc7a3Sopenharmony_ci
307cc1dc7a3Sopenharmony_ci	promise(partition_count > 0);
308cc1dc7a3Sopenharmony_ci
309cc1dc7a3Sopenharmony_ci	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
310cc1dc7a3Sopenharmony_ci	{
311cc1dc7a3Sopenharmony_ci		unpack_color_endpoints(decode_mode,
312cc1dc7a3Sopenharmony_ci		                       scb.color_formats[pa_idx],
313cc1dc7a3Sopenharmony_ci		                       scb.color_values[pa_idx],
314cc1dc7a3Sopenharmony_ci		                       rgb_hdr, alpha_hdr,
315cc1dc7a3Sopenharmony_ci		                       endpnt0[pa_idx],
316cc1dc7a3Sopenharmony_ci		                       endpnt1[pa_idx]);
317cc1dc7a3Sopenharmony_ci	}
318cc1dc7a3Sopenharmony_ci
319cc1dc7a3Sopenharmony_ci	uint8_t* dec_weights_uquant = scb.weights;
320cc1dc7a3Sopenharmony_ci	bool adjustments = false;
321cc1dc7a3Sopenharmony_ci
322cc1dc7a3Sopenharmony_ci	// For each plane and partition ...
323cc1dc7a3Sopenharmony_ci	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
324cc1dc7a3Sopenharmony_ci	{
325cc1dc7a3Sopenharmony_ci		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
326cc1dc7a3Sopenharmony_ci		{
327cc1dc7a3Sopenharmony_ci			// Compute the endpoint delta for all components in current plane
328cc1dc7a3Sopenharmony_ci			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
329cc1dc7a3Sopenharmony_ci			epd = select(epd, vint4::zero(), plane_mask);
330cc1dc7a3Sopenharmony_ci
331cc1dc7a3Sopenharmony_ci			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
332cc1dc7a3Sopenharmony_ci			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
333cc1dc7a3Sopenharmony_ci		}
334cc1dc7a3Sopenharmony_ci
335cc1dc7a3Sopenharmony_ci		// For each weight compute previous, current, and next errors
336cc1dc7a3Sopenharmony_ci		promise(bsd.texel_count > 0);
337cc1dc7a3Sopenharmony_ci		for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
338cc1dc7a3Sopenharmony_ci		{
339cc1dc7a3Sopenharmony_ci			int uqw = dec_weights_uquant[texel];
340cc1dc7a3Sopenharmony_ci
341cc1dc7a3Sopenharmony_ci			uint32_t prev_and_next = qat.prev_next_values[uqw];
342cc1dc7a3Sopenharmony_ci			int uqw_down = prev_and_next & 0xFF;
343cc1dc7a3Sopenharmony_ci			int uqw_up = (prev_and_next >> 8) & 0xFF;
344cc1dc7a3Sopenharmony_ci
345cc1dc7a3Sopenharmony_ci			// Interpolate the colors to create the diffs
346cc1dc7a3Sopenharmony_ci			float weight_base = static_cast<float>(uqw);
347cc1dc7a3Sopenharmony_ci			float weight_down = static_cast<float>(uqw_down - uqw);
348cc1dc7a3Sopenharmony_ci			float weight_up = static_cast<float>(uqw_up - uqw);
349cc1dc7a3Sopenharmony_ci
350cc1dc7a3Sopenharmony_ci			unsigned int partition = pi.partition_of_texel[texel];
351cc1dc7a3Sopenharmony_ci			vfloat4 color_offset = offset[partition];
352cc1dc7a3Sopenharmony_ci			vfloat4 color_base   = endpnt0f[partition];
353cc1dc7a3Sopenharmony_ci
354cc1dc7a3Sopenharmony_ci			vfloat4 color = color_base + color_offset * weight_base;
355cc1dc7a3Sopenharmony_ci			vfloat4 orig_color   = blk.texel(texel);
356cc1dc7a3Sopenharmony_ci			vfloat4 error_weight = blk.channel_weight;
357cc1dc7a3Sopenharmony_ci
358cc1dc7a3Sopenharmony_ci			vfloat4 color_diff      = color - orig_color;
359cc1dc7a3Sopenharmony_ci			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
360cc1dc7a3Sopenharmony_ci			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;
361cc1dc7a3Sopenharmony_ci
362cc1dc7a3Sopenharmony_ci			float error_base = dot_s(color_diff      * color_diff,      error_weight);
363cc1dc7a3Sopenharmony_ci			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
364cc1dc7a3Sopenharmony_ci			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);
365cc1dc7a3Sopenharmony_ci
366cc1dc7a3Sopenharmony_ci			// Check if the prev or next error is better, and if so use it
367cc1dc7a3Sopenharmony_ci			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
368cc1dc7a3Sopenharmony_ci			{
369cc1dc7a3Sopenharmony_ci				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
370cc1dc7a3Sopenharmony_ci				adjustments = true;
371cc1dc7a3Sopenharmony_ci			}
372cc1dc7a3Sopenharmony_ci			else if ((error_down < error_base) && (uqw > 0))
373cc1dc7a3Sopenharmony_ci			{
374cc1dc7a3Sopenharmony_ci				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
375cc1dc7a3Sopenharmony_ci				adjustments = true;
376cc1dc7a3Sopenharmony_ci			}
377cc1dc7a3Sopenharmony_ci		}
378cc1dc7a3Sopenharmony_ci
379cc1dc7a3Sopenharmony_ci		// Prepare iteration for plane 2
380cc1dc7a3Sopenharmony_ci		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
381cc1dc7a3Sopenharmony_ci		plane_mask = ~plane_mask;
382cc1dc7a3Sopenharmony_ci	}
383cc1dc7a3Sopenharmony_ci
384cc1dc7a3Sopenharmony_ci	return adjustments;
385cc1dc7a3Sopenharmony_ci}
386cc1dc7a3Sopenharmony_ci#endif
387cc1dc7a3Sopenharmony_ci
388cc1dc7a3Sopenharmony_ci/**
389cc1dc7a3Sopenharmony_ci * @brief Attempt to improve weights given a chosen configuration.
390cc1dc7a3Sopenharmony_ci *
391cc1dc7a3Sopenharmony_ci * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
392cc1dc7a3Sopenharmony_ci * partition and per plane) and attempt to improve image quality by moving each weight up by one or
393cc1dc7a3Sopenharmony_ci * down by one quantization step.
394cc1dc7a3Sopenharmony_ci *
395cc1dc7a3Sopenharmony_ci * @param      decode_mode   The decode mode (LDR, HDR).
396cc1dc7a3Sopenharmony_ci * @param      bsd           The block size information.
397cc1dc7a3Sopenharmony_ci * @param      blk           The image block color data to compress.
398cc1dc7a3Sopenharmony_ci * @param[out] scb           The symbolic compressed block output.
399cc1dc7a3Sopenharmony_ci */
400cc1dc7a3Sopenharmony_cistatic bool realign_weights_decimated(
401cc1dc7a3Sopenharmony_ci	astcenc_profile decode_mode,
402cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
403cc1dc7a3Sopenharmony_ci	const image_block& blk,
404cc1dc7a3Sopenharmony_ci	symbolic_compressed_block& scb
405cc1dc7a3Sopenharmony_ci) {
406cc1dc7a3Sopenharmony_ci	// Get the partition descriptor
407cc1dc7a3Sopenharmony_ci	unsigned int partition_count = scb.partition_count;
408cc1dc7a3Sopenharmony_ci	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
409cc1dc7a3Sopenharmony_ci
410cc1dc7a3Sopenharmony_ci	// Get the quantization table
411cc1dc7a3Sopenharmony_ci	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
412cc1dc7a3Sopenharmony_ci	unsigned int weight_quant_level = bm.quant_mode;
413cc1dc7a3Sopenharmony_ci	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
414cc1dc7a3Sopenharmony_ci
415cc1dc7a3Sopenharmony_ci	// Get the decimation table
416cc1dc7a3Sopenharmony_ci	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
417cc1dc7a3Sopenharmony_ci	unsigned int weight_count = di.weight_count;
418cc1dc7a3Sopenharmony_ci	assert(weight_count != bsd.texel_count);
419cc1dc7a3Sopenharmony_ci
420cc1dc7a3Sopenharmony_ci	unsigned int max_plane = bm.is_dual_plane;
421cc1dc7a3Sopenharmony_ci	int plane2_component = scb.plane2_component;
422cc1dc7a3Sopenharmony_ci	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
423cc1dc7a3Sopenharmony_ci
424cc1dc7a3Sopenharmony_ci	// Decode the color endpoints
425cc1dc7a3Sopenharmony_ci	bool rgb_hdr;
426cc1dc7a3Sopenharmony_ci	bool alpha_hdr;
427cc1dc7a3Sopenharmony_ci	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
428cc1dc7a3Sopenharmony_ci	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
429cc1dc7a3Sopenharmony_ci	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
430cc1dc7a3Sopenharmony_ci	vfloat4 offset[BLOCK_MAX_PARTITIONS];
431cc1dc7a3Sopenharmony_ci
432cc1dc7a3Sopenharmony_ci	promise(partition_count > 0);
433cc1dc7a3Sopenharmony_ci	promise(weight_count > 0);
434cc1dc7a3Sopenharmony_ci
435cc1dc7a3Sopenharmony_ci	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
436cc1dc7a3Sopenharmony_ci	{
437cc1dc7a3Sopenharmony_ci		unpack_color_endpoints(decode_mode,
438cc1dc7a3Sopenharmony_ci		                       scb.color_formats[pa_idx],
439cc1dc7a3Sopenharmony_ci		                       scb.color_values[pa_idx],
440cc1dc7a3Sopenharmony_ci		                       rgb_hdr, alpha_hdr,
441cc1dc7a3Sopenharmony_ci		                       endpnt0[pa_idx],
442cc1dc7a3Sopenharmony_ci		                       endpnt1[pa_idx]);
443cc1dc7a3Sopenharmony_ci	}
444cc1dc7a3Sopenharmony_ci
445cc1dc7a3Sopenharmony_ci	uint8_t* dec_weights_uquant = scb.weights;
446cc1dc7a3Sopenharmony_ci	bool adjustments = false;
447cc1dc7a3Sopenharmony_ci
448cc1dc7a3Sopenharmony_ci	// For each plane and partition ...
449cc1dc7a3Sopenharmony_ci	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
450cc1dc7a3Sopenharmony_ci	{
451cc1dc7a3Sopenharmony_ci		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
452cc1dc7a3Sopenharmony_ci		{
453cc1dc7a3Sopenharmony_ci			// Compute the endpoint delta for all components in current plane
454cc1dc7a3Sopenharmony_ci			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
455cc1dc7a3Sopenharmony_ci			epd = select(epd, vint4::zero(), plane_mask);
456cc1dc7a3Sopenharmony_ci
457cc1dc7a3Sopenharmony_ci			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
458cc1dc7a3Sopenharmony_ci			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
459cc1dc7a3Sopenharmony_ci		}
460cc1dc7a3Sopenharmony_ci
461cc1dc7a3Sopenharmony_ci		// Create an unquantized weight grid for this decimation level
462cc1dc7a3Sopenharmony_ci		ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
463cc1dc7a3Sopenharmony_ci		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
464cc1dc7a3Sopenharmony_ci		{
465cc1dc7a3Sopenharmony_ci			vint unquant_value(dec_weights_uquant + we_idx);
466cc1dc7a3Sopenharmony_ci			vfloat unquant_valuef = int_to_float(unquant_value);
467cc1dc7a3Sopenharmony_ci			storea(unquant_valuef, uq_weightsf + we_idx);
468cc1dc7a3Sopenharmony_ci		}
469cc1dc7a3Sopenharmony_ci
470cc1dc7a3Sopenharmony_ci		// For each weight compute previous, current, and next errors
471cc1dc7a3Sopenharmony_ci		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
472cc1dc7a3Sopenharmony_ci		{
473cc1dc7a3Sopenharmony_ci			int uqw = dec_weights_uquant[we_idx];
474cc1dc7a3Sopenharmony_ci			uint32_t prev_and_next = qat.prev_next_values[uqw];
475cc1dc7a3Sopenharmony_ci
476cc1dc7a3Sopenharmony_ci			float uqw_base = uq_weightsf[we_idx];
477cc1dc7a3Sopenharmony_ci			float uqw_down = static_cast<float>(prev_and_next & 0xFF);
478cc1dc7a3Sopenharmony_ci			float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
479cc1dc7a3Sopenharmony_ci
480cc1dc7a3Sopenharmony_ci			float uqw_diff_down = uqw_down - uqw_base;
481cc1dc7a3Sopenharmony_ci			float uqw_diff_up = uqw_up - uqw_base;
482cc1dc7a3Sopenharmony_ci
483cc1dc7a3Sopenharmony_ci			vfloat4 error_basev = vfloat4::zero();
484cc1dc7a3Sopenharmony_ci			vfloat4 error_downv = vfloat4::zero();
485cc1dc7a3Sopenharmony_ci			vfloat4 error_upv = vfloat4::zero();
486cc1dc7a3Sopenharmony_ci
487cc1dc7a3Sopenharmony_ci			// Interpolate the colors to create the diffs
488cc1dc7a3Sopenharmony_ci			unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
489cc1dc7a3Sopenharmony_ci			promise(texels_to_evaluate > 0);
490cc1dc7a3Sopenharmony_ci			for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
491cc1dc7a3Sopenharmony_ci			{
492cc1dc7a3Sopenharmony_ci				unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
493cc1dc7a3Sopenharmony_ci
494cc1dc7a3Sopenharmony_ci				float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
495cc1dc7a3Sopenharmony_ci
496cc1dc7a3Sopenharmony_ci				float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
497cc1dc7a3Sopenharmony_ci				                   + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
498cc1dc7a3Sopenharmony_ci					              + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
499cc1dc7a3Sopenharmony_ci				                   + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
500cc1dc7a3Sopenharmony_ci
501cc1dc7a3Sopenharmony_ci				// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
502cc1dc7a3Sopenharmony_ci				// float weight = astc::flt_rd(weight_base + 0.5f);
503cc1dc7a3Sopenharmony_ci				// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
504cc1dc7a3Sopenharmony_ci				// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
505cc1dc7a3Sopenharmony_ci				float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
506cc1dc7a3Sopenharmony_ci				float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
507cc1dc7a3Sopenharmony_ci
508cc1dc7a3Sopenharmony_ci				unsigned int partition = pi.partition_of_texel[texel];
509cc1dc7a3Sopenharmony_ci				vfloat4 color_offset = offset[partition];
510cc1dc7a3Sopenharmony_ci				vfloat4 color_base   = endpnt0f[partition];
511cc1dc7a3Sopenharmony_ci
512cc1dc7a3Sopenharmony_ci				vfloat4 color = color_base + color_offset * weight_base;
513cc1dc7a3Sopenharmony_ci				vfloat4 orig_color = blk.texel(texel);
514cc1dc7a3Sopenharmony_ci
515cc1dc7a3Sopenharmony_ci				vfloat4 color_diff      = color - orig_color;
516cc1dc7a3Sopenharmony_ci				vfloat4 color_down_diff = color_diff + color_offset * weight_down;
517cc1dc7a3Sopenharmony_ci				vfloat4 color_up_diff   = color_diff + color_offset * weight_up;
518cc1dc7a3Sopenharmony_ci
519cc1dc7a3Sopenharmony_ci				error_basev += color_diff * color_diff;
520cc1dc7a3Sopenharmony_ci				error_downv += color_down_diff * color_down_diff;
521cc1dc7a3Sopenharmony_ci				error_upv   += color_up_diff * color_up_diff;
522cc1dc7a3Sopenharmony_ci			}
523cc1dc7a3Sopenharmony_ci
524cc1dc7a3Sopenharmony_ci			vfloat4 error_weight = blk.channel_weight;
525cc1dc7a3Sopenharmony_ci			float error_base = hadd_s(error_basev * error_weight);
526cc1dc7a3Sopenharmony_ci			float error_down = hadd_s(error_downv * error_weight);
527cc1dc7a3Sopenharmony_ci			float error_up   = hadd_s(error_upv   * error_weight);
528cc1dc7a3Sopenharmony_ci
529cc1dc7a3Sopenharmony_ci			// Check if the prev or next error is better, and if so use it
530cc1dc7a3Sopenharmony_ci			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
531cc1dc7a3Sopenharmony_ci			{
532cc1dc7a3Sopenharmony_ci				uq_weightsf[we_idx] = uqw_up;
533cc1dc7a3Sopenharmony_ci				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
534cc1dc7a3Sopenharmony_ci				adjustments = true;
535cc1dc7a3Sopenharmony_ci			}
536cc1dc7a3Sopenharmony_ci			else if ((error_down < error_base) && (uqw > 0))
537cc1dc7a3Sopenharmony_ci			{
538cc1dc7a3Sopenharmony_ci				uq_weightsf[we_idx] = uqw_down;
539cc1dc7a3Sopenharmony_ci				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
540cc1dc7a3Sopenharmony_ci				adjustments = true;
541cc1dc7a3Sopenharmony_ci			}
542cc1dc7a3Sopenharmony_ci		}
543cc1dc7a3Sopenharmony_ci
544cc1dc7a3Sopenharmony_ci		// Prepare iteration for plane 2
545cc1dc7a3Sopenharmony_ci		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
546cc1dc7a3Sopenharmony_ci		plane_mask = ~plane_mask;
547cc1dc7a3Sopenharmony_ci	}
548cc1dc7a3Sopenharmony_ci
549cc1dc7a3Sopenharmony_ci	return adjustments;
550cc1dc7a3Sopenharmony_ci}
551cc1dc7a3Sopenharmony_ci
552cc1dc7a3Sopenharmony_ci/**
553cc1dc7a3Sopenharmony_ci * @brief Compress a block using a chosen partitioning and 1 plane of weights.
554cc1dc7a3Sopenharmony_ci *
555cc1dc7a3Sopenharmony_ci * @param      config                    The compressor configuration.
556cc1dc7a3Sopenharmony_ci * @param      bsd                       The block size information.
557cc1dc7a3Sopenharmony_ci * @param      blk                       The image block color data to compress.
558cc1dc7a3Sopenharmony_ci * @param      only_always               True if we only use "always" percentile block modes.
559cc1dc7a3Sopenharmony_ci * @param      tune_errorval_threshold   The error value threshold.
560cc1dc7a3Sopenharmony_ci * @param      partition_count           The partition count.
561cc1dc7a3Sopenharmony_ci * @param      partition_index           The partition index if @c partition_count is 2-4.
562cc1dc7a3Sopenharmony_ci * @param[out] scb                       The symbolic compressed block output.
563cc1dc7a3Sopenharmony_ci * @param[out] tmpbuf                    The quantized weights for plane 1.
564cc1dc7a3Sopenharmony_ci */
565cc1dc7a3Sopenharmony_cistatic float compress_symbolic_block_for_partition_1plane(
566cc1dc7a3Sopenharmony_ci	QualityProfile privateProfile,
567cc1dc7a3Sopenharmony_ci	const astcenc_config& config,
568cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
569cc1dc7a3Sopenharmony_ci	const image_block& blk,
570cc1dc7a3Sopenharmony_ci	bool only_always,
571cc1dc7a3Sopenharmony_ci	float tune_errorval_threshold,
572cc1dc7a3Sopenharmony_ci	unsigned int partition_count,
573cc1dc7a3Sopenharmony_ci	unsigned int partition_index,
574cc1dc7a3Sopenharmony_ci	symbolic_compressed_block& scb,
575cc1dc7a3Sopenharmony_ci	compression_working_buffers& tmpbuf,
576cc1dc7a3Sopenharmony_ci	int quant_limit
577cc1dc7a3Sopenharmony_ci) {
578cc1dc7a3Sopenharmony_ci	promise(partition_count > 0);
579cc1dc7a3Sopenharmony_ci	promise(config.tune_candidate_limit > 0);
580cc1dc7a3Sopenharmony_ci	promise(config.tune_refinement_limit > 0);
581cc1dc7a3Sopenharmony_ci
582cc1dc7a3Sopenharmony_ci	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
583cc1dc7a3Sopenharmony_ci
584cc1dc7a3Sopenharmony_ci	auto compute_difference = &compute_symbolic_block_difference_1plane;
585cc1dc7a3Sopenharmony_ci	if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
586cc1dc7a3Sopenharmony_ci	{
587cc1dc7a3Sopenharmony_ci		compute_difference = &compute_symbolic_block_difference_1plane_1partition;
588cc1dc7a3Sopenharmony_ci	}
589cc1dc7a3Sopenharmony_ci
590cc1dc7a3Sopenharmony_ci	const auto& pi = bsd.get_partition_info(partition_count, partition_index);
591cc1dc7a3Sopenharmony_ci
592cc1dc7a3Sopenharmony_ci	// Compute ideal weights and endpoint colors, with no quantization or decimation
593cc1dc7a3Sopenharmony_ci	endpoints_and_weights& ei = tmpbuf.ei1;
594cc1dc7a3Sopenharmony_ci	compute_ideal_colors_and_weights_1plane(blk, pi, ei);
595cc1dc7a3Sopenharmony_ci
596cc1dc7a3Sopenharmony_ci	// Compute ideal weights and endpoint colors for every decimation
597cc1dc7a3Sopenharmony_ci	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
598cc1dc7a3Sopenharmony_ci	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
599cc1dc7a3Sopenharmony_ci
600cc1dc7a3Sopenharmony_ci	// For each decimation mode, compute an ideal set of weights with no quantization
601cc1dc7a3Sopenharmony_ci	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
602cc1dc7a3Sopenharmony_ci	                                                : bsd.decimation_mode_count_selected;
603cc1dc7a3Sopenharmony_ci	promise(max_decimation_modes > 0);
604cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < max_decimation_modes; i++)
605cc1dc7a3Sopenharmony_ci	{
606cc1dc7a3Sopenharmony_ci		const auto& dm = bsd.get_decimation_mode(i);
607cc1dc7a3Sopenharmony_ci		if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
608cc1dc7a3Sopenharmony_ci		{
609cc1dc7a3Sopenharmony_ci			continue;
610cc1dc7a3Sopenharmony_ci		}
611cc1dc7a3Sopenharmony_ci
612cc1dc7a3Sopenharmony_ci		const auto& di = bsd.get_decimation_info(i);
613cc1dc7a3Sopenharmony_ci
614cc1dc7a3Sopenharmony_ci		compute_ideal_weights_for_decimation(
615cc1dc7a3Sopenharmony_ci		    ei,
616cc1dc7a3Sopenharmony_ci		    di,
617cc1dc7a3Sopenharmony_ci		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
618cc1dc7a3Sopenharmony_ci	}
619cc1dc7a3Sopenharmony_ci
620cc1dc7a3Sopenharmony_ci	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
621cc1dc7a3Sopenharmony_ci	// weight pair, compute the smallest weight that will result in a color value greater than 1
622cc1dc7a3Sopenharmony_ci	vfloat4 min_ep(10.0f);
623cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < partition_count; i++)
624cc1dc7a3Sopenharmony_ci	{
625cc1dc7a3Sopenharmony_ci		vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
626cc1dc7a3Sopenharmony_ci
627cc1dc7a3Sopenharmony_ci		vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
628cc1dc7a3Sopenharmony_ci		min_ep = select(min_ep, ep, use_ep);
629cc1dc7a3Sopenharmony_ci	}
630cc1dc7a3Sopenharmony_ci
631cc1dc7a3Sopenharmony_ci	float min_wt_cutoff = hmin_s(min_ep);
632cc1dc7a3Sopenharmony_ci
633cc1dc7a3Sopenharmony_ci	// For each mode, use the angular method to compute a shift
634cc1dc7a3Sopenharmony_ci	compute_angular_endpoints_1plane(
635cc1dc7a3Sopenharmony_ci	    privateProfile, only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
636cc1dc7a3Sopenharmony_ci
637cc1dc7a3Sopenharmony_ci	float* weight_low_value = tmpbuf.weight_low_value1;
638cc1dc7a3Sopenharmony_ci	float* weight_high_value = tmpbuf.weight_high_value1;
639cc1dc7a3Sopenharmony_ci	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
640cc1dc7a3Sopenharmony_ci	float* qwt_errors = tmpbuf.qwt_errors;
641cc1dc7a3Sopenharmony_ci
642cc1dc7a3Sopenharmony_ci	// For each mode (which specifies a decimation and a quantization):
643cc1dc7a3Sopenharmony_ci	//     * Compute number of bits needed for the quantized weights
644cc1dc7a3Sopenharmony_ci	//     * Generate an optimized set of quantized weights
645cc1dc7a3Sopenharmony_ci	//     * Compute quantization errors for the mode
646cc1dc7a3Sopenharmony_ci
647cc1dc7a3Sopenharmony_ci
648cc1dc7a3Sopenharmony_ci	static const int8_t free_bits_for_partition_count[4] {
649cc1dc7a3Sopenharmony_ci		115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
650cc1dc7a3Sopenharmony_ci	};
651cc1dc7a3Sopenharmony_ci
652cc1dc7a3Sopenharmony_ci	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
653cc1dc7a3Sopenharmony_ci	                                           : bsd.block_mode_count_1plane_selected;
654cc1dc7a3Sopenharmony_ci	promise(max_block_modes > 0);
655cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < max_block_modes; i++)
656cc1dc7a3Sopenharmony_ci	{
657cc1dc7a3Sopenharmony_ci		const block_mode& bm = bsd.block_modes[i];
658cc1dc7a3Sopenharmony_ci
659cc1dc7a3Sopenharmony_ci		if (bm.quant_mode > max_weight_quant)
660cc1dc7a3Sopenharmony_ci		{
661cc1dc7a3Sopenharmony_ci			qwt_errors[i] = 1e38f;
662cc1dc7a3Sopenharmony_ci			continue;
663cc1dc7a3Sopenharmony_ci		}
664cc1dc7a3Sopenharmony_ci
665cc1dc7a3Sopenharmony_ci		assert(!bm.is_dual_plane);
666cc1dc7a3Sopenharmony_ci		int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
667cc1dc7a3Sopenharmony_ci		if (bitcount <= 0)
668cc1dc7a3Sopenharmony_ci		{
669cc1dc7a3Sopenharmony_ci			qwt_errors[i] = 1e38f;
670cc1dc7a3Sopenharmony_ci			continue;
671cc1dc7a3Sopenharmony_ci		}
672cc1dc7a3Sopenharmony_ci
673cc1dc7a3Sopenharmony_ci		if (weight_high_value[i] > 1.02f * min_wt_cutoff)
674cc1dc7a3Sopenharmony_ci		{
675cc1dc7a3Sopenharmony_ci			weight_high_value[i] = 1.0f;
676cc1dc7a3Sopenharmony_ci		}
677cc1dc7a3Sopenharmony_ci
678cc1dc7a3Sopenharmony_ci		int decimation_mode = bm.decimation_mode;
679cc1dc7a3Sopenharmony_ci		const auto& di = bsd.get_decimation_info(decimation_mode);
680cc1dc7a3Sopenharmony_ci
681cc1dc7a3Sopenharmony_ci		qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
682cc1dc7a3Sopenharmony_ci
683cc1dc7a3Sopenharmony_ci		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
684cc1dc7a3Sopenharmony_ci
685cc1dc7a3Sopenharmony_ci		// Generate the optimized set of weights for the weight mode
686cc1dc7a3Sopenharmony_ci		compute_quantized_weights_for_decimation(
687cc1dc7a3Sopenharmony_ci		    di,
688cc1dc7a3Sopenharmony_ci		    weight_low_value[i], weight_high_value[i],
689cc1dc7a3Sopenharmony_ci		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
690cc1dc7a3Sopenharmony_ci		    dec_weights_uquantf,
691cc1dc7a3Sopenharmony_ci		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
692cc1dc7a3Sopenharmony_ci		    bm.get_weight_quant_mode());
693cc1dc7a3Sopenharmony_ci
694cc1dc7a3Sopenharmony_ci		// Compute weight quantization errors for the block mode
695cc1dc7a3Sopenharmony_ci		qwt_errors[i] = compute_error_of_weight_set_1plane(
696cc1dc7a3Sopenharmony_ci		    ei,
697cc1dc7a3Sopenharmony_ci		    di,
698cc1dc7a3Sopenharmony_ci		    dec_weights_uquantf);
699cc1dc7a3Sopenharmony_ci	}
700cc1dc7a3Sopenharmony_ci
701cc1dc7a3Sopenharmony_ci	// Decide the optimal combination of color endpoint encodings and weight encodings
702cc1dc7a3Sopenharmony_ci	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
703cc1dc7a3Sopenharmony_ci	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
704cc1dc7a3Sopenharmony_ci
705cc1dc7a3Sopenharmony_ci	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
706cc1dc7a3Sopenharmony_ci	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
707cc1dc7a3Sopenharmony_ci
708cc1dc7a3Sopenharmony_ci	unsigned int candidate_count = compute_ideal_endpoint_formats(
709cc1dc7a3Sopenharmony_ci	    privateProfile,
710cc1dc7a3Sopenharmony_ci	    pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
711cc1dc7a3Sopenharmony_ci	    config.tune_candidate_limit, 0, max_block_modes,
712cc1dc7a3Sopenharmony_ci	    partition_format_specifiers, block_mode_index,
713cc1dc7a3Sopenharmony_ci	    color_quant_level, color_quant_level_mod, tmpbuf);
714cc1dc7a3Sopenharmony_ci
715cc1dc7a3Sopenharmony_ci	// Iterate over the N believed-to-be-best modes to find out which one is actually best
716cc1dc7a3Sopenharmony_ci	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
717cc1dc7a3Sopenharmony_ci	float best_errorval_in_scb = scb.errorval;
718cc1dc7a3Sopenharmony_ci
719cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < candidate_count; i++)
720cc1dc7a3Sopenharmony_ci	{
721cc1dc7a3Sopenharmony_ci		TRACE_NODE(node0, "candidate");
722cc1dc7a3Sopenharmony_ci
723cc1dc7a3Sopenharmony_ci		const int bm_packed_index = block_mode_index[i];
724cc1dc7a3Sopenharmony_ci		assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
725cc1dc7a3Sopenharmony_ci		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
726cc1dc7a3Sopenharmony_ci
727cc1dc7a3Sopenharmony_ci		int decimation_mode = qw_bm.decimation_mode;
728cc1dc7a3Sopenharmony_ci		const auto& di = bsd.get_decimation_info(decimation_mode);
729cc1dc7a3Sopenharmony_ci		promise(di.weight_count > 0);
730cc1dc7a3Sopenharmony_ci
731cc1dc7a3Sopenharmony_ci		trace_add_data("weight_x", di.weight_x);
732cc1dc7a3Sopenharmony_ci		trace_add_data("weight_y", di.weight_y);
733cc1dc7a3Sopenharmony_ci		trace_add_data("weight_z", di.weight_z);
734cc1dc7a3Sopenharmony_ci		trace_add_data("weight_quant", qw_bm.quant_mode);
735cc1dc7a3Sopenharmony_ci
736cc1dc7a3Sopenharmony_ci		// Recompute the ideal color endpoints before storing them
737cc1dc7a3Sopenharmony_ci		vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
738cc1dc7a3Sopenharmony_ci		vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
739cc1dc7a3Sopenharmony_ci
740cc1dc7a3Sopenharmony_ci		symbolic_compressed_block workscb;
741cc1dc7a3Sopenharmony_ci		endpoints workep = ei.ep;
742cc1dc7a3Sopenharmony_ci
743cc1dc7a3Sopenharmony_ci		uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
744cc1dc7a3Sopenharmony_ci
745cc1dc7a3Sopenharmony_ci		for (unsigned int j = 0; j < di.weight_count; j++)
746cc1dc7a3Sopenharmony_ci		{
747cc1dc7a3Sopenharmony_ci			workscb.weights[j] = u8_weight_src[j];
748cc1dc7a3Sopenharmony_ci		}
749cc1dc7a3Sopenharmony_ci
750cc1dc7a3Sopenharmony_ci		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
751cc1dc7a3Sopenharmony_ci		{
752cc1dc7a3Sopenharmony_ci			recompute_ideal_colors_1plane(
753cc1dc7a3Sopenharmony_ci			    blk, pi, di, workscb.weights,
754cc1dc7a3Sopenharmony_ci			    workep, rgbs_colors, rgbo_colors);
755cc1dc7a3Sopenharmony_ci
756cc1dc7a3Sopenharmony_ci			// Quantize the chosen color, tracking if worth trying the mod value
757cc1dc7a3Sopenharmony_ci			bool all_same = color_quant_level[i] != color_quant_level_mod[i];
758cc1dc7a3Sopenharmony_ci			for (unsigned int j = 0; j < partition_count; j++)
759cc1dc7a3Sopenharmony_ci			{
760cc1dc7a3Sopenharmony_ci				workscb.color_formats[j] = pack_color_endpoints(
761cc1dc7a3Sopenharmony_ci				    privateProfile,
762cc1dc7a3Sopenharmony_ci				    workep.endpt0[j],
763cc1dc7a3Sopenharmony_ci				    workep.endpt1[j],
764cc1dc7a3Sopenharmony_ci				    rgbs_colors[j],
765cc1dc7a3Sopenharmony_ci				    rgbo_colors[j],
766cc1dc7a3Sopenharmony_ci				    partition_format_specifiers[i][j],
767cc1dc7a3Sopenharmony_ci				    workscb.color_values[j],
768cc1dc7a3Sopenharmony_ci				    color_quant_level[i]);
769cc1dc7a3Sopenharmony_ci
770cc1dc7a3Sopenharmony_ci				all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
771cc1dc7a3Sopenharmony_ci			}
772cc1dc7a3Sopenharmony_ci
773cc1dc7a3Sopenharmony_ci			// If all the color endpoint modes are the same, we get a few more bits to store colors;
774cc1dc7a3Sopenharmony_ci			// let's see if we can take advantage of this: requantize all the colors and see if the
775cc1dc7a3Sopenharmony_ci			// endpoint modes remain the same.
776cc1dc7a3Sopenharmony_ci			workscb.color_formats_matched = 0;
777cc1dc7a3Sopenharmony_ci			if (partition_count >= 2 && all_same)
778cc1dc7a3Sopenharmony_ci			{
779cc1dc7a3Sopenharmony_ci				uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
780cc1dc7a3Sopenharmony_ci				uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
781cc1dc7a3Sopenharmony_ci				bool all_same_mod = true;
782cc1dc7a3Sopenharmony_ci				for (unsigned int j = 0; j < partition_count; j++)
783cc1dc7a3Sopenharmony_ci				{
784cc1dc7a3Sopenharmony_ci					color_formats_mod[j] = pack_color_endpoints(
785cc1dc7a3Sopenharmony_ci					    privateProfile,
786cc1dc7a3Sopenharmony_ci					    workep.endpt0[j],
787cc1dc7a3Sopenharmony_ci					    workep.endpt1[j],
788cc1dc7a3Sopenharmony_ci					    rgbs_colors[j],
789cc1dc7a3Sopenharmony_ci					    rgbo_colors[j],
790cc1dc7a3Sopenharmony_ci					    partition_format_specifiers[i][j],
791cc1dc7a3Sopenharmony_ci					    colorvals[j],
792cc1dc7a3Sopenharmony_ci					    color_quant_level_mod[i]);
793cc1dc7a3Sopenharmony_ci
794cc1dc7a3Sopenharmony_ci					// Early out as soon as it's no longer possible to use mod
795cc1dc7a3Sopenharmony_ci					if (color_formats_mod[j] != color_formats_mod[0])
796cc1dc7a3Sopenharmony_ci					{
797cc1dc7a3Sopenharmony_ci						all_same_mod = false;
798cc1dc7a3Sopenharmony_ci						break;
799cc1dc7a3Sopenharmony_ci					}
800cc1dc7a3Sopenharmony_ci				}
801cc1dc7a3Sopenharmony_ci
802cc1dc7a3Sopenharmony_ci				if (all_same_mod)
803cc1dc7a3Sopenharmony_ci				{
804cc1dc7a3Sopenharmony_ci					workscb.color_formats_matched = 1;
805cc1dc7a3Sopenharmony_ci					for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
806cc1dc7a3Sopenharmony_ci					{
807cc1dc7a3Sopenharmony_ci						for (unsigned int k = 0; k < 8; k++)
808cc1dc7a3Sopenharmony_ci						{
809cc1dc7a3Sopenharmony_ci							workscb.color_values[j][k] = colorvals[j][k];
810cc1dc7a3Sopenharmony_ci						}
811cc1dc7a3Sopenharmony_ci
812cc1dc7a3Sopenharmony_ci						workscb.color_formats[j] = color_formats_mod[j];
813cc1dc7a3Sopenharmony_ci					}
814cc1dc7a3Sopenharmony_ci				}
815cc1dc7a3Sopenharmony_ci			}
816cc1dc7a3Sopenharmony_ci
817cc1dc7a3Sopenharmony_ci			// Store header fields
818cc1dc7a3Sopenharmony_ci			workscb.partition_count = static_cast<uint8_t>(partition_count);
819cc1dc7a3Sopenharmony_ci			workscb.partition_index = static_cast<uint16_t>(partition_index);
820cc1dc7a3Sopenharmony_ci			workscb.plane2_component = -1;
821cc1dc7a3Sopenharmony_ci			workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
822cc1dc7a3Sopenharmony_ci			workscb.block_mode = qw_bm.mode_index;
823cc1dc7a3Sopenharmony_ci			workscb.block_type = SYM_BTYPE_NONCONST;
824cc1dc7a3Sopenharmony_ci			if (privateProfile == HIGH_SPEED_PROFILE)
825cc1dc7a3Sopenharmony_ci			{
826cc1dc7a3Sopenharmony_ci				workscb.errorval = 0;
827cc1dc7a3Sopenharmony_ci				scb = workscb;
828cc1dc7a3Sopenharmony_ci				break;
829cc1dc7a3Sopenharmony_ci			}
830cc1dc7a3Sopenharmony_ci			// Pre-realign test
831cc1dc7a3Sopenharmony_ci			if (l == 0)
832cc1dc7a3Sopenharmony_ci			{
833cc1dc7a3Sopenharmony_ci				float errorval = compute_difference(config, bsd, workscb, blk);
834cc1dc7a3Sopenharmony_ci				if (errorval == -ERROR_CALC_DEFAULT)
835cc1dc7a3Sopenharmony_ci				{
836cc1dc7a3Sopenharmony_ci					errorval = -errorval;
837cc1dc7a3Sopenharmony_ci					workscb.block_type = SYM_BTYPE_ERROR;
838cc1dc7a3Sopenharmony_ci				}
839cc1dc7a3Sopenharmony_ci
840cc1dc7a3Sopenharmony_ci				trace_add_data("error_prerealign", errorval);
841cc1dc7a3Sopenharmony_ci				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
842cc1dc7a3Sopenharmony_ci
843cc1dc7a3Sopenharmony_ci				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
844cc1dc7a3Sopenharmony_ci				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
845cc1dc7a3Sopenharmony_ci				// drive a heuristic to skip blocks that are unlikely to catch up with the best
846cc1dc7a3Sopenharmony_ci				// block we have already.
847cc1dc7a3Sopenharmony_ci				unsigned int iters_remaining = config.tune_refinement_limit - l;
848cc1dc7a3Sopenharmony_ci				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
849cc1dc7a3Sopenharmony_ci				if (errorval > (threshold * best_errorval_in_scb))
850cc1dc7a3Sopenharmony_ci				{
851cc1dc7a3Sopenharmony_ci					break;
852cc1dc7a3Sopenharmony_ci				}
853cc1dc7a3Sopenharmony_ci
854cc1dc7a3Sopenharmony_ci				if (errorval < best_errorval_in_scb)
855cc1dc7a3Sopenharmony_ci				{
856cc1dc7a3Sopenharmony_ci					best_errorval_in_scb = errorval;
857cc1dc7a3Sopenharmony_ci					workscb.errorval = errorval;
858cc1dc7a3Sopenharmony_ci					scb = workscb;
859cc1dc7a3Sopenharmony_ci
860cc1dc7a3Sopenharmony_ci					if (errorval < tune_errorval_threshold)
861cc1dc7a3Sopenharmony_ci					{
862cc1dc7a3Sopenharmony_ci						// Skip remaining candidates - this is "good enough"
863cc1dc7a3Sopenharmony_ci						i = candidate_count;
864cc1dc7a3Sopenharmony_ci						break;
865cc1dc7a3Sopenharmony_ci					}
866cc1dc7a3Sopenharmony_ci				}
867cc1dc7a3Sopenharmony_ci			}
868cc1dc7a3Sopenharmony_ci
869cc1dc7a3Sopenharmony_ci			bool adjustments;
870cc1dc7a3Sopenharmony_ci			if (di.weight_count != bsd.texel_count)
871cc1dc7a3Sopenharmony_ci			{
872cc1dc7a3Sopenharmony_ci				adjustments = realign_weights_decimated(
873cc1dc7a3Sopenharmony_ci					config.profile, bsd, blk, workscb);
874cc1dc7a3Sopenharmony_ci			}
875cc1dc7a3Sopenharmony_ci			else
876cc1dc7a3Sopenharmony_ci			{
877cc1dc7a3Sopenharmony_ci				adjustments = realign_weights_undecimated(
878cc1dc7a3Sopenharmony_ci					config.profile, bsd, blk, workscb);
879cc1dc7a3Sopenharmony_ci			}
880cc1dc7a3Sopenharmony_ci
881cc1dc7a3Sopenharmony_ci			// Post-realign test
882cc1dc7a3Sopenharmony_ci			float errorval = compute_difference(config, bsd, workscb, blk);
883cc1dc7a3Sopenharmony_ci			if (errorval == -ERROR_CALC_DEFAULT)
884cc1dc7a3Sopenharmony_ci			{
885cc1dc7a3Sopenharmony_ci				errorval = -errorval;
886cc1dc7a3Sopenharmony_ci				workscb.block_type = SYM_BTYPE_ERROR;
887cc1dc7a3Sopenharmony_ci			}
888cc1dc7a3Sopenharmony_ci
889cc1dc7a3Sopenharmony_ci			trace_add_data("error_postrealign", errorval);
890cc1dc7a3Sopenharmony_ci			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
891cc1dc7a3Sopenharmony_ci
892cc1dc7a3Sopenharmony_ci			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
893cc1dc7a3Sopenharmony_ci			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
894cc1dc7a3Sopenharmony_ci			// give benefit of the doubt ...
895cc1dc7a3Sopenharmony_ci			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
896cc1dc7a3Sopenharmony_ci			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
897cc1dc7a3Sopenharmony_ci			if (errorval > (threshold * best_errorval_in_scb))
898cc1dc7a3Sopenharmony_ci			{
899cc1dc7a3Sopenharmony_ci				break;
900cc1dc7a3Sopenharmony_ci			}
901cc1dc7a3Sopenharmony_ci
902cc1dc7a3Sopenharmony_ci			if (errorval < best_errorval_in_scb)
903cc1dc7a3Sopenharmony_ci			{
904cc1dc7a3Sopenharmony_ci				best_errorval_in_scb = errorval;
905cc1dc7a3Sopenharmony_ci				workscb.errorval = errorval;
906cc1dc7a3Sopenharmony_ci				scb = workscb;
907cc1dc7a3Sopenharmony_ci
908cc1dc7a3Sopenharmony_ci				if (errorval < tune_errorval_threshold)
909cc1dc7a3Sopenharmony_ci				{
910cc1dc7a3Sopenharmony_ci					// Skip remaining candidates - this is "good enough"
911cc1dc7a3Sopenharmony_ci					i = candidate_count;
912cc1dc7a3Sopenharmony_ci					break;
913cc1dc7a3Sopenharmony_ci				}
914cc1dc7a3Sopenharmony_ci			}
915cc1dc7a3Sopenharmony_ci
916cc1dc7a3Sopenharmony_ci			if (!adjustments)
917cc1dc7a3Sopenharmony_ci			{
918cc1dc7a3Sopenharmony_ci				break;
919cc1dc7a3Sopenharmony_ci			}
920cc1dc7a3Sopenharmony_ci		}
921cc1dc7a3Sopenharmony_ci	}
922cc1dc7a3Sopenharmony_ci
923cc1dc7a3Sopenharmony_ci	return best_errorval_in_mode;
924cc1dc7a3Sopenharmony_ci}
925cc1dc7a3Sopenharmony_ci
926cc1dc7a3Sopenharmony_ci/**
927cc1dc7a3Sopenharmony_ci * @brief Compress a block using a chosen partitioning and 2 planes of weights.
928cc1dc7a3Sopenharmony_ci *
929cc1dc7a3Sopenharmony_ci * @param      config                    The compressor configuration.
930cc1dc7a3Sopenharmony_ci * @param      bsd                       The block size information.
931cc1dc7a3Sopenharmony_ci * @param      blk                       The image block color data to compress.
932cc1dc7a3Sopenharmony_ci * @param      tune_errorval_threshold   The error value threshold.
933cc1dc7a3Sopenharmony_ci * @param      plane2_component          The component index for the second plane of weights.
934cc1dc7a3Sopenharmony_ci * @param[out] scb                       The symbolic compressed block output.
935cc1dc7a3Sopenharmony_ci * @param[out] tmpbuf                    The quantized weights for plane 1.
936cc1dc7a3Sopenharmony_ci */
937cc1dc7a3Sopenharmony_cistatic float compress_symbolic_block_for_partition_2planes(
938cc1dc7a3Sopenharmony_ci	QualityProfile privateProfile,
939cc1dc7a3Sopenharmony_ci	const astcenc_config& config,
940cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
941cc1dc7a3Sopenharmony_ci	const image_block& blk,
942cc1dc7a3Sopenharmony_ci	float tune_errorval_threshold,
943cc1dc7a3Sopenharmony_ci	unsigned int plane2_component,
944cc1dc7a3Sopenharmony_ci	symbolic_compressed_block& scb,
945cc1dc7a3Sopenharmony_ci	compression_working_buffers& tmpbuf,
946cc1dc7a3Sopenharmony_ci	int quant_limit
947cc1dc7a3Sopenharmony_ci) {
948cc1dc7a3Sopenharmony_ci	promise(config.tune_candidate_limit > 0);
949cc1dc7a3Sopenharmony_ci	promise(config.tune_refinement_limit > 0);
950cc1dc7a3Sopenharmony_ci	promise(bsd.decimation_mode_count_selected > 0);
951cc1dc7a3Sopenharmony_ci
952cc1dc7a3Sopenharmony_ci	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
953cc1dc7a3Sopenharmony_ci
954cc1dc7a3Sopenharmony_ci	// Compute ideal weights and endpoint colors, with no quantization or decimation
955cc1dc7a3Sopenharmony_ci	endpoints_and_weights& ei1 = tmpbuf.ei1;
956cc1dc7a3Sopenharmony_ci	endpoints_and_weights& ei2 = tmpbuf.ei2;
957cc1dc7a3Sopenharmony_ci
958cc1dc7a3Sopenharmony_ci	compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
959cc1dc7a3Sopenharmony_ci
960cc1dc7a3Sopenharmony_ci	// Compute ideal weights and endpoint colors for every decimation
961cc1dc7a3Sopenharmony_ci	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
962cc1dc7a3Sopenharmony_ci	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
963cc1dc7a3Sopenharmony_ci
964cc1dc7a3Sopenharmony_ci	// For each decimation mode, compute an ideal set of weights with no quantization
965cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
966cc1dc7a3Sopenharmony_ci	{
967cc1dc7a3Sopenharmony_ci		const auto& dm = bsd.get_decimation_mode(i);
968cc1dc7a3Sopenharmony_ci		if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
969cc1dc7a3Sopenharmony_ci		{
970cc1dc7a3Sopenharmony_ci			continue;
971cc1dc7a3Sopenharmony_ci		}
972cc1dc7a3Sopenharmony_ci
973cc1dc7a3Sopenharmony_ci		const auto& di = bsd.get_decimation_info(i);
974cc1dc7a3Sopenharmony_ci
975cc1dc7a3Sopenharmony_ci		compute_ideal_weights_for_decimation(
976cc1dc7a3Sopenharmony_ci		    ei1,
977cc1dc7a3Sopenharmony_ci		    di,
978cc1dc7a3Sopenharmony_ci		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
979cc1dc7a3Sopenharmony_ci
980cc1dc7a3Sopenharmony_ci		compute_ideal_weights_for_decimation(
981cc1dc7a3Sopenharmony_ci		    ei2,
982cc1dc7a3Sopenharmony_ci		    di,
983cc1dc7a3Sopenharmony_ci		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
984cc1dc7a3Sopenharmony_ci	}
985cc1dc7a3Sopenharmony_ci
986cc1dc7a3Sopenharmony_ci	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
987cc1dc7a3Sopenharmony_ci	// weight pair, compute the smallest weight that will result in a color value greater than 1
988cc1dc7a3Sopenharmony_ci	vfloat4 min_ep1(10.0f);
989cc1dc7a3Sopenharmony_ci	vfloat4 min_ep2(10.0f);
990cc1dc7a3Sopenharmony_ci
991cc1dc7a3Sopenharmony_ci	vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
992cc1dc7a3Sopenharmony_ci	vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
993cc1dc7a3Sopenharmony_ci	min_ep1 = select(min_ep1, ep1, use_ep1);
994cc1dc7a3Sopenharmony_ci
995cc1dc7a3Sopenharmony_ci	vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
996cc1dc7a3Sopenharmony_ci	vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
997cc1dc7a3Sopenharmony_ci	min_ep2 = select(min_ep2, ep2, use_ep2);
998cc1dc7a3Sopenharmony_ci
999cc1dc7a3Sopenharmony_ci	vfloat4 err_max(ERROR_CALC_DEFAULT);
1000cc1dc7a3Sopenharmony_ci	vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
1001cc1dc7a3Sopenharmony_ci
1002cc1dc7a3Sopenharmony_ci	// Set the plane2 component to max error in ep1
1003cc1dc7a3Sopenharmony_ci	min_ep1 = select(min_ep1, err_max, err_mask);
1004cc1dc7a3Sopenharmony_ci
1005cc1dc7a3Sopenharmony_ci	float min_wt_cutoff1 = hmin_s(min_ep1);
1006cc1dc7a3Sopenharmony_ci
1007cc1dc7a3Sopenharmony_ci	// Set the minwt2 to the plane2 component min in ep2
1008cc1dc7a3Sopenharmony_ci	float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
1009cc1dc7a3Sopenharmony_ci
1010cc1dc7a3Sopenharmony_ci	compute_angular_endpoints_2planes(
1011cc1dc7a3Sopenharmony_ci	    privateProfile, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
1012cc1dc7a3Sopenharmony_ci
1013cc1dc7a3Sopenharmony_ci	// For each mode (which specifies a decimation and a quantization):
1014cc1dc7a3Sopenharmony_ci	//     * Compute number of bits needed for the quantized weights
1015cc1dc7a3Sopenharmony_ci	//     * Generate an optimized set of quantized weights
1016cc1dc7a3Sopenharmony_ci	//     * Compute quantization errors for the mode
1017cc1dc7a3Sopenharmony_ci
1018cc1dc7a3Sopenharmony_ci	float* weight_low_value1 = tmpbuf.weight_low_value1;
1019cc1dc7a3Sopenharmony_ci	float* weight_high_value1 = tmpbuf.weight_high_value1;
1020cc1dc7a3Sopenharmony_ci	float* weight_low_value2 = tmpbuf.weight_low_value2;
1021cc1dc7a3Sopenharmony_ci	float* weight_high_value2 = tmpbuf.weight_high_value2;
1022cc1dc7a3Sopenharmony_ci
1023cc1dc7a3Sopenharmony_ci	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
1024cc1dc7a3Sopenharmony_ci	float* qwt_errors = tmpbuf.qwt_errors;
1025cc1dc7a3Sopenharmony_ci
1026cc1dc7a3Sopenharmony_ci	unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
1027cc1dc7a3Sopenharmony_ci	unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
1028cc1dc7a3Sopenharmony_ci
1029cc1dc7a3Sopenharmony_ci	for (unsigned int i = start_2plane; i < end_2plane; i++)
1030cc1dc7a3Sopenharmony_ci	{
1031cc1dc7a3Sopenharmony_ci		const block_mode& bm = bsd.block_modes[i];
1032cc1dc7a3Sopenharmony_ci		assert(bm.is_dual_plane);
1033cc1dc7a3Sopenharmony_ci
1034cc1dc7a3Sopenharmony_ci		if (bm.quant_mode > max_weight_quant)
1035cc1dc7a3Sopenharmony_ci		{
1036cc1dc7a3Sopenharmony_ci			qwt_errors[i] = 1e38f;
1037cc1dc7a3Sopenharmony_ci			continue;
1038cc1dc7a3Sopenharmony_ci		}
1039cc1dc7a3Sopenharmony_ci
1040cc1dc7a3Sopenharmony_ci		qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
1041cc1dc7a3Sopenharmony_ci
1042cc1dc7a3Sopenharmony_ci		if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
1043cc1dc7a3Sopenharmony_ci		{
1044cc1dc7a3Sopenharmony_ci			weight_high_value1[i] = 1.0f;
1045cc1dc7a3Sopenharmony_ci		}
1046cc1dc7a3Sopenharmony_ci
1047cc1dc7a3Sopenharmony_ci		if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
1048cc1dc7a3Sopenharmony_ci		{
1049cc1dc7a3Sopenharmony_ci			weight_high_value2[i] = 1.0f;
1050cc1dc7a3Sopenharmony_ci		}
1051cc1dc7a3Sopenharmony_ci
1052cc1dc7a3Sopenharmony_ci		unsigned int decimation_mode = bm.decimation_mode;
1053cc1dc7a3Sopenharmony_ci		const auto& di = bsd.get_decimation_info(decimation_mode);
1054cc1dc7a3Sopenharmony_ci
1055cc1dc7a3Sopenharmony_ci		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
1056cc1dc7a3Sopenharmony_ci
1057cc1dc7a3Sopenharmony_ci		// Generate the optimized set of weights for the mode
1058cc1dc7a3Sopenharmony_ci		compute_quantized_weights_for_decimation(
1059cc1dc7a3Sopenharmony_ci		    di,
1060cc1dc7a3Sopenharmony_ci		    weight_low_value1[i],
1061cc1dc7a3Sopenharmony_ci		    weight_high_value1[i],
1062cc1dc7a3Sopenharmony_ci		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
1063cc1dc7a3Sopenharmony_ci		    dec_weights_uquantf,
1064cc1dc7a3Sopenharmony_ci		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
1065cc1dc7a3Sopenharmony_ci		    bm.get_weight_quant_mode());
1066cc1dc7a3Sopenharmony_ci
1067cc1dc7a3Sopenharmony_ci		compute_quantized_weights_for_decimation(
1068cc1dc7a3Sopenharmony_ci		    di,
1069cc1dc7a3Sopenharmony_ci		    weight_low_value2[i],
1070cc1dc7a3Sopenharmony_ci		    weight_high_value2[i],
1071cc1dc7a3Sopenharmony_ci		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
1072cc1dc7a3Sopenharmony_ci		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
1073cc1dc7a3Sopenharmony_ci		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
1074cc1dc7a3Sopenharmony_ci		    bm.get_weight_quant_mode());
1075cc1dc7a3Sopenharmony_ci
1076cc1dc7a3Sopenharmony_ci		// Compute weight quantization errors for the block mode
1077cc1dc7a3Sopenharmony_ci		qwt_errors[i] = compute_error_of_weight_set_2planes(
1078cc1dc7a3Sopenharmony_ci		    ei1,
1079cc1dc7a3Sopenharmony_ci		    ei2,
1080cc1dc7a3Sopenharmony_ci		    di,
1081cc1dc7a3Sopenharmony_ci		    dec_weights_uquantf,
1082cc1dc7a3Sopenharmony_ci		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
1083cc1dc7a3Sopenharmony_ci	}
1084cc1dc7a3Sopenharmony_ci
1085cc1dc7a3Sopenharmony_ci	// Decide the optimal combination of color endpoint encodings and weight encodings
1086cc1dc7a3Sopenharmony_ci	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
1087cc1dc7a3Sopenharmony_ci	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
1088cc1dc7a3Sopenharmony_ci
1089cc1dc7a3Sopenharmony_ci	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
1090cc1dc7a3Sopenharmony_ci	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
1091cc1dc7a3Sopenharmony_ci
1092cc1dc7a3Sopenharmony_ci	endpoints epm;
1093cc1dc7a3Sopenharmony_ci	merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
1094cc1dc7a3Sopenharmony_ci
1095cc1dc7a3Sopenharmony_ci	const auto& pi = bsd.get_partition_info(1, 0);
1096cc1dc7a3Sopenharmony_ci	unsigned int candidate_count = compute_ideal_endpoint_formats(
1097cc1dc7a3Sopenharmony_ci	    config.privateProfile,
1098cc1dc7a3Sopenharmony_ci	    pi, blk, epm, qwt_bitcounts, qwt_errors,
1099cc1dc7a3Sopenharmony_ci	    config.tune_candidate_limit,
1100cc1dc7a3Sopenharmony_ci		bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
1101cc1dc7a3Sopenharmony_ci	    partition_format_specifiers, block_mode_index,
1102cc1dc7a3Sopenharmony_ci	    color_quant_level, color_quant_level_mod, tmpbuf);
1103cc1dc7a3Sopenharmony_ci
1104cc1dc7a3Sopenharmony_ci	// Iterate over the N believed-to-be-best modes to find out which one is actually best
1105cc1dc7a3Sopenharmony_ci	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
1106cc1dc7a3Sopenharmony_ci	float best_errorval_in_scb = scb.errorval;
1107cc1dc7a3Sopenharmony_ci
1108cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < candidate_count; i++)
1109cc1dc7a3Sopenharmony_ci	{
1110cc1dc7a3Sopenharmony_ci		TRACE_NODE(node0, "candidate");
1111cc1dc7a3Sopenharmony_ci
1112cc1dc7a3Sopenharmony_ci		const int bm_packed_index = block_mode_index[i];
1113cc1dc7a3Sopenharmony_ci		assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
1114cc1dc7a3Sopenharmony_ci		       bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
1115cc1dc7a3Sopenharmony_ci		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
1116cc1dc7a3Sopenharmony_ci
1117cc1dc7a3Sopenharmony_ci		int decimation_mode = qw_bm.decimation_mode;
1118cc1dc7a3Sopenharmony_ci		const auto& di = bsd.get_decimation_info(decimation_mode);
1119cc1dc7a3Sopenharmony_ci		promise(di.weight_count > 0);
1120cc1dc7a3Sopenharmony_ci
1121cc1dc7a3Sopenharmony_ci		trace_add_data("weight_x", di.weight_x);
1122cc1dc7a3Sopenharmony_ci		trace_add_data("weight_y", di.weight_y);
1123cc1dc7a3Sopenharmony_ci		trace_add_data("weight_z", di.weight_z);
1124cc1dc7a3Sopenharmony_ci		trace_add_data("weight_quant", qw_bm.quant_mode);
1125cc1dc7a3Sopenharmony_ci
1126cc1dc7a3Sopenharmony_ci		vfloat4 rgbs_color;
1127cc1dc7a3Sopenharmony_ci		vfloat4 rgbo_color;
1128cc1dc7a3Sopenharmony_ci
1129cc1dc7a3Sopenharmony_ci		symbolic_compressed_block workscb;
1130cc1dc7a3Sopenharmony_ci		endpoints workep = epm;
1131cc1dc7a3Sopenharmony_ci
1132cc1dc7a3Sopenharmony_ci		uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
1133cc1dc7a3Sopenharmony_ci		uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
1134cc1dc7a3Sopenharmony_ci
1135cc1dc7a3Sopenharmony_ci		for (int j = 0; j < di.weight_count; j++)
1136cc1dc7a3Sopenharmony_ci		{
1137cc1dc7a3Sopenharmony_ci			workscb.weights[j] = u8_weight1_src[j];
1138cc1dc7a3Sopenharmony_ci			workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
1139cc1dc7a3Sopenharmony_ci		}
1140cc1dc7a3Sopenharmony_ci
1141cc1dc7a3Sopenharmony_ci		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
1142cc1dc7a3Sopenharmony_ci		{
1143cc1dc7a3Sopenharmony_ci			recompute_ideal_colors_2planes(
1144cc1dc7a3Sopenharmony_ci			    blk, bsd, di,
1145cc1dc7a3Sopenharmony_ci			    workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
1146cc1dc7a3Sopenharmony_ci			    workep, rgbs_color, rgbo_color, plane2_component);
1147cc1dc7a3Sopenharmony_ci
1148cc1dc7a3Sopenharmony_ci			// Quantize the chosen color
1149cc1dc7a3Sopenharmony_ci			workscb.color_formats[0] = pack_color_endpoints(
1150cc1dc7a3Sopenharmony_ci			                               privateProfile,
1151cc1dc7a3Sopenharmony_ci			                               workep.endpt0[0],
1152cc1dc7a3Sopenharmony_ci			                               workep.endpt1[0],
1153cc1dc7a3Sopenharmony_ci			                               rgbs_color, rgbo_color,
1154cc1dc7a3Sopenharmony_ci			                               partition_format_specifiers[i][0],
1155cc1dc7a3Sopenharmony_ci			                               workscb.color_values[0],
1156cc1dc7a3Sopenharmony_ci			                               color_quant_level[i]);
1157cc1dc7a3Sopenharmony_ci
1158cc1dc7a3Sopenharmony_ci			// Store header fields
1159cc1dc7a3Sopenharmony_ci			workscb.partition_count = 1;
1160cc1dc7a3Sopenharmony_ci			workscb.partition_index = 0;
1161cc1dc7a3Sopenharmony_ci			workscb.quant_mode = color_quant_level[i];
1162cc1dc7a3Sopenharmony_ci			workscb.color_formats_matched = 0;
1163cc1dc7a3Sopenharmony_ci			workscb.block_mode = qw_bm.mode_index;
1164cc1dc7a3Sopenharmony_ci			workscb.plane2_component = static_cast<int8_t>(plane2_component);
1165cc1dc7a3Sopenharmony_ci			workscb.block_type = SYM_BTYPE_NONCONST;
1166cc1dc7a3Sopenharmony_ci
1167cc1dc7a3Sopenharmony_ci			// Pre-realign test
1168cc1dc7a3Sopenharmony_ci			if (l == 0)
1169cc1dc7a3Sopenharmony_ci			{
1170cc1dc7a3Sopenharmony_ci				float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
1171cc1dc7a3Sopenharmony_ci				if (errorval == -ERROR_CALC_DEFAULT)
1172cc1dc7a3Sopenharmony_ci				{
1173cc1dc7a3Sopenharmony_ci					errorval = -errorval;
1174cc1dc7a3Sopenharmony_ci					workscb.block_type = SYM_BTYPE_ERROR;
1175cc1dc7a3Sopenharmony_ci				}
1176cc1dc7a3Sopenharmony_ci
1177cc1dc7a3Sopenharmony_ci				trace_add_data("error_prerealign", errorval);
1178cc1dc7a3Sopenharmony_ci				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1179cc1dc7a3Sopenharmony_ci
1180cc1dc7a3Sopenharmony_ci				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
1181cc1dc7a3Sopenharmony_ci				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
1182cc1dc7a3Sopenharmony_ci				// drive a heuristic to skip blocks that are unlikely to catch up with the best
1183cc1dc7a3Sopenharmony_ci				// block we have already.
1184cc1dc7a3Sopenharmony_ci				unsigned int iters_remaining = config.tune_refinement_limit - l;
1185cc1dc7a3Sopenharmony_ci				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
1186cc1dc7a3Sopenharmony_ci				if (errorval > (threshold * best_errorval_in_scb))
1187cc1dc7a3Sopenharmony_ci				{
1188cc1dc7a3Sopenharmony_ci					break;
1189cc1dc7a3Sopenharmony_ci				}
1190cc1dc7a3Sopenharmony_ci
1191cc1dc7a3Sopenharmony_ci				if (errorval < best_errorval_in_scb)
1192cc1dc7a3Sopenharmony_ci				{
1193cc1dc7a3Sopenharmony_ci					best_errorval_in_scb = errorval;
1194cc1dc7a3Sopenharmony_ci					workscb.errorval = errorval;
1195cc1dc7a3Sopenharmony_ci					scb = workscb;
1196cc1dc7a3Sopenharmony_ci
1197cc1dc7a3Sopenharmony_ci					if (errorval < tune_errorval_threshold)
1198cc1dc7a3Sopenharmony_ci					{
1199cc1dc7a3Sopenharmony_ci						// Skip remaining candidates - this is "good enough"
1200cc1dc7a3Sopenharmony_ci						i = candidate_count;
1201cc1dc7a3Sopenharmony_ci						break;
1202cc1dc7a3Sopenharmony_ci					}
1203cc1dc7a3Sopenharmony_ci				}
1204cc1dc7a3Sopenharmony_ci			}
1205cc1dc7a3Sopenharmony_ci
1206cc1dc7a3Sopenharmony_ci			// Perform a final pass over the weights to try to improve them.
1207cc1dc7a3Sopenharmony_ci			bool adjustments;
1208cc1dc7a3Sopenharmony_ci			if (di.weight_count != bsd.texel_count)
1209cc1dc7a3Sopenharmony_ci			{
1210cc1dc7a3Sopenharmony_ci				adjustments = realign_weights_decimated(
1211cc1dc7a3Sopenharmony_ci					config.profile, bsd, blk, workscb);
1212cc1dc7a3Sopenharmony_ci			}
1213cc1dc7a3Sopenharmony_ci			else
1214cc1dc7a3Sopenharmony_ci			{
1215cc1dc7a3Sopenharmony_ci				adjustments = realign_weights_undecimated(
1216cc1dc7a3Sopenharmony_ci					config.profile, bsd, blk, workscb);
1217cc1dc7a3Sopenharmony_ci			}
1218cc1dc7a3Sopenharmony_ci
1219cc1dc7a3Sopenharmony_ci			// Post-realign test
1220cc1dc7a3Sopenharmony_ci			float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
1221cc1dc7a3Sopenharmony_ci			if (errorval == -ERROR_CALC_DEFAULT)
1222cc1dc7a3Sopenharmony_ci			{
1223cc1dc7a3Sopenharmony_ci				errorval = -errorval;
1224cc1dc7a3Sopenharmony_ci				workscb.block_type = SYM_BTYPE_ERROR;
1225cc1dc7a3Sopenharmony_ci			}
1226cc1dc7a3Sopenharmony_ci
1227cc1dc7a3Sopenharmony_ci			trace_add_data("error_postrealign", errorval);
1228cc1dc7a3Sopenharmony_ci			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1229cc1dc7a3Sopenharmony_ci
1230cc1dc7a3Sopenharmony_ci			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
1231cc1dc7a3Sopenharmony_ci			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
1232cc1dc7a3Sopenharmony_ci			// give benefit of the doubt ...
1233cc1dc7a3Sopenharmony_ci			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
1234cc1dc7a3Sopenharmony_ci			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
1235cc1dc7a3Sopenharmony_ci			if (errorval > (threshold * best_errorval_in_scb))
1236cc1dc7a3Sopenharmony_ci			{
1237cc1dc7a3Sopenharmony_ci				break;
1238cc1dc7a3Sopenharmony_ci			}
1239cc1dc7a3Sopenharmony_ci
1240cc1dc7a3Sopenharmony_ci			if (errorval < best_errorval_in_scb)
1241cc1dc7a3Sopenharmony_ci			{
1242cc1dc7a3Sopenharmony_ci				best_errorval_in_scb = errorval;
1243cc1dc7a3Sopenharmony_ci				workscb.errorval = errorval;
1244cc1dc7a3Sopenharmony_ci				scb = workscb;
1245cc1dc7a3Sopenharmony_ci
1246cc1dc7a3Sopenharmony_ci				if (errorval < tune_errorval_threshold)
1247cc1dc7a3Sopenharmony_ci				{
1248cc1dc7a3Sopenharmony_ci					// Skip remaining candidates - this is "good enough"
1249cc1dc7a3Sopenharmony_ci					i = candidate_count;
1250cc1dc7a3Sopenharmony_ci					break;
1251cc1dc7a3Sopenharmony_ci				}
1252cc1dc7a3Sopenharmony_ci			}
1253cc1dc7a3Sopenharmony_ci
1254cc1dc7a3Sopenharmony_ci			if (!adjustments)
1255cc1dc7a3Sopenharmony_ci			{
1256cc1dc7a3Sopenharmony_ci				break;
1257cc1dc7a3Sopenharmony_ci			}
1258cc1dc7a3Sopenharmony_ci		}
1259cc1dc7a3Sopenharmony_ci	}
1260cc1dc7a3Sopenharmony_ci
1261cc1dc7a3Sopenharmony_ci	return best_errorval_in_mode;
1262cc1dc7a3Sopenharmony_ci}
1263cc1dc7a3Sopenharmony_ci
1264cc1dc7a3Sopenharmony_ci/**
1265cc1dc7a3Sopenharmony_ci * @brief Determine the lowest cross-channel correlation factor.
1266cc1dc7a3Sopenharmony_ci *
1267cc1dc7a3Sopenharmony_ci * @param texels_per_block   The number of texels in a block.
1268cc1dc7a3Sopenharmony_ci * @param blk                The image block color data to compress.
1269cc1dc7a3Sopenharmony_ci *
1270cc1dc7a3Sopenharmony_ci * @return Return the lowest correlation factor.
1271cc1dc7a3Sopenharmony_ci */
1272cc1dc7a3Sopenharmony_cistatic float prepare_block_statistics(
1273cc1dc7a3Sopenharmony_ci	int texels_per_block,
1274cc1dc7a3Sopenharmony_ci	const image_block& blk
1275cc1dc7a3Sopenharmony_ci) {
1276cc1dc7a3Sopenharmony_ci	// Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
1277cc1dc7a3Sopenharmony_ci	// of the matrix. The matrix is symmetric, so this is all we need for this use case.
1278cc1dc7a3Sopenharmony_ci	float rs = 0.0f;
1279cc1dc7a3Sopenharmony_ci	float gs = 0.0f;
1280cc1dc7a3Sopenharmony_ci	float bs = 0.0f;
1281cc1dc7a3Sopenharmony_ci	float as = 0.0f;
1282cc1dc7a3Sopenharmony_ci	float rr_var = 0.0f;
1283cc1dc7a3Sopenharmony_ci	float gg_var = 0.0f;
1284cc1dc7a3Sopenharmony_ci	float bb_var = 0.0f;
1285cc1dc7a3Sopenharmony_ci	float aa_var = 0.0f;
1286cc1dc7a3Sopenharmony_ci	float rg_cov = 0.0f;
1287cc1dc7a3Sopenharmony_ci	float rb_cov = 0.0f;
1288cc1dc7a3Sopenharmony_ci	float ra_cov = 0.0f;
1289cc1dc7a3Sopenharmony_ci	float gb_cov = 0.0f;
1290cc1dc7a3Sopenharmony_ci	float ga_cov = 0.0f;
1291cc1dc7a3Sopenharmony_ci	float ba_cov = 0.0f;
1292cc1dc7a3Sopenharmony_ci
1293cc1dc7a3Sopenharmony_ci	float weight_sum = 0.0f;
1294cc1dc7a3Sopenharmony_ci
1295cc1dc7a3Sopenharmony_ci	promise(texels_per_block > 0);
1296cc1dc7a3Sopenharmony_ci	for (int i = 0; i < texels_per_block; i++)
1297cc1dc7a3Sopenharmony_ci	{
1298cc1dc7a3Sopenharmony_ci		float weight = hadd_s(blk.channel_weight) / 4.0f;
1299cc1dc7a3Sopenharmony_ci		assert(weight >= 0.0f);
1300cc1dc7a3Sopenharmony_ci		weight_sum += weight;
1301cc1dc7a3Sopenharmony_ci
1302cc1dc7a3Sopenharmony_ci		float r = blk.data_r[i];
1303cc1dc7a3Sopenharmony_ci		float g = blk.data_g[i];
1304cc1dc7a3Sopenharmony_ci		float b = blk.data_b[i];
1305cc1dc7a3Sopenharmony_ci		float a = blk.data_a[i];
1306cc1dc7a3Sopenharmony_ci
1307cc1dc7a3Sopenharmony_ci		float rw = r * weight;
1308cc1dc7a3Sopenharmony_ci		rs += rw;
1309cc1dc7a3Sopenharmony_ci		rr_var += r * rw;
1310cc1dc7a3Sopenharmony_ci		rg_cov += g * rw;
1311cc1dc7a3Sopenharmony_ci		rb_cov += b * rw;
1312cc1dc7a3Sopenharmony_ci		ra_cov += a * rw;
1313cc1dc7a3Sopenharmony_ci
1314cc1dc7a3Sopenharmony_ci		float gw = g * weight;
1315cc1dc7a3Sopenharmony_ci		gs += gw;
1316cc1dc7a3Sopenharmony_ci		gg_var += g * gw;
1317cc1dc7a3Sopenharmony_ci		gb_cov += b * gw;
1318cc1dc7a3Sopenharmony_ci		ga_cov += a * gw;
1319cc1dc7a3Sopenharmony_ci
1320cc1dc7a3Sopenharmony_ci		float bw = b * weight;
1321cc1dc7a3Sopenharmony_ci		bs += bw;
1322cc1dc7a3Sopenharmony_ci		bb_var += b * bw;
1323cc1dc7a3Sopenharmony_ci		ba_cov += a * bw;
1324cc1dc7a3Sopenharmony_ci
1325cc1dc7a3Sopenharmony_ci		float aw = a * weight;
1326cc1dc7a3Sopenharmony_ci		as += aw;
1327cc1dc7a3Sopenharmony_ci		aa_var += a * aw;
1328cc1dc7a3Sopenharmony_ci	}
1329cc1dc7a3Sopenharmony_ci
1330cc1dc7a3Sopenharmony_ci	float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
1331cc1dc7a3Sopenharmony_ci
1332cc1dc7a3Sopenharmony_ci	rr_var -= rs * (rs * rpt);
1333cc1dc7a3Sopenharmony_ci	rg_cov -= gs * (rs * rpt);
1334cc1dc7a3Sopenharmony_ci	rb_cov -= bs * (rs * rpt);
1335cc1dc7a3Sopenharmony_ci	ra_cov -= as * (rs * rpt);
1336cc1dc7a3Sopenharmony_ci
1337cc1dc7a3Sopenharmony_ci	gg_var -= gs * (gs * rpt);
1338cc1dc7a3Sopenharmony_ci	gb_cov -= bs * (gs * rpt);
1339cc1dc7a3Sopenharmony_ci	ga_cov -= as * (gs * rpt);
1340cc1dc7a3Sopenharmony_ci
1341cc1dc7a3Sopenharmony_ci	bb_var -= bs * (bs * rpt);
1342cc1dc7a3Sopenharmony_ci	ba_cov -= as * (bs * rpt);
1343cc1dc7a3Sopenharmony_ci
1344cc1dc7a3Sopenharmony_ci	aa_var -= as * (as * rpt);
1345cc1dc7a3Sopenharmony_ci
1346cc1dc7a3Sopenharmony_ci	// These will give a NaN if a channel is constant - these are fixed up in the next step
1347cc1dc7a3Sopenharmony_ci	rg_cov *= astc::rsqrt(rr_var * gg_var);
1348cc1dc7a3Sopenharmony_ci	rb_cov *= astc::rsqrt(rr_var * bb_var);
1349cc1dc7a3Sopenharmony_ci	ra_cov *= astc::rsqrt(rr_var * aa_var);
1350cc1dc7a3Sopenharmony_ci	gb_cov *= astc::rsqrt(gg_var * bb_var);
1351cc1dc7a3Sopenharmony_ci	ga_cov *= astc::rsqrt(gg_var * aa_var);
1352cc1dc7a3Sopenharmony_ci	ba_cov *= astc::rsqrt(bb_var * aa_var);
1353cc1dc7a3Sopenharmony_ci
1354cc1dc7a3Sopenharmony_ci	if (astc::isnan(rg_cov)) rg_cov = 1.0f;
1355cc1dc7a3Sopenharmony_ci	if (astc::isnan(rb_cov)) rb_cov = 1.0f;
1356cc1dc7a3Sopenharmony_ci	if (astc::isnan(ra_cov)) ra_cov = 1.0f;
1357cc1dc7a3Sopenharmony_ci	if (astc::isnan(gb_cov)) gb_cov = 1.0f;
1358cc1dc7a3Sopenharmony_ci	if (astc::isnan(ga_cov)) ga_cov = 1.0f;
1359cc1dc7a3Sopenharmony_ci	if (astc::isnan(ba_cov)) ba_cov = 1.0f;
1360cc1dc7a3Sopenharmony_ci
1361cc1dc7a3Sopenharmony_ci	float lowest_correlation = astc::min(fabsf(rg_cov),      fabsf(rb_cov));
1362cc1dc7a3Sopenharmony_ci	lowest_correlation       = astc::min(lowest_correlation, fabsf(ra_cov));
1363cc1dc7a3Sopenharmony_ci	lowest_correlation       = astc::min(lowest_correlation, fabsf(gb_cov));
1364cc1dc7a3Sopenharmony_ci	lowest_correlation       = astc::min(lowest_correlation, fabsf(ga_cov));
1365cc1dc7a3Sopenharmony_ci	lowest_correlation       = astc::min(lowest_correlation, fabsf(ba_cov));
1366cc1dc7a3Sopenharmony_ci
1367cc1dc7a3Sopenharmony_ci	// Diagnostic trace points
1368cc1dc7a3Sopenharmony_ci	trace_add_data("min_r", blk.data_min.lane<0>());
1369cc1dc7a3Sopenharmony_ci	trace_add_data("max_r", blk.data_max.lane<0>());
1370cc1dc7a3Sopenharmony_ci	trace_add_data("min_g", blk.data_min.lane<1>());
1371cc1dc7a3Sopenharmony_ci	trace_add_data("max_g", blk.data_max.lane<1>());
1372cc1dc7a3Sopenharmony_ci	trace_add_data("min_b", blk.data_min.lane<2>());
1373cc1dc7a3Sopenharmony_ci	trace_add_data("max_b", blk.data_max.lane<2>());
1374cc1dc7a3Sopenharmony_ci	trace_add_data("min_a", blk.data_min.lane<3>());
1375cc1dc7a3Sopenharmony_ci	trace_add_data("max_a", blk.data_max.lane<3>());
1376cc1dc7a3Sopenharmony_ci	trace_add_data("cov_rg", fabsf(rg_cov));
1377cc1dc7a3Sopenharmony_ci	trace_add_data("cov_rb", fabsf(rb_cov));
1378cc1dc7a3Sopenharmony_ci	trace_add_data("cov_ra", fabsf(ra_cov));
1379cc1dc7a3Sopenharmony_ci	trace_add_data("cov_gb", fabsf(gb_cov));
1380cc1dc7a3Sopenharmony_ci	trace_add_data("cov_ga", fabsf(ga_cov));
1381cc1dc7a3Sopenharmony_ci	trace_add_data("cov_ba", fabsf(ba_cov));
1382cc1dc7a3Sopenharmony_ci
1383cc1dc7a3Sopenharmony_ci	return lowest_correlation;
1384cc1dc7a3Sopenharmony_ci}
1385cc1dc7a3Sopenharmony_ci
1386cc1dc7a3Sopenharmony_ci/* See header for documentation. */
1387cc1dc7a3Sopenharmony_civoid compress_block(
1388cc1dc7a3Sopenharmony_ci	const astcenc_contexti& ctx,
1389cc1dc7a3Sopenharmony_ci	const image_block& blk,
1390cc1dc7a3Sopenharmony_ci	uint8_t pcb[16],
1391cc1dc7a3Sopenharmony_ci#if QUALITY_CONTROL
1392cc1dc7a3Sopenharmony_ci	compression_working_buffers& tmpbuf,
1393cc1dc7a3Sopenharmony_ci	bool calQualityEnable,
1394cc1dc7a3Sopenharmony_ci	int32_t *mseBlock[RGBA_COM]
1395cc1dc7a3Sopenharmony_ci#else
1396cc1dc7a3Sopenharmony_ci	compression_working_buffers& tmpbuf
1397cc1dc7a3Sopenharmony_ci#endif
1398cc1dc7a3Sopenharmony_ci	)
1399cc1dc7a3Sopenharmony_ci{
1400cc1dc7a3Sopenharmony_ci	astcenc_profile decode_mode = ctx.config.profile;
1401cc1dc7a3Sopenharmony_ci	symbolic_compressed_block scb;
1402cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd = *ctx.bsd;
1403cc1dc7a3Sopenharmony_ci	float lowest_correl;
1404cc1dc7a3Sopenharmony_ci
1405cc1dc7a3Sopenharmony_ci	TRACE_NODE(node0, "block");
1406cc1dc7a3Sopenharmony_ci	trace_add_data("pos_x", blk.xpos);
1407cc1dc7a3Sopenharmony_ci	trace_add_data("pos_y", blk.ypos);
1408cc1dc7a3Sopenharmony_ci	trace_add_data("pos_z", blk.zpos);
1409cc1dc7a3Sopenharmony_ci
1410cc1dc7a3Sopenharmony_ci	// Set stricter block targets for luminance data as we have more bits to play with
1411cc1dc7a3Sopenharmony_ci	bool block_is_l = blk.is_luminance();
1412cc1dc7a3Sopenharmony_ci	float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
1413cc1dc7a3Sopenharmony_ci
1414cc1dc7a3Sopenharmony_ci	// Set slightly stricter block targets for lumalpha data as we have more bits to play with
1415cc1dc7a3Sopenharmony_ci	bool block_is_la = blk.is_luminancealpha();
1416cc1dc7a3Sopenharmony_ci	float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
1417cc1dc7a3Sopenharmony_ci
1418cc1dc7a3Sopenharmony_ci	bool block_skip_two_plane = false;
1419cc1dc7a3Sopenharmony_ci	int max_partitions;
1420cc1dc7a3Sopenharmony_ci	if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
1421cc1dc7a3Sopenharmony_ci	{
1422cc1dc7a3Sopenharmony_ci		max_partitions = 1;
1423cc1dc7a3Sopenharmony_ci	}
1424cc1dc7a3Sopenharmony_ci#ifdef ASTC_CUSTOMIZED_ENABLE
1425cc1dc7a3Sopenharmony_ci	else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
1426cc1dc7a3Sopenharmony_ci	{
1427cc1dc7a3Sopenharmony_ci		if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
1428cc1dc7a3Sopenharmony_ci			g_astcCustomizedSoManager.customizedMaxPartitionsFunc_ == nullptr)
1429cc1dc7a3Sopenharmony_ci		{
1430cc1dc7a3Sopenharmony_ci			printf("astcenc customized so dlopen failed or customizedMaxPartitionsFunc_ is nullptr!\n");
1431cc1dc7a3Sopenharmony_ci			return;
1432cc1dc7a3Sopenharmony_ci		}
1433cc1dc7a3Sopenharmony_ci		max_partitions = g_astcCustomizedSoManager.customizedMaxPartitionsFunc_();
1434cc1dc7a3Sopenharmony_ci	}
1435cc1dc7a3Sopenharmony_ci#endif
1436cc1dc7a3Sopenharmony_ci	else
1437cc1dc7a3Sopenharmony_ci	{
1438cc1dc7a3Sopenharmony_ci		max_partitions = ctx.config.tune_partition_count_limit;
1439cc1dc7a3Sopenharmony_ci	}
1440cc1dc7a3Sopenharmony_ci
1441cc1dc7a3Sopenharmony_ci	unsigned int requested_partition_indices[3] {
1442cc1dc7a3Sopenharmony_ci		ctx.config.tune_2partition_index_limit,
1443cc1dc7a3Sopenharmony_ci		ctx.config.tune_3partition_index_limit,
1444cc1dc7a3Sopenharmony_ci		ctx.config.tune_4partition_index_limit
1445cc1dc7a3Sopenharmony_ci	};
1446cc1dc7a3Sopenharmony_ci
1447cc1dc7a3Sopenharmony_ci	unsigned int requested_partition_trials[3] {
1448cc1dc7a3Sopenharmony_ci		ctx.config.tune_2partitioning_candidate_limit,
1449cc1dc7a3Sopenharmony_ci		ctx.config.tune_3partitioning_candidate_limit,
1450cc1dc7a3Sopenharmony_ci		ctx.config.tune_4partitioning_candidate_limit
1451cc1dc7a3Sopenharmony_ci	};
1452cc1dc7a3Sopenharmony_ci
1453cc1dc7a3Sopenharmony_ci#if defined(ASTCENC_DIAGNOSTICS)
1454cc1dc7a3Sopenharmony_ci	// Do this early in diagnostic builds so we can dump uniform metrics
1455cc1dc7a3Sopenharmony_ci	// for every block. Do it later in release builds to avoid redundant work!
1456cc1dc7a3Sopenharmony_ci	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1457cc1dc7a3Sopenharmony_ci	float error_threshold = ctx.config.tune_db_limit
1458cc1dc7a3Sopenharmony_ci	                      * error_weight_sum
1459cc1dc7a3Sopenharmony_ci	                      * block_is_l_scale
1460cc1dc7a3Sopenharmony_ci	                      * block_is_la_scale;
1461cc1dc7a3Sopenharmony_ci
1462cc1dc7a3Sopenharmony_ci	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1463cc1dc7a3Sopenharmony_ci	trace_add_data("lowest_correl", lowest_correl);
1464cc1dc7a3Sopenharmony_ci	trace_add_data("tune_error_threshold", error_threshold);
1465cc1dc7a3Sopenharmony_ci#endif
1466cc1dc7a3Sopenharmony_ci
1467cc1dc7a3Sopenharmony_ci	// Detected a constant-color block
1468cc1dc7a3Sopenharmony_ci	if (all(blk.data_min == blk.data_max))
1469cc1dc7a3Sopenharmony_ci	{
1470cc1dc7a3Sopenharmony_ci		TRACE_NODE(node1, "pass");
1471cc1dc7a3Sopenharmony_ci		trace_add_data("partition_count", 0);
1472cc1dc7a3Sopenharmony_ci		trace_add_data("plane_count", 1);
1473cc1dc7a3Sopenharmony_ci
1474cc1dc7a3Sopenharmony_ci		scb.partition_count = 0;
1475cc1dc7a3Sopenharmony_ci
1476cc1dc7a3Sopenharmony_ci		// Encode as FP16 if using HDR
1477cc1dc7a3Sopenharmony_ci		if ((decode_mode == ASTCENC_PRF_HDR) ||
1478cc1dc7a3Sopenharmony_ci		    (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
1479cc1dc7a3Sopenharmony_ci		{
1480cc1dc7a3Sopenharmony_ci			scb.block_type = SYM_BTYPE_CONST_F16;
1481cc1dc7a3Sopenharmony_ci			vint4 color_f16 = float_to_float16(blk.origin_texel);
1482cc1dc7a3Sopenharmony_ci			store(color_f16, scb.constant_color);
1483cc1dc7a3Sopenharmony_ci		}
1484cc1dc7a3Sopenharmony_ci		// Encode as UNORM16 if NOT using HDR
1485cc1dc7a3Sopenharmony_ci		else
1486cc1dc7a3Sopenharmony_ci		{
1487cc1dc7a3Sopenharmony_ci			scb.block_type = SYM_BTYPE_CONST_U16;
1488cc1dc7a3Sopenharmony_ci			vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1489cc1dc7a3Sopenharmony_ci			vint4 color_u16 = float_to_int_rtn(color_f32);
1490cc1dc7a3Sopenharmony_ci			store(color_u16, scb.constant_color);
1491cc1dc7a3Sopenharmony_ci		}
1492cc1dc7a3Sopenharmony_ci
1493cc1dc7a3Sopenharmony_ci		trace_add_data("exit", "quality hit");
1494cc1dc7a3Sopenharmony_ci		if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE)
1495cc1dc7a3Sopenharmony_ci		{
1496cc1dc7a3Sopenharmony_ci			scb.block_type = SYM_BTYPE_NONCONST;
1497cc1dc7a3Sopenharmony_ci			scb.partition_count = 1;
1498cc1dc7a3Sopenharmony_ci			scb.color_formats_matched = 0;
1499cc1dc7a3Sopenharmony_ci			scb.plane2_component = -1;
1500cc1dc7a3Sopenharmony_ci			if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
1501cc1dc7a3Sopenharmony_ci			{
1502cc1dc7a3Sopenharmony_ci				scb.block_mode = HIGH_SPEED_PROFILE_BLOCK_MODE;
1503cc1dc7a3Sopenharmony_ci			}
1504cc1dc7a3Sopenharmony_ci#ifdef ASTC_CUSTOMIZED_ENABLE
1505cc1dc7a3Sopenharmony_ci			else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
1506cc1dc7a3Sopenharmony_ci			{
1507cc1dc7a3Sopenharmony_ci				if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
1508cc1dc7a3Sopenharmony_ci					g_astcCustomizedSoManager.customizedBlockModeFunc_ == nullptr)
1509cc1dc7a3Sopenharmony_ci				{
1510cc1dc7a3Sopenharmony_ci					printf("astcenc customized so dlopen failed or customizedBlockModeFunc_ is nullptr!\n");
1511cc1dc7a3Sopenharmony_ci					return;
1512cc1dc7a3Sopenharmony_ci				}
1513cc1dc7a3Sopenharmony_ci				scb.block_mode = g_astcCustomizedSoManager.customizedBlockModeFunc_();
1514cc1dc7a3Sopenharmony_ci			}
1515cc1dc7a3Sopenharmony_ci#endif
1516cc1dc7a3Sopenharmony_ci			scb.partition_index = 0;
1517cc1dc7a3Sopenharmony_ci			scb.quant_mode = QUANT_256;
1518cc1dc7a3Sopenharmony_ci			scb.color_formats[0] = 12; // color format is 12 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
1519cc1dc7a3Sopenharmony_ci			for (int w = 0; w < 16; w++) { // weights num is 16 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
1520cc1dc7a3Sopenharmony_ci				scb.weights[w] = 0;
1521cc1dc7a3Sopenharmony_ci			}
1522cc1dc7a3Sopenharmony_ci			for (unsigned int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit
1523cc1dc7a3Sopenharmony_ci				scb.color_values[0][pixel << 1] = scb.constant_color[pixel] & BYTE_MASK; // low byte
1524cc1dc7a3Sopenharmony_ci				scb.color_values[0][(pixel << 1) + 1] = (scb.constant_color[pixel] >> 8) & BYTE_MASK; // high byte
1525cc1dc7a3Sopenharmony_ci			}
1526cc1dc7a3Sopenharmony_ci		}
1527cc1dc7a3Sopenharmony_ci		scb.privateProfile = ctx.config.privateProfile;
1528cc1dc7a3Sopenharmony_ci		symbolic_to_physical(bsd, scb, pcb);
1529cc1dc7a3Sopenharmony_ci#if QUALITY_CONTROL
1530cc1dc7a3Sopenharmony_ci	if (calQualityEnable) {
1531cc1dc7a3Sopenharmony_ci		*mseBlock[R_COM] = *mseBlock[G_COM] = *mseBlock[B_COM] = *mseBlock[A_COM] = 0;
1532cc1dc7a3Sopenharmony_ci	}
1533cc1dc7a3Sopenharmony_ci#endif
1534cc1dc7a3Sopenharmony_ci		return;
1535cc1dc7a3Sopenharmony_ci	}
1536cc1dc7a3Sopenharmony_ci
1537cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DIAGNOSTICS)
1538cc1dc7a3Sopenharmony_ci	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1539cc1dc7a3Sopenharmony_ci	float error_threshold = ctx.config.tune_db_limit
1540cc1dc7a3Sopenharmony_ci	                      * error_weight_sum
1541cc1dc7a3Sopenharmony_ci	                      * block_is_l_scale
1542cc1dc7a3Sopenharmony_ci	                      * block_is_la_scale;
1543cc1dc7a3Sopenharmony_ci#endif
1544cc1dc7a3Sopenharmony_ci
1545cc1dc7a3Sopenharmony_ci	// Set SCB and mode errors to a very high error value
1546cc1dc7a3Sopenharmony_ci	scb.errorval = ERROR_CALC_DEFAULT;
1547cc1dc7a3Sopenharmony_ci	scb.block_type = SYM_BTYPE_ERROR;
1548cc1dc7a3Sopenharmony_ci
1549cc1dc7a3Sopenharmony_ci	float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
1550cc1dc7a3Sopenharmony_ci		ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
1551cc1dc7a3Sopenharmony_ci	};
1552cc1dc7a3Sopenharmony_ci
1553cc1dc7a3Sopenharmony_ci	float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
1554cc1dc7a3Sopenharmony_ci		0.0f,
1555cc1dc7a3Sopenharmony_ci		ctx.config.tune_2partition_early_out_limit_factor,
1556cc1dc7a3Sopenharmony_ci		ctx.config.tune_3partition_early_out_limit_factor,
1557cc1dc7a3Sopenharmony_ci		0.0f
1558cc1dc7a3Sopenharmony_ci	};
1559cc1dc7a3Sopenharmony_ci
1560cc1dc7a3Sopenharmony_ci	// Trial using 1 plane of weights and 1 partition.
1561cc1dc7a3Sopenharmony_ci
1562cc1dc7a3Sopenharmony_ci	// Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
1563cc1dc7a3Sopenharmony_ci	// mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
1564cc1dc7a3Sopenharmony_ci	// optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
1565cc1dc7a3Sopenharmony_ci	// compression and slightly reduces image quality.
1566cc1dc7a3Sopenharmony_ci
1567cc1dc7a3Sopenharmony_ci	float errorval_mult[2] {
1568cc1dc7a3Sopenharmony_ci		1.0f / ctx.config.tune_mse_overshoot,
1569cc1dc7a3Sopenharmony_ci		1.0f
1570cc1dc7a3Sopenharmony_ci	};
1571cc1dc7a3Sopenharmony_ci
1572cc1dc7a3Sopenharmony_ci	static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
1573cc1dc7a3Sopenharmony_ci
1574cc1dc7a3Sopenharmony_ci	// Only enable MODE0 fast path if enabled
1575cc1dc7a3Sopenharmony_ci	// Never enable for 3D blocks as no "always" block modes are available
1576cc1dc7a3Sopenharmony_ci	int start_trial = 1;
1577cc1dc7a3Sopenharmony_ci 	if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
1578cc1dc7a3Sopenharmony_ci	{
1579cc1dc7a3Sopenharmony_ci		start_trial = 0;
1580cc1dc7a3Sopenharmony_ci	}
1581cc1dc7a3Sopenharmony_ci
1582cc1dc7a3Sopenharmony_ci	int quant_limit = QUANT_32;
1583cc1dc7a3Sopenharmony_ci	for (int i = start_trial; i < 2; i++)
1584cc1dc7a3Sopenharmony_ci	{
1585cc1dc7a3Sopenharmony_ci		TRACE_NODE(node1, "pass");
1586cc1dc7a3Sopenharmony_ci		trace_add_data("partition_count", 1);
1587cc1dc7a3Sopenharmony_ci		trace_add_data("plane_count", 1);
1588cc1dc7a3Sopenharmony_ci		trace_add_data("search_mode", i);
1589cc1dc7a3Sopenharmony_ci
1590cc1dc7a3Sopenharmony_ci		float errorval = compress_symbolic_block_for_partition_1plane(
1591cc1dc7a3Sopenharmony_ci		    ctx.config.privateProfile,
1592cc1dc7a3Sopenharmony_ci		    ctx.config, bsd, blk, i == 0,
1593cc1dc7a3Sopenharmony_ci		    error_threshold * errorval_mult[i] * errorval_overshoot,
1594cc1dc7a3Sopenharmony_ci		    1, 0,  scb, tmpbuf, QUANT_32);
1595cc1dc7a3Sopenharmony_ci
1596cc1dc7a3Sopenharmony_ci		// Record the quant level so we can use the filter later searches
1597cc1dc7a3Sopenharmony_ci		const auto& bm = bsd.get_block_mode(scb.block_mode);
1598cc1dc7a3Sopenharmony_ci		quant_limit = bm.get_weight_quant_mode();
1599cc1dc7a3Sopenharmony_ci
1600cc1dc7a3Sopenharmony_ci		best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
1601cc1dc7a3Sopenharmony_ci		if ((ctx.config.privateProfile == HIGH_SPEED_PROFILE) || (errorval < (error_threshold * errorval_mult[i])))
1602cc1dc7a3Sopenharmony_ci		{
1603cc1dc7a3Sopenharmony_ci			trace_add_data("exit", "quality hit");
1604cc1dc7a3Sopenharmony_ci			goto END_OF_TESTS;
1605cc1dc7a3Sopenharmony_ci		}
1606cc1dc7a3Sopenharmony_ci	}
1607cc1dc7a3Sopenharmony_ci
1608cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DIAGNOSTICS)
1609cc1dc7a3Sopenharmony_ci	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1610cc1dc7a3Sopenharmony_ci#endif
1611cc1dc7a3Sopenharmony_ci
1612cc1dc7a3Sopenharmony_ci	block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
1613cc1dc7a3Sopenharmony_ci
1614cc1dc7a3Sopenharmony_ci	// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
1615cc1dc7a3Sopenharmony_ci	// alpha is the most likely to be non-correlated if it is present in the data.
1616cc1dc7a3Sopenharmony_ci	for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
1617cc1dc7a3Sopenharmony_ci	{
1618cc1dc7a3Sopenharmony_ci		if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE)
1619cc1dc7a3Sopenharmony_ci		{
1620cc1dc7a3Sopenharmony_ci			break;
1621cc1dc7a3Sopenharmony_ci		}
1622cc1dc7a3Sopenharmony_ci		TRACE_NODE(node1, "pass");
1623cc1dc7a3Sopenharmony_ci		trace_add_data("partition_count", 1);
1624cc1dc7a3Sopenharmony_ci		trace_add_data("plane_count", 2);
1625cc1dc7a3Sopenharmony_ci		trace_add_data("plane_component", i);
1626cc1dc7a3Sopenharmony_ci
1627cc1dc7a3Sopenharmony_ci		if (block_skip_two_plane)
1628cc1dc7a3Sopenharmony_ci		{
1629cc1dc7a3Sopenharmony_ci			trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
1630cc1dc7a3Sopenharmony_ci			continue;
1631cc1dc7a3Sopenharmony_ci		}
1632cc1dc7a3Sopenharmony_ci
1633cc1dc7a3Sopenharmony_ci		if (blk.grayscale && i != 3)
1634cc1dc7a3Sopenharmony_ci		{
1635cc1dc7a3Sopenharmony_ci			trace_add_data("skip", "grayscale block");
1636cc1dc7a3Sopenharmony_ci			continue;
1637cc1dc7a3Sopenharmony_ci		}
1638cc1dc7a3Sopenharmony_ci
1639cc1dc7a3Sopenharmony_ci		if (blk.is_constant_channel(i))
1640cc1dc7a3Sopenharmony_ci		{
1641cc1dc7a3Sopenharmony_ci			trace_add_data("skip", "constant component");
1642cc1dc7a3Sopenharmony_ci			continue;
1643cc1dc7a3Sopenharmony_ci		}
1644cc1dc7a3Sopenharmony_ci
1645cc1dc7a3Sopenharmony_ci		float errorval = compress_symbolic_block_for_partition_2planes(
1646cc1dc7a3Sopenharmony_ci		    ctx.config.privateProfile,
1647cc1dc7a3Sopenharmony_ci		    ctx.config, bsd, blk, error_threshold * errorval_overshoot,
1648cc1dc7a3Sopenharmony_ci		    i, scb, tmpbuf, quant_limit);
1649cc1dc7a3Sopenharmony_ci
1650cc1dc7a3Sopenharmony_ci		// If attempting two planes is much worse than the best one plane result
1651cc1dc7a3Sopenharmony_ci		// then further two plane searches are unlikely to help so move on ...
1652cc1dc7a3Sopenharmony_ci		if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
1653cc1dc7a3Sopenharmony_ci		{
1654cc1dc7a3Sopenharmony_ci			break;
1655cc1dc7a3Sopenharmony_ci		}
1656cc1dc7a3Sopenharmony_ci
1657cc1dc7a3Sopenharmony_ci		if (errorval < error_threshold)
1658cc1dc7a3Sopenharmony_ci		{
1659cc1dc7a3Sopenharmony_ci			trace_add_data("exit", "quality hit");
1660cc1dc7a3Sopenharmony_ci			goto END_OF_TESTS;
1661cc1dc7a3Sopenharmony_ci		}
1662cc1dc7a3Sopenharmony_ci	}
1663cc1dc7a3Sopenharmony_ci
1664cc1dc7a3Sopenharmony_ci	// Find best blocks for 2, 3 and 4 partitions
1665cc1dc7a3Sopenharmony_ci	for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
1666cc1dc7a3Sopenharmony_ci	{
1667cc1dc7a3Sopenharmony_ci		unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
1668cc1dc7a3Sopenharmony_ci
1669cc1dc7a3Sopenharmony_ci		unsigned int requested_indices = requested_partition_indices[partition_count - 2];
1670cc1dc7a3Sopenharmony_ci
1671cc1dc7a3Sopenharmony_ci		unsigned int requested_trials = requested_partition_trials[partition_count - 2];
1672cc1dc7a3Sopenharmony_ci		requested_trials = astc::min(requested_trials, requested_indices);
1673cc1dc7a3Sopenharmony_ci
1674cc1dc7a3Sopenharmony_ci		unsigned int actual_trials = find_best_partition_candidates(
1675cc1dc7a3Sopenharmony_ci		    bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
1676cc1dc7a3Sopenharmony_ci
1677cc1dc7a3Sopenharmony_ci		float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
1678cc1dc7a3Sopenharmony_ci
1679cc1dc7a3Sopenharmony_ci		for (unsigned int i = 0; i < actual_trials; i++)
1680cc1dc7a3Sopenharmony_ci		{
1681cc1dc7a3Sopenharmony_ci			TRACE_NODE(node1, "pass");
1682cc1dc7a3Sopenharmony_ci			trace_add_data("partition_count", partition_count);
1683cc1dc7a3Sopenharmony_ci			trace_add_data("partition_index", partition_indices[i]);
1684cc1dc7a3Sopenharmony_ci			trace_add_data("plane_count", 1);
1685cc1dc7a3Sopenharmony_ci			trace_add_data("search_mode", i);
1686cc1dc7a3Sopenharmony_ci
1687cc1dc7a3Sopenharmony_ci			float errorval = compress_symbolic_block_for_partition_1plane(
1688cc1dc7a3Sopenharmony_ci			    ctx.config.privateProfile,
1689cc1dc7a3Sopenharmony_ci			    ctx.config, bsd, blk, false,
1690cc1dc7a3Sopenharmony_ci			    error_threshold * errorval_overshoot,
1691cc1dc7a3Sopenharmony_ci			    partition_count, partition_indices[i],
1692cc1dc7a3Sopenharmony_ci			    scb, tmpbuf, quant_limit);
1693cc1dc7a3Sopenharmony_ci
1694cc1dc7a3Sopenharmony_ci			best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
1695cc1dc7a3Sopenharmony_ci
1696cc1dc7a3Sopenharmony_ci			// If using N partitions doesn't improve much over using N-1 partitions then skip trying
1697cc1dc7a3Sopenharmony_ci			// N+1. Error can dramatically improve if the data is correlated or non-correlated and
1698cc1dc7a3Sopenharmony_ci			// aligns with a partitioning that suits that encoding, so for this inner loop check add
1699cc1dc7a3Sopenharmony_ci			// a large error scale because the "other" trial could be a lot better.
1700cc1dc7a3Sopenharmony_ci			float best_error = best_errorvals_for_pcount[partition_count - 1];
1701cc1dc7a3Sopenharmony_ci			float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
1702cc1dc7a3Sopenharmony_ci			if (best_error > (best_error_in_prev * best_error_scale))
1703cc1dc7a3Sopenharmony_ci			{
1704cc1dc7a3Sopenharmony_ci				trace_add_data("skip", "tune_partition_early_out_limit_factor");
1705cc1dc7a3Sopenharmony_ci				goto END_OF_TESTS;
1706cc1dc7a3Sopenharmony_ci			}
1707cc1dc7a3Sopenharmony_ci
1708cc1dc7a3Sopenharmony_ci			if (errorval < error_threshold)
1709cc1dc7a3Sopenharmony_ci			{
1710cc1dc7a3Sopenharmony_ci				trace_add_data("exit", "quality hit");
1711cc1dc7a3Sopenharmony_ci				goto END_OF_TESTS;
1712cc1dc7a3Sopenharmony_ci			}
1713cc1dc7a3Sopenharmony_ci		}
1714cc1dc7a3Sopenharmony_ci
1715cc1dc7a3Sopenharmony_ci		// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
1716cc1dc7a3Sopenharmony_ci		float best_error = best_errorvals_for_pcount[partition_count - 1];
1717cc1dc7a3Sopenharmony_ci		float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
1718cc1dc7a3Sopenharmony_ci		if (best_error > (best_error_in_prev * best_error_scale))
1719cc1dc7a3Sopenharmony_ci		{
1720cc1dc7a3Sopenharmony_ci			trace_add_data("skip", "tune_partition_early_out_limit_factor");
1721cc1dc7a3Sopenharmony_ci			goto END_OF_TESTS;
1722cc1dc7a3Sopenharmony_ci		}
1723cc1dc7a3Sopenharmony_ci	}
1724cc1dc7a3Sopenharmony_ci
1725cc1dc7a3Sopenharmony_ci	trace_add_data("exit", "quality not hit");
1726cc1dc7a3Sopenharmony_ci
1727cc1dc7a3Sopenharmony_ciEND_OF_TESTS:
1728cc1dc7a3Sopenharmony_ci	// If we still have an error block then convert to something we can encode
1729cc1dc7a3Sopenharmony_ci	// TODO: Do something more sensible here, such as average color block
1730cc1dc7a3Sopenharmony_ci	if (scb.block_type == SYM_BTYPE_ERROR)
1731cc1dc7a3Sopenharmony_ci	{
1732cc1dc7a3Sopenharmony_ci#if defined(ASTCENC_DIAGNOSTICS)
1733cc1dc7a3Sopenharmony_ci		static bool printed_once = false;
1734cc1dc7a3Sopenharmony_ci		if (!printed_once)
1735cc1dc7a3Sopenharmony_ci		{
1736cc1dc7a3Sopenharmony_ci			printed_once = true;
1737cc1dc7a3Sopenharmony_ci			printf("WARN: At least one block failed to find a valid encoding.\n"
1738cc1dc7a3Sopenharmony_ci			       "      Try increasing compression quality settings.\n\n");
1739cc1dc7a3Sopenharmony_ci		}
1740cc1dc7a3Sopenharmony_ci#endif
1741cc1dc7a3Sopenharmony_ci
1742cc1dc7a3Sopenharmony_ci		scb.block_type = SYM_BTYPE_CONST_U16;
1743cc1dc7a3Sopenharmony_ci		vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1744cc1dc7a3Sopenharmony_ci		vint4 color_u16 = float_to_int_rtn(color_f32);
1745cc1dc7a3Sopenharmony_ci		store(color_u16, scb.constant_color);
1746cc1dc7a3Sopenharmony_ci	}
1747cc1dc7a3Sopenharmony_ci
1748cc1dc7a3Sopenharmony_ci	// Compress to a physical block
1749cc1dc7a3Sopenharmony_ci	scb.privateProfile = ctx.config.privateProfile;
1750cc1dc7a3Sopenharmony_ci	symbolic_to_physical(bsd, scb, pcb);
1751cc1dc7a3Sopenharmony_ci#if QUALITY_CONTROL
1752cc1dc7a3Sopenharmony_ci	if (calQualityEnable) {
1753cc1dc7a3Sopenharmony_ci		image_block decBlk = blk;
1754cc1dc7a3Sopenharmony_ci		decompress_symbolic_block(ctx.config.profile, bsd, blk.xpos, blk.ypos, blk.zpos, scb, decBlk);
1755cc1dc7a3Sopenharmony_ci		vint4 colorSumDiff = vint4::zero();
1756cc1dc7a3Sopenharmony_ci		for (size_t ii = 0; ii < bsd.texel_count; ii++) {
1757cc1dc7a3Sopenharmony_ci			vint4 colorRef = float_to_int_rtn(blk.texel(ii) * 255.0f / 65535.0f);
1758cc1dc7a3Sopenharmony_ci			vint4 colorTest = float_to_int_rtn(min(decBlk.texel(ii), 1.0f) * 255.0f);
1759cc1dc7a3Sopenharmony_ci			vint4 colorDiff = colorRef - colorTest;
1760cc1dc7a3Sopenharmony_ci			colorSumDiff += colorDiff * colorDiff;
1761cc1dc7a3Sopenharmony_ci		}
1762cc1dc7a3Sopenharmony_ci		*mseBlock[R_COM] = colorSumDiff.lane<0>();
1763cc1dc7a3Sopenharmony_ci		*mseBlock[G_COM] = colorSumDiff.lane<1>();
1764cc1dc7a3Sopenharmony_ci		*mseBlock[B_COM] = colorSumDiff.lane<2>();
1765cc1dc7a3Sopenharmony_ci		*mseBlock[A_COM] = colorSumDiff.lane<3>();
1766cc1dc7a3Sopenharmony_ci    }
1767cc1dc7a3Sopenharmony_ci#endif
1768cc1dc7a3Sopenharmony_ci}
1769cc1dc7a3Sopenharmony_ci
1770cc1dc7a3Sopenharmony_ci#endif
1771