1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0
2cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
3cc1dc7a3Sopenharmony_ci// Copyright 2011-2024 Arm Limited
4cc1dc7a3Sopenharmony_ci//
5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy
7cc1dc7a3Sopenharmony_ci// of the License at:
8cc1dc7a3Sopenharmony_ci//
9cc1dc7a3Sopenharmony_ci//     http://www.apache.org/licenses/LICENSE-2.0
10cc1dc7a3Sopenharmony_ci//
11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software
12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations
15cc1dc7a3Sopenharmony_ci// under the License.
16cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
17cc1dc7a3Sopenharmony_ci
18cc1dc7a3Sopenharmony_ci/**
19cc1dc7a3Sopenharmony_ci * @brief Functions to decompress a symbolic block.
20cc1dc7a3Sopenharmony_ci */
21cc1dc7a3Sopenharmony_ci
22cc1dc7a3Sopenharmony_ci#include "astcenc_internal.h"
23cc1dc7a3Sopenharmony_ci
24cc1dc7a3Sopenharmony_ci#include <stdio.h>
25cc1dc7a3Sopenharmony_ci#include <assert.h>
26cc1dc7a3Sopenharmony_ci
27cc1dc7a3Sopenharmony_ci/**
28cc1dc7a3Sopenharmony_ci * @brief Compute the integer linear interpolation of two color endpoints.
29cc1dc7a3Sopenharmony_ci *
30cc1dc7a3Sopenharmony_ci * @param u8_mask       The mask for lanes using decode_unorm8 rather than decode_f16.
31cc1dc7a3Sopenharmony_ci * @param color0        The endpoint0 color.
32cc1dc7a3Sopenharmony_ci * @param color1        The endpoint1 color.
33cc1dc7a3Sopenharmony_ci * @param weights       The interpolation weight (between 0 and 64).
34cc1dc7a3Sopenharmony_ci *
35cc1dc7a3Sopenharmony_ci * @return The interpolated color.
36cc1dc7a3Sopenharmony_ci */
37cc1dc7a3Sopenharmony_cistatic vint4 lerp_color_int(
38cc1dc7a3Sopenharmony_ci	vmask4 u8_mask,
39cc1dc7a3Sopenharmony_ci	vint4 color0,
40cc1dc7a3Sopenharmony_ci	vint4 color1,
41cc1dc7a3Sopenharmony_ci	vint4 weights
42cc1dc7a3Sopenharmony_ci) {
43cc1dc7a3Sopenharmony_ci	vint4 weight1 = weights;
44cc1dc7a3Sopenharmony_ci	vint4 weight0 = vint4(64) - weight1;
45cc1dc7a3Sopenharmony_ci
46cc1dc7a3Sopenharmony_ci	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
47cc1dc7a3Sopenharmony_ci	color = asr<6>(color);
48cc1dc7a3Sopenharmony_ci
49cc1dc7a3Sopenharmony_ci	// For decode_unorm8 values force the codec to bit replicate. This allows the
50cc1dc7a3Sopenharmony_ci	// rest of the codec to assume the full 0xFFFF range for everything and ignore
51cc1dc7a3Sopenharmony_ci	// the decode_mode setting
52cc1dc7a3Sopenharmony_ci	vint4 color_u8 = asr<8>(color) * vint4(257);
53cc1dc7a3Sopenharmony_ci	color = select(color, color_u8, u8_mask);
54cc1dc7a3Sopenharmony_ci
55cc1dc7a3Sopenharmony_ci	return color;
56cc1dc7a3Sopenharmony_ci}
57cc1dc7a3Sopenharmony_ci
58cc1dc7a3Sopenharmony_ci/**
59cc1dc7a3Sopenharmony_ci * @brief Convert integer color value into a float value for the decoder.
60cc1dc7a3Sopenharmony_ci *
61cc1dc7a3Sopenharmony_ci * @param data       The integer color value post-interpolation.
62cc1dc7a3Sopenharmony_ci * @param lns_mask   If set treat lane as HDR (LNS) else LDR (unorm16).
63cc1dc7a3Sopenharmony_ci *
64cc1dc7a3Sopenharmony_ci * @return The float color value.
65cc1dc7a3Sopenharmony_ci */
66cc1dc7a3Sopenharmony_cistatic inline vfloat4 decode_texel(
67cc1dc7a3Sopenharmony_ci	vint4 data,
68cc1dc7a3Sopenharmony_ci	vmask4 lns_mask
69cc1dc7a3Sopenharmony_ci) {
70cc1dc7a3Sopenharmony_ci	vint4 color_lns = vint4::zero();
71cc1dc7a3Sopenharmony_ci	vint4 color_unorm = vint4::zero();
72cc1dc7a3Sopenharmony_ci
73cc1dc7a3Sopenharmony_ci	if (any(lns_mask))
74cc1dc7a3Sopenharmony_ci	{
75cc1dc7a3Sopenharmony_ci		color_lns = lns_to_sf16(data);
76cc1dc7a3Sopenharmony_ci	}
77cc1dc7a3Sopenharmony_ci
78cc1dc7a3Sopenharmony_ci	if (!all(lns_mask))
79cc1dc7a3Sopenharmony_ci	{
80cc1dc7a3Sopenharmony_ci		color_unorm = unorm16_to_sf16(data);
81cc1dc7a3Sopenharmony_ci	}
82cc1dc7a3Sopenharmony_ci
83cc1dc7a3Sopenharmony_ci	// Pick components and then convert to FP16
84cc1dc7a3Sopenharmony_ci	vint4 datai = select(color_unorm, color_lns, lns_mask);
85cc1dc7a3Sopenharmony_ci	return float16_to_float(datai);
86cc1dc7a3Sopenharmony_ci}
87cc1dc7a3Sopenharmony_ci
88cc1dc7a3Sopenharmony_ci/* See header for documentation. */
89cc1dc7a3Sopenharmony_civoid unpack_weights(
90cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
91cc1dc7a3Sopenharmony_ci	const symbolic_compressed_block& scb,
92cc1dc7a3Sopenharmony_ci	const decimation_info& di,
93cc1dc7a3Sopenharmony_ci	bool is_dual_plane,
94cc1dc7a3Sopenharmony_ci	int weights_plane1[BLOCK_MAX_TEXELS],
95cc1dc7a3Sopenharmony_ci	int weights_plane2[BLOCK_MAX_TEXELS]
96cc1dc7a3Sopenharmony_ci) {
97cc1dc7a3Sopenharmony_ci	// Safe to overshoot as all arrays are allocated to full size
98cc1dc7a3Sopenharmony_ci	if (!is_dual_plane)
99cc1dc7a3Sopenharmony_ci	{
100cc1dc7a3Sopenharmony_ci		// Build full 64-entry weight lookup table
101cc1dc7a3Sopenharmony_ci		vint4 tab0 = vint4::load(scb.weights +  0);
102cc1dc7a3Sopenharmony_ci		vint4 tab1 = vint4::load(scb.weights + 16);
103cc1dc7a3Sopenharmony_ci		vint4 tab2 = vint4::load(scb.weights + 32);
104cc1dc7a3Sopenharmony_ci		vint4 tab3 = vint4::load(scb.weights + 48);
105cc1dc7a3Sopenharmony_ci
106cc1dc7a3Sopenharmony_ci		vint tab0p, tab1p, tab2p, tab3p;
107cc1dc7a3Sopenharmony_ci		vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
108cc1dc7a3Sopenharmony_ci
109cc1dc7a3Sopenharmony_ci		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
110cc1dc7a3Sopenharmony_ci		{
111cc1dc7a3Sopenharmony_ci			vint summed_value(8);
112cc1dc7a3Sopenharmony_ci			vint weight_count(di.texel_weight_count + i);
113cc1dc7a3Sopenharmony_ci			int max_weight_count = hmax(weight_count).lane<0>();
114cc1dc7a3Sopenharmony_ci
115cc1dc7a3Sopenharmony_ci			promise(max_weight_count > 0);
116cc1dc7a3Sopenharmony_ci			for (int j = 0; j < max_weight_count; j++)
117cc1dc7a3Sopenharmony_ci			{
118cc1dc7a3Sopenharmony_ci				vint texel_weights(di.texel_weights_tr[j] + i);
119cc1dc7a3Sopenharmony_ci				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
120cc1dc7a3Sopenharmony_ci
121cc1dc7a3Sopenharmony_ci				summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
122cc1dc7a3Sopenharmony_ci			}
123cc1dc7a3Sopenharmony_ci
124cc1dc7a3Sopenharmony_ci			store(lsr<4>(summed_value), weights_plane1 + i);
125cc1dc7a3Sopenharmony_ci		}
126cc1dc7a3Sopenharmony_ci	}
127cc1dc7a3Sopenharmony_ci	else
128cc1dc7a3Sopenharmony_ci	{
129cc1dc7a3Sopenharmony_ci		// Build a 32-entry weight lookup table per plane
130cc1dc7a3Sopenharmony_ci		// Plane 1
131cc1dc7a3Sopenharmony_ci		vint4 tab0_plane1 = vint4::load(scb.weights +  0);
132cc1dc7a3Sopenharmony_ci		vint4 tab1_plane1 = vint4::load(scb.weights + 16);
133cc1dc7a3Sopenharmony_ci		vint tab0_plane1p, tab1_plane1p;
134cc1dc7a3Sopenharmony_ci		vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
135cc1dc7a3Sopenharmony_ci
136cc1dc7a3Sopenharmony_ci		// Plane 2
137cc1dc7a3Sopenharmony_ci		vint4 tab0_plane2 = vint4::load(scb.weights + 32);
138cc1dc7a3Sopenharmony_ci		vint4 tab1_plane2 = vint4::load(scb.weights + 48);
139cc1dc7a3Sopenharmony_ci		vint tab0_plane2p, tab1_plane2p;
140cc1dc7a3Sopenharmony_ci		vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
141cc1dc7a3Sopenharmony_ci
142cc1dc7a3Sopenharmony_ci		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
143cc1dc7a3Sopenharmony_ci		{
144cc1dc7a3Sopenharmony_ci			vint sum_plane1(8);
145cc1dc7a3Sopenharmony_ci			vint sum_plane2(8);
146cc1dc7a3Sopenharmony_ci
147cc1dc7a3Sopenharmony_ci			vint weight_count(di.texel_weight_count + i);
148cc1dc7a3Sopenharmony_ci			int max_weight_count = hmax(weight_count).lane<0>();
149cc1dc7a3Sopenharmony_ci
150cc1dc7a3Sopenharmony_ci			promise(max_weight_count > 0);
151cc1dc7a3Sopenharmony_ci			for (int j = 0; j < max_weight_count; j++)
152cc1dc7a3Sopenharmony_ci			{
153cc1dc7a3Sopenharmony_ci				vint texel_weights(di.texel_weights_tr[j] + i);
154cc1dc7a3Sopenharmony_ci				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
155cc1dc7a3Sopenharmony_ci
156cc1dc7a3Sopenharmony_ci				sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
157cc1dc7a3Sopenharmony_ci				sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
158cc1dc7a3Sopenharmony_ci			}
159cc1dc7a3Sopenharmony_ci
160cc1dc7a3Sopenharmony_ci			store(lsr<4>(sum_plane1), weights_plane1 + i);
161cc1dc7a3Sopenharmony_ci			store(lsr<4>(sum_plane2), weights_plane2 + i);
162cc1dc7a3Sopenharmony_ci		}
163cc1dc7a3Sopenharmony_ci	}
164cc1dc7a3Sopenharmony_ci}
165cc1dc7a3Sopenharmony_ci
166cc1dc7a3Sopenharmony_ci/**
167cc1dc7a3Sopenharmony_ci * @brief Return an FP32 NaN value for use in error colors.
168cc1dc7a3Sopenharmony_ci *
169cc1dc7a3Sopenharmony_ci * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
170cc1dc7a3Sopenharmony_ci *
171cc1dc7a3Sopenharmony_ci * @return The float color value.
172cc1dc7a3Sopenharmony_ci */
173cc1dc7a3Sopenharmony_cistatic float error_color_nan()
174cc1dc7a3Sopenharmony_ci{
175cc1dc7a3Sopenharmony_ci	if32 v;
176cc1dc7a3Sopenharmony_ci	v.u = 0xFFFFE000U;
177cc1dc7a3Sopenharmony_ci	return v.f;
178cc1dc7a3Sopenharmony_ci}
179cc1dc7a3Sopenharmony_ci
180cc1dc7a3Sopenharmony_ci/* See header for documentation. */
181cc1dc7a3Sopenharmony_civoid decompress_symbolic_block(
182cc1dc7a3Sopenharmony_ci	astcenc_profile decode_mode,
183cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
184cc1dc7a3Sopenharmony_ci	int xpos,
185cc1dc7a3Sopenharmony_ci	int ypos,
186cc1dc7a3Sopenharmony_ci	int zpos,
187cc1dc7a3Sopenharmony_ci	const symbolic_compressed_block& scb,
188cc1dc7a3Sopenharmony_ci	image_block& blk
189cc1dc7a3Sopenharmony_ci) {
190cc1dc7a3Sopenharmony_ci	blk.xpos = xpos;
191cc1dc7a3Sopenharmony_ci	blk.ypos = ypos;
192cc1dc7a3Sopenharmony_ci	blk.zpos = zpos;
193cc1dc7a3Sopenharmony_ci
194cc1dc7a3Sopenharmony_ci	blk.data_min = vfloat4::zero();
195cc1dc7a3Sopenharmony_ci	blk.data_mean = vfloat4::zero();
196cc1dc7a3Sopenharmony_ci	blk.data_max = vfloat4::zero();
197cc1dc7a3Sopenharmony_ci	blk.grayscale = false;
198cc1dc7a3Sopenharmony_ci
199cc1dc7a3Sopenharmony_ci	// If we detected an error-block, blow up immediately.
200cc1dc7a3Sopenharmony_ci	if (scb.block_type == SYM_BTYPE_ERROR)
201cc1dc7a3Sopenharmony_ci	{
202cc1dc7a3Sopenharmony_ci		for (unsigned int i = 0; i < bsd.texel_count; i++)
203cc1dc7a3Sopenharmony_ci		{
204cc1dc7a3Sopenharmony_ci			blk.data_r[i] = error_color_nan();
205cc1dc7a3Sopenharmony_ci			blk.data_g[i] = error_color_nan();
206cc1dc7a3Sopenharmony_ci			blk.data_b[i] = error_color_nan();
207cc1dc7a3Sopenharmony_ci			blk.data_a[i] = error_color_nan();
208cc1dc7a3Sopenharmony_ci			blk.rgb_lns[i] = 0;
209cc1dc7a3Sopenharmony_ci			blk.alpha_lns[i] = 0;
210cc1dc7a3Sopenharmony_ci		}
211cc1dc7a3Sopenharmony_ci
212cc1dc7a3Sopenharmony_ci		return;
213cc1dc7a3Sopenharmony_ci	}
214cc1dc7a3Sopenharmony_ci
215cc1dc7a3Sopenharmony_ci	if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
216cc1dc7a3Sopenharmony_ci	    (scb.block_type == SYM_BTYPE_CONST_U16))
217cc1dc7a3Sopenharmony_ci	{
218cc1dc7a3Sopenharmony_ci		vfloat4 color;
219cc1dc7a3Sopenharmony_ci		uint8_t use_lns = 0;
220cc1dc7a3Sopenharmony_ci
221cc1dc7a3Sopenharmony_ci		// UNORM16 constant color block
222cc1dc7a3Sopenharmony_ci		if (scb.block_type == SYM_BTYPE_CONST_U16)
223cc1dc7a3Sopenharmony_ci		{
224cc1dc7a3Sopenharmony_ci			vint4 colori(scb.constant_color);
225cc1dc7a3Sopenharmony_ci
226cc1dc7a3Sopenharmony_ci			// Determine the UNORM8 rounding on the decode
227cc1dc7a3Sopenharmony_ci			vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
228cc1dc7a3Sopenharmony_ci
229cc1dc7a3Sopenharmony_ci			// The real decoder would just use the top 8 bits, but we rescale
230cc1dc7a3Sopenharmony_ci			// in to a 16-bit value that rounds correctly.
231cc1dc7a3Sopenharmony_ci			vint4 colori_u8 = asr<8>(colori) * 257;
232cc1dc7a3Sopenharmony_ci			colori = select(colori, colori_u8, u8_mask);
233cc1dc7a3Sopenharmony_ci
234cc1dc7a3Sopenharmony_ci			vint4 colorf16 = unorm16_to_sf16(colori);
235cc1dc7a3Sopenharmony_ci			color = float16_to_float(colorf16);
236cc1dc7a3Sopenharmony_ci		}
237cc1dc7a3Sopenharmony_ci		// FLOAT16 constant color block
238cc1dc7a3Sopenharmony_ci		else
239cc1dc7a3Sopenharmony_ci		{
240cc1dc7a3Sopenharmony_ci			switch (decode_mode)
241cc1dc7a3Sopenharmony_ci			{
242cc1dc7a3Sopenharmony_ci			case ASTCENC_PRF_LDR_SRGB:
243cc1dc7a3Sopenharmony_ci			case ASTCENC_PRF_LDR:
244cc1dc7a3Sopenharmony_ci				color = vfloat4(error_color_nan());
245cc1dc7a3Sopenharmony_ci				break;
246cc1dc7a3Sopenharmony_ci			case ASTCENC_PRF_HDR_RGB_LDR_A:
247cc1dc7a3Sopenharmony_ci			case ASTCENC_PRF_HDR:
248cc1dc7a3Sopenharmony_ci				// Constant-color block; unpack from FP16 to FP32.
249cc1dc7a3Sopenharmony_ci				color = float16_to_float(vint4(scb.constant_color));
250cc1dc7a3Sopenharmony_ci				use_lns = 1;
251cc1dc7a3Sopenharmony_ci				break;
252cc1dc7a3Sopenharmony_ci			}
253cc1dc7a3Sopenharmony_ci		}
254cc1dc7a3Sopenharmony_ci
255cc1dc7a3Sopenharmony_ci		for (unsigned int i = 0; i < bsd.texel_count; i++)
256cc1dc7a3Sopenharmony_ci		{
257cc1dc7a3Sopenharmony_ci			blk.data_r[i] = color.lane<0>();
258cc1dc7a3Sopenharmony_ci			blk.data_g[i] = color.lane<1>();
259cc1dc7a3Sopenharmony_ci			blk.data_b[i] = color.lane<2>();
260cc1dc7a3Sopenharmony_ci			blk.data_a[i] = color.lane<3>();
261cc1dc7a3Sopenharmony_ci			blk.rgb_lns[i] = use_lns;
262cc1dc7a3Sopenharmony_ci			blk.alpha_lns[i] = use_lns;
263cc1dc7a3Sopenharmony_ci		}
264cc1dc7a3Sopenharmony_ci
265cc1dc7a3Sopenharmony_ci		return;
266cc1dc7a3Sopenharmony_ci	}
267cc1dc7a3Sopenharmony_ci
268cc1dc7a3Sopenharmony_ci	// Get the appropriate partition-table entry
269cc1dc7a3Sopenharmony_ci	int partition_count = scb.partition_count;
270cc1dc7a3Sopenharmony_ci	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
271cc1dc7a3Sopenharmony_ci
272cc1dc7a3Sopenharmony_ci	// Get the appropriate block descriptors
273cc1dc7a3Sopenharmony_ci	const auto& bm = bsd.get_block_mode(scb.block_mode);
274cc1dc7a3Sopenharmony_ci	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
275cc1dc7a3Sopenharmony_ci
276cc1dc7a3Sopenharmony_ci	bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
277cc1dc7a3Sopenharmony_ci
278cc1dc7a3Sopenharmony_ci	// Unquantize and undecimate the weights
279cc1dc7a3Sopenharmony_ci	int plane1_weights[BLOCK_MAX_TEXELS];
280cc1dc7a3Sopenharmony_ci	int plane2_weights[BLOCK_MAX_TEXELS];
281cc1dc7a3Sopenharmony_ci	unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
282cc1dc7a3Sopenharmony_ci
283cc1dc7a3Sopenharmony_ci	// Now that we have endpoint colors and weights, we can unpack texel colors
284cc1dc7a3Sopenharmony_ci	int plane2_component = scb.plane2_component;
285cc1dc7a3Sopenharmony_ci	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
286cc1dc7a3Sopenharmony_ci
287cc1dc7a3Sopenharmony_ci	vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
288cc1dc7a3Sopenharmony_ci
289cc1dc7a3Sopenharmony_ci	for (int i = 0; i < partition_count; i++)
290cc1dc7a3Sopenharmony_ci	{
291cc1dc7a3Sopenharmony_ci		// Decode the color endpoints for this partition
292cc1dc7a3Sopenharmony_ci		vint4 ep0;
293cc1dc7a3Sopenharmony_ci		vint4 ep1;
294cc1dc7a3Sopenharmony_ci		bool rgb_lns;
295cc1dc7a3Sopenharmony_ci		bool a_lns;
296cc1dc7a3Sopenharmony_ci
297cc1dc7a3Sopenharmony_ci		unpack_color_endpoints(decode_mode,
298cc1dc7a3Sopenharmony_ci		                       scb.color_formats[i],
299cc1dc7a3Sopenharmony_ci		                       scb.color_values[i],
300cc1dc7a3Sopenharmony_ci		                       rgb_lns, a_lns,
301cc1dc7a3Sopenharmony_ci		                       ep0, ep1);
302cc1dc7a3Sopenharmony_ci
303cc1dc7a3Sopenharmony_ci		vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
304cc1dc7a3Sopenharmony_ci
305cc1dc7a3Sopenharmony_ci		int texel_count = pi.partition_texel_count[i];
306cc1dc7a3Sopenharmony_ci		for (int j = 0; j < texel_count; j++)
307cc1dc7a3Sopenharmony_ci		{
308cc1dc7a3Sopenharmony_ci			int tix = pi.texels_of_partition[i][j];
309cc1dc7a3Sopenharmony_ci			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
310cc1dc7a3Sopenharmony_ci			vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
311cc1dc7a3Sopenharmony_ci			vfloat4 colorf = decode_texel(color, lns_mask);
312cc1dc7a3Sopenharmony_ci
313cc1dc7a3Sopenharmony_ci			blk.data_r[tix] = colorf.lane<0>();
314cc1dc7a3Sopenharmony_ci			blk.data_g[tix] = colorf.lane<1>();
315cc1dc7a3Sopenharmony_ci			blk.data_b[tix] = colorf.lane<2>();
316cc1dc7a3Sopenharmony_ci			blk.data_a[tix] = colorf.lane<3>();
317cc1dc7a3Sopenharmony_ci		}
318cc1dc7a3Sopenharmony_ci	}
319cc1dc7a3Sopenharmony_ci}
320cc1dc7a3Sopenharmony_ci
321cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DECOMPRESS_ONLY)
322cc1dc7a3Sopenharmony_ci
323cc1dc7a3Sopenharmony_ci/* See header for documentation. */
324cc1dc7a3Sopenharmony_cifloat compute_symbolic_block_difference_2plane(
325cc1dc7a3Sopenharmony_ci	const astcenc_config& config,
326cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
327cc1dc7a3Sopenharmony_ci	const symbolic_compressed_block& scb,
328cc1dc7a3Sopenharmony_ci	const image_block& blk
329cc1dc7a3Sopenharmony_ci) {
330cc1dc7a3Sopenharmony_ci	// If we detected an error-block, blow up immediately.
331cc1dc7a3Sopenharmony_ci	if (scb.block_type == SYM_BTYPE_ERROR)
332cc1dc7a3Sopenharmony_ci	{
333cc1dc7a3Sopenharmony_ci		return ERROR_CALC_DEFAULT;
334cc1dc7a3Sopenharmony_ci	}
335cc1dc7a3Sopenharmony_ci
336cc1dc7a3Sopenharmony_ci	assert(scb.block_mode >= 0);
337cc1dc7a3Sopenharmony_ci	assert(scb.partition_count == 1);
338cc1dc7a3Sopenharmony_ci	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
339cc1dc7a3Sopenharmony_ci
340cc1dc7a3Sopenharmony_ci	// Get the appropriate block descriptor
341cc1dc7a3Sopenharmony_ci	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
342cc1dc7a3Sopenharmony_ci	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
343cc1dc7a3Sopenharmony_ci
344cc1dc7a3Sopenharmony_ci	// Unquantize and undecimate the weights
345cc1dc7a3Sopenharmony_ci	int plane1_weights[BLOCK_MAX_TEXELS];
346cc1dc7a3Sopenharmony_ci	int plane2_weights[BLOCK_MAX_TEXELS];
347cc1dc7a3Sopenharmony_ci	unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
348cc1dc7a3Sopenharmony_ci
349cc1dc7a3Sopenharmony_ci	vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
350cc1dc7a3Sopenharmony_ci
351cc1dc7a3Sopenharmony_ci	vfloat4 summa = vfloat4::zero();
352cc1dc7a3Sopenharmony_ci
353cc1dc7a3Sopenharmony_ci	// Decode the color endpoints for this partition
354cc1dc7a3Sopenharmony_ci	vint4 ep0;
355cc1dc7a3Sopenharmony_ci	vint4 ep1;
356cc1dc7a3Sopenharmony_ci	bool rgb_lns;
357cc1dc7a3Sopenharmony_ci	bool a_lns;
358cc1dc7a3Sopenharmony_ci
359cc1dc7a3Sopenharmony_ci	unpack_color_endpoints(config.profile,
360cc1dc7a3Sopenharmony_ci	                       scb.color_formats[0],
361cc1dc7a3Sopenharmony_ci	                       scb.color_values[0],
362cc1dc7a3Sopenharmony_ci	                       rgb_lns, a_lns,
363cc1dc7a3Sopenharmony_ci	                       ep0, ep1);
364cc1dc7a3Sopenharmony_ci
365cc1dc7a3Sopenharmony_ci	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
366cc1dc7a3Sopenharmony_ci
367cc1dc7a3Sopenharmony_ci	// Unpack and compute error for each texel in the partition
368cc1dc7a3Sopenharmony_ci	unsigned int texel_count = bsd.texel_count;
369cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < texel_count; i++)
370cc1dc7a3Sopenharmony_ci	{
371cc1dc7a3Sopenharmony_ci		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
372cc1dc7a3Sopenharmony_ci		vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
373cc1dc7a3Sopenharmony_ci
374cc1dc7a3Sopenharmony_ci		vfloat4 color = int_to_float(colori);
375cc1dc7a3Sopenharmony_ci		vfloat4 oldColor = blk.texel(i);
376cc1dc7a3Sopenharmony_ci
377cc1dc7a3Sopenharmony_ci		// Compare error using a perceptual decode metric for RGBM textures
378cc1dc7a3Sopenharmony_ci		if (config.flags & ASTCENC_FLG_MAP_RGBM)
379cc1dc7a3Sopenharmony_ci		{
380cc1dc7a3Sopenharmony_ci			// Fail encodings that result in zero weight M pixels. Note that this can cause
381cc1dc7a3Sopenharmony_ci			// "interesting" artifacts if we reject all useful encodings - we typically get max
382cc1dc7a3Sopenharmony_ci			// brightness encodings instead which look just as bad. We recommend users apply a
383cc1dc7a3Sopenharmony_ci			// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
384cc1dc7a3Sopenharmony_ci			// getting small M values post-quantization, but we can't prove it would never
385cc1dc7a3Sopenharmony_ci			// happen, especially at low bit rates ...
386cc1dc7a3Sopenharmony_ci			if (color.lane<3>() == 0.0f)
387cc1dc7a3Sopenharmony_ci			{
388cc1dc7a3Sopenharmony_ci				return -ERROR_CALC_DEFAULT;
389cc1dc7a3Sopenharmony_ci			}
390cc1dc7a3Sopenharmony_ci
391cc1dc7a3Sopenharmony_ci			// Compute error based on decoded RGBM color
392cc1dc7a3Sopenharmony_ci			color = vfloat4(
393cc1dc7a3Sopenharmony_ci				color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
394cc1dc7a3Sopenharmony_ci				color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
395cc1dc7a3Sopenharmony_ci				color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
396cc1dc7a3Sopenharmony_ci				1.0f
397cc1dc7a3Sopenharmony_ci			);
398cc1dc7a3Sopenharmony_ci
399cc1dc7a3Sopenharmony_ci			oldColor = vfloat4(
400cc1dc7a3Sopenharmony_ci				oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
401cc1dc7a3Sopenharmony_ci				oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
402cc1dc7a3Sopenharmony_ci				oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
403cc1dc7a3Sopenharmony_ci				1.0f
404cc1dc7a3Sopenharmony_ci			);
405cc1dc7a3Sopenharmony_ci		}
406cc1dc7a3Sopenharmony_ci
407cc1dc7a3Sopenharmony_ci		vfloat4 error = oldColor - color;
408cc1dc7a3Sopenharmony_ci		error = min(abs(error), 1e15f);
409cc1dc7a3Sopenharmony_ci		error = error * error;
410cc1dc7a3Sopenharmony_ci
411cc1dc7a3Sopenharmony_ci		summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
412cc1dc7a3Sopenharmony_ci	}
413cc1dc7a3Sopenharmony_ci
414cc1dc7a3Sopenharmony_ci	return summa.lane<0>();
415cc1dc7a3Sopenharmony_ci}
416cc1dc7a3Sopenharmony_ci
417cc1dc7a3Sopenharmony_ci/* See header for documentation. */
418cc1dc7a3Sopenharmony_cifloat compute_symbolic_block_difference_1plane(
419cc1dc7a3Sopenharmony_ci	const astcenc_config& config,
420cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
421cc1dc7a3Sopenharmony_ci	const symbolic_compressed_block& scb,
422cc1dc7a3Sopenharmony_ci	const image_block& blk
423cc1dc7a3Sopenharmony_ci) {
424cc1dc7a3Sopenharmony_ci	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
425cc1dc7a3Sopenharmony_ci
426cc1dc7a3Sopenharmony_ci	// If we detected an error-block, blow up immediately.
427cc1dc7a3Sopenharmony_ci	if (scb.block_type == SYM_BTYPE_ERROR)
428cc1dc7a3Sopenharmony_ci	{
429cc1dc7a3Sopenharmony_ci		return ERROR_CALC_DEFAULT;
430cc1dc7a3Sopenharmony_ci	}
431cc1dc7a3Sopenharmony_ci
432cc1dc7a3Sopenharmony_ci	assert(scb.block_mode >= 0);
433cc1dc7a3Sopenharmony_ci
434cc1dc7a3Sopenharmony_ci	// Get the appropriate partition-table entry
435cc1dc7a3Sopenharmony_ci	unsigned int partition_count = scb.partition_count;
436cc1dc7a3Sopenharmony_ci	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
437cc1dc7a3Sopenharmony_ci
438cc1dc7a3Sopenharmony_ci	// Get the appropriate block descriptor
439cc1dc7a3Sopenharmony_ci	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
440cc1dc7a3Sopenharmony_ci	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
441cc1dc7a3Sopenharmony_ci
442cc1dc7a3Sopenharmony_ci	// Unquantize and undecimate the weights
443cc1dc7a3Sopenharmony_ci	int plane1_weights[BLOCK_MAX_TEXELS];
444cc1dc7a3Sopenharmony_ci	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
445cc1dc7a3Sopenharmony_ci
446cc1dc7a3Sopenharmony_ci	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
447cc1dc7a3Sopenharmony_ci
448cc1dc7a3Sopenharmony_ci	vfloat4 summa = vfloat4::zero();
449cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < partition_count; i++)
450cc1dc7a3Sopenharmony_ci	{
451cc1dc7a3Sopenharmony_ci		// Decode the color endpoints for this partition
452cc1dc7a3Sopenharmony_ci		vint4 ep0;
453cc1dc7a3Sopenharmony_ci		vint4 ep1;
454cc1dc7a3Sopenharmony_ci		bool rgb_lns;
455cc1dc7a3Sopenharmony_ci		bool a_lns;
456cc1dc7a3Sopenharmony_ci
457cc1dc7a3Sopenharmony_ci		unpack_color_endpoints(config.profile,
458cc1dc7a3Sopenharmony_ci		                       scb.color_formats[i],
459cc1dc7a3Sopenharmony_ci		                       scb.color_values[i],
460cc1dc7a3Sopenharmony_ci		                       rgb_lns, a_lns,
461cc1dc7a3Sopenharmony_ci		                       ep0, ep1);
462cc1dc7a3Sopenharmony_ci
463cc1dc7a3Sopenharmony_ci		// Unpack and compute error for each texel in the partition
464cc1dc7a3Sopenharmony_ci		unsigned int texel_count = pi.partition_texel_count[i];
465cc1dc7a3Sopenharmony_ci		for (unsigned int j = 0; j < texel_count; j++)
466cc1dc7a3Sopenharmony_ci		{
467cc1dc7a3Sopenharmony_ci			unsigned int tix = pi.texels_of_partition[i][j];
468cc1dc7a3Sopenharmony_ci			vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
469cc1dc7a3Sopenharmony_ci			                              vint4(plane1_weights[tix]));
470cc1dc7a3Sopenharmony_ci
471cc1dc7a3Sopenharmony_ci			vfloat4 color = int_to_float(colori);
472cc1dc7a3Sopenharmony_ci			vfloat4 oldColor = blk.texel(tix);
473cc1dc7a3Sopenharmony_ci
474cc1dc7a3Sopenharmony_ci			// Compare error using a perceptual decode metric for RGBM textures
475cc1dc7a3Sopenharmony_ci			if (config.flags & ASTCENC_FLG_MAP_RGBM)
476cc1dc7a3Sopenharmony_ci			{
477cc1dc7a3Sopenharmony_ci				// Fail encodings that result in zero weight M pixels. Note that this can cause
478cc1dc7a3Sopenharmony_ci				// "interesting" artifacts if we reject all useful encodings - we typically get max
479cc1dc7a3Sopenharmony_ci				// brightness encodings instead which look just as bad. We recommend users apply a
480cc1dc7a3Sopenharmony_ci				// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
481cc1dc7a3Sopenharmony_ci				// getting small M values post-quantization, but we can't prove it would never
482cc1dc7a3Sopenharmony_ci				// happen, especially at low bit rates ...
483cc1dc7a3Sopenharmony_ci				if (color.lane<3>() == 0.0f)
484cc1dc7a3Sopenharmony_ci				{
485cc1dc7a3Sopenharmony_ci					return -ERROR_CALC_DEFAULT;
486cc1dc7a3Sopenharmony_ci				}
487cc1dc7a3Sopenharmony_ci
488cc1dc7a3Sopenharmony_ci				// Compute error based on decoded RGBM color
489cc1dc7a3Sopenharmony_ci				color = vfloat4(
490cc1dc7a3Sopenharmony_ci					color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
491cc1dc7a3Sopenharmony_ci					color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
492cc1dc7a3Sopenharmony_ci					color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
493cc1dc7a3Sopenharmony_ci					1.0f
494cc1dc7a3Sopenharmony_ci				);
495cc1dc7a3Sopenharmony_ci
496cc1dc7a3Sopenharmony_ci				oldColor = vfloat4(
497cc1dc7a3Sopenharmony_ci					oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
498cc1dc7a3Sopenharmony_ci					oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
499cc1dc7a3Sopenharmony_ci					oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
500cc1dc7a3Sopenharmony_ci					1.0f
501cc1dc7a3Sopenharmony_ci				);
502cc1dc7a3Sopenharmony_ci			}
503cc1dc7a3Sopenharmony_ci
504cc1dc7a3Sopenharmony_ci			vfloat4 error = oldColor - color;
505cc1dc7a3Sopenharmony_ci			error = min(abs(error), 1e15f);
506cc1dc7a3Sopenharmony_ci			error = error * error;
507cc1dc7a3Sopenharmony_ci
508cc1dc7a3Sopenharmony_ci			summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
509cc1dc7a3Sopenharmony_ci		}
510cc1dc7a3Sopenharmony_ci	}
511cc1dc7a3Sopenharmony_ci
512cc1dc7a3Sopenharmony_ci	return summa.lane<0>();
513cc1dc7a3Sopenharmony_ci}
514cc1dc7a3Sopenharmony_ci
515cc1dc7a3Sopenharmony_ci/* See header for documentation. */
516cc1dc7a3Sopenharmony_cifloat compute_symbolic_block_difference_1plane_1partition(
517cc1dc7a3Sopenharmony_ci	const astcenc_config& config,
518cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
519cc1dc7a3Sopenharmony_ci	const symbolic_compressed_block& scb,
520cc1dc7a3Sopenharmony_ci	const image_block& blk
521cc1dc7a3Sopenharmony_ci) {
522cc1dc7a3Sopenharmony_ci	// If we detected an error-block, blow up immediately.
523cc1dc7a3Sopenharmony_ci	if (scb.block_type == SYM_BTYPE_ERROR)
524cc1dc7a3Sopenharmony_ci	{
525cc1dc7a3Sopenharmony_ci		return ERROR_CALC_DEFAULT;
526cc1dc7a3Sopenharmony_ci	}
527cc1dc7a3Sopenharmony_ci
528cc1dc7a3Sopenharmony_ci	assert(scb.block_mode >= 0);
529cc1dc7a3Sopenharmony_ci	assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
530cc1dc7a3Sopenharmony_ci
531cc1dc7a3Sopenharmony_ci	// Get the appropriate block descriptor
532cc1dc7a3Sopenharmony_ci	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
533cc1dc7a3Sopenharmony_ci	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
534cc1dc7a3Sopenharmony_ci
535cc1dc7a3Sopenharmony_ci	// Unquantize and undecimate the weights
536cc1dc7a3Sopenharmony_ci	ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
537cc1dc7a3Sopenharmony_ci	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
538cc1dc7a3Sopenharmony_ci
539cc1dc7a3Sopenharmony_ci	// Decode the color endpoints for this partition
540cc1dc7a3Sopenharmony_ci	vint4 ep0;
541cc1dc7a3Sopenharmony_ci	vint4 ep1;
542cc1dc7a3Sopenharmony_ci	bool rgb_lns;
543cc1dc7a3Sopenharmony_ci	bool a_lns;
544cc1dc7a3Sopenharmony_ci
545cc1dc7a3Sopenharmony_ci	unpack_color_endpoints(config.profile,
546cc1dc7a3Sopenharmony_ci	                       scb.color_formats[0],
547cc1dc7a3Sopenharmony_ci	                       scb.color_values[0],
548cc1dc7a3Sopenharmony_ci	                       rgb_lns, a_lns,
549cc1dc7a3Sopenharmony_ci	                       ep0, ep1);
550cc1dc7a3Sopenharmony_ci
551cc1dc7a3Sopenharmony_ci	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
552cc1dc7a3Sopenharmony_ci
553cc1dc7a3Sopenharmony_ci	// Unpack and compute error for each texel in the partition
554cc1dc7a3Sopenharmony_ci	vfloatacc summav = vfloatacc::zero();
555cc1dc7a3Sopenharmony_ci
556cc1dc7a3Sopenharmony_ci	vint lane_id = vint::lane_id();
557cc1dc7a3Sopenharmony_ci
558cc1dc7a3Sopenharmony_ci	unsigned int texel_count = bsd.texel_count;
559cc1dc7a3Sopenharmony_ci	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
560cc1dc7a3Sopenharmony_ci	{
561cc1dc7a3Sopenharmony_ci		// Compute EP1 contribution
562cc1dc7a3Sopenharmony_ci		vint weight1 = vint::loada(plane1_weights + i);
563cc1dc7a3Sopenharmony_ci		vint ep1_r = vint(ep1.lane<0>()) * weight1;
564cc1dc7a3Sopenharmony_ci		vint ep1_g = vint(ep1.lane<1>()) * weight1;
565cc1dc7a3Sopenharmony_ci		vint ep1_b = vint(ep1.lane<2>()) * weight1;
566cc1dc7a3Sopenharmony_ci		vint ep1_a = vint(ep1.lane<3>()) * weight1;
567cc1dc7a3Sopenharmony_ci
568cc1dc7a3Sopenharmony_ci		// Compute EP0 contribution
569cc1dc7a3Sopenharmony_ci		vint weight0 = vint(64) - weight1;
570cc1dc7a3Sopenharmony_ci		vint ep0_r = vint(ep0.lane<0>()) * weight0;
571cc1dc7a3Sopenharmony_ci		vint ep0_g = vint(ep0.lane<1>()) * weight0;
572cc1dc7a3Sopenharmony_ci		vint ep0_b = vint(ep0.lane<2>()) * weight0;
573cc1dc7a3Sopenharmony_ci		vint ep0_a = vint(ep0.lane<3>()) * weight0;
574cc1dc7a3Sopenharmony_ci
575cc1dc7a3Sopenharmony_ci		// Combine contributions
576cc1dc7a3Sopenharmony_ci		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
577cc1dc7a3Sopenharmony_ci		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
578cc1dc7a3Sopenharmony_ci		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
579cc1dc7a3Sopenharmony_ci		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
580cc1dc7a3Sopenharmony_ci
581cc1dc7a3Sopenharmony_ci		// If using a U8 decode mode bit replicate top 8 bits
582cc1dc7a3Sopenharmony_ci		// so rest of codec can assume 0xFFFF max range everywhere
583cc1dc7a3Sopenharmony_ci		vint colori_r8 = asr<8>(colori_r) * vint(257);
584cc1dc7a3Sopenharmony_ci		colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
585cc1dc7a3Sopenharmony_ci
586cc1dc7a3Sopenharmony_ci		vint colori_g8 = asr<8>(colori_g) * vint(257);
587cc1dc7a3Sopenharmony_ci		colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
588cc1dc7a3Sopenharmony_ci
589cc1dc7a3Sopenharmony_ci		vint colori_b8 = asr<8>(colori_b) * vint(257);
590cc1dc7a3Sopenharmony_ci		colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
591cc1dc7a3Sopenharmony_ci
592cc1dc7a3Sopenharmony_ci		vint colori_a8 = asr<8>(colori_a) * vint(257);
593cc1dc7a3Sopenharmony_ci		colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
594cc1dc7a3Sopenharmony_ci
595cc1dc7a3Sopenharmony_ci		// Compute color diff
596cc1dc7a3Sopenharmony_ci		vfloat color_r = int_to_float(colori_r);
597cc1dc7a3Sopenharmony_ci		vfloat color_g = int_to_float(colori_g);
598cc1dc7a3Sopenharmony_ci		vfloat color_b = int_to_float(colori_b);
599cc1dc7a3Sopenharmony_ci		vfloat color_a = int_to_float(colori_a);
600cc1dc7a3Sopenharmony_ci
601cc1dc7a3Sopenharmony_ci		vfloat color_orig_r = loada(blk.data_r + i);
602cc1dc7a3Sopenharmony_ci		vfloat color_orig_g = loada(blk.data_g + i);
603cc1dc7a3Sopenharmony_ci		vfloat color_orig_b = loada(blk.data_b + i);
604cc1dc7a3Sopenharmony_ci		vfloat color_orig_a = loada(blk.data_a + i);
605cc1dc7a3Sopenharmony_ci
606cc1dc7a3Sopenharmony_ci		vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
607cc1dc7a3Sopenharmony_ci		vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
608cc1dc7a3Sopenharmony_ci		vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
609cc1dc7a3Sopenharmony_ci		vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
610cc1dc7a3Sopenharmony_ci
611cc1dc7a3Sopenharmony_ci		// Compute squared error metric
612cc1dc7a3Sopenharmony_ci		color_error_r = color_error_r * color_error_r;
613cc1dc7a3Sopenharmony_ci		color_error_g = color_error_g * color_error_g;
614cc1dc7a3Sopenharmony_ci		color_error_b = color_error_b * color_error_b;
615cc1dc7a3Sopenharmony_ci		color_error_a = color_error_a * color_error_a;
616cc1dc7a3Sopenharmony_ci
617cc1dc7a3Sopenharmony_ci		vfloat metric = color_error_r * blk.channel_weight.lane<0>()
618cc1dc7a3Sopenharmony_ci		              + color_error_g * blk.channel_weight.lane<1>()
619cc1dc7a3Sopenharmony_ci		              + color_error_b * blk.channel_weight.lane<2>()
620cc1dc7a3Sopenharmony_ci		              + color_error_a * blk.channel_weight.lane<3>();
621cc1dc7a3Sopenharmony_ci
622cc1dc7a3Sopenharmony_ci		// Mask off bad lanes
623cc1dc7a3Sopenharmony_ci		vmask mask = lane_id < vint(texel_count);
624cc1dc7a3Sopenharmony_ci		lane_id += vint(ASTCENC_SIMD_WIDTH);
625cc1dc7a3Sopenharmony_ci		haccumulate(summav, metric, mask);
626cc1dc7a3Sopenharmony_ci	}
627cc1dc7a3Sopenharmony_ci
628cc1dc7a3Sopenharmony_ci	return hadd_s(summav);
629cc1dc7a3Sopenharmony_ci}
630cc1dc7a3Sopenharmony_ci
631cc1dc7a3Sopenharmony_ci#endif
632