1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0 2cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 3cc1dc7a3Sopenharmony_ci// Copyright 2011-2024 Arm Limited 4cc1dc7a3Sopenharmony_ci// 5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy 7cc1dc7a3Sopenharmony_ci// of the License at: 8cc1dc7a3Sopenharmony_ci// 9cc1dc7a3Sopenharmony_ci// http://www.apache.org/licenses/LICENSE-2.0 10cc1dc7a3Sopenharmony_ci// 11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software 12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations 15cc1dc7a3Sopenharmony_ci// under the License. 16cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 17cc1dc7a3Sopenharmony_ci 18cc1dc7a3Sopenharmony_ci/** 19cc1dc7a3Sopenharmony_ci * @brief Functions to decompress a symbolic block. 20cc1dc7a3Sopenharmony_ci */ 21cc1dc7a3Sopenharmony_ci 22cc1dc7a3Sopenharmony_ci#include "astcenc_internal.h" 23cc1dc7a3Sopenharmony_ci 24cc1dc7a3Sopenharmony_ci#include <stdio.h> 25cc1dc7a3Sopenharmony_ci#include <assert.h> 26cc1dc7a3Sopenharmony_ci 27cc1dc7a3Sopenharmony_ci/** 28cc1dc7a3Sopenharmony_ci * @brief Compute the integer linear interpolation of two color endpoints. 29cc1dc7a3Sopenharmony_ci * 30cc1dc7a3Sopenharmony_ci * @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16. 31cc1dc7a3Sopenharmony_ci * @param color0 The endpoint0 color. 32cc1dc7a3Sopenharmony_ci * @param color1 The endpoint1 color. 33cc1dc7a3Sopenharmony_ci * @param weights The interpolation weight (between 0 and 64). 34cc1dc7a3Sopenharmony_ci * 35cc1dc7a3Sopenharmony_ci * @return The interpolated color. 36cc1dc7a3Sopenharmony_ci */ 37cc1dc7a3Sopenharmony_cistatic vint4 lerp_color_int( 38cc1dc7a3Sopenharmony_ci vmask4 u8_mask, 39cc1dc7a3Sopenharmony_ci vint4 color0, 40cc1dc7a3Sopenharmony_ci vint4 color1, 41cc1dc7a3Sopenharmony_ci vint4 weights 42cc1dc7a3Sopenharmony_ci) { 43cc1dc7a3Sopenharmony_ci vint4 weight1 = weights; 44cc1dc7a3Sopenharmony_ci vint4 weight0 = vint4(64) - weight1; 45cc1dc7a3Sopenharmony_ci 46cc1dc7a3Sopenharmony_ci vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32); 47cc1dc7a3Sopenharmony_ci color = asr<6>(color); 48cc1dc7a3Sopenharmony_ci 49cc1dc7a3Sopenharmony_ci // For decode_unorm8 values force the codec to bit replicate. This allows the 50cc1dc7a3Sopenharmony_ci // rest of the codec to assume the full 0xFFFF range for everything and ignore 51cc1dc7a3Sopenharmony_ci // the decode_mode setting 52cc1dc7a3Sopenharmony_ci vint4 color_u8 = asr<8>(color) * vint4(257); 53cc1dc7a3Sopenharmony_ci color = select(color, color_u8, u8_mask); 54cc1dc7a3Sopenharmony_ci 55cc1dc7a3Sopenharmony_ci return color; 56cc1dc7a3Sopenharmony_ci} 57cc1dc7a3Sopenharmony_ci 58cc1dc7a3Sopenharmony_ci/** 59cc1dc7a3Sopenharmony_ci * @brief Convert integer color value into a float value for the decoder. 60cc1dc7a3Sopenharmony_ci * 61cc1dc7a3Sopenharmony_ci * @param data The integer color value post-interpolation. 62cc1dc7a3Sopenharmony_ci * @param lns_mask If set treat lane as HDR (LNS) else LDR (unorm16). 63cc1dc7a3Sopenharmony_ci * 64cc1dc7a3Sopenharmony_ci * @return The float color value. 65cc1dc7a3Sopenharmony_ci */ 66cc1dc7a3Sopenharmony_cistatic inline vfloat4 decode_texel( 67cc1dc7a3Sopenharmony_ci vint4 data, 68cc1dc7a3Sopenharmony_ci vmask4 lns_mask 69cc1dc7a3Sopenharmony_ci) { 70cc1dc7a3Sopenharmony_ci vint4 color_lns = vint4::zero(); 71cc1dc7a3Sopenharmony_ci vint4 color_unorm = vint4::zero(); 72cc1dc7a3Sopenharmony_ci 73cc1dc7a3Sopenharmony_ci if (any(lns_mask)) 74cc1dc7a3Sopenharmony_ci { 75cc1dc7a3Sopenharmony_ci color_lns = lns_to_sf16(data); 76cc1dc7a3Sopenharmony_ci } 77cc1dc7a3Sopenharmony_ci 78cc1dc7a3Sopenharmony_ci if (!all(lns_mask)) 79cc1dc7a3Sopenharmony_ci { 80cc1dc7a3Sopenharmony_ci color_unorm = unorm16_to_sf16(data); 81cc1dc7a3Sopenharmony_ci } 82cc1dc7a3Sopenharmony_ci 83cc1dc7a3Sopenharmony_ci // Pick components and then convert to FP16 84cc1dc7a3Sopenharmony_ci vint4 datai = select(color_unorm, color_lns, lns_mask); 85cc1dc7a3Sopenharmony_ci return float16_to_float(datai); 86cc1dc7a3Sopenharmony_ci} 87cc1dc7a3Sopenharmony_ci 88cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 89cc1dc7a3Sopenharmony_civoid unpack_weights( 90cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 91cc1dc7a3Sopenharmony_ci const symbolic_compressed_block& scb, 92cc1dc7a3Sopenharmony_ci const decimation_info& di, 93cc1dc7a3Sopenharmony_ci bool is_dual_plane, 94cc1dc7a3Sopenharmony_ci int weights_plane1[BLOCK_MAX_TEXELS], 95cc1dc7a3Sopenharmony_ci int weights_plane2[BLOCK_MAX_TEXELS] 96cc1dc7a3Sopenharmony_ci) { 97cc1dc7a3Sopenharmony_ci // Safe to overshoot as all arrays are allocated to full size 98cc1dc7a3Sopenharmony_ci if (!is_dual_plane) 99cc1dc7a3Sopenharmony_ci { 100cc1dc7a3Sopenharmony_ci // Build full 64-entry weight lookup table 101cc1dc7a3Sopenharmony_ci vint4 tab0 = vint4::load(scb.weights + 0); 102cc1dc7a3Sopenharmony_ci vint4 tab1 = vint4::load(scb.weights + 16); 103cc1dc7a3Sopenharmony_ci vint4 tab2 = vint4::load(scb.weights + 32); 104cc1dc7a3Sopenharmony_ci vint4 tab3 = vint4::load(scb.weights + 48); 105cc1dc7a3Sopenharmony_ci 106cc1dc7a3Sopenharmony_ci vint tab0p, tab1p, tab2p, tab3p; 107cc1dc7a3Sopenharmony_ci vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p); 108cc1dc7a3Sopenharmony_ci 109cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH) 110cc1dc7a3Sopenharmony_ci { 111cc1dc7a3Sopenharmony_ci vint summed_value(8); 112cc1dc7a3Sopenharmony_ci vint weight_count(di.texel_weight_count + i); 113cc1dc7a3Sopenharmony_ci int max_weight_count = hmax(weight_count).lane<0>(); 114cc1dc7a3Sopenharmony_ci 115cc1dc7a3Sopenharmony_ci promise(max_weight_count > 0); 116cc1dc7a3Sopenharmony_ci for (int j = 0; j < max_weight_count; j++) 117cc1dc7a3Sopenharmony_ci { 118cc1dc7a3Sopenharmony_ci vint texel_weights(di.texel_weights_tr[j] + i); 119cc1dc7a3Sopenharmony_ci vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i); 120cc1dc7a3Sopenharmony_ci 121cc1dc7a3Sopenharmony_ci summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int; 122cc1dc7a3Sopenharmony_ci } 123cc1dc7a3Sopenharmony_ci 124cc1dc7a3Sopenharmony_ci store(lsr<4>(summed_value), weights_plane1 + i); 125cc1dc7a3Sopenharmony_ci } 126cc1dc7a3Sopenharmony_ci } 127cc1dc7a3Sopenharmony_ci else 128cc1dc7a3Sopenharmony_ci { 129cc1dc7a3Sopenharmony_ci // Build a 32-entry weight lookup table per plane 130cc1dc7a3Sopenharmony_ci // Plane 1 131cc1dc7a3Sopenharmony_ci vint4 tab0_plane1 = vint4::load(scb.weights + 0); 132cc1dc7a3Sopenharmony_ci vint4 tab1_plane1 = vint4::load(scb.weights + 16); 133cc1dc7a3Sopenharmony_ci vint tab0_plane1p, tab1_plane1p; 134cc1dc7a3Sopenharmony_ci vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p); 135cc1dc7a3Sopenharmony_ci 136cc1dc7a3Sopenharmony_ci // Plane 2 137cc1dc7a3Sopenharmony_ci vint4 tab0_plane2 = vint4::load(scb.weights + 32); 138cc1dc7a3Sopenharmony_ci vint4 tab1_plane2 = vint4::load(scb.weights + 48); 139cc1dc7a3Sopenharmony_ci vint tab0_plane2p, tab1_plane2p; 140cc1dc7a3Sopenharmony_ci vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p); 141cc1dc7a3Sopenharmony_ci 142cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH) 143cc1dc7a3Sopenharmony_ci { 144cc1dc7a3Sopenharmony_ci vint sum_plane1(8); 145cc1dc7a3Sopenharmony_ci vint sum_plane2(8); 146cc1dc7a3Sopenharmony_ci 147cc1dc7a3Sopenharmony_ci vint weight_count(di.texel_weight_count + i); 148cc1dc7a3Sopenharmony_ci int max_weight_count = hmax(weight_count).lane<0>(); 149cc1dc7a3Sopenharmony_ci 150cc1dc7a3Sopenharmony_ci promise(max_weight_count > 0); 151cc1dc7a3Sopenharmony_ci for (int j = 0; j < max_weight_count; j++) 152cc1dc7a3Sopenharmony_ci { 153cc1dc7a3Sopenharmony_ci vint texel_weights(di.texel_weights_tr[j] + i); 154cc1dc7a3Sopenharmony_ci vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i); 155cc1dc7a3Sopenharmony_ci 156cc1dc7a3Sopenharmony_ci sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int; 157cc1dc7a3Sopenharmony_ci sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int; 158cc1dc7a3Sopenharmony_ci } 159cc1dc7a3Sopenharmony_ci 160cc1dc7a3Sopenharmony_ci store(lsr<4>(sum_plane1), weights_plane1 + i); 161cc1dc7a3Sopenharmony_ci store(lsr<4>(sum_plane2), weights_plane2 + i); 162cc1dc7a3Sopenharmony_ci } 163cc1dc7a3Sopenharmony_ci } 164cc1dc7a3Sopenharmony_ci} 165cc1dc7a3Sopenharmony_ci 166cc1dc7a3Sopenharmony_ci/** 167cc1dc7a3Sopenharmony_ci * @brief Return an FP32 NaN value for use in error colors. 168cc1dc7a3Sopenharmony_ci * 169cc1dc7a3Sopenharmony_ci * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN. 170cc1dc7a3Sopenharmony_ci * 171cc1dc7a3Sopenharmony_ci * @return The float color value. 172cc1dc7a3Sopenharmony_ci */ 173cc1dc7a3Sopenharmony_cistatic float error_color_nan() 174cc1dc7a3Sopenharmony_ci{ 175cc1dc7a3Sopenharmony_ci if32 v; 176cc1dc7a3Sopenharmony_ci v.u = 0xFFFFE000U; 177cc1dc7a3Sopenharmony_ci return v.f; 178cc1dc7a3Sopenharmony_ci} 179cc1dc7a3Sopenharmony_ci 180cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 181cc1dc7a3Sopenharmony_civoid decompress_symbolic_block( 182cc1dc7a3Sopenharmony_ci astcenc_profile decode_mode, 183cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 184cc1dc7a3Sopenharmony_ci int xpos, 185cc1dc7a3Sopenharmony_ci int ypos, 186cc1dc7a3Sopenharmony_ci int zpos, 187cc1dc7a3Sopenharmony_ci const symbolic_compressed_block& scb, 188cc1dc7a3Sopenharmony_ci image_block& blk 189cc1dc7a3Sopenharmony_ci) { 190cc1dc7a3Sopenharmony_ci blk.xpos = xpos; 191cc1dc7a3Sopenharmony_ci blk.ypos = ypos; 192cc1dc7a3Sopenharmony_ci blk.zpos = zpos; 193cc1dc7a3Sopenharmony_ci 194cc1dc7a3Sopenharmony_ci blk.data_min = vfloat4::zero(); 195cc1dc7a3Sopenharmony_ci blk.data_mean = vfloat4::zero(); 196cc1dc7a3Sopenharmony_ci blk.data_max = vfloat4::zero(); 197cc1dc7a3Sopenharmony_ci blk.grayscale = false; 198cc1dc7a3Sopenharmony_ci 199cc1dc7a3Sopenharmony_ci // If we detected an error-block, blow up immediately. 200cc1dc7a3Sopenharmony_ci if (scb.block_type == SYM_BTYPE_ERROR) 201cc1dc7a3Sopenharmony_ci { 202cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < bsd.texel_count; i++) 203cc1dc7a3Sopenharmony_ci { 204cc1dc7a3Sopenharmony_ci blk.data_r[i] = error_color_nan(); 205cc1dc7a3Sopenharmony_ci blk.data_g[i] = error_color_nan(); 206cc1dc7a3Sopenharmony_ci blk.data_b[i] = error_color_nan(); 207cc1dc7a3Sopenharmony_ci blk.data_a[i] = error_color_nan(); 208cc1dc7a3Sopenharmony_ci blk.rgb_lns[i] = 0; 209cc1dc7a3Sopenharmony_ci blk.alpha_lns[i] = 0; 210cc1dc7a3Sopenharmony_ci } 211cc1dc7a3Sopenharmony_ci 212cc1dc7a3Sopenharmony_ci return; 213cc1dc7a3Sopenharmony_ci } 214cc1dc7a3Sopenharmony_ci 215cc1dc7a3Sopenharmony_ci if ((scb.block_type == SYM_BTYPE_CONST_F16) || 216cc1dc7a3Sopenharmony_ci (scb.block_type == SYM_BTYPE_CONST_U16)) 217cc1dc7a3Sopenharmony_ci { 218cc1dc7a3Sopenharmony_ci vfloat4 color; 219cc1dc7a3Sopenharmony_ci uint8_t use_lns = 0; 220cc1dc7a3Sopenharmony_ci 221cc1dc7a3Sopenharmony_ci // UNORM16 constant color block 222cc1dc7a3Sopenharmony_ci if (scb.block_type == SYM_BTYPE_CONST_U16) 223cc1dc7a3Sopenharmony_ci { 224cc1dc7a3Sopenharmony_ci vint4 colori(scb.constant_color); 225cc1dc7a3Sopenharmony_ci 226cc1dc7a3Sopenharmony_ci // Determine the UNORM8 rounding on the decode 227cc1dc7a3Sopenharmony_ci vmask4 u8_mask = get_u8_component_mask(decode_mode, blk); 228cc1dc7a3Sopenharmony_ci 229cc1dc7a3Sopenharmony_ci // The real decoder would just use the top 8 bits, but we rescale 230cc1dc7a3Sopenharmony_ci // in to a 16-bit value that rounds correctly. 231cc1dc7a3Sopenharmony_ci vint4 colori_u8 = asr<8>(colori) * 257; 232cc1dc7a3Sopenharmony_ci colori = select(colori, colori_u8, u8_mask); 233cc1dc7a3Sopenharmony_ci 234cc1dc7a3Sopenharmony_ci vint4 colorf16 = unorm16_to_sf16(colori); 235cc1dc7a3Sopenharmony_ci color = float16_to_float(colorf16); 236cc1dc7a3Sopenharmony_ci } 237cc1dc7a3Sopenharmony_ci // FLOAT16 constant color block 238cc1dc7a3Sopenharmony_ci else 239cc1dc7a3Sopenharmony_ci { 240cc1dc7a3Sopenharmony_ci switch (decode_mode) 241cc1dc7a3Sopenharmony_ci { 242cc1dc7a3Sopenharmony_ci case ASTCENC_PRF_LDR_SRGB: 243cc1dc7a3Sopenharmony_ci case ASTCENC_PRF_LDR: 244cc1dc7a3Sopenharmony_ci color = vfloat4(error_color_nan()); 245cc1dc7a3Sopenharmony_ci break; 246cc1dc7a3Sopenharmony_ci case ASTCENC_PRF_HDR_RGB_LDR_A: 247cc1dc7a3Sopenharmony_ci case ASTCENC_PRF_HDR: 248cc1dc7a3Sopenharmony_ci // Constant-color block; unpack from FP16 to FP32. 249cc1dc7a3Sopenharmony_ci color = float16_to_float(vint4(scb.constant_color)); 250cc1dc7a3Sopenharmony_ci use_lns = 1; 251cc1dc7a3Sopenharmony_ci break; 252cc1dc7a3Sopenharmony_ci } 253cc1dc7a3Sopenharmony_ci } 254cc1dc7a3Sopenharmony_ci 255cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < bsd.texel_count; i++) 256cc1dc7a3Sopenharmony_ci { 257cc1dc7a3Sopenharmony_ci blk.data_r[i] = color.lane<0>(); 258cc1dc7a3Sopenharmony_ci blk.data_g[i] = color.lane<1>(); 259cc1dc7a3Sopenharmony_ci blk.data_b[i] = color.lane<2>(); 260cc1dc7a3Sopenharmony_ci blk.data_a[i] = color.lane<3>(); 261cc1dc7a3Sopenharmony_ci blk.rgb_lns[i] = use_lns; 262cc1dc7a3Sopenharmony_ci blk.alpha_lns[i] = use_lns; 263cc1dc7a3Sopenharmony_ci } 264cc1dc7a3Sopenharmony_ci 265cc1dc7a3Sopenharmony_ci return; 266cc1dc7a3Sopenharmony_ci } 267cc1dc7a3Sopenharmony_ci 268cc1dc7a3Sopenharmony_ci // Get the appropriate partition-table entry 269cc1dc7a3Sopenharmony_ci int partition_count = scb.partition_count; 270cc1dc7a3Sopenharmony_ci const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); 271cc1dc7a3Sopenharmony_ci 272cc1dc7a3Sopenharmony_ci // Get the appropriate block descriptors 273cc1dc7a3Sopenharmony_ci const auto& bm = bsd.get_block_mode(scb.block_mode); 274cc1dc7a3Sopenharmony_ci const auto& di = bsd.get_decimation_info(bm.decimation_mode); 275cc1dc7a3Sopenharmony_ci 276cc1dc7a3Sopenharmony_ci bool is_dual_plane = static_cast<bool>(bm.is_dual_plane); 277cc1dc7a3Sopenharmony_ci 278cc1dc7a3Sopenharmony_ci // Unquantize and undecimate the weights 279cc1dc7a3Sopenharmony_ci int plane1_weights[BLOCK_MAX_TEXELS]; 280cc1dc7a3Sopenharmony_ci int plane2_weights[BLOCK_MAX_TEXELS]; 281cc1dc7a3Sopenharmony_ci unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights); 282cc1dc7a3Sopenharmony_ci 283cc1dc7a3Sopenharmony_ci // Now that we have endpoint colors and weights, we can unpack texel colors 284cc1dc7a3Sopenharmony_ci int plane2_component = scb.plane2_component; 285cc1dc7a3Sopenharmony_ci vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component); 286cc1dc7a3Sopenharmony_ci 287cc1dc7a3Sopenharmony_ci vmask4 u8_mask = get_u8_component_mask(decode_mode, blk); 288cc1dc7a3Sopenharmony_ci 289cc1dc7a3Sopenharmony_ci for (int i = 0; i < partition_count; i++) 290cc1dc7a3Sopenharmony_ci { 291cc1dc7a3Sopenharmony_ci // Decode the color endpoints for this partition 292cc1dc7a3Sopenharmony_ci vint4 ep0; 293cc1dc7a3Sopenharmony_ci vint4 ep1; 294cc1dc7a3Sopenharmony_ci bool rgb_lns; 295cc1dc7a3Sopenharmony_ci bool a_lns; 296cc1dc7a3Sopenharmony_ci 297cc1dc7a3Sopenharmony_ci unpack_color_endpoints(decode_mode, 298cc1dc7a3Sopenharmony_ci scb.color_formats[i], 299cc1dc7a3Sopenharmony_ci scb.color_values[i], 300cc1dc7a3Sopenharmony_ci rgb_lns, a_lns, 301cc1dc7a3Sopenharmony_ci ep0, ep1); 302cc1dc7a3Sopenharmony_ci 303cc1dc7a3Sopenharmony_ci vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns); 304cc1dc7a3Sopenharmony_ci 305cc1dc7a3Sopenharmony_ci int texel_count = pi.partition_texel_count[i]; 306cc1dc7a3Sopenharmony_ci for (int j = 0; j < texel_count; j++) 307cc1dc7a3Sopenharmony_ci { 308cc1dc7a3Sopenharmony_ci int tix = pi.texels_of_partition[i][j]; 309cc1dc7a3Sopenharmony_ci vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask); 310cc1dc7a3Sopenharmony_ci vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight); 311cc1dc7a3Sopenharmony_ci vfloat4 colorf = decode_texel(color, lns_mask); 312cc1dc7a3Sopenharmony_ci 313cc1dc7a3Sopenharmony_ci blk.data_r[tix] = colorf.lane<0>(); 314cc1dc7a3Sopenharmony_ci blk.data_g[tix] = colorf.lane<1>(); 315cc1dc7a3Sopenharmony_ci blk.data_b[tix] = colorf.lane<2>(); 316cc1dc7a3Sopenharmony_ci blk.data_a[tix] = colorf.lane<3>(); 317cc1dc7a3Sopenharmony_ci } 318cc1dc7a3Sopenharmony_ci } 319cc1dc7a3Sopenharmony_ci} 320cc1dc7a3Sopenharmony_ci 321cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DECOMPRESS_ONLY) 322cc1dc7a3Sopenharmony_ci 323cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 324cc1dc7a3Sopenharmony_cifloat compute_symbolic_block_difference_2plane( 325cc1dc7a3Sopenharmony_ci const astcenc_config& config, 326cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 327cc1dc7a3Sopenharmony_ci const symbolic_compressed_block& scb, 328cc1dc7a3Sopenharmony_ci const image_block& blk 329cc1dc7a3Sopenharmony_ci) { 330cc1dc7a3Sopenharmony_ci // If we detected an error-block, blow up immediately. 331cc1dc7a3Sopenharmony_ci if (scb.block_type == SYM_BTYPE_ERROR) 332cc1dc7a3Sopenharmony_ci { 333cc1dc7a3Sopenharmony_ci return ERROR_CALC_DEFAULT; 334cc1dc7a3Sopenharmony_ci } 335cc1dc7a3Sopenharmony_ci 336cc1dc7a3Sopenharmony_ci assert(scb.block_mode >= 0); 337cc1dc7a3Sopenharmony_ci assert(scb.partition_count == 1); 338cc1dc7a3Sopenharmony_ci assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1); 339cc1dc7a3Sopenharmony_ci 340cc1dc7a3Sopenharmony_ci // Get the appropriate block descriptor 341cc1dc7a3Sopenharmony_ci const block_mode& bm = bsd.get_block_mode(scb.block_mode); 342cc1dc7a3Sopenharmony_ci const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); 343cc1dc7a3Sopenharmony_ci 344cc1dc7a3Sopenharmony_ci // Unquantize and undecimate the weights 345cc1dc7a3Sopenharmony_ci int plane1_weights[BLOCK_MAX_TEXELS]; 346cc1dc7a3Sopenharmony_ci int plane2_weights[BLOCK_MAX_TEXELS]; 347cc1dc7a3Sopenharmony_ci unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights); 348cc1dc7a3Sopenharmony_ci 349cc1dc7a3Sopenharmony_ci vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component); 350cc1dc7a3Sopenharmony_ci 351cc1dc7a3Sopenharmony_ci vfloat4 summa = vfloat4::zero(); 352cc1dc7a3Sopenharmony_ci 353cc1dc7a3Sopenharmony_ci // Decode the color endpoints for this partition 354cc1dc7a3Sopenharmony_ci vint4 ep0; 355cc1dc7a3Sopenharmony_ci vint4 ep1; 356cc1dc7a3Sopenharmony_ci bool rgb_lns; 357cc1dc7a3Sopenharmony_ci bool a_lns; 358cc1dc7a3Sopenharmony_ci 359cc1dc7a3Sopenharmony_ci unpack_color_endpoints(config.profile, 360cc1dc7a3Sopenharmony_ci scb.color_formats[0], 361cc1dc7a3Sopenharmony_ci scb.color_values[0], 362cc1dc7a3Sopenharmony_ci rgb_lns, a_lns, 363cc1dc7a3Sopenharmony_ci ep0, ep1); 364cc1dc7a3Sopenharmony_ci 365cc1dc7a3Sopenharmony_ci vmask4 u8_mask = get_u8_component_mask(config.profile, blk); 366cc1dc7a3Sopenharmony_ci 367cc1dc7a3Sopenharmony_ci // Unpack and compute error for each texel in the partition 368cc1dc7a3Sopenharmony_ci unsigned int texel_count = bsd.texel_count; 369cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i++) 370cc1dc7a3Sopenharmony_ci { 371cc1dc7a3Sopenharmony_ci vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask); 372cc1dc7a3Sopenharmony_ci vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight); 373cc1dc7a3Sopenharmony_ci 374cc1dc7a3Sopenharmony_ci vfloat4 color = int_to_float(colori); 375cc1dc7a3Sopenharmony_ci vfloat4 oldColor = blk.texel(i); 376cc1dc7a3Sopenharmony_ci 377cc1dc7a3Sopenharmony_ci // Compare error using a perceptual decode metric for RGBM textures 378cc1dc7a3Sopenharmony_ci if (config.flags & ASTCENC_FLG_MAP_RGBM) 379cc1dc7a3Sopenharmony_ci { 380cc1dc7a3Sopenharmony_ci // Fail encodings that result in zero weight M pixels. Note that this can cause 381cc1dc7a3Sopenharmony_ci // "interesting" artifacts if we reject all useful encodings - we typically get max 382cc1dc7a3Sopenharmony_ci // brightness encodings instead which look just as bad. We recommend users apply a 383cc1dc7a3Sopenharmony_ci // bias to their stored M value, limiting the lower value to 16 or 32 to avoid 384cc1dc7a3Sopenharmony_ci // getting small M values post-quantization, but we can't prove it would never 385cc1dc7a3Sopenharmony_ci // happen, especially at low bit rates ... 386cc1dc7a3Sopenharmony_ci if (color.lane<3>() == 0.0f) 387cc1dc7a3Sopenharmony_ci { 388cc1dc7a3Sopenharmony_ci return -ERROR_CALC_DEFAULT; 389cc1dc7a3Sopenharmony_ci } 390cc1dc7a3Sopenharmony_ci 391cc1dc7a3Sopenharmony_ci // Compute error based on decoded RGBM color 392cc1dc7a3Sopenharmony_ci color = vfloat4( 393cc1dc7a3Sopenharmony_ci color.lane<0>() * color.lane<3>() * config.rgbm_m_scale, 394cc1dc7a3Sopenharmony_ci color.lane<1>() * color.lane<3>() * config.rgbm_m_scale, 395cc1dc7a3Sopenharmony_ci color.lane<2>() * color.lane<3>() * config.rgbm_m_scale, 396cc1dc7a3Sopenharmony_ci 1.0f 397cc1dc7a3Sopenharmony_ci ); 398cc1dc7a3Sopenharmony_ci 399cc1dc7a3Sopenharmony_ci oldColor = vfloat4( 400cc1dc7a3Sopenharmony_ci oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale, 401cc1dc7a3Sopenharmony_ci oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale, 402cc1dc7a3Sopenharmony_ci oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale, 403cc1dc7a3Sopenharmony_ci 1.0f 404cc1dc7a3Sopenharmony_ci ); 405cc1dc7a3Sopenharmony_ci } 406cc1dc7a3Sopenharmony_ci 407cc1dc7a3Sopenharmony_ci vfloat4 error = oldColor - color; 408cc1dc7a3Sopenharmony_ci error = min(abs(error), 1e15f); 409cc1dc7a3Sopenharmony_ci error = error * error; 410cc1dc7a3Sopenharmony_ci 411cc1dc7a3Sopenharmony_ci summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT); 412cc1dc7a3Sopenharmony_ci } 413cc1dc7a3Sopenharmony_ci 414cc1dc7a3Sopenharmony_ci return summa.lane<0>(); 415cc1dc7a3Sopenharmony_ci} 416cc1dc7a3Sopenharmony_ci 417cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 418cc1dc7a3Sopenharmony_cifloat compute_symbolic_block_difference_1plane( 419cc1dc7a3Sopenharmony_ci const astcenc_config& config, 420cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 421cc1dc7a3Sopenharmony_ci const symbolic_compressed_block& scb, 422cc1dc7a3Sopenharmony_ci const image_block& blk 423cc1dc7a3Sopenharmony_ci) { 424cc1dc7a3Sopenharmony_ci assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0); 425cc1dc7a3Sopenharmony_ci 426cc1dc7a3Sopenharmony_ci // If we detected an error-block, blow up immediately. 427cc1dc7a3Sopenharmony_ci if (scb.block_type == SYM_BTYPE_ERROR) 428cc1dc7a3Sopenharmony_ci { 429cc1dc7a3Sopenharmony_ci return ERROR_CALC_DEFAULT; 430cc1dc7a3Sopenharmony_ci } 431cc1dc7a3Sopenharmony_ci 432cc1dc7a3Sopenharmony_ci assert(scb.block_mode >= 0); 433cc1dc7a3Sopenharmony_ci 434cc1dc7a3Sopenharmony_ci // Get the appropriate partition-table entry 435cc1dc7a3Sopenharmony_ci unsigned int partition_count = scb.partition_count; 436cc1dc7a3Sopenharmony_ci const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); 437cc1dc7a3Sopenharmony_ci 438cc1dc7a3Sopenharmony_ci // Get the appropriate block descriptor 439cc1dc7a3Sopenharmony_ci const block_mode& bm = bsd.get_block_mode(scb.block_mode); 440cc1dc7a3Sopenharmony_ci const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); 441cc1dc7a3Sopenharmony_ci 442cc1dc7a3Sopenharmony_ci // Unquantize and undecimate the weights 443cc1dc7a3Sopenharmony_ci int plane1_weights[BLOCK_MAX_TEXELS]; 444cc1dc7a3Sopenharmony_ci unpack_weights(bsd, scb, di, false, plane1_weights, nullptr); 445cc1dc7a3Sopenharmony_ci 446cc1dc7a3Sopenharmony_ci vmask4 u8_mask = get_u8_component_mask(config.profile, blk); 447cc1dc7a3Sopenharmony_ci 448cc1dc7a3Sopenharmony_ci vfloat4 summa = vfloat4::zero(); 449cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < partition_count; i++) 450cc1dc7a3Sopenharmony_ci { 451cc1dc7a3Sopenharmony_ci // Decode the color endpoints for this partition 452cc1dc7a3Sopenharmony_ci vint4 ep0; 453cc1dc7a3Sopenharmony_ci vint4 ep1; 454cc1dc7a3Sopenharmony_ci bool rgb_lns; 455cc1dc7a3Sopenharmony_ci bool a_lns; 456cc1dc7a3Sopenharmony_ci 457cc1dc7a3Sopenharmony_ci unpack_color_endpoints(config.profile, 458cc1dc7a3Sopenharmony_ci scb.color_formats[i], 459cc1dc7a3Sopenharmony_ci scb.color_values[i], 460cc1dc7a3Sopenharmony_ci rgb_lns, a_lns, 461cc1dc7a3Sopenharmony_ci ep0, ep1); 462cc1dc7a3Sopenharmony_ci 463cc1dc7a3Sopenharmony_ci // Unpack and compute error for each texel in the partition 464cc1dc7a3Sopenharmony_ci unsigned int texel_count = pi.partition_texel_count[i]; 465cc1dc7a3Sopenharmony_ci for (unsigned int j = 0; j < texel_count; j++) 466cc1dc7a3Sopenharmony_ci { 467cc1dc7a3Sopenharmony_ci unsigned int tix = pi.texels_of_partition[i][j]; 468cc1dc7a3Sopenharmony_ci vint4 colori = lerp_color_int(u8_mask, ep0, ep1, 469cc1dc7a3Sopenharmony_ci vint4(plane1_weights[tix])); 470cc1dc7a3Sopenharmony_ci 471cc1dc7a3Sopenharmony_ci vfloat4 color = int_to_float(colori); 472cc1dc7a3Sopenharmony_ci vfloat4 oldColor = blk.texel(tix); 473cc1dc7a3Sopenharmony_ci 474cc1dc7a3Sopenharmony_ci // Compare error using a perceptual decode metric for RGBM textures 475cc1dc7a3Sopenharmony_ci if (config.flags & ASTCENC_FLG_MAP_RGBM) 476cc1dc7a3Sopenharmony_ci { 477cc1dc7a3Sopenharmony_ci // Fail encodings that result in zero weight M pixels. Note that this can cause 478cc1dc7a3Sopenharmony_ci // "interesting" artifacts if we reject all useful encodings - we typically get max 479cc1dc7a3Sopenharmony_ci // brightness encodings instead which look just as bad. We recommend users apply a 480cc1dc7a3Sopenharmony_ci // bias to their stored M value, limiting the lower value to 16 or 32 to avoid 481cc1dc7a3Sopenharmony_ci // getting small M values post-quantization, but we can't prove it would never 482cc1dc7a3Sopenharmony_ci // happen, especially at low bit rates ... 483cc1dc7a3Sopenharmony_ci if (color.lane<3>() == 0.0f) 484cc1dc7a3Sopenharmony_ci { 485cc1dc7a3Sopenharmony_ci return -ERROR_CALC_DEFAULT; 486cc1dc7a3Sopenharmony_ci } 487cc1dc7a3Sopenharmony_ci 488cc1dc7a3Sopenharmony_ci // Compute error based on decoded RGBM color 489cc1dc7a3Sopenharmony_ci color = vfloat4( 490cc1dc7a3Sopenharmony_ci color.lane<0>() * color.lane<3>() * config.rgbm_m_scale, 491cc1dc7a3Sopenharmony_ci color.lane<1>() * color.lane<3>() * config.rgbm_m_scale, 492cc1dc7a3Sopenharmony_ci color.lane<2>() * color.lane<3>() * config.rgbm_m_scale, 493cc1dc7a3Sopenharmony_ci 1.0f 494cc1dc7a3Sopenharmony_ci ); 495cc1dc7a3Sopenharmony_ci 496cc1dc7a3Sopenharmony_ci oldColor = vfloat4( 497cc1dc7a3Sopenharmony_ci oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale, 498cc1dc7a3Sopenharmony_ci oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale, 499cc1dc7a3Sopenharmony_ci oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale, 500cc1dc7a3Sopenharmony_ci 1.0f 501cc1dc7a3Sopenharmony_ci ); 502cc1dc7a3Sopenharmony_ci } 503cc1dc7a3Sopenharmony_ci 504cc1dc7a3Sopenharmony_ci vfloat4 error = oldColor - color; 505cc1dc7a3Sopenharmony_ci error = min(abs(error), 1e15f); 506cc1dc7a3Sopenharmony_ci error = error * error; 507cc1dc7a3Sopenharmony_ci 508cc1dc7a3Sopenharmony_ci summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT); 509cc1dc7a3Sopenharmony_ci } 510cc1dc7a3Sopenharmony_ci } 511cc1dc7a3Sopenharmony_ci 512cc1dc7a3Sopenharmony_ci return summa.lane<0>(); 513cc1dc7a3Sopenharmony_ci} 514cc1dc7a3Sopenharmony_ci 515cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 516cc1dc7a3Sopenharmony_cifloat compute_symbolic_block_difference_1plane_1partition( 517cc1dc7a3Sopenharmony_ci const astcenc_config& config, 518cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 519cc1dc7a3Sopenharmony_ci const symbolic_compressed_block& scb, 520cc1dc7a3Sopenharmony_ci const image_block& blk 521cc1dc7a3Sopenharmony_ci) { 522cc1dc7a3Sopenharmony_ci // If we detected an error-block, blow up immediately. 523cc1dc7a3Sopenharmony_ci if (scb.block_type == SYM_BTYPE_ERROR) 524cc1dc7a3Sopenharmony_ci { 525cc1dc7a3Sopenharmony_ci return ERROR_CALC_DEFAULT; 526cc1dc7a3Sopenharmony_ci } 527cc1dc7a3Sopenharmony_ci 528cc1dc7a3Sopenharmony_ci assert(scb.block_mode >= 0); 529cc1dc7a3Sopenharmony_ci assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1); 530cc1dc7a3Sopenharmony_ci 531cc1dc7a3Sopenharmony_ci // Get the appropriate block descriptor 532cc1dc7a3Sopenharmony_ci const block_mode& bm = bsd.get_block_mode(scb.block_mode); 533cc1dc7a3Sopenharmony_ci const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); 534cc1dc7a3Sopenharmony_ci 535cc1dc7a3Sopenharmony_ci // Unquantize and undecimate the weights 536cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS]; 537cc1dc7a3Sopenharmony_ci unpack_weights(bsd, scb, di, false, plane1_weights, nullptr); 538cc1dc7a3Sopenharmony_ci 539cc1dc7a3Sopenharmony_ci // Decode the color endpoints for this partition 540cc1dc7a3Sopenharmony_ci vint4 ep0; 541cc1dc7a3Sopenharmony_ci vint4 ep1; 542cc1dc7a3Sopenharmony_ci bool rgb_lns; 543cc1dc7a3Sopenharmony_ci bool a_lns; 544cc1dc7a3Sopenharmony_ci 545cc1dc7a3Sopenharmony_ci unpack_color_endpoints(config.profile, 546cc1dc7a3Sopenharmony_ci scb.color_formats[0], 547cc1dc7a3Sopenharmony_ci scb.color_values[0], 548cc1dc7a3Sopenharmony_ci rgb_lns, a_lns, 549cc1dc7a3Sopenharmony_ci ep0, ep1); 550cc1dc7a3Sopenharmony_ci 551cc1dc7a3Sopenharmony_ci vmask4 u8_mask = get_u8_component_mask(config.profile, blk); 552cc1dc7a3Sopenharmony_ci 553cc1dc7a3Sopenharmony_ci // Unpack and compute error for each texel in the partition 554cc1dc7a3Sopenharmony_ci vfloatacc summav = vfloatacc::zero(); 555cc1dc7a3Sopenharmony_ci 556cc1dc7a3Sopenharmony_ci vint lane_id = vint::lane_id(); 557cc1dc7a3Sopenharmony_ci 558cc1dc7a3Sopenharmony_ci unsigned int texel_count = bsd.texel_count; 559cc1dc7a3Sopenharmony_ci for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) 560cc1dc7a3Sopenharmony_ci { 561cc1dc7a3Sopenharmony_ci // Compute EP1 contribution 562cc1dc7a3Sopenharmony_ci vint weight1 = vint::loada(plane1_weights + i); 563cc1dc7a3Sopenharmony_ci vint ep1_r = vint(ep1.lane<0>()) * weight1; 564cc1dc7a3Sopenharmony_ci vint ep1_g = vint(ep1.lane<1>()) * weight1; 565cc1dc7a3Sopenharmony_ci vint ep1_b = vint(ep1.lane<2>()) * weight1; 566cc1dc7a3Sopenharmony_ci vint ep1_a = vint(ep1.lane<3>()) * weight1; 567cc1dc7a3Sopenharmony_ci 568cc1dc7a3Sopenharmony_ci // Compute EP0 contribution 569cc1dc7a3Sopenharmony_ci vint weight0 = vint(64) - weight1; 570cc1dc7a3Sopenharmony_ci vint ep0_r = vint(ep0.lane<0>()) * weight0; 571cc1dc7a3Sopenharmony_ci vint ep0_g = vint(ep0.lane<1>()) * weight0; 572cc1dc7a3Sopenharmony_ci vint ep0_b = vint(ep0.lane<2>()) * weight0; 573cc1dc7a3Sopenharmony_ci vint ep0_a = vint(ep0.lane<3>()) * weight0; 574cc1dc7a3Sopenharmony_ci 575cc1dc7a3Sopenharmony_ci // Combine contributions 576cc1dc7a3Sopenharmony_ci vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)); 577cc1dc7a3Sopenharmony_ci vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)); 578cc1dc7a3Sopenharmony_ci vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)); 579cc1dc7a3Sopenharmony_ci vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)); 580cc1dc7a3Sopenharmony_ci 581cc1dc7a3Sopenharmony_ci // If using a U8 decode mode bit replicate top 8 bits 582cc1dc7a3Sopenharmony_ci // so rest of codec can assume 0xFFFF max range everywhere 583cc1dc7a3Sopenharmony_ci vint colori_r8 = asr<8>(colori_r) * vint(257); 584cc1dc7a3Sopenharmony_ci colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>())); 585cc1dc7a3Sopenharmony_ci 586cc1dc7a3Sopenharmony_ci vint colori_g8 = asr<8>(colori_g) * vint(257); 587cc1dc7a3Sopenharmony_ci colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>())); 588cc1dc7a3Sopenharmony_ci 589cc1dc7a3Sopenharmony_ci vint colori_b8 = asr<8>(colori_b) * vint(257); 590cc1dc7a3Sopenharmony_ci colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>())); 591cc1dc7a3Sopenharmony_ci 592cc1dc7a3Sopenharmony_ci vint colori_a8 = asr<8>(colori_a) * vint(257); 593cc1dc7a3Sopenharmony_ci colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>())); 594cc1dc7a3Sopenharmony_ci 595cc1dc7a3Sopenharmony_ci // Compute color diff 596cc1dc7a3Sopenharmony_ci vfloat color_r = int_to_float(colori_r); 597cc1dc7a3Sopenharmony_ci vfloat color_g = int_to_float(colori_g); 598cc1dc7a3Sopenharmony_ci vfloat color_b = int_to_float(colori_b); 599cc1dc7a3Sopenharmony_ci vfloat color_a = int_to_float(colori_a); 600cc1dc7a3Sopenharmony_ci 601cc1dc7a3Sopenharmony_ci vfloat color_orig_r = loada(blk.data_r + i); 602cc1dc7a3Sopenharmony_ci vfloat color_orig_g = loada(blk.data_g + i); 603cc1dc7a3Sopenharmony_ci vfloat color_orig_b = loada(blk.data_b + i); 604cc1dc7a3Sopenharmony_ci vfloat color_orig_a = loada(blk.data_a + i); 605cc1dc7a3Sopenharmony_ci 606cc1dc7a3Sopenharmony_ci vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f)); 607cc1dc7a3Sopenharmony_ci vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f)); 608cc1dc7a3Sopenharmony_ci vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f)); 609cc1dc7a3Sopenharmony_ci vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f)); 610cc1dc7a3Sopenharmony_ci 611cc1dc7a3Sopenharmony_ci // Compute squared error metric 612cc1dc7a3Sopenharmony_ci color_error_r = color_error_r * color_error_r; 613cc1dc7a3Sopenharmony_ci color_error_g = color_error_g * color_error_g; 614cc1dc7a3Sopenharmony_ci color_error_b = color_error_b * color_error_b; 615cc1dc7a3Sopenharmony_ci color_error_a = color_error_a * color_error_a; 616cc1dc7a3Sopenharmony_ci 617cc1dc7a3Sopenharmony_ci vfloat metric = color_error_r * blk.channel_weight.lane<0>() 618cc1dc7a3Sopenharmony_ci + color_error_g * blk.channel_weight.lane<1>() 619cc1dc7a3Sopenharmony_ci + color_error_b * blk.channel_weight.lane<2>() 620cc1dc7a3Sopenharmony_ci + color_error_a * blk.channel_weight.lane<3>(); 621cc1dc7a3Sopenharmony_ci 622cc1dc7a3Sopenharmony_ci // Mask off bad lanes 623cc1dc7a3Sopenharmony_ci vmask mask = lane_id < vint(texel_count); 624cc1dc7a3Sopenharmony_ci lane_id += vint(ASTCENC_SIMD_WIDTH); 625cc1dc7a3Sopenharmony_ci haccumulate(summav, metric, mask); 626cc1dc7a3Sopenharmony_ci } 627cc1dc7a3Sopenharmony_ci 628cc1dc7a3Sopenharmony_ci return hadd_s(summav); 629cc1dc7a3Sopenharmony_ci} 630cc1dc7a3Sopenharmony_ci 631cc1dc7a3Sopenharmony_ci#endif 632