1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0 2cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 3cc1dc7a3Sopenharmony_ci// Copyright 2011-2024 Arm Limited 4cc1dc7a3Sopenharmony_ci// 5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy 7cc1dc7a3Sopenharmony_ci// of the License at: 8cc1dc7a3Sopenharmony_ci// 9cc1dc7a3Sopenharmony_ci// http://www.apache.org/licenses/LICENSE-2.0 10cc1dc7a3Sopenharmony_ci// 11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software 12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations 15cc1dc7a3Sopenharmony_ci// under the License. 16cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 17cc1dc7a3Sopenharmony_ci 18cc1dc7a3Sopenharmony_ci/** 19cc1dc7a3Sopenharmony_ci * @brief Functions for creating in-memory ASTC image structures. 20cc1dc7a3Sopenharmony_ci */ 21cc1dc7a3Sopenharmony_ci 22cc1dc7a3Sopenharmony_ci#include <cassert> 23cc1dc7a3Sopenharmony_ci#include <cstring> 24cc1dc7a3Sopenharmony_ci 25cc1dc7a3Sopenharmony_ci#include "astcenc_internal.h" 26cc1dc7a3Sopenharmony_ci 27cc1dc7a3Sopenharmony_ci/** 28cc1dc7a3Sopenharmony_ci * @brief Loader pipeline function type for data fetch from memory. 29cc1dc7a3Sopenharmony_ci */ 30cc1dc7a3Sopenharmony_ciusing pixel_loader = vfloat4(*)(const void*, int); 31cc1dc7a3Sopenharmony_ci 32cc1dc7a3Sopenharmony_ci/** 33cc1dc7a3Sopenharmony_ci * @brief Loader pipeline function type for swizzling data in a vector. 34cc1dc7a3Sopenharmony_ci */ 35cc1dc7a3Sopenharmony_ciusing pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&); 36cc1dc7a3Sopenharmony_ci 37cc1dc7a3Sopenharmony_ci/** 38cc1dc7a3Sopenharmony_ci * @brief Loader pipeline function type for converting data in a vector to LNS. 39cc1dc7a3Sopenharmony_ci */ 40cc1dc7a3Sopenharmony_ciusing pixel_converter = vfloat4(*)(vfloat4, vmask4); 41cc1dc7a3Sopenharmony_ci 42cc1dc7a3Sopenharmony_ci/** 43cc1dc7a3Sopenharmony_ci * @brief Load a 8-bit UNORM texel from a data array. 44cc1dc7a3Sopenharmony_ci * 45cc1dc7a3Sopenharmony_ci * @param data The data pointer. 46cc1dc7a3Sopenharmony_ci * @param base_offset The index offset to the start of the pixel. 47cc1dc7a3Sopenharmony_ci */ 48cc1dc7a3Sopenharmony_cistatic vfloat4 load_texel_u8( 49cc1dc7a3Sopenharmony_ci const void* data, 50cc1dc7a3Sopenharmony_ci int base_offset 51cc1dc7a3Sopenharmony_ci) { 52cc1dc7a3Sopenharmony_ci const uint8_t* data8 = static_cast<const uint8_t*>(data); 53cc1dc7a3Sopenharmony_ci return int_to_float(vint4(data8 + base_offset)) / 255.0f; 54cc1dc7a3Sopenharmony_ci} 55cc1dc7a3Sopenharmony_ci 56cc1dc7a3Sopenharmony_ci/** 57cc1dc7a3Sopenharmony_ci * @brief Load a 16-bit fp16 texel from a data array. 58cc1dc7a3Sopenharmony_ci * 59cc1dc7a3Sopenharmony_ci * @param data The data pointer. 60cc1dc7a3Sopenharmony_ci * @param base_offset The index offset to the start of the pixel. 61cc1dc7a3Sopenharmony_ci */ 62cc1dc7a3Sopenharmony_cistatic vfloat4 load_texel_f16( 63cc1dc7a3Sopenharmony_ci const void* data, 64cc1dc7a3Sopenharmony_ci int base_offset 65cc1dc7a3Sopenharmony_ci) { 66cc1dc7a3Sopenharmony_ci const uint16_t* data16 = static_cast<const uint16_t*>(data); 67cc1dc7a3Sopenharmony_ci int r = data16[base_offset ]; 68cc1dc7a3Sopenharmony_ci int g = data16[base_offset + 1]; 69cc1dc7a3Sopenharmony_ci int b = data16[base_offset + 2]; 70cc1dc7a3Sopenharmony_ci int a = data16[base_offset + 3]; 71cc1dc7a3Sopenharmony_ci return float16_to_float(vint4(r, g, b, a)); 72cc1dc7a3Sopenharmony_ci} 73cc1dc7a3Sopenharmony_ci 74cc1dc7a3Sopenharmony_ci/** 75cc1dc7a3Sopenharmony_ci * @brief Load a 32-bit float texel from a data array. 76cc1dc7a3Sopenharmony_ci * 77cc1dc7a3Sopenharmony_ci * @param data The data pointer. 78cc1dc7a3Sopenharmony_ci * @param base_offset The index offset to the start of the pixel. 79cc1dc7a3Sopenharmony_ci */ 80cc1dc7a3Sopenharmony_cistatic vfloat4 load_texel_f32( 81cc1dc7a3Sopenharmony_ci const void* data, 82cc1dc7a3Sopenharmony_ci int base_offset 83cc1dc7a3Sopenharmony_ci) { 84cc1dc7a3Sopenharmony_ci const float* data32 = static_cast<const float*>(data); 85cc1dc7a3Sopenharmony_ci return vfloat4(data32 + base_offset); 86cc1dc7a3Sopenharmony_ci} 87cc1dc7a3Sopenharmony_ci 88cc1dc7a3Sopenharmony_ci/** 89cc1dc7a3Sopenharmony_ci * @brief Dummy no-op swizzle function. 90cc1dc7a3Sopenharmony_ci * 91cc1dc7a3Sopenharmony_ci * @param data The source RGBA vector to swizzle. 92cc1dc7a3Sopenharmony_ci * @param swz The swizzle to use. 93cc1dc7a3Sopenharmony_ci */ 94cc1dc7a3Sopenharmony_cistatic vfloat4 swz_texel_skip( 95cc1dc7a3Sopenharmony_ci vfloat4 data, 96cc1dc7a3Sopenharmony_ci const astcenc_swizzle& swz 97cc1dc7a3Sopenharmony_ci) { 98cc1dc7a3Sopenharmony_ci (void)swz; 99cc1dc7a3Sopenharmony_ci return data; 100cc1dc7a3Sopenharmony_ci} 101cc1dc7a3Sopenharmony_ci 102cc1dc7a3Sopenharmony_ci/** 103cc1dc7a3Sopenharmony_ci * @brief Swizzle a texel into a new arrangement. 104cc1dc7a3Sopenharmony_ci * 105cc1dc7a3Sopenharmony_ci * @param data The source RGBA vector to swizzle. 106cc1dc7a3Sopenharmony_ci * @param swz The swizzle to use. 107cc1dc7a3Sopenharmony_ci */ 108cc1dc7a3Sopenharmony_cistatic vfloat4 swz_texel( 109cc1dc7a3Sopenharmony_ci vfloat4 data, 110cc1dc7a3Sopenharmony_ci const astcenc_swizzle& swz 111cc1dc7a3Sopenharmony_ci) { 112cc1dc7a3Sopenharmony_ci ASTCENC_ALIGNAS float datas[6]; 113cc1dc7a3Sopenharmony_ci 114cc1dc7a3Sopenharmony_ci storea(data, datas); 115cc1dc7a3Sopenharmony_ci datas[ASTCENC_SWZ_0] = 0.0f; 116cc1dc7a3Sopenharmony_ci datas[ASTCENC_SWZ_1] = 1.0f; 117cc1dc7a3Sopenharmony_ci 118cc1dc7a3Sopenharmony_ci return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]); 119cc1dc7a3Sopenharmony_ci} 120cc1dc7a3Sopenharmony_ci 121cc1dc7a3Sopenharmony_ci/** 122cc1dc7a3Sopenharmony_ci * @brief Encode a texel that is entirely LDR linear. 123cc1dc7a3Sopenharmony_ci * 124cc1dc7a3Sopenharmony_ci * @param data The RGBA data to encode. 125cc1dc7a3Sopenharmony_ci * @param lns_mask The mask for the HDR channels than need LNS encoding. 126cc1dc7a3Sopenharmony_ci */ 127cc1dc7a3Sopenharmony_cistatic vfloat4 encode_texel_unorm( 128cc1dc7a3Sopenharmony_ci vfloat4 data, 129cc1dc7a3Sopenharmony_ci vmask4 lns_mask 130cc1dc7a3Sopenharmony_ci) { 131cc1dc7a3Sopenharmony_ci (void)lns_mask; 132cc1dc7a3Sopenharmony_ci return data * 65535.0f; 133cc1dc7a3Sopenharmony_ci} 134cc1dc7a3Sopenharmony_ci 135cc1dc7a3Sopenharmony_ci/** 136cc1dc7a3Sopenharmony_ci * @brief Encode a texel that includes at least some HDR LNS texels. 137cc1dc7a3Sopenharmony_ci * 138cc1dc7a3Sopenharmony_ci * @param data The RGBA data to encode. 139cc1dc7a3Sopenharmony_ci * @param lns_mask The mask for the HDR channels than need LNS encoding. 140cc1dc7a3Sopenharmony_ci */ 141cc1dc7a3Sopenharmony_cistatic vfloat4 encode_texel_lns( 142cc1dc7a3Sopenharmony_ci vfloat4 data, 143cc1dc7a3Sopenharmony_ci vmask4 lns_mask 144cc1dc7a3Sopenharmony_ci) { 145cc1dc7a3Sopenharmony_ci vfloat4 datav_unorm = data * 65535.0f; 146cc1dc7a3Sopenharmony_ci vfloat4 datav_lns = float_to_lns(data); 147cc1dc7a3Sopenharmony_ci return select(datav_unorm, datav_lns, lns_mask); 148cc1dc7a3Sopenharmony_ci} 149cc1dc7a3Sopenharmony_ci 150cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 151cc1dc7a3Sopenharmony_civoid load_image_block( 152cc1dc7a3Sopenharmony_ci astcenc_profile decode_mode, 153cc1dc7a3Sopenharmony_ci const astcenc_image& img, 154cc1dc7a3Sopenharmony_ci image_block& blk, 155cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 156cc1dc7a3Sopenharmony_ci unsigned int xpos, 157cc1dc7a3Sopenharmony_ci unsigned int ypos, 158cc1dc7a3Sopenharmony_ci unsigned int zpos, 159cc1dc7a3Sopenharmony_ci const astcenc_swizzle& swz 160cc1dc7a3Sopenharmony_ci) { 161cc1dc7a3Sopenharmony_ci unsigned int xsize = img.dim_x; 162cc1dc7a3Sopenharmony_ci unsigned int ysize = img.dim_y; 163cc1dc7a3Sopenharmony_ci unsigned int zsize = img.dim_z; 164cc1dc7a3Sopenharmony_ci 165cc1dc7a3Sopenharmony_ci blk.xpos = xpos; 166cc1dc7a3Sopenharmony_ci blk.ypos = ypos; 167cc1dc7a3Sopenharmony_ci blk.zpos = zpos; 168cc1dc7a3Sopenharmony_ci 169cc1dc7a3Sopenharmony_ci // True if any non-identity swizzle 170cc1dc7a3Sopenharmony_ci bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) || 171cc1dc7a3Sopenharmony_ci (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A); 172cc1dc7a3Sopenharmony_ci 173cc1dc7a3Sopenharmony_ci int idx = 0; 174cc1dc7a3Sopenharmony_ci 175cc1dc7a3Sopenharmony_ci vfloat4 data_min(1e38f); 176cc1dc7a3Sopenharmony_ci vfloat4 data_mean(0.0f); 177cc1dc7a3Sopenharmony_ci vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count)); 178cc1dc7a3Sopenharmony_ci vfloat4 data_max(-1e38f); 179cc1dc7a3Sopenharmony_ci vmask4 grayscalev(true); 180cc1dc7a3Sopenharmony_ci 181cc1dc7a3Sopenharmony_ci // This works because we impose the same choice everywhere during encode 182cc1dc7a3Sopenharmony_ci uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) || 183cc1dc7a3Sopenharmony_ci (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0; 184cc1dc7a3Sopenharmony_ci uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0; 185cc1dc7a3Sopenharmony_ci vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns); 186cc1dc7a3Sopenharmony_ci vmask4 lns_mask = use_lns != vint4::zero(); 187cc1dc7a3Sopenharmony_ci 188cc1dc7a3Sopenharmony_ci // Set up the function pointers for loading pipeline as needed 189cc1dc7a3Sopenharmony_ci pixel_loader loader = load_texel_u8; 190cc1dc7a3Sopenharmony_ci if (img.data_type == ASTCENC_TYPE_F16) 191cc1dc7a3Sopenharmony_ci { 192cc1dc7a3Sopenharmony_ci loader = load_texel_f16; 193cc1dc7a3Sopenharmony_ci } 194cc1dc7a3Sopenharmony_ci else if (img.data_type == ASTCENC_TYPE_F32) 195cc1dc7a3Sopenharmony_ci { 196cc1dc7a3Sopenharmony_ci loader = load_texel_f32; 197cc1dc7a3Sopenharmony_ci } 198cc1dc7a3Sopenharmony_ci 199cc1dc7a3Sopenharmony_ci pixel_swizzler swizzler = swz_texel_skip; 200cc1dc7a3Sopenharmony_ci if (needs_swz) 201cc1dc7a3Sopenharmony_ci { 202cc1dc7a3Sopenharmony_ci swizzler = swz_texel; 203cc1dc7a3Sopenharmony_ci } 204cc1dc7a3Sopenharmony_ci 205cc1dc7a3Sopenharmony_ci pixel_converter converter = encode_texel_unorm; 206cc1dc7a3Sopenharmony_ci if (any(lns_mask)) 207cc1dc7a3Sopenharmony_ci { 208cc1dc7a3Sopenharmony_ci converter = encode_texel_lns; 209cc1dc7a3Sopenharmony_ci } 210cc1dc7a3Sopenharmony_ci 211cc1dc7a3Sopenharmony_ci for (unsigned int z = 0; z < bsd.zdim; z++) 212cc1dc7a3Sopenharmony_ci { 213cc1dc7a3Sopenharmony_ci unsigned int zi = astc::min(zpos + z, zsize - 1); 214cc1dc7a3Sopenharmony_ci void* plane = img.data[zi]; 215cc1dc7a3Sopenharmony_ci 216cc1dc7a3Sopenharmony_ci for (unsigned int y = 0; y < bsd.ydim; y++) 217cc1dc7a3Sopenharmony_ci { 218cc1dc7a3Sopenharmony_ci unsigned int yi = astc::min(ypos + y, ysize - 1); 219cc1dc7a3Sopenharmony_ci 220cc1dc7a3Sopenharmony_ci for (unsigned int x = 0; x < bsd.xdim; x++) 221cc1dc7a3Sopenharmony_ci { 222cc1dc7a3Sopenharmony_ci unsigned int xi = astc::min(xpos + x, xsize - 1); 223cc1dc7a3Sopenharmony_ci 224cc1dc7a3Sopenharmony_ci vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi)); 225cc1dc7a3Sopenharmony_ci datav = swizzler(datav, swz); 226cc1dc7a3Sopenharmony_ci datav = converter(datav, lns_mask); 227cc1dc7a3Sopenharmony_ci 228cc1dc7a3Sopenharmony_ci // Compute block metadata 229cc1dc7a3Sopenharmony_ci data_min = min(data_min, datav); 230cc1dc7a3Sopenharmony_ci data_mean += datav * data_mean_scale; 231cc1dc7a3Sopenharmony_ci data_max = max(data_max, datav); 232cc1dc7a3Sopenharmony_ci 233cc1dc7a3Sopenharmony_ci grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>()); 234cc1dc7a3Sopenharmony_ci 235cc1dc7a3Sopenharmony_ci blk.data_r[idx] = datav.lane<0>(); 236cc1dc7a3Sopenharmony_ci blk.data_g[idx] = datav.lane<1>(); 237cc1dc7a3Sopenharmony_ci blk.data_b[idx] = datav.lane<2>(); 238cc1dc7a3Sopenharmony_ci blk.data_a[idx] = datav.lane<3>(); 239cc1dc7a3Sopenharmony_ci 240cc1dc7a3Sopenharmony_ci blk.rgb_lns[idx] = rgb_lns; 241cc1dc7a3Sopenharmony_ci blk.alpha_lns[idx] = a_lns; 242cc1dc7a3Sopenharmony_ci 243cc1dc7a3Sopenharmony_ci idx++; 244cc1dc7a3Sopenharmony_ci } 245cc1dc7a3Sopenharmony_ci } 246cc1dc7a3Sopenharmony_ci } 247cc1dc7a3Sopenharmony_ci 248cc1dc7a3Sopenharmony_ci // Reverse the encoding so we store origin block in the original format 249cc1dc7a3Sopenharmony_ci vfloat4 data_enc = blk.texel(0); 250cc1dc7a3Sopenharmony_ci vfloat4 data_enc_unorm = data_enc / 65535.0f; 251cc1dc7a3Sopenharmony_ci vfloat4 data_enc_lns = vfloat4::zero(); 252cc1dc7a3Sopenharmony_ci 253cc1dc7a3Sopenharmony_ci if (rgb_lns || a_lns) 254cc1dc7a3Sopenharmony_ci { 255cc1dc7a3Sopenharmony_ci data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc))); 256cc1dc7a3Sopenharmony_ci } 257cc1dc7a3Sopenharmony_ci 258cc1dc7a3Sopenharmony_ci blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask); 259cc1dc7a3Sopenharmony_ci 260cc1dc7a3Sopenharmony_ci // Store block metadata 261cc1dc7a3Sopenharmony_ci blk.data_min = data_min; 262cc1dc7a3Sopenharmony_ci blk.data_mean = data_mean; 263cc1dc7a3Sopenharmony_ci blk.data_max = data_max; 264cc1dc7a3Sopenharmony_ci blk.grayscale = all(grayscalev); 265cc1dc7a3Sopenharmony_ci} 266cc1dc7a3Sopenharmony_ci 267cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 268cc1dc7a3Sopenharmony_civoid load_image_block_fast_ldr( 269cc1dc7a3Sopenharmony_ci astcenc_profile decode_mode, 270cc1dc7a3Sopenharmony_ci const astcenc_image& img, 271cc1dc7a3Sopenharmony_ci image_block& blk, 272cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 273cc1dc7a3Sopenharmony_ci unsigned int xpos, 274cc1dc7a3Sopenharmony_ci unsigned int ypos, 275cc1dc7a3Sopenharmony_ci unsigned int zpos, 276cc1dc7a3Sopenharmony_ci const astcenc_swizzle& swz 277cc1dc7a3Sopenharmony_ci) { 278cc1dc7a3Sopenharmony_ci (void)swz; 279cc1dc7a3Sopenharmony_ci (void)decode_mode; 280cc1dc7a3Sopenharmony_ci 281cc1dc7a3Sopenharmony_ci unsigned int xsize = img.dim_x; 282cc1dc7a3Sopenharmony_ci unsigned int ysize = img.dim_y; 283cc1dc7a3Sopenharmony_ci unsigned int stride = img.dim_stride; 284cc1dc7a3Sopenharmony_ci blk.xpos = xpos; 285cc1dc7a3Sopenharmony_ci blk.ypos = ypos; 286cc1dc7a3Sopenharmony_ci blk.zpos = zpos; 287cc1dc7a3Sopenharmony_ci 288cc1dc7a3Sopenharmony_ci vfloat4 data_min(1e38f); 289cc1dc7a3Sopenharmony_ci vfloat4 data_mean = vfloat4::zero(); 290cc1dc7a3Sopenharmony_ci vfloat4 data_max(-1e38f); 291cc1dc7a3Sopenharmony_ci vmask4 grayscalev(true); 292cc1dc7a3Sopenharmony_ci int idx = 0; 293cc1dc7a3Sopenharmony_ci 294cc1dc7a3Sopenharmony_ci const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]); 295cc1dc7a3Sopenharmony_ci for (unsigned int y = ypos; y < ypos + bsd.ydim; y++) 296cc1dc7a3Sopenharmony_ci { 297cc1dc7a3Sopenharmony_ci unsigned int yi = astc::min(y, ysize - 1); 298cc1dc7a3Sopenharmony_ci 299cc1dc7a3Sopenharmony_ci for (unsigned int x = xpos; x < xpos + bsd.xdim; x++) 300cc1dc7a3Sopenharmony_ci { 301cc1dc7a3Sopenharmony_ci unsigned int xi = astc::min(x, xsize - 1); 302cc1dc7a3Sopenharmony_ci 303cc1dc7a3Sopenharmony_ci vint4 datavi = vint4(plane + (4 * stride * yi) + (4 * xi)); 304cc1dc7a3Sopenharmony_ci vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f); 305cc1dc7a3Sopenharmony_ci 306cc1dc7a3Sopenharmony_ci // Compute block metadata 307cc1dc7a3Sopenharmony_ci data_min = min(data_min, datav); 308cc1dc7a3Sopenharmony_ci data_mean += datav; 309cc1dc7a3Sopenharmony_ci data_max = max(data_max, datav); 310cc1dc7a3Sopenharmony_ci 311cc1dc7a3Sopenharmony_ci grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>()); 312cc1dc7a3Sopenharmony_ci 313cc1dc7a3Sopenharmony_ci blk.data_r[idx] = datav.lane<0>(); 314cc1dc7a3Sopenharmony_ci blk.data_g[idx] = datav.lane<1>(); 315cc1dc7a3Sopenharmony_ci blk.data_b[idx] = datav.lane<2>(); 316cc1dc7a3Sopenharmony_ci blk.data_a[idx] = datav.lane<3>(); 317cc1dc7a3Sopenharmony_ci 318cc1dc7a3Sopenharmony_ci idx++; 319cc1dc7a3Sopenharmony_ci } 320cc1dc7a3Sopenharmony_ci } 321cc1dc7a3Sopenharmony_ci 322cc1dc7a3Sopenharmony_ci // Reverse the encoding so we store origin block in the original format 323cc1dc7a3Sopenharmony_ci blk.origin_texel = blk.texel(0) / 65535.0f; 324cc1dc7a3Sopenharmony_ci 325cc1dc7a3Sopenharmony_ci // Store block metadata 326cc1dc7a3Sopenharmony_ci blk.rgb_lns[0] = 0; 327cc1dc7a3Sopenharmony_ci blk.alpha_lns[0] = 0; 328cc1dc7a3Sopenharmony_ci blk.data_min = data_min; 329cc1dc7a3Sopenharmony_ci blk.data_mean = data_mean / static_cast<float>(bsd.texel_count); 330cc1dc7a3Sopenharmony_ci blk.data_max = data_max; 331cc1dc7a3Sopenharmony_ci blk.grayscale = all(grayscalev); 332cc1dc7a3Sopenharmony_ci} 333cc1dc7a3Sopenharmony_ci 334cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 335cc1dc7a3Sopenharmony_civoid store_image_block( 336cc1dc7a3Sopenharmony_ci astcenc_image& img, 337cc1dc7a3Sopenharmony_ci const image_block& blk, 338cc1dc7a3Sopenharmony_ci const block_size_descriptor& bsd, 339cc1dc7a3Sopenharmony_ci unsigned int xpos, 340cc1dc7a3Sopenharmony_ci unsigned int ypos, 341cc1dc7a3Sopenharmony_ci unsigned int zpos, 342cc1dc7a3Sopenharmony_ci const astcenc_swizzle& swz 343cc1dc7a3Sopenharmony_ci) { 344cc1dc7a3Sopenharmony_ci unsigned int x_size = img.dim_x; 345cc1dc7a3Sopenharmony_ci unsigned int x_start = xpos; 346cc1dc7a3Sopenharmony_ci unsigned int x_end = astc::min(x_size, xpos + bsd.xdim); 347cc1dc7a3Sopenharmony_ci unsigned int x_count = x_end - x_start; 348cc1dc7a3Sopenharmony_ci unsigned int x_nudge = bsd.xdim - x_count; 349cc1dc7a3Sopenharmony_ci 350cc1dc7a3Sopenharmony_ci unsigned int y_size = img.dim_y; 351cc1dc7a3Sopenharmony_ci unsigned int y_start = ypos; 352cc1dc7a3Sopenharmony_ci unsigned int y_end = astc::min(y_size, ypos + bsd.ydim); 353cc1dc7a3Sopenharmony_ci unsigned int y_count = y_end - y_start; 354cc1dc7a3Sopenharmony_ci unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim; 355cc1dc7a3Sopenharmony_ci 356cc1dc7a3Sopenharmony_ci unsigned int z_size = img.dim_z; 357cc1dc7a3Sopenharmony_ci unsigned int z_start = zpos; 358cc1dc7a3Sopenharmony_ci unsigned int z_end = astc::min(z_size, zpos + bsd.zdim); 359cc1dc7a3Sopenharmony_ci 360cc1dc7a3Sopenharmony_ci // True if any non-identity swizzle 361cc1dc7a3Sopenharmony_ci bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) || 362cc1dc7a3Sopenharmony_ci (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A); 363cc1dc7a3Sopenharmony_ci 364cc1dc7a3Sopenharmony_ci // True if any swizzle uses Z reconstruct 365cc1dc7a3Sopenharmony_ci bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) || 366cc1dc7a3Sopenharmony_ci (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z); 367cc1dc7a3Sopenharmony_ci 368cc1dc7a3Sopenharmony_ci int idx = 0; 369cc1dc7a3Sopenharmony_ci if (img.data_type == ASTCENC_TYPE_U8) 370cc1dc7a3Sopenharmony_ci { 371cc1dc7a3Sopenharmony_ci for (unsigned int z = z_start; z < z_end; z++) 372cc1dc7a3Sopenharmony_ci { 373cc1dc7a3Sopenharmony_ci // Fetch the image plane 374cc1dc7a3Sopenharmony_ci uint8_t* data8 = static_cast<uint8_t*>(img.data[z]); 375cc1dc7a3Sopenharmony_ci 376cc1dc7a3Sopenharmony_ci for (unsigned int y = y_start; y < y_end; y++) 377cc1dc7a3Sopenharmony_ci { 378cc1dc7a3Sopenharmony_ci uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start); 379cc1dc7a3Sopenharmony_ci 380cc1dc7a3Sopenharmony_ci for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH) 381cc1dc7a3Sopenharmony_ci { 382cc1dc7a3Sopenharmony_ci unsigned int max_texels = ASTCENC_SIMD_WIDTH; 383cc1dc7a3Sopenharmony_ci unsigned int used_texels = astc::min(x_count - x, max_texels); 384cc1dc7a3Sopenharmony_ci 385cc1dc7a3Sopenharmony_ci // Unaligned load as rows are not always SIMD_WIDTH long 386cc1dc7a3Sopenharmony_ci vfloat data_r(blk.data_r + idx); 387cc1dc7a3Sopenharmony_ci vfloat data_g(blk.data_g + idx); 388cc1dc7a3Sopenharmony_ci vfloat data_b(blk.data_b + idx); 389cc1dc7a3Sopenharmony_ci vfloat data_a(blk.data_a + idx); 390cc1dc7a3Sopenharmony_ci 391cc1dc7a3Sopenharmony_ci vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f); 392cc1dc7a3Sopenharmony_ci vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f); 393cc1dc7a3Sopenharmony_ci vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f); 394cc1dc7a3Sopenharmony_ci vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f); 395cc1dc7a3Sopenharmony_ci 396cc1dc7a3Sopenharmony_ci if (needs_swz) 397cc1dc7a3Sopenharmony_ci { 398cc1dc7a3Sopenharmony_ci vint swizzle_table[7]; 399cc1dc7a3Sopenharmony_ci swizzle_table[ASTCENC_SWZ_0] = vint(0); 400cc1dc7a3Sopenharmony_ci swizzle_table[ASTCENC_SWZ_1] = vint(255); 401cc1dc7a3Sopenharmony_ci swizzle_table[ASTCENC_SWZ_R] = data_ri; 402cc1dc7a3Sopenharmony_ci swizzle_table[ASTCENC_SWZ_G] = data_gi; 403cc1dc7a3Sopenharmony_ci swizzle_table[ASTCENC_SWZ_B] = data_bi; 404cc1dc7a3Sopenharmony_ci swizzle_table[ASTCENC_SWZ_A] = data_ai; 405cc1dc7a3Sopenharmony_ci 406cc1dc7a3Sopenharmony_ci if (needs_z) 407cc1dc7a3Sopenharmony_ci { 408cc1dc7a3Sopenharmony_ci vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f); 409cc1dc7a3Sopenharmony_ci vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f); 410cc1dc7a3Sopenharmony_ci vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y); 411cc1dc7a3Sopenharmony_ci data_z = max(data_z, 0.0f); 412cc1dc7a3Sopenharmony_ci data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f); 413cc1dc7a3Sopenharmony_ci 414cc1dc7a3Sopenharmony_ci swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f); 415cc1dc7a3Sopenharmony_ci } 416cc1dc7a3Sopenharmony_ci 417cc1dc7a3Sopenharmony_ci data_ri = swizzle_table[swz.r]; 418cc1dc7a3Sopenharmony_ci data_gi = swizzle_table[swz.g]; 419cc1dc7a3Sopenharmony_ci data_bi = swizzle_table[swz.b]; 420cc1dc7a3Sopenharmony_ci data_ai = swizzle_table[swz.a]; 421cc1dc7a3Sopenharmony_ci } 422cc1dc7a3Sopenharmony_ci 423cc1dc7a3Sopenharmony_ci // Errors are NaN encoded - convert to magenta error color 424cc1dc7a3Sopenharmony_ci // Branch is OK here - it is almost never true so predicts well 425cc1dc7a3Sopenharmony_ci vmask nan_mask = data_r != data_r; 426cc1dc7a3Sopenharmony_ci if (any(nan_mask)) 427cc1dc7a3Sopenharmony_ci { 428cc1dc7a3Sopenharmony_ci data_ri = select(data_ri, vint(0xFF), nan_mask); 429cc1dc7a3Sopenharmony_ci data_gi = select(data_gi, vint(0x00), nan_mask); 430cc1dc7a3Sopenharmony_ci data_bi = select(data_bi, vint(0xFF), nan_mask); 431cc1dc7a3Sopenharmony_ci data_ai = select(data_ai, vint(0xFF), nan_mask); 432cc1dc7a3Sopenharmony_ci } 433cc1dc7a3Sopenharmony_ci 434cc1dc7a3Sopenharmony_ci vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai); 435cc1dc7a3Sopenharmony_ci vmask store_mask = vint::lane_id() < vint(used_texels); 436cc1dc7a3Sopenharmony_ci store_lanes_masked(data8_row, data_rgbai, store_mask); 437cc1dc7a3Sopenharmony_ci 438cc1dc7a3Sopenharmony_ci data8_row += ASTCENC_SIMD_WIDTH * 4; 439cc1dc7a3Sopenharmony_ci idx += used_texels; 440cc1dc7a3Sopenharmony_ci } 441cc1dc7a3Sopenharmony_ci idx += x_nudge; 442cc1dc7a3Sopenharmony_ci } 443cc1dc7a3Sopenharmony_ci idx += y_nudge; 444cc1dc7a3Sopenharmony_ci } 445cc1dc7a3Sopenharmony_ci } 446cc1dc7a3Sopenharmony_ci else if (img.data_type == ASTCENC_TYPE_F16) 447cc1dc7a3Sopenharmony_ci { 448cc1dc7a3Sopenharmony_ci for (unsigned int z = z_start; z < z_end; z++) 449cc1dc7a3Sopenharmony_ci { 450cc1dc7a3Sopenharmony_ci // Fetch the image plane 451cc1dc7a3Sopenharmony_ci uint16_t* data16 = static_cast<uint16_t*>(img.data[z]); 452cc1dc7a3Sopenharmony_ci 453cc1dc7a3Sopenharmony_ci for (unsigned int y = y_start; y < y_end; y++) 454cc1dc7a3Sopenharmony_ci { 455cc1dc7a3Sopenharmony_ci uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start); 456cc1dc7a3Sopenharmony_ci 457cc1dc7a3Sopenharmony_ci for (unsigned int x = 0; x < x_count; x++) 458cc1dc7a3Sopenharmony_ci { 459cc1dc7a3Sopenharmony_ci vint4 color; 460cc1dc7a3Sopenharmony_ci 461cc1dc7a3Sopenharmony_ci // NaNs are handled inline - no need to special case 462cc1dc7a3Sopenharmony_ci if (needs_swz) 463cc1dc7a3Sopenharmony_ci { 464cc1dc7a3Sopenharmony_ci float data[7]; 465cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_0] = 0.0f; 466cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_1] = 1.0f; 467cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_R] = blk.data_r[idx]; 468cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_G] = blk.data_g[idx]; 469cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_B] = blk.data_b[idx]; 470cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_A] = blk.data_a[idx]; 471cc1dc7a3Sopenharmony_ci 472cc1dc7a3Sopenharmony_ci if (needs_z) 473cc1dc7a3Sopenharmony_ci { 474cc1dc7a3Sopenharmony_ci float xN = (data[0] * 2.0f) - 1.0f; 475cc1dc7a3Sopenharmony_ci float yN = (data[3] * 2.0f) - 1.0f; 476cc1dc7a3Sopenharmony_ci float zN = 1.0f - xN * xN - yN * yN; 477cc1dc7a3Sopenharmony_ci if (zN < 0.0f) 478cc1dc7a3Sopenharmony_ci { 479cc1dc7a3Sopenharmony_ci zN = 0.0f; 480cc1dc7a3Sopenharmony_ci } 481cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f; 482cc1dc7a3Sopenharmony_ci } 483cc1dc7a3Sopenharmony_ci 484cc1dc7a3Sopenharmony_ci vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); 485cc1dc7a3Sopenharmony_ci color = float_to_float16(colorf); 486cc1dc7a3Sopenharmony_ci } 487cc1dc7a3Sopenharmony_ci else 488cc1dc7a3Sopenharmony_ci { 489cc1dc7a3Sopenharmony_ci vfloat4 colorf = blk.texel(idx); 490cc1dc7a3Sopenharmony_ci color = float_to_float16(colorf); 491cc1dc7a3Sopenharmony_ci } 492cc1dc7a3Sopenharmony_ci 493cc1dc7a3Sopenharmony_ci // TODO: Vectorize with store N shorts? 494cc1dc7a3Sopenharmony_ci data16_row[0] = static_cast<uint16_t>(color.lane<0>()); 495cc1dc7a3Sopenharmony_ci data16_row[1] = static_cast<uint16_t>(color.lane<1>()); 496cc1dc7a3Sopenharmony_ci data16_row[2] = static_cast<uint16_t>(color.lane<2>()); 497cc1dc7a3Sopenharmony_ci data16_row[3] = static_cast<uint16_t>(color.lane<3>()); 498cc1dc7a3Sopenharmony_ci data16_row += 4; 499cc1dc7a3Sopenharmony_ci idx++; 500cc1dc7a3Sopenharmony_ci } 501cc1dc7a3Sopenharmony_ci idx += x_nudge; 502cc1dc7a3Sopenharmony_ci } 503cc1dc7a3Sopenharmony_ci idx += y_nudge; 504cc1dc7a3Sopenharmony_ci } 505cc1dc7a3Sopenharmony_ci } 506cc1dc7a3Sopenharmony_ci else // if (img.data_type == ASTCENC_TYPE_F32) 507cc1dc7a3Sopenharmony_ci { 508cc1dc7a3Sopenharmony_ci assert(img.data_type == ASTCENC_TYPE_F32); 509cc1dc7a3Sopenharmony_ci 510cc1dc7a3Sopenharmony_ci for (unsigned int z = z_start; z < z_end; z++) 511cc1dc7a3Sopenharmony_ci { 512cc1dc7a3Sopenharmony_ci // Fetch the image plane 513cc1dc7a3Sopenharmony_ci float* data32 = static_cast<float*>(img.data[z]); 514cc1dc7a3Sopenharmony_ci 515cc1dc7a3Sopenharmony_ci for (unsigned int y = y_start; y < y_end; y++) 516cc1dc7a3Sopenharmony_ci { 517cc1dc7a3Sopenharmony_ci float* data32_row = data32 + (4 * x_size * y) + (4 * x_start); 518cc1dc7a3Sopenharmony_ci 519cc1dc7a3Sopenharmony_ci for (unsigned int x = 0; x < x_count; x++) 520cc1dc7a3Sopenharmony_ci { 521cc1dc7a3Sopenharmony_ci vfloat4 color = blk.texel(idx); 522cc1dc7a3Sopenharmony_ci 523cc1dc7a3Sopenharmony_ci // NaNs are handled inline - no need to special case 524cc1dc7a3Sopenharmony_ci if (needs_swz) 525cc1dc7a3Sopenharmony_ci { 526cc1dc7a3Sopenharmony_ci float data[7]; 527cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_0] = 0.0f; 528cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_1] = 1.0f; 529cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_R] = color.lane<0>(); 530cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_G] = color.lane<1>(); 531cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_B] = color.lane<2>(); 532cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_A] = color.lane<3>(); 533cc1dc7a3Sopenharmony_ci 534cc1dc7a3Sopenharmony_ci if (needs_z) 535cc1dc7a3Sopenharmony_ci { 536cc1dc7a3Sopenharmony_ci float xN = (data[0] * 2.0f) - 1.0f; 537cc1dc7a3Sopenharmony_ci float yN = (data[3] * 2.0f) - 1.0f; 538cc1dc7a3Sopenharmony_ci float zN = 1.0f - xN * xN - yN * yN; 539cc1dc7a3Sopenharmony_ci if (zN < 0.0f) 540cc1dc7a3Sopenharmony_ci { 541cc1dc7a3Sopenharmony_ci zN = 0.0f; 542cc1dc7a3Sopenharmony_ci } 543cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f; 544cc1dc7a3Sopenharmony_ci } 545cc1dc7a3Sopenharmony_ci 546cc1dc7a3Sopenharmony_ci color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); 547cc1dc7a3Sopenharmony_ci } 548cc1dc7a3Sopenharmony_ci 549cc1dc7a3Sopenharmony_ci store(color, data32_row); 550cc1dc7a3Sopenharmony_ci data32_row += 4; 551cc1dc7a3Sopenharmony_ci idx++; 552cc1dc7a3Sopenharmony_ci } 553cc1dc7a3Sopenharmony_ci idx += x_nudge; 554cc1dc7a3Sopenharmony_ci } 555cc1dc7a3Sopenharmony_ci idx += y_nudge; 556cc1dc7a3Sopenharmony_ci } 557cc1dc7a3Sopenharmony_ci } 558cc1dc7a3Sopenharmony_ci} 559