1// SPDX-License-Identifier: Apache-2.0 2// ---------------------------------------------------------------------------- 3// Copyright 2011-2024 Arm Limited 4// 5// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6// use this file except in compliance with the License. You may obtain a copy 7// of the License at: 8// 9// http://www.apache.org/licenses/LICENSE-2.0 10// 11// Unless required by applicable law or agreed to in writing, software 12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14// License for the specific language governing permissions and limitations 15// under the License. 16// ---------------------------------------------------------------------------- 17 18/** 19 * @brief Functions for creating in-memory ASTC image structures. 20 */ 21 22#include <cassert> 23#include <cstring> 24 25#include "astcenc_internal.h" 26 27/** 28 * @brief Loader pipeline function type for data fetch from memory. 29 */ 30using pixel_loader = vfloat4(*)(const void*, int); 31 32/** 33 * @brief Loader pipeline function type for swizzling data in a vector. 34 */ 35using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&); 36 37/** 38 * @brief Loader pipeline function type for converting data in a vector to LNS. 39 */ 40using pixel_converter = vfloat4(*)(vfloat4, vmask4); 41 42/** 43 * @brief Load a 8-bit UNORM texel from a data array. 44 * 45 * @param data The data pointer. 46 * @param base_offset The index offset to the start of the pixel. 47 */ 48static vfloat4 load_texel_u8( 49 const void* data, 50 int base_offset 51) { 52 const uint8_t* data8 = static_cast<const uint8_t*>(data); 53 return int_to_float(vint4(data8 + base_offset)) / 255.0f; 54} 55 56/** 57 * @brief Load a 16-bit fp16 texel from a data array. 58 * 59 * @param data The data pointer. 60 * @param base_offset The index offset to the start of the pixel. 61 */ 62static vfloat4 load_texel_f16( 63 const void* data, 64 int base_offset 65) { 66 const uint16_t* data16 = static_cast<const uint16_t*>(data); 67 int r = data16[base_offset ]; 68 int g = data16[base_offset + 1]; 69 int b = data16[base_offset + 2]; 70 int a = data16[base_offset + 3]; 71 return float16_to_float(vint4(r, g, b, a)); 72} 73 74/** 75 * @brief Load a 32-bit float texel from a data array. 76 * 77 * @param data The data pointer. 78 * @param base_offset The index offset to the start of the pixel. 79 */ 80static vfloat4 load_texel_f32( 81 const void* data, 82 int base_offset 83) { 84 const float* data32 = static_cast<const float*>(data); 85 return vfloat4(data32 + base_offset); 86} 87 88/** 89 * @brief Dummy no-op swizzle function. 90 * 91 * @param data The source RGBA vector to swizzle. 92 * @param swz The swizzle to use. 93 */ 94static vfloat4 swz_texel_skip( 95 vfloat4 data, 96 const astcenc_swizzle& swz 97) { 98 (void)swz; 99 return data; 100} 101 102/** 103 * @brief Swizzle a texel into a new arrangement. 104 * 105 * @param data The source RGBA vector to swizzle. 106 * @param swz The swizzle to use. 107 */ 108static vfloat4 swz_texel( 109 vfloat4 data, 110 const astcenc_swizzle& swz 111) { 112 ASTCENC_ALIGNAS float datas[6]; 113 114 storea(data, datas); 115 datas[ASTCENC_SWZ_0] = 0.0f; 116 datas[ASTCENC_SWZ_1] = 1.0f; 117 118 return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]); 119} 120 121/** 122 * @brief Encode a texel that is entirely LDR linear. 123 * 124 * @param data The RGBA data to encode. 125 * @param lns_mask The mask for the HDR channels than need LNS encoding. 126 */ 127static vfloat4 encode_texel_unorm( 128 vfloat4 data, 129 vmask4 lns_mask 130) { 131 (void)lns_mask; 132 return data * 65535.0f; 133} 134 135/** 136 * @brief Encode a texel that includes at least some HDR LNS texels. 137 * 138 * @param data The RGBA data to encode. 139 * @param lns_mask The mask for the HDR channels than need LNS encoding. 140 */ 141static vfloat4 encode_texel_lns( 142 vfloat4 data, 143 vmask4 lns_mask 144) { 145 vfloat4 datav_unorm = data * 65535.0f; 146 vfloat4 datav_lns = float_to_lns(data); 147 return select(datav_unorm, datav_lns, lns_mask); 148} 149 150/* See header for documentation. */ 151void load_image_block( 152 astcenc_profile decode_mode, 153 const astcenc_image& img, 154 image_block& blk, 155 const block_size_descriptor& bsd, 156 unsigned int xpos, 157 unsigned int ypos, 158 unsigned int zpos, 159 const astcenc_swizzle& swz 160) { 161 unsigned int xsize = img.dim_x; 162 unsigned int ysize = img.dim_y; 163 unsigned int zsize = img.dim_z; 164 165 blk.xpos = xpos; 166 blk.ypos = ypos; 167 blk.zpos = zpos; 168 169 // True if any non-identity swizzle 170 bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) || 171 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A); 172 173 int idx = 0; 174 175 vfloat4 data_min(1e38f); 176 vfloat4 data_mean(0.0f); 177 vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count)); 178 vfloat4 data_max(-1e38f); 179 vmask4 grayscalev(true); 180 181 // This works because we impose the same choice everywhere during encode 182 uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) || 183 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0; 184 uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0; 185 vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns); 186 vmask4 lns_mask = use_lns != vint4::zero(); 187 188 // Set up the function pointers for loading pipeline as needed 189 pixel_loader loader = load_texel_u8; 190 if (img.data_type == ASTCENC_TYPE_F16) 191 { 192 loader = load_texel_f16; 193 } 194 else if (img.data_type == ASTCENC_TYPE_F32) 195 { 196 loader = load_texel_f32; 197 } 198 199 pixel_swizzler swizzler = swz_texel_skip; 200 if (needs_swz) 201 { 202 swizzler = swz_texel; 203 } 204 205 pixel_converter converter = encode_texel_unorm; 206 if (any(lns_mask)) 207 { 208 converter = encode_texel_lns; 209 } 210 211 for (unsigned int z = 0; z < bsd.zdim; z++) 212 { 213 unsigned int zi = astc::min(zpos + z, zsize - 1); 214 void* plane = img.data[zi]; 215 216 for (unsigned int y = 0; y < bsd.ydim; y++) 217 { 218 unsigned int yi = astc::min(ypos + y, ysize - 1); 219 220 for (unsigned int x = 0; x < bsd.xdim; x++) 221 { 222 unsigned int xi = astc::min(xpos + x, xsize - 1); 223 224 vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi)); 225 datav = swizzler(datav, swz); 226 datav = converter(datav, lns_mask); 227 228 // Compute block metadata 229 data_min = min(data_min, datav); 230 data_mean += datav * data_mean_scale; 231 data_max = max(data_max, datav); 232 233 grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>()); 234 235 blk.data_r[idx] = datav.lane<0>(); 236 blk.data_g[idx] = datav.lane<1>(); 237 blk.data_b[idx] = datav.lane<2>(); 238 blk.data_a[idx] = datav.lane<3>(); 239 240 blk.rgb_lns[idx] = rgb_lns; 241 blk.alpha_lns[idx] = a_lns; 242 243 idx++; 244 } 245 } 246 } 247 248 // Reverse the encoding so we store origin block in the original format 249 vfloat4 data_enc = blk.texel(0); 250 vfloat4 data_enc_unorm = data_enc / 65535.0f; 251 vfloat4 data_enc_lns = vfloat4::zero(); 252 253 if (rgb_lns || a_lns) 254 { 255 data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc))); 256 } 257 258 blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask); 259 260 // Store block metadata 261 blk.data_min = data_min; 262 blk.data_mean = data_mean; 263 blk.data_max = data_max; 264 blk.grayscale = all(grayscalev); 265} 266 267/* See header for documentation. */ 268void load_image_block_fast_ldr( 269 astcenc_profile decode_mode, 270 const astcenc_image& img, 271 image_block& blk, 272 const block_size_descriptor& bsd, 273 unsigned int xpos, 274 unsigned int ypos, 275 unsigned int zpos, 276 const astcenc_swizzle& swz 277) { 278 (void)swz; 279 (void)decode_mode; 280 281 unsigned int xsize = img.dim_x; 282 unsigned int ysize = img.dim_y; 283 unsigned int stride = img.dim_stride; 284 blk.xpos = xpos; 285 blk.ypos = ypos; 286 blk.zpos = zpos; 287 288 vfloat4 data_min(1e38f); 289 vfloat4 data_mean = vfloat4::zero(); 290 vfloat4 data_max(-1e38f); 291 vmask4 grayscalev(true); 292 int idx = 0; 293 294 const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]); 295 for (unsigned int y = ypos; y < ypos + bsd.ydim; y++) 296 { 297 unsigned int yi = astc::min(y, ysize - 1); 298 299 for (unsigned int x = xpos; x < xpos + bsd.xdim; x++) 300 { 301 unsigned int xi = astc::min(x, xsize - 1); 302 303 vint4 datavi = vint4(plane + (4 * stride * yi) + (4 * xi)); 304 vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f); 305 306 // Compute block metadata 307 data_min = min(data_min, datav); 308 data_mean += datav; 309 data_max = max(data_max, datav); 310 311 grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>()); 312 313 blk.data_r[idx] = datav.lane<0>(); 314 blk.data_g[idx] = datav.lane<1>(); 315 blk.data_b[idx] = datav.lane<2>(); 316 blk.data_a[idx] = datav.lane<3>(); 317 318 idx++; 319 } 320 } 321 322 // Reverse the encoding so we store origin block in the original format 323 blk.origin_texel = blk.texel(0) / 65535.0f; 324 325 // Store block metadata 326 blk.rgb_lns[0] = 0; 327 blk.alpha_lns[0] = 0; 328 blk.data_min = data_min; 329 blk.data_mean = data_mean / static_cast<float>(bsd.texel_count); 330 blk.data_max = data_max; 331 blk.grayscale = all(grayscalev); 332} 333 334/* See header for documentation. */ 335void store_image_block( 336 astcenc_image& img, 337 const image_block& blk, 338 const block_size_descriptor& bsd, 339 unsigned int xpos, 340 unsigned int ypos, 341 unsigned int zpos, 342 const astcenc_swizzle& swz 343) { 344 unsigned int x_size = img.dim_x; 345 unsigned int x_start = xpos; 346 unsigned int x_end = astc::min(x_size, xpos + bsd.xdim); 347 unsigned int x_count = x_end - x_start; 348 unsigned int x_nudge = bsd.xdim - x_count; 349 350 unsigned int y_size = img.dim_y; 351 unsigned int y_start = ypos; 352 unsigned int y_end = astc::min(y_size, ypos + bsd.ydim); 353 unsigned int y_count = y_end - y_start; 354 unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim; 355 356 unsigned int z_size = img.dim_z; 357 unsigned int z_start = zpos; 358 unsigned int z_end = astc::min(z_size, zpos + bsd.zdim); 359 360 // True if any non-identity swizzle 361 bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) || 362 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A); 363 364 // True if any swizzle uses Z reconstruct 365 bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) || 366 (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z); 367 368 int idx = 0; 369 if (img.data_type == ASTCENC_TYPE_U8) 370 { 371 for (unsigned int z = z_start; z < z_end; z++) 372 { 373 // Fetch the image plane 374 uint8_t* data8 = static_cast<uint8_t*>(img.data[z]); 375 376 for (unsigned int y = y_start; y < y_end; y++) 377 { 378 uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start); 379 380 for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH) 381 { 382 unsigned int max_texels = ASTCENC_SIMD_WIDTH; 383 unsigned int used_texels = astc::min(x_count - x, max_texels); 384 385 // Unaligned load as rows are not always SIMD_WIDTH long 386 vfloat data_r(blk.data_r + idx); 387 vfloat data_g(blk.data_g + idx); 388 vfloat data_b(blk.data_b + idx); 389 vfloat data_a(blk.data_a + idx); 390 391 vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f); 392 vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f); 393 vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f); 394 vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f); 395 396 if (needs_swz) 397 { 398 vint swizzle_table[7]; 399 swizzle_table[ASTCENC_SWZ_0] = vint(0); 400 swizzle_table[ASTCENC_SWZ_1] = vint(255); 401 swizzle_table[ASTCENC_SWZ_R] = data_ri; 402 swizzle_table[ASTCENC_SWZ_G] = data_gi; 403 swizzle_table[ASTCENC_SWZ_B] = data_bi; 404 swizzle_table[ASTCENC_SWZ_A] = data_ai; 405 406 if (needs_z) 407 { 408 vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f); 409 vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f); 410 vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y); 411 data_z = max(data_z, 0.0f); 412 data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f); 413 414 swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f); 415 } 416 417 data_ri = swizzle_table[swz.r]; 418 data_gi = swizzle_table[swz.g]; 419 data_bi = swizzle_table[swz.b]; 420 data_ai = swizzle_table[swz.a]; 421 } 422 423 // Errors are NaN encoded - convert to magenta error color 424 // Branch is OK here - it is almost never true so predicts well 425 vmask nan_mask = data_r != data_r; 426 if (any(nan_mask)) 427 { 428 data_ri = select(data_ri, vint(0xFF), nan_mask); 429 data_gi = select(data_gi, vint(0x00), nan_mask); 430 data_bi = select(data_bi, vint(0xFF), nan_mask); 431 data_ai = select(data_ai, vint(0xFF), nan_mask); 432 } 433 434 vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai); 435 vmask store_mask = vint::lane_id() < vint(used_texels); 436 store_lanes_masked(data8_row, data_rgbai, store_mask); 437 438 data8_row += ASTCENC_SIMD_WIDTH * 4; 439 idx += used_texels; 440 } 441 idx += x_nudge; 442 } 443 idx += y_nudge; 444 } 445 } 446 else if (img.data_type == ASTCENC_TYPE_F16) 447 { 448 for (unsigned int z = z_start; z < z_end; z++) 449 { 450 // Fetch the image plane 451 uint16_t* data16 = static_cast<uint16_t*>(img.data[z]); 452 453 for (unsigned int y = y_start; y < y_end; y++) 454 { 455 uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start); 456 457 for (unsigned int x = 0; x < x_count; x++) 458 { 459 vint4 color; 460 461 // NaNs are handled inline - no need to special case 462 if (needs_swz) 463 { 464 float data[7]; 465 data[ASTCENC_SWZ_0] = 0.0f; 466 data[ASTCENC_SWZ_1] = 1.0f; 467 data[ASTCENC_SWZ_R] = blk.data_r[idx]; 468 data[ASTCENC_SWZ_G] = blk.data_g[idx]; 469 data[ASTCENC_SWZ_B] = blk.data_b[idx]; 470 data[ASTCENC_SWZ_A] = blk.data_a[idx]; 471 472 if (needs_z) 473 { 474 float xN = (data[0] * 2.0f) - 1.0f; 475 float yN = (data[3] * 2.0f) - 1.0f; 476 float zN = 1.0f - xN * xN - yN * yN; 477 if (zN < 0.0f) 478 { 479 zN = 0.0f; 480 } 481 data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f; 482 } 483 484 vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); 485 color = float_to_float16(colorf); 486 } 487 else 488 { 489 vfloat4 colorf = blk.texel(idx); 490 color = float_to_float16(colorf); 491 } 492 493 // TODO: Vectorize with store N shorts? 494 data16_row[0] = static_cast<uint16_t>(color.lane<0>()); 495 data16_row[1] = static_cast<uint16_t>(color.lane<1>()); 496 data16_row[2] = static_cast<uint16_t>(color.lane<2>()); 497 data16_row[3] = static_cast<uint16_t>(color.lane<3>()); 498 data16_row += 4; 499 idx++; 500 } 501 idx += x_nudge; 502 } 503 idx += y_nudge; 504 } 505 } 506 else // if (img.data_type == ASTCENC_TYPE_F32) 507 { 508 assert(img.data_type == ASTCENC_TYPE_F32); 509 510 for (unsigned int z = z_start; z < z_end; z++) 511 { 512 // Fetch the image plane 513 float* data32 = static_cast<float*>(img.data[z]); 514 515 for (unsigned int y = y_start; y < y_end; y++) 516 { 517 float* data32_row = data32 + (4 * x_size * y) + (4 * x_start); 518 519 for (unsigned int x = 0; x < x_count; x++) 520 { 521 vfloat4 color = blk.texel(idx); 522 523 // NaNs are handled inline - no need to special case 524 if (needs_swz) 525 { 526 float data[7]; 527 data[ASTCENC_SWZ_0] = 0.0f; 528 data[ASTCENC_SWZ_1] = 1.0f; 529 data[ASTCENC_SWZ_R] = color.lane<0>(); 530 data[ASTCENC_SWZ_G] = color.lane<1>(); 531 data[ASTCENC_SWZ_B] = color.lane<2>(); 532 data[ASTCENC_SWZ_A] = color.lane<3>(); 533 534 if (needs_z) 535 { 536 float xN = (data[0] * 2.0f) - 1.0f; 537 float yN = (data[3] * 2.0f) - 1.0f; 538 float zN = 1.0f - xN * xN - yN * yN; 539 if (zN < 0.0f) 540 { 541 zN = 0.0f; 542 } 543 data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f; 544 } 545 546 color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); 547 } 548 549 store(color, data32_row); 550 data32_row += 4; 551 idx++; 552 } 553 idx += x_nudge; 554 } 555 idx += y_nudge; 556 } 557 } 558} 559