1// SPDX-License-Identifier: Apache-2.0
2// ----------------------------------------------------------------------------
3// Copyright 2011-2024 Arm Limited
4//
5// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6// use this file except in compliance with the License. You may obtain a copy
7// of the License at:
8//
9//     http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14// License for the specific language governing permissions and limitations
15// under the License.
16// ----------------------------------------------------------------------------
17
18/**
19 * @brief Functions for creating in-memory ASTC image structures.
20 */
21
22#include <cassert>
23#include <cstring>
24
25#include "astcenc_internal.h"
26
27/**
28 * @brief Loader pipeline function type for data fetch from memory.
29 */
30using pixel_loader = vfloat4(*)(const void*, int);
31
32/**
33 * @brief Loader pipeline function type for swizzling data in a vector.
34 */
35using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
36
37/**
38 * @brief Loader pipeline function type for converting data in a vector to LNS.
39 */
40using pixel_converter = vfloat4(*)(vfloat4, vmask4);
41
42/**
43 * @brief Load a 8-bit UNORM texel from a data array.
44 *
45 * @param data          The data pointer.
46 * @param base_offset   The index offset to the start of the pixel.
47 */
48static vfloat4 load_texel_u8(
49	const void* data,
50	int base_offset
51) {
52	const uint8_t* data8 = static_cast<const uint8_t*>(data);
53	return int_to_float(vint4(data8 + base_offset)) / 255.0f;
54}
55
56/**
57 * @brief Load a 16-bit fp16 texel from a data array.
58 *
59 * @param data          The data pointer.
60 * @param base_offset   The index offset to the start of the pixel.
61 */
62static vfloat4 load_texel_f16(
63	const void* data,
64	int base_offset
65) {
66	const uint16_t* data16 = static_cast<const uint16_t*>(data);
67	int r = data16[base_offset    ];
68	int g = data16[base_offset + 1];
69	int b = data16[base_offset + 2];
70	int a = data16[base_offset + 3];
71	return float16_to_float(vint4(r, g, b, a));
72}
73
74/**
75 * @brief Load a 32-bit float texel from a data array.
76 *
77 * @param data          The data pointer.
78 * @param base_offset   The index offset to the start of the pixel.
79 */
80static vfloat4 load_texel_f32(
81	const void* data,
82	int base_offset
83) {
84	const float* data32 = static_cast<const float*>(data);
85	return vfloat4(data32 + base_offset);
86}
87
88/**
89 * @brief Dummy no-op swizzle function.
90 *
91 * @param data   The source RGBA vector to swizzle.
92 * @param swz    The swizzle to use.
93 */
94static vfloat4 swz_texel_skip(
95	vfloat4 data,
96	const astcenc_swizzle& swz
97) {
98	(void)swz;
99	return data;
100}
101
102/**
103 * @brief Swizzle a texel into a new arrangement.
104 *
105 * @param data   The source RGBA vector to swizzle.
106 * @param swz    The swizzle to use.
107 */
108static vfloat4 swz_texel(
109	vfloat4 data,
110	const astcenc_swizzle& swz
111) {
112	ASTCENC_ALIGNAS float datas[6];
113
114	storea(data, datas);
115	datas[ASTCENC_SWZ_0] = 0.0f;
116	datas[ASTCENC_SWZ_1] = 1.0f;
117
118	return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
119}
120
121/**
122 * @brief Encode a texel that is entirely LDR linear.
123 *
124 * @param data       The RGBA data to encode.
125 * @param lns_mask   The mask for the HDR channels than need LNS encoding.
126 */
127static vfloat4 encode_texel_unorm(
128	vfloat4 data,
129	vmask4 lns_mask
130) {
131	(void)lns_mask;
132	return data * 65535.0f;
133}
134
135/**
136 * @brief Encode a texel that includes at least some HDR LNS texels.
137 *
138 * @param data       The RGBA data to encode.
139 * @param lns_mask   The mask for the HDR channels than need LNS encoding.
140 */
141static vfloat4 encode_texel_lns(
142	vfloat4 data,
143	vmask4 lns_mask
144) {
145	vfloat4 datav_unorm = data * 65535.0f;
146	vfloat4 datav_lns = float_to_lns(data);
147	return select(datav_unorm, datav_lns, lns_mask);
148}
149
150/* See header for documentation. */
151void load_image_block(
152	astcenc_profile decode_mode,
153	const astcenc_image& img,
154	image_block& blk,
155	const block_size_descriptor& bsd,
156	unsigned int xpos,
157	unsigned int ypos,
158	unsigned int zpos,
159	const astcenc_swizzle& swz
160) {
161	unsigned int xsize = img.dim_x;
162	unsigned int ysize = img.dim_y;
163	unsigned int zsize = img.dim_z;
164
165	blk.xpos = xpos;
166	blk.ypos = ypos;
167	blk.zpos = zpos;
168
169	// True if any non-identity swizzle
170	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
171	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
172
173	int idx = 0;
174
175	vfloat4 data_min(1e38f);
176	vfloat4 data_mean(0.0f);
177	vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
178	vfloat4 data_max(-1e38f);
179	vmask4 grayscalev(true);
180
181	// This works because we impose the same choice everywhere during encode
182	uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
183	                  (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
184	uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
185	vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
186	vmask4 lns_mask = use_lns != vint4::zero();
187
188	// Set up the function pointers for loading pipeline as needed
189	pixel_loader loader = load_texel_u8;
190	if (img.data_type == ASTCENC_TYPE_F16)
191	{
192		loader = load_texel_f16;
193	}
194	else if  (img.data_type == ASTCENC_TYPE_F32)
195	{
196		loader = load_texel_f32;
197	}
198
199	pixel_swizzler swizzler = swz_texel_skip;
200	if (needs_swz)
201	{
202		swizzler = swz_texel;
203	}
204
205	pixel_converter converter = encode_texel_unorm;
206	if (any(lns_mask))
207	{
208		converter = encode_texel_lns;
209	}
210
211	for (unsigned int z = 0; z < bsd.zdim; z++)
212	{
213		unsigned int zi = astc::min(zpos + z, zsize - 1);
214		void* plane = img.data[zi];
215
216		for (unsigned int y = 0; y < bsd.ydim; y++)
217		{
218			unsigned int yi = astc::min(ypos + y, ysize - 1);
219
220			for (unsigned int x = 0; x < bsd.xdim; x++)
221			{
222				unsigned int xi = astc::min(xpos + x, xsize - 1);
223
224				vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
225				datav = swizzler(datav, swz);
226				datav = converter(datav, lns_mask);
227
228				// Compute block metadata
229				data_min = min(data_min, datav);
230				data_mean += datav * data_mean_scale;
231				data_max = max(data_max, datav);
232
233				grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
234
235				blk.data_r[idx] = datav.lane<0>();
236				blk.data_g[idx] = datav.lane<1>();
237				blk.data_b[idx] = datav.lane<2>();
238				blk.data_a[idx] = datav.lane<3>();
239
240				blk.rgb_lns[idx] = rgb_lns;
241				blk.alpha_lns[idx] = a_lns;
242
243				idx++;
244			}
245		}
246	}
247
248	// Reverse the encoding so we store origin block in the original format
249	vfloat4 data_enc = blk.texel(0);
250	vfloat4 data_enc_unorm = data_enc / 65535.0f;
251	vfloat4 data_enc_lns = vfloat4::zero();
252
253	if (rgb_lns || a_lns)
254	{
255		data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
256	}
257
258	blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
259
260	// Store block metadata
261	blk.data_min = data_min;
262	blk.data_mean = data_mean;
263	blk.data_max = data_max;
264	blk.grayscale = all(grayscalev);
265}
266
267/* See header for documentation. */
268void load_image_block_fast_ldr(
269	astcenc_profile decode_mode,
270	const astcenc_image& img,
271	image_block& blk,
272	const block_size_descriptor& bsd,
273	unsigned int xpos,
274	unsigned int ypos,
275	unsigned int zpos,
276	const astcenc_swizzle& swz
277) {
278	(void)swz;
279	(void)decode_mode;
280
281	unsigned int xsize = img.dim_x;
282	unsigned int ysize = img.dim_y;
283	unsigned int stride = img.dim_stride;
284	blk.xpos = xpos;
285	blk.ypos = ypos;
286	blk.zpos = zpos;
287
288	vfloat4 data_min(1e38f);
289	vfloat4 data_mean = vfloat4::zero();
290	vfloat4 data_max(-1e38f);
291	vmask4 grayscalev(true);
292	int idx = 0;
293
294	const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
295	for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
296	{
297		unsigned int yi = astc::min(y, ysize - 1);
298
299		for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
300		{
301			unsigned int xi = astc::min(x, xsize - 1);
302
303			vint4 datavi = vint4(plane + (4 * stride * yi) + (4 * xi));
304			vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
305
306			// Compute block metadata
307			data_min = min(data_min, datav);
308			data_mean += datav;
309			data_max = max(data_max, datav);
310
311			grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
312
313			blk.data_r[idx] = datav.lane<0>();
314			blk.data_g[idx] = datav.lane<1>();
315			blk.data_b[idx] = datav.lane<2>();
316			blk.data_a[idx] = datav.lane<3>();
317
318			idx++;
319		}
320	}
321
322	// Reverse the encoding so we store origin block in the original format
323	blk.origin_texel = blk.texel(0) / 65535.0f;
324
325	// Store block metadata
326	blk.rgb_lns[0] = 0;
327	blk.alpha_lns[0] = 0;
328	blk.data_min = data_min;
329	blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
330	blk.data_max = data_max;
331	blk.grayscale = all(grayscalev);
332}
333
334/* See header for documentation. */
335void store_image_block(
336	astcenc_image& img,
337	const image_block& blk,
338	const block_size_descriptor& bsd,
339	unsigned int xpos,
340	unsigned int ypos,
341	unsigned int zpos,
342	const astcenc_swizzle& swz
343) {
344	unsigned int x_size = img.dim_x;
345	unsigned int x_start = xpos;
346	unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
347	unsigned int x_count = x_end - x_start;
348	unsigned int x_nudge = bsd.xdim - x_count;
349
350	unsigned int y_size = img.dim_y;
351	unsigned int y_start = ypos;
352	unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
353	unsigned int y_count = y_end - y_start;
354	unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
355
356	unsigned int z_size = img.dim_z;
357	unsigned int z_start = zpos;
358	unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
359
360	// True if any non-identity swizzle
361	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
362	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
363
364	// True if any swizzle uses Z reconstruct
365	bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
366	               (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
367
368	int idx = 0;
369	if (img.data_type == ASTCENC_TYPE_U8)
370	{
371		for (unsigned int z = z_start; z < z_end; z++)
372		{
373			// Fetch the image plane
374			uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
375
376			for (unsigned int y = y_start; y < y_end; y++)
377			{
378				uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
379
380				for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
381				{
382					unsigned int max_texels = ASTCENC_SIMD_WIDTH;
383					unsigned int used_texels = astc::min(x_count - x, max_texels);
384
385					// Unaligned load as rows are not always SIMD_WIDTH long
386					vfloat data_r(blk.data_r + idx);
387					vfloat data_g(blk.data_g + idx);
388					vfloat data_b(blk.data_b + idx);
389					vfloat data_a(blk.data_a + idx);
390
391					vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
392					vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
393					vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
394					vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
395
396					if (needs_swz)
397					{
398						vint swizzle_table[7];
399						swizzle_table[ASTCENC_SWZ_0] = vint(0);
400						swizzle_table[ASTCENC_SWZ_1] = vint(255);
401						swizzle_table[ASTCENC_SWZ_R] = data_ri;
402						swizzle_table[ASTCENC_SWZ_G] = data_gi;
403						swizzle_table[ASTCENC_SWZ_B] = data_bi;
404						swizzle_table[ASTCENC_SWZ_A] = data_ai;
405
406						if (needs_z)
407						{
408							vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
409							vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
410							vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
411							data_z = max(data_z, 0.0f);
412							data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
413
414							swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
415						}
416
417						data_ri = swizzle_table[swz.r];
418						data_gi = swizzle_table[swz.g];
419						data_bi = swizzle_table[swz.b];
420						data_ai = swizzle_table[swz.a];
421					}
422
423					// Errors are NaN encoded - convert to magenta error color
424					// Branch is OK here - it is almost never true so predicts well
425					vmask nan_mask = data_r != data_r;
426					if (any(nan_mask))
427					{
428						data_ri = select(data_ri, vint(0xFF), nan_mask);
429						data_gi = select(data_gi, vint(0x00), nan_mask);
430						data_bi = select(data_bi, vint(0xFF), nan_mask);
431						data_ai = select(data_ai, vint(0xFF), nan_mask);
432					}
433
434					vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
435					vmask store_mask = vint::lane_id() < vint(used_texels);
436					store_lanes_masked(data8_row, data_rgbai, store_mask);
437
438					data8_row += ASTCENC_SIMD_WIDTH * 4;
439					idx += used_texels;
440				}
441				idx += x_nudge;
442			}
443			idx += y_nudge;
444		}
445	}
446	else if (img.data_type == ASTCENC_TYPE_F16)
447	{
448		for (unsigned int z = z_start; z < z_end; z++)
449		{
450			// Fetch the image plane
451			uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
452
453			for (unsigned int y = y_start; y < y_end; y++)
454			{
455				uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
456
457				for (unsigned int x = 0; x < x_count; x++)
458				{
459					vint4 color;
460
461					// NaNs are handled inline - no need to special case
462					if (needs_swz)
463					{
464						float data[7];
465						data[ASTCENC_SWZ_0] = 0.0f;
466						data[ASTCENC_SWZ_1] = 1.0f;
467						data[ASTCENC_SWZ_R] = blk.data_r[idx];
468						data[ASTCENC_SWZ_G] = blk.data_g[idx];
469						data[ASTCENC_SWZ_B] = blk.data_b[idx];
470						data[ASTCENC_SWZ_A] = blk.data_a[idx];
471
472						if (needs_z)
473						{
474							float xN = (data[0] * 2.0f) - 1.0f;
475							float yN = (data[3] * 2.0f) - 1.0f;
476							float zN = 1.0f - xN * xN - yN * yN;
477							if (zN < 0.0f)
478							{
479								zN = 0.0f;
480							}
481							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
482						}
483
484						vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
485						color = float_to_float16(colorf);
486					}
487					else
488					{
489						vfloat4 colorf = blk.texel(idx);
490						color = float_to_float16(colorf);
491					}
492
493					// TODO: Vectorize with store N shorts?
494					data16_row[0] = static_cast<uint16_t>(color.lane<0>());
495					data16_row[1] = static_cast<uint16_t>(color.lane<1>());
496					data16_row[2] = static_cast<uint16_t>(color.lane<2>());
497					data16_row[3] = static_cast<uint16_t>(color.lane<3>());
498					data16_row += 4;
499					idx++;
500				}
501				idx += x_nudge;
502			}
503			idx += y_nudge;
504		}
505	}
506	else // if (img.data_type == ASTCENC_TYPE_F32)
507	{
508		assert(img.data_type == ASTCENC_TYPE_F32);
509
510		for (unsigned int z = z_start; z < z_end; z++)
511		{
512			// Fetch the image plane
513			float* data32 = static_cast<float*>(img.data[z]);
514
515			for (unsigned int y = y_start; y < y_end; y++)
516			{
517				float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
518
519				for (unsigned int x = 0; x < x_count; x++)
520				{
521					vfloat4 color = blk.texel(idx);
522
523					// NaNs are handled inline - no need to special case
524					if (needs_swz)
525					{
526						float data[7];
527						data[ASTCENC_SWZ_0] = 0.0f;
528						data[ASTCENC_SWZ_1] = 1.0f;
529						data[ASTCENC_SWZ_R] = color.lane<0>();
530						data[ASTCENC_SWZ_G] = color.lane<1>();
531						data[ASTCENC_SWZ_B] = color.lane<2>();
532						data[ASTCENC_SWZ_A] = color.lane<3>();
533
534						if (needs_z)
535						{
536							float xN = (data[0] * 2.0f) - 1.0f;
537							float yN = (data[3] * 2.0f) - 1.0f;
538							float zN = 1.0f - xN * xN - yN * yN;
539							if (zN < 0.0f)
540							{
541								zN = 0.0f;
542							}
543							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
544						}
545
546						color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
547					}
548
549					store(color, data32_row);
550					data32_row += 4;
551					idx++;
552				}
553				idx += x_nudge;
554			}
555			idx += y_nudge;
556		}
557	}
558}
559