1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0
2cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
3cc1dc7a3Sopenharmony_ci// Copyright 2011-2024 Arm Limited
4cc1dc7a3Sopenharmony_ci//
5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy
7cc1dc7a3Sopenharmony_ci// of the License at:
8cc1dc7a3Sopenharmony_ci//
9cc1dc7a3Sopenharmony_ci//     http://www.apache.org/licenses/LICENSE-2.0
10cc1dc7a3Sopenharmony_ci//
11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software
12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations
15cc1dc7a3Sopenharmony_ci// under the License.
16cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
17cc1dc7a3Sopenharmony_ci
18cc1dc7a3Sopenharmony_ci/**
19cc1dc7a3Sopenharmony_ci * @brief Functions for creating in-memory ASTC image structures.
20cc1dc7a3Sopenharmony_ci */
21cc1dc7a3Sopenharmony_ci
22cc1dc7a3Sopenharmony_ci#include <cassert>
23cc1dc7a3Sopenharmony_ci#include <cstring>
24cc1dc7a3Sopenharmony_ci
25cc1dc7a3Sopenharmony_ci#include "astcenc_internal.h"
26cc1dc7a3Sopenharmony_ci
27cc1dc7a3Sopenharmony_ci/**
28cc1dc7a3Sopenharmony_ci * @brief Loader pipeline function type for data fetch from memory.
29cc1dc7a3Sopenharmony_ci */
30cc1dc7a3Sopenharmony_ciusing pixel_loader = vfloat4(*)(const void*, int);
31cc1dc7a3Sopenharmony_ci
32cc1dc7a3Sopenharmony_ci/**
33cc1dc7a3Sopenharmony_ci * @brief Loader pipeline function type for swizzling data in a vector.
34cc1dc7a3Sopenharmony_ci */
35cc1dc7a3Sopenharmony_ciusing pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
36cc1dc7a3Sopenharmony_ci
37cc1dc7a3Sopenharmony_ci/**
38cc1dc7a3Sopenharmony_ci * @brief Loader pipeline function type for converting data in a vector to LNS.
39cc1dc7a3Sopenharmony_ci */
40cc1dc7a3Sopenharmony_ciusing pixel_converter = vfloat4(*)(vfloat4, vmask4);
41cc1dc7a3Sopenharmony_ci
42cc1dc7a3Sopenharmony_ci/**
43cc1dc7a3Sopenharmony_ci * @brief Load a 8-bit UNORM texel from a data array.
44cc1dc7a3Sopenharmony_ci *
45cc1dc7a3Sopenharmony_ci * @param data          The data pointer.
46cc1dc7a3Sopenharmony_ci * @param base_offset   The index offset to the start of the pixel.
47cc1dc7a3Sopenharmony_ci */
48cc1dc7a3Sopenharmony_cistatic vfloat4 load_texel_u8(
49cc1dc7a3Sopenharmony_ci	const void* data,
50cc1dc7a3Sopenharmony_ci	int base_offset
51cc1dc7a3Sopenharmony_ci) {
52cc1dc7a3Sopenharmony_ci	const uint8_t* data8 = static_cast<const uint8_t*>(data);
53cc1dc7a3Sopenharmony_ci	return int_to_float(vint4(data8 + base_offset)) / 255.0f;
54cc1dc7a3Sopenharmony_ci}
55cc1dc7a3Sopenharmony_ci
56cc1dc7a3Sopenharmony_ci/**
57cc1dc7a3Sopenharmony_ci * @brief Load a 16-bit fp16 texel from a data array.
58cc1dc7a3Sopenharmony_ci *
59cc1dc7a3Sopenharmony_ci * @param data          The data pointer.
60cc1dc7a3Sopenharmony_ci * @param base_offset   The index offset to the start of the pixel.
61cc1dc7a3Sopenharmony_ci */
62cc1dc7a3Sopenharmony_cistatic vfloat4 load_texel_f16(
63cc1dc7a3Sopenharmony_ci	const void* data,
64cc1dc7a3Sopenharmony_ci	int base_offset
65cc1dc7a3Sopenharmony_ci) {
66cc1dc7a3Sopenharmony_ci	const uint16_t* data16 = static_cast<const uint16_t*>(data);
67cc1dc7a3Sopenharmony_ci	int r = data16[base_offset    ];
68cc1dc7a3Sopenharmony_ci	int g = data16[base_offset + 1];
69cc1dc7a3Sopenharmony_ci	int b = data16[base_offset + 2];
70cc1dc7a3Sopenharmony_ci	int a = data16[base_offset + 3];
71cc1dc7a3Sopenharmony_ci	return float16_to_float(vint4(r, g, b, a));
72cc1dc7a3Sopenharmony_ci}
73cc1dc7a3Sopenharmony_ci
74cc1dc7a3Sopenharmony_ci/**
75cc1dc7a3Sopenharmony_ci * @brief Load a 32-bit float texel from a data array.
76cc1dc7a3Sopenharmony_ci *
77cc1dc7a3Sopenharmony_ci * @param data          The data pointer.
78cc1dc7a3Sopenharmony_ci * @param base_offset   The index offset to the start of the pixel.
79cc1dc7a3Sopenharmony_ci */
80cc1dc7a3Sopenharmony_cistatic vfloat4 load_texel_f32(
81cc1dc7a3Sopenharmony_ci	const void* data,
82cc1dc7a3Sopenharmony_ci	int base_offset
83cc1dc7a3Sopenharmony_ci) {
84cc1dc7a3Sopenharmony_ci	const float* data32 = static_cast<const float*>(data);
85cc1dc7a3Sopenharmony_ci	return vfloat4(data32 + base_offset);
86cc1dc7a3Sopenharmony_ci}
87cc1dc7a3Sopenharmony_ci
88cc1dc7a3Sopenharmony_ci/**
89cc1dc7a3Sopenharmony_ci * @brief Dummy no-op swizzle function.
90cc1dc7a3Sopenharmony_ci *
91cc1dc7a3Sopenharmony_ci * @param data   The source RGBA vector to swizzle.
92cc1dc7a3Sopenharmony_ci * @param swz    The swizzle to use.
93cc1dc7a3Sopenharmony_ci */
94cc1dc7a3Sopenharmony_cistatic vfloat4 swz_texel_skip(
95cc1dc7a3Sopenharmony_ci	vfloat4 data,
96cc1dc7a3Sopenharmony_ci	const astcenc_swizzle& swz
97cc1dc7a3Sopenharmony_ci) {
98cc1dc7a3Sopenharmony_ci	(void)swz;
99cc1dc7a3Sopenharmony_ci	return data;
100cc1dc7a3Sopenharmony_ci}
101cc1dc7a3Sopenharmony_ci
102cc1dc7a3Sopenharmony_ci/**
103cc1dc7a3Sopenharmony_ci * @brief Swizzle a texel into a new arrangement.
104cc1dc7a3Sopenharmony_ci *
105cc1dc7a3Sopenharmony_ci * @param data   The source RGBA vector to swizzle.
106cc1dc7a3Sopenharmony_ci * @param swz    The swizzle to use.
107cc1dc7a3Sopenharmony_ci */
108cc1dc7a3Sopenharmony_cistatic vfloat4 swz_texel(
109cc1dc7a3Sopenharmony_ci	vfloat4 data,
110cc1dc7a3Sopenharmony_ci	const astcenc_swizzle& swz
111cc1dc7a3Sopenharmony_ci) {
112cc1dc7a3Sopenharmony_ci	ASTCENC_ALIGNAS float datas[6];
113cc1dc7a3Sopenharmony_ci
114cc1dc7a3Sopenharmony_ci	storea(data, datas);
115cc1dc7a3Sopenharmony_ci	datas[ASTCENC_SWZ_0] = 0.0f;
116cc1dc7a3Sopenharmony_ci	datas[ASTCENC_SWZ_1] = 1.0f;
117cc1dc7a3Sopenharmony_ci
118cc1dc7a3Sopenharmony_ci	return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
119cc1dc7a3Sopenharmony_ci}
120cc1dc7a3Sopenharmony_ci
121cc1dc7a3Sopenharmony_ci/**
122cc1dc7a3Sopenharmony_ci * @brief Encode a texel that is entirely LDR linear.
123cc1dc7a3Sopenharmony_ci *
124cc1dc7a3Sopenharmony_ci * @param data       The RGBA data to encode.
125cc1dc7a3Sopenharmony_ci * @param lns_mask   The mask for the HDR channels than need LNS encoding.
126cc1dc7a3Sopenharmony_ci */
127cc1dc7a3Sopenharmony_cistatic vfloat4 encode_texel_unorm(
128cc1dc7a3Sopenharmony_ci	vfloat4 data,
129cc1dc7a3Sopenharmony_ci	vmask4 lns_mask
130cc1dc7a3Sopenharmony_ci) {
131cc1dc7a3Sopenharmony_ci	(void)lns_mask;
132cc1dc7a3Sopenharmony_ci	return data * 65535.0f;
133cc1dc7a3Sopenharmony_ci}
134cc1dc7a3Sopenharmony_ci
135cc1dc7a3Sopenharmony_ci/**
136cc1dc7a3Sopenharmony_ci * @brief Encode a texel that includes at least some HDR LNS texels.
137cc1dc7a3Sopenharmony_ci *
138cc1dc7a3Sopenharmony_ci * @param data       The RGBA data to encode.
139cc1dc7a3Sopenharmony_ci * @param lns_mask   The mask for the HDR channels than need LNS encoding.
140cc1dc7a3Sopenharmony_ci */
141cc1dc7a3Sopenharmony_cistatic vfloat4 encode_texel_lns(
142cc1dc7a3Sopenharmony_ci	vfloat4 data,
143cc1dc7a3Sopenharmony_ci	vmask4 lns_mask
144cc1dc7a3Sopenharmony_ci) {
145cc1dc7a3Sopenharmony_ci	vfloat4 datav_unorm = data * 65535.0f;
146cc1dc7a3Sopenharmony_ci	vfloat4 datav_lns = float_to_lns(data);
147cc1dc7a3Sopenharmony_ci	return select(datav_unorm, datav_lns, lns_mask);
148cc1dc7a3Sopenharmony_ci}
149cc1dc7a3Sopenharmony_ci
150cc1dc7a3Sopenharmony_ci/* See header for documentation. */
151cc1dc7a3Sopenharmony_civoid load_image_block(
152cc1dc7a3Sopenharmony_ci	astcenc_profile decode_mode,
153cc1dc7a3Sopenharmony_ci	const astcenc_image& img,
154cc1dc7a3Sopenharmony_ci	image_block& blk,
155cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
156cc1dc7a3Sopenharmony_ci	unsigned int xpos,
157cc1dc7a3Sopenharmony_ci	unsigned int ypos,
158cc1dc7a3Sopenharmony_ci	unsigned int zpos,
159cc1dc7a3Sopenharmony_ci	const astcenc_swizzle& swz
160cc1dc7a3Sopenharmony_ci) {
161cc1dc7a3Sopenharmony_ci	unsigned int xsize = img.dim_x;
162cc1dc7a3Sopenharmony_ci	unsigned int ysize = img.dim_y;
163cc1dc7a3Sopenharmony_ci	unsigned int zsize = img.dim_z;
164cc1dc7a3Sopenharmony_ci
165cc1dc7a3Sopenharmony_ci	blk.xpos = xpos;
166cc1dc7a3Sopenharmony_ci	blk.ypos = ypos;
167cc1dc7a3Sopenharmony_ci	blk.zpos = zpos;
168cc1dc7a3Sopenharmony_ci
169cc1dc7a3Sopenharmony_ci	// True if any non-identity swizzle
170cc1dc7a3Sopenharmony_ci	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
171cc1dc7a3Sopenharmony_ci	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
172cc1dc7a3Sopenharmony_ci
173cc1dc7a3Sopenharmony_ci	int idx = 0;
174cc1dc7a3Sopenharmony_ci
175cc1dc7a3Sopenharmony_ci	vfloat4 data_min(1e38f);
176cc1dc7a3Sopenharmony_ci	vfloat4 data_mean(0.0f);
177cc1dc7a3Sopenharmony_ci	vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
178cc1dc7a3Sopenharmony_ci	vfloat4 data_max(-1e38f);
179cc1dc7a3Sopenharmony_ci	vmask4 grayscalev(true);
180cc1dc7a3Sopenharmony_ci
181cc1dc7a3Sopenharmony_ci	// This works because we impose the same choice everywhere during encode
182cc1dc7a3Sopenharmony_ci	uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
183cc1dc7a3Sopenharmony_ci	                  (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
184cc1dc7a3Sopenharmony_ci	uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
185cc1dc7a3Sopenharmony_ci	vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
186cc1dc7a3Sopenharmony_ci	vmask4 lns_mask = use_lns != vint4::zero();
187cc1dc7a3Sopenharmony_ci
188cc1dc7a3Sopenharmony_ci	// Set up the function pointers for loading pipeline as needed
189cc1dc7a3Sopenharmony_ci	pixel_loader loader = load_texel_u8;
190cc1dc7a3Sopenharmony_ci	if (img.data_type == ASTCENC_TYPE_F16)
191cc1dc7a3Sopenharmony_ci	{
192cc1dc7a3Sopenharmony_ci		loader = load_texel_f16;
193cc1dc7a3Sopenharmony_ci	}
194cc1dc7a3Sopenharmony_ci	else if  (img.data_type == ASTCENC_TYPE_F32)
195cc1dc7a3Sopenharmony_ci	{
196cc1dc7a3Sopenharmony_ci		loader = load_texel_f32;
197cc1dc7a3Sopenharmony_ci	}
198cc1dc7a3Sopenharmony_ci
199cc1dc7a3Sopenharmony_ci	pixel_swizzler swizzler = swz_texel_skip;
200cc1dc7a3Sopenharmony_ci	if (needs_swz)
201cc1dc7a3Sopenharmony_ci	{
202cc1dc7a3Sopenharmony_ci		swizzler = swz_texel;
203cc1dc7a3Sopenharmony_ci	}
204cc1dc7a3Sopenharmony_ci
205cc1dc7a3Sopenharmony_ci	pixel_converter converter = encode_texel_unorm;
206cc1dc7a3Sopenharmony_ci	if (any(lns_mask))
207cc1dc7a3Sopenharmony_ci	{
208cc1dc7a3Sopenharmony_ci		converter = encode_texel_lns;
209cc1dc7a3Sopenharmony_ci	}
210cc1dc7a3Sopenharmony_ci
211cc1dc7a3Sopenharmony_ci	for (unsigned int z = 0; z < bsd.zdim; z++)
212cc1dc7a3Sopenharmony_ci	{
213cc1dc7a3Sopenharmony_ci		unsigned int zi = astc::min(zpos + z, zsize - 1);
214cc1dc7a3Sopenharmony_ci		void* plane = img.data[zi];
215cc1dc7a3Sopenharmony_ci
216cc1dc7a3Sopenharmony_ci		for (unsigned int y = 0; y < bsd.ydim; y++)
217cc1dc7a3Sopenharmony_ci		{
218cc1dc7a3Sopenharmony_ci			unsigned int yi = astc::min(ypos + y, ysize - 1);
219cc1dc7a3Sopenharmony_ci
220cc1dc7a3Sopenharmony_ci			for (unsigned int x = 0; x < bsd.xdim; x++)
221cc1dc7a3Sopenharmony_ci			{
222cc1dc7a3Sopenharmony_ci				unsigned int xi = astc::min(xpos + x, xsize - 1);
223cc1dc7a3Sopenharmony_ci
224cc1dc7a3Sopenharmony_ci				vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
225cc1dc7a3Sopenharmony_ci				datav = swizzler(datav, swz);
226cc1dc7a3Sopenharmony_ci				datav = converter(datav, lns_mask);
227cc1dc7a3Sopenharmony_ci
228cc1dc7a3Sopenharmony_ci				// Compute block metadata
229cc1dc7a3Sopenharmony_ci				data_min = min(data_min, datav);
230cc1dc7a3Sopenharmony_ci				data_mean += datav * data_mean_scale;
231cc1dc7a3Sopenharmony_ci				data_max = max(data_max, datav);
232cc1dc7a3Sopenharmony_ci
233cc1dc7a3Sopenharmony_ci				grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
234cc1dc7a3Sopenharmony_ci
235cc1dc7a3Sopenharmony_ci				blk.data_r[idx] = datav.lane<0>();
236cc1dc7a3Sopenharmony_ci				blk.data_g[idx] = datav.lane<1>();
237cc1dc7a3Sopenharmony_ci				blk.data_b[idx] = datav.lane<2>();
238cc1dc7a3Sopenharmony_ci				blk.data_a[idx] = datav.lane<3>();
239cc1dc7a3Sopenharmony_ci
240cc1dc7a3Sopenharmony_ci				blk.rgb_lns[idx] = rgb_lns;
241cc1dc7a3Sopenharmony_ci				blk.alpha_lns[idx] = a_lns;
242cc1dc7a3Sopenharmony_ci
243cc1dc7a3Sopenharmony_ci				idx++;
244cc1dc7a3Sopenharmony_ci			}
245cc1dc7a3Sopenharmony_ci		}
246cc1dc7a3Sopenharmony_ci	}
247cc1dc7a3Sopenharmony_ci
248cc1dc7a3Sopenharmony_ci	// Reverse the encoding so we store origin block in the original format
249cc1dc7a3Sopenharmony_ci	vfloat4 data_enc = blk.texel(0);
250cc1dc7a3Sopenharmony_ci	vfloat4 data_enc_unorm = data_enc / 65535.0f;
251cc1dc7a3Sopenharmony_ci	vfloat4 data_enc_lns = vfloat4::zero();
252cc1dc7a3Sopenharmony_ci
253cc1dc7a3Sopenharmony_ci	if (rgb_lns || a_lns)
254cc1dc7a3Sopenharmony_ci	{
255cc1dc7a3Sopenharmony_ci		data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
256cc1dc7a3Sopenharmony_ci	}
257cc1dc7a3Sopenharmony_ci
258cc1dc7a3Sopenharmony_ci	blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
259cc1dc7a3Sopenharmony_ci
260cc1dc7a3Sopenharmony_ci	// Store block metadata
261cc1dc7a3Sopenharmony_ci	blk.data_min = data_min;
262cc1dc7a3Sopenharmony_ci	blk.data_mean = data_mean;
263cc1dc7a3Sopenharmony_ci	blk.data_max = data_max;
264cc1dc7a3Sopenharmony_ci	blk.grayscale = all(grayscalev);
265cc1dc7a3Sopenharmony_ci}
266cc1dc7a3Sopenharmony_ci
267cc1dc7a3Sopenharmony_ci/* See header for documentation. */
268cc1dc7a3Sopenharmony_civoid load_image_block_fast_ldr(
269cc1dc7a3Sopenharmony_ci	astcenc_profile decode_mode,
270cc1dc7a3Sopenharmony_ci	const astcenc_image& img,
271cc1dc7a3Sopenharmony_ci	image_block& blk,
272cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
273cc1dc7a3Sopenharmony_ci	unsigned int xpos,
274cc1dc7a3Sopenharmony_ci	unsigned int ypos,
275cc1dc7a3Sopenharmony_ci	unsigned int zpos,
276cc1dc7a3Sopenharmony_ci	const astcenc_swizzle& swz
277cc1dc7a3Sopenharmony_ci) {
278cc1dc7a3Sopenharmony_ci	(void)swz;
279cc1dc7a3Sopenharmony_ci	(void)decode_mode;
280cc1dc7a3Sopenharmony_ci
281cc1dc7a3Sopenharmony_ci	unsigned int xsize = img.dim_x;
282cc1dc7a3Sopenharmony_ci	unsigned int ysize = img.dim_y;
283cc1dc7a3Sopenharmony_ci	unsigned int stride = img.dim_stride;
284cc1dc7a3Sopenharmony_ci	blk.xpos = xpos;
285cc1dc7a3Sopenharmony_ci	blk.ypos = ypos;
286cc1dc7a3Sopenharmony_ci	blk.zpos = zpos;
287cc1dc7a3Sopenharmony_ci
288cc1dc7a3Sopenharmony_ci	vfloat4 data_min(1e38f);
289cc1dc7a3Sopenharmony_ci	vfloat4 data_mean = vfloat4::zero();
290cc1dc7a3Sopenharmony_ci	vfloat4 data_max(-1e38f);
291cc1dc7a3Sopenharmony_ci	vmask4 grayscalev(true);
292cc1dc7a3Sopenharmony_ci	int idx = 0;
293cc1dc7a3Sopenharmony_ci
294cc1dc7a3Sopenharmony_ci	const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
295cc1dc7a3Sopenharmony_ci	for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
296cc1dc7a3Sopenharmony_ci	{
297cc1dc7a3Sopenharmony_ci		unsigned int yi = astc::min(y, ysize - 1);
298cc1dc7a3Sopenharmony_ci
299cc1dc7a3Sopenharmony_ci		for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
300cc1dc7a3Sopenharmony_ci		{
301cc1dc7a3Sopenharmony_ci			unsigned int xi = astc::min(x, xsize - 1);
302cc1dc7a3Sopenharmony_ci
303cc1dc7a3Sopenharmony_ci			vint4 datavi = vint4(plane + (4 * stride * yi) + (4 * xi));
304cc1dc7a3Sopenharmony_ci			vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
305cc1dc7a3Sopenharmony_ci
306cc1dc7a3Sopenharmony_ci			// Compute block metadata
307cc1dc7a3Sopenharmony_ci			data_min = min(data_min, datav);
308cc1dc7a3Sopenharmony_ci			data_mean += datav;
309cc1dc7a3Sopenharmony_ci			data_max = max(data_max, datav);
310cc1dc7a3Sopenharmony_ci
311cc1dc7a3Sopenharmony_ci			grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
312cc1dc7a3Sopenharmony_ci
313cc1dc7a3Sopenharmony_ci			blk.data_r[idx] = datav.lane<0>();
314cc1dc7a3Sopenharmony_ci			blk.data_g[idx] = datav.lane<1>();
315cc1dc7a3Sopenharmony_ci			blk.data_b[idx] = datav.lane<2>();
316cc1dc7a3Sopenharmony_ci			blk.data_a[idx] = datav.lane<3>();
317cc1dc7a3Sopenharmony_ci
318cc1dc7a3Sopenharmony_ci			idx++;
319cc1dc7a3Sopenharmony_ci		}
320cc1dc7a3Sopenharmony_ci	}
321cc1dc7a3Sopenharmony_ci
322cc1dc7a3Sopenharmony_ci	// Reverse the encoding so we store origin block in the original format
323cc1dc7a3Sopenharmony_ci	blk.origin_texel = blk.texel(0) / 65535.0f;
324cc1dc7a3Sopenharmony_ci
325cc1dc7a3Sopenharmony_ci	// Store block metadata
326cc1dc7a3Sopenharmony_ci	blk.rgb_lns[0] = 0;
327cc1dc7a3Sopenharmony_ci	blk.alpha_lns[0] = 0;
328cc1dc7a3Sopenharmony_ci	blk.data_min = data_min;
329cc1dc7a3Sopenharmony_ci	blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
330cc1dc7a3Sopenharmony_ci	blk.data_max = data_max;
331cc1dc7a3Sopenharmony_ci	blk.grayscale = all(grayscalev);
332cc1dc7a3Sopenharmony_ci}
333cc1dc7a3Sopenharmony_ci
334cc1dc7a3Sopenharmony_ci/* See header for documentation. */
335cc1dc7a3Sopenharmony_civoid store_image_block(
336cc1dc7a3Sopenharmony_ci	astcenc_image& img,
337cc1dc7a3Sopenharmony_ci	const image_block& blk,
338cc1dc7a3Sopenharmony_ci	const block_size_descriptor& bsd,
339cc1dc7a3Sopenharmony_ci	unsigned int xpos,
340cc1dc7a3Sopenharmony_ci	unsigned int ypos,
341cc1dc7a3Sopenharmony_ci	unsigned int zpos,
342cc1dc7a3Sopenharmony_ci	const astcenc_swizzle& swz
343cc1dc7a3Sopenharmony_ci) {
344cc1dc7a3Sopenharmony_ci	unsigned int x_size = img.dim_x;
345cc1dc7a3Sopenharmony_ci	unsigned int x_start = xpos;
346cc1dc7a3Sopenharmony_ci	unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
347cc1dc7a3Sopenharmony_ci	unsigned int x_count = x_end - x_start;
348cc1dc7a3Sopenharmony_ci	unsigned int x_nudge = bsd.xdim - x_count;
349cc1dc7a3Sopenharmony_ci
350cc1dc7a3Sopenharmony_ci	unsigned int y_size = img.dim_y;
351cc1dc7a3Sopenharmony_ci	unsigned int y_start = ypos;
352cc1dc7a3Sopenharmony_ci	unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
353cc1dc7a3Sopenharmony_ci	unsigned int y_count = y_end - y_start;
354cc1dc7a3Sopenharmony_ci	unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
355cc1dc7a3Sopenharmony_ci
356cc1dc7a3Sopenharmony_ci	unsigned int z_size = img.dim_z;
357cc1dc7a3Sopenharmony_ci	unsigned int z_start = zpos;
358cc1dc7a3Sopenharmony_ci	unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
359cc1dc7a3Sopenharmony_ci
360cc1dc7a3Sopenharmony_ci	// True if any non-identity swizzle
361cc1dc7a3Sopenharmony_ci	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
362cc1dc7a3Sopenharmony_ci	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
363cc1dc7a3Sopenharmony_ci
364cc1dc7a3Sopenharmony_ci	// True if any swizzle uses Z reconstruct
365cc1dc7a3Sopenharmony_ci	bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
366cc1dc7a3Sopenharmony_ci	               (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
367cc1dc7a3Sopenharmony_ci
368cc1dc7a3Sopenharmony_ci	int idx = 0;
369cc1dc7a3Sopenharmony_ci	if (img.data_type == ASTCENC_TYPE_U8)
370cc1dc7a3Sopenharmony_ci	{
371cc1dc7a3Sopenharmony_ci		for (unsigned int z = z_start; z < z_end; z++)
372cc1dc7a3Sopenharmony_ci		{
373cc1dc7a3Sopenharmony_ci			// Fetch the image plane
374cc1dc7a3Sopenharmony_ci			uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
375cc1dc7a3Sopenharmony_ci
376cc1dc7a3Sopenharmony_ci			for (unsigned int y = y_start; y < y_end; y++)
377cc1dc7a3Sopenharmony_ci			{
378cc1dc7a3Sopenharmony_ci				uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
379cc1dc7a3Sopenharmony_ci
380cc1dc7a3Sopenharmony_ci				for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
381cc1dc7a3Sopenharmony_ci				{
382cc1dc7a3Sopenharmony_ci					unsigned int max_texels = ASTCENC_SIMD_WIDTH;
383cc1dc7a3Sopenharmony_ci					unsigned int used_texels = astc::min(x_count - x, max_texels);
384cc1dc7a3Sopenharmony_ci
385cc1dc7a3Sopenharmony_ci					// Unaligned load as rows are not always SIMD_WIDTH long
386cc1dc7a3Sopenharmony_ci					vfloat data_r(blk.data_r + idx);
387cc1dc7a3Sopenharmony_ci					vfloat data_g(blk.data_g + idx);
388cc1dc7a3Sopenharmony_ci					vfloat data_b(blk.data_b + idx);
389cc1dc7a3Sopenharmony_ci					vfloat data_a(blk.data_a + idx);
390cc1dc7a3Sopenharmony_ci
391cc1dc7a3Sopenharmony_ci					vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
392cc1dc7a3Sopenharmony_ci					vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
393cc1dc7a3Sopenharmony_ci					vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
394cc1dc7a3Sopenharmony_ci					vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
395cc1dc7a3Sopenharmony_ci
396cc1dc7a3Sopenharmony_ci					if (needs_swz)
397cc1dc7a3Sopenharmony_ci					{
398cc1dc7a3Sopenharmony_ci						vint swizzle_table[7];
399cc1dc7a3Sopenharmony_ci						swizzle_table[ASTCENC_SWZ_0] = vint(0);
400cc1dc7a3Sopenharmony_ci						swizzle_table[ASTCENC_SWZ_1] = vint(255);
401cc1dc7a3Sopenharmony_ci						swizzle_table[ASTCENC_SWZ_R] = data_ri;
402cc1dc7a3Sopenharmony_ci						swizzle_table[ASTCENC_SWZ_G] = data_gi;
403cc1dc7a3Sopenharmony_ci						swizzle_table[ASTCENC_SWZ_B] = data_bi;
404cc1dc7a3Sopenharmony_ci						swizzle_table[ASTCENC_SWZ_A] = data_ai;
405cc1dc7a3Sopenharmony_ci
406cc1dc7a3Sopenharmony_ci						if (needs_z)
407cc1dc7a3Sopenharmony_ci						{
408cc1dc7a3Sopenharmony_ci							vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
409cc1dc7a3Sopenharmony_ci							vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
410cc1dc7a3Sopenharmony_ci							vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
411cc1dc7a3Sopenharmony_ci							data_z = max(data_z, 0.0f);
412cc1dc7a3Sopenharmony_ci							data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
413cc1dc7a3Sopenharmony_ci
414cc1dc7a3Sopenharmony_ci							swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
415cc1dc7a3Sopenharmony_ci						}
416cc1dc7a3Sopenharmony_ci
417cc1dc7a3Sopenharmony_ci						data_ri = swizzle_table[swz.r];
418cc1dc7a3Sopenharmony_ci						data_gi = swizzle_table[swz.g];
419cc1dc7a3Sopenharmony_ci						data_bi = swizzle_table[swz.b];
420cc1dc7a3Sopenharmony_ci						data_ai = swizzle_table[swz.a];
421cc1dc7a3Sopenharmony_ci					}
422cc1dc7a3Sopenharmony_ci
423cc1dc7a3Sopenharmony_ci					// Errors are NaN encoded - convert to magenta error color
424cc1dc7a3Sopenharmony_ci					// Branch is OK here - it is almost never true so predicts well
425cc1dc7a3Sopenharmony_ci					vmask nan_mask = data_r != data_r;
426cc1dc7a3Sopenharmony_ci					if (any(nan_mask))
427cc1dc7a3Sopenharmony_ci					{
428cc1dc7a3Sopenharmony_ci						data_ri = select(data_ri, vint(0xFF), nan_mask);
429cc1dc7a3Sopenharmony_ci						data_gi = select(data_gi, vint(0x00), nan_mask);
430cc1dc7a3Sopenharmony_ci						data_bi = select(data_bi, vint(0xFF), nan_mask);
431cc1dc7a3Sopenharmony_ci						data_ai = select(data_ai, vint(0xFF), nan_mask);
432cc1dc7a3Sopenharmony_ci					}
433cc1dc7a3Sopenharmony_ci
434cc1dc7a3Sopenharmony_ci					vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
435cc1dc7a3Sopenharmony_ci					vmask store_mask = vint::lane_id() < vint(used_texels);
436cc1dc7a3Sopenharmony_ci					store_lanes_masked(data8_row, data_rgbai, store_mask);
437cc1dc7a3Sopenharmony_ci
438cc1dc7a3Sopenharmony_ci					data8_row += ASTCENC_SIMD_WIDTH * 4;
439cc1dc7a3Sopenharmony_ci					idx += used_texels;
440cc1dc7a3Sopenharmony_ci				}
441cc1dc7a3Sopenharmony_ci				idx += x_nudge;
442cc1dc7a3Sopenharmony_ci			}
443cc1dc7a3Sopenharmony_ci			idx += y_nudge;
444cc1dc7a3Sopenharmony_ci		}
445cc1dc7a3Sopenharmony_ci	}
446cc1dc7a3Sopenharmony_ci	else if (img.data_type == ASTCENC_TYPE_F16)
447cc1dc7a3Sopenharmony_ci	{
448cc1dc7a3Sopenharmony_ci		for (unsigned int z = z_start; z < z_end; z++)
449cc1dc7a3Sopenharmony_ci		{
450cc1dc7a3Sopenharmony_ci			// Fetch the image plane
451cc1dc7a3Sopenharmony_ci			uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
452cc1dc7a3Sopenharmony_ci
453cc1dc7a3Sopenharmony_ci			for (unsigned int y = y_start; y < y_end; y++)
454cc1dc7a3Sopenharmony_ci			{
455cc1dc7a3Sopenharmony_ci				uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
456cc1dc7a3Sopenharmony_ci
457cc1dc7a3Sopenharmony_ci				for (unsigned int x = 0; x < x_count; x++)
458cc1dc7a3Sopenharmony_ci				{
459cc1dc7a3Sopenharmony_ci					vint4 color;
460cc1dc7a3Sopenharmony_ci
461cc1dc7a3Sopenharmony_ci					// NaNs are handled inline - no need to special case
462cc1dc7a3Sopenharmony_ci					if (needs_swz)
463cc1dc7a3Sopenharmony_ci					{
464cc1dc7a3Sopenharmony_ci						float data[7];
465cc1dc7a3Sopenharmony_ci						data[ASTCENC_SWZ_0] = 0.0f;
466cc1dc7a3Sopenharmony_ci						data[ASTCENC_SWZ_1] = 1.0f;
467cc1dc7a3Sopenharmony_ci						data[ASTCENC_SWZ_R] = blk.data_r[idx];
468cc1dc7a3Sopenharmony_ci						data[ASTCENC_SWZ_G] = blk.data_g[idx];
469cc1dc7a3Sopenharmony_ci						data[ASTCENC_SWZ_B] = blk.data_b[idx];
470cc1dc7a3Sopenharmony_ci						data[ASTCENC_SWZ_A] = blk.data_a[idx];
471cc1dc7a3Sopenharmony_ci
472cc1dc7a3Sopenharmony_ci						if (needs_z)
473cc1dc7a3Sopenharmony_ci						{
474cc1dc7a3Sopenharmony_ci							float xN = (data[0] * 2.0f) - 1.0f;
475cc1dc7a3Sopenharmony_ci							float yN = (data[3] * 2.0f) - 1.0f;
476cc1dc7a3Sopenharmony_ci							float zN = 1.0f - xN * xN - yN * yN;
477cc1dc7a3Sopenharmony_ci							if (zN < 0.0f)
478cc1dc7a3Sopenharmony_ci							{
479cc1dc7a3Sopenharmony_ci								zN = 0.0f;
480cc1dc7a3Sopenharmony_ci							}
481cc1dc7a3Sopenharmony_ci							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
482cc1dc7a3Sopenharmony_ci						}
483cc1dc7a3Sopenharmony_ci
484cc1dc7a3Sopenharmony_ci						vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
485cc1dc7a3Sopenharmony_ci						color = float_to_float16(colorf);
486cc1dc7a3Sopenharmony_ci					}
487cc1dc7a3Sopenharmony_ci					else
488cc1dc7a3Sopenharmony_ci					{
489cc1dc7a3Sopenharmony_ci						vfloat4 colorf = blk.texel(idx);
490cc1dc7a3Sopenharmony_ci						color = float_to_float16(colorf);
491cc1dc7a3Sopenharmony_ci					}
492cc1dc7a3Sopenharmony_ci
493cc1dc7a3Sopenharmony_ci					// TODO: Vectorize with store N shorts?
494cc1dc7a3Sopenharmony_ci					data16_row[0] = static_cast<uint16_t>(color.lane<0>());
495cc1dc7a3Sopenharmony_ci					data16_row[1] = static_cast<uint16_t>(color.lane<1>());
496cc1dc7a3Sopenharmony_ci					data16_row[2] = static_cast<uint16_t>(color.lane<2>());
497cc1dc7a3Sopenharmony_ci					data16_row[3] = static_cast<uint16_t>(color.lane<3>());
498cc1dc7a3Sopenharmony_ci					data16_row += 4;
499cc1dc7a3Sopenharmony_ci					idx++;
500cc1dc7a3Sopenharmony_ci				}
501cc1dc7a3Sopenharmony_ci				idx += x_nudge;
502cc1dc7a3Sopenharmony_ci			}
503cc1dc7a3Sopenharmony_ci			idx += y_nudge;
504cc1dc7a3Sopenharmony_ci		}
505cc1dc7a3Sopenharmony_ci	}
506cc1dc7a3Sopenharmony_ci	else // if (img.data_type == ASTCENC_TYPE_F32)
507cc1dc7a3Sopenharmony_ci	{
508cc1dc7a3Sopenharmony_ci		assert(img.data_type == ASTCENC_TYPE_F32);
509cc1dc7a3Sopenharmony_ci
510cc1dc7a3Sopenharmony_ci		for (unsigned int z = z_start; z < z_end; z++)
511cc1dc7a3Sopenharmony_ci		{
512cc1dc7a3Sopenharmony_ci			// Fetch the image plane
513cc1dc7a3Sopenharmony_ci			float* data32 = static_cast<float*>(img.data[z]);
514cc1dc7a3Sopenharmony_ci
515cc1dc7a3Sopenharmony_ci			for (unsigned int y = y_start; y < y_end; y++)
516cc1dc7a3Sopenharmony_ci			{
517cc1dc7a3Sopenharmony_ci				float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
518cc1dc7a3Sopenharmony_ci
519cc1dc7a3Sopenharmony_ci				for (unsigned int x = 0; x < x_count; x++)
520cc1dc7a3Sopenharmony_ci				{
521cc1dc7a3Sopenharmony_ci					vfloat4 color = blk.texel(idx);
522cc1dc7a3Sopenharmony_ci
523cc1dc7a3Sopenharmony_ci					// NaNs are handled inline - no need to special case
524cc1dc7a3Sopenharmony_ci					if (needs_swz)
525cc1dc7a3Sopenharmony_ci					{
526cc1dc7a3Sopenharmony_ci						float data[7];
527cc1dc7a3Sopenharmony_ci						data[ASTCENC_SWZ_0] = 0.0f;
528cc1dc7a3Sopenharmony_ci						data[ASTCENC_SWZ_1] = 1.0f;
529cc1dc7a3Sopenharmony_ci						data[ASTCENC_SWZ_R] = color.lane<0>();
530cc1dc7a3Sopenharmony_ci						data[ASTCENC_SWZ_G] = color.lane<1>();
531cc1dc7a3Sopenharmony_ci						data[ASTCENC_SWZ_B] = color.lane<2>();
532cc1dc7a3Sopenharmony_ci						data[ASTCENC_SWZ_A] = color.lane<3>();
533cc1dc7a3Sopenharmony_ci
534cc1dc7a3Sopenharmony_ci						if (needs_z)
535cc1dc7a3Sopenharmony_ci						{
536cc1dc7a3Sopenharmony_ci							float xN = (data[0] * 2.0f) - 1.0f;
537cc1dc7a3Sopenharmony_ci							float yN = (data[3] * 2.0f) - 1.0f;
538cc1dc7a3Sopenharmony_ci							float zN = 1.0f - xN * xN - yN * yN;
539cc1dc7a3Sopenharmony_ci							if (zN < 0.0f)
540cc1dc7a3Sopenharmony_ci							{
541cc1dc7a3Sopenharmony_ci								zN = 0.0f;
542cc1dc7a3Sopenharmony_ci							}
543cc1dc7a3Sopenharmony_ci							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
544cc1dc7a3Sopenharmony_ci						}
545cc1dc7a3Sopenharmony_ci
546cc1dc7a3Sopenharmony_ci						color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
547cc1dc7a3Sopenharmony_ci					}
548cc1dc7a3Sopenharmony_ci
549cc1dc7a3Sopenharmony_ci					store(color, data32_row);
550cc1dc7a3Sopenharmony_ci					data32_row += 4;
551cc1dc7a3Sopenharmony_ci					idx++;
552cc1dc7a3Sopenharmony_ci				}
553cc1dc7a3Sopenharmony_ci				idx += x_nudge;
554cc1dc7a3Sopenharmony_ci			}
555cc1dc7a3Sopenharmony_ci			idx += y_nudge;
556cc1dc7a3Sopenharmony_ci		}
557cc1dc7a3Sopenharmony_ci	}
558cc1dc7a3Sopenharmony_ci}
559