1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0
2cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
3cc1dc7a3Sopenharmony_ci// Copyright 2011-2022 Arm Limited
4cc1dc7a3Sopenharmony_ci//
5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy
7cc1dc7a3Sopenharmony_ci// of the License at:
8cc1dc7a3Sopenharmony_ci//
9cc1dc7a3Sopenharmony_ci//     http://www.apache.org/licenses/LICENSE-2.0
10cc1dc7a3Sopenharmony_ci//
11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software
12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations
15cc1dc7a3Sopenharmony_ci// under the License.
16cc1dc7a3Sopenharmony_ci// ----------------------------------------------------------------------------
17cc1dc7a3Sopenharmony_ci
18cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DECOMPRESS_ONLY)
19cc1dc7a3Sopenharmony_ci
20cc1dc7a3Sopenharmony_ci/**
21cc1dc7a3Sopenharmony_ci * @brief Functions to calculate variance per component in a NxN footprint.
22cc1dc7a3Sopenharmony_ci *
23cc1dc7a3Sopenharmony_ci * We need N to be parametric, so the routine below uses summed area tables in order to execute in
24cc1dc7a3Sopenharmony_ci * O(1) time independent of how big N is.
25cc1dc7a3Sopenharmony_ci *
26cc1dc7a3Sopenharmony_ci * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
27cc1dc7a3Sopenharmony_ci * perform a binary reduction, and then distributes the results. This method means that there is no
28cc1dc7a3Sopenharmony_ci * serial dependency between a given element and the next one, and also significantly improves
29cc1dc7a3Sopenharmony_ci * numerical stability allowing us to use floats rather than doubles.
30cc1dc7a3Sopenharmony_ci */
31cc1dc7a3Sopenharmony_ci
32cc1dc7a3Sopenharmony_ci#include "astcenc_internal.h"
33cc1dc7a3Sopenharmony_ci
34cc1dc7a3Sopenharmony_ci#include <cassert>
35cc1dc7a3Sopenharmony_ci
36cc1dc7a3Sopenharmony_ci/**
37cc1dc7a3Sopenharmony_ci * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
38cc1dc7a3Sopenharmony_ci *
39cc1dc7a3Sopenharmony_ci * This will take an input array of the form:
40cc1dc7a3Sopenharmony_ci *     v0, v1, v2, ...
41cc1dc7a3Sopenharmony_ci * ... and modify in-place to turn it into a prefix-sum array of the form:
42cc1dc7a3Sopenharmony_ci *     v0, v0+v1, v0+v1+v2, ...
43cc1dc7a3Sopenharmony_ci *
44cc1dc7a3Sopenharmony_ci * @param d      The array to prefix-sum.
45cc1dc7a3Sopenharmony_ci * @param items  The number of items in the array.
46cc1dc7a3Sopenharmony_ci * @param stride The item spacing in the array; i.e. dense arrays should use 1.
47cc1dc7a3Sopenharmony_ci */
48cc1dc7a3Sopenharmony_cistatic void brent_kung_prefix_sum(
49cc1dc7a3Sopenharmony_ci	vfloat4* d,
50cc1dc7a3Sopenharmony_ci	size_t items,
51cc1dc7a3Sopenharmony_ci	int stride
52cc1dc7a3Sopenharmony_ci) {
53cc1dc7a3Sopenharmony_ci	if (items < 2)
54cc1dc7a3Sopenharmony_ci		return;
55cc1dc7a3Sopenharmony_ci
56cc1dc7a3Sopenharmony_ci	size_t lc_stride = 2;
57cc1dc7a3Sopenharmony_ci	size_t log2_stride = 1;
58cc1dc7a3Sopenharmony_ci
59cc1dc7a3Sopenharmony_ci	// The reduction-tree loop
60cc1dc7a3Sopenharmony_ci	do {
61cc1dc7a3Sopenharmony_ci		size_t step = lc_stride >> 1;
62cc1dc7a3Sopenharmony_ci		size_t start = lc_stride - 1;
63cc1dc7a3Sopenharmony_ci		size_t iters = items >> log2_stride;
64cc1dc7a3Sopenharmony_ci
65cc1dc7a3Sopenharmony_ci		vfloat4 *da = d + (start * stride);
66cc1dc7a3Sopenharmony_ci		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
67cc1dc7a3Sopenharmony_ci		size_t ofs_stride = stride << log2_stride;
68cc1dc7a3Sopenharmony_ci
69cc1dc7a3Sopenharmony_ci		while (iters)
70cc1dc7a3Sopenharmony_ci		{
71cc1dc7a3Sopenharmony_ci			*da = *da + da[ofs];
72cc1dc7a3Sopenharmony_ci			da += ofs_stride;
73cc1dc7a3Sopenharmony_ci			iters--;
74cc1dc7a3Sopenharmony_ci		}
75cc1dc7a3Sopenharmony_ci
76cc1dc7a3Sopenharmony_ci		log2_stride += 1;
77cc1dc7a3Sopenharmony_ci		lc_stride <<= 1;
78cc1dc7a3Sopenharmony_ci	} while (lc_stride <= items);
79cc1dc7a3Sopenharmony_ci
80cc1dc7a3Sopenharmony_ci	// The expansion-tree loop
81cc1dc7a3Sopenharmony_ci	do {
82cc1dc7a3Sopenharmony_ci		log2_stride -= 1;
83cc1dc7a3Sopenharmony_ci		lc_stride >>= 1;
84cc1dc7a3Sopenharmony_ci
85cc1dc7a3Sopenharmony_ci		size_t step = lc_stride >> 1;
86cc1dc7a3Sopenharmony_ci		size_t start = step + lc_stride - 1;
87cc1dc7a3Sopenharmony_ci		size_t iters = (items - step) >> log2_stride;
88cc1dc7a3Sopenharmony_ci
89cc1dc7a3Sopenharmony_ci		vfloat4 *da = d + (start * stride);
90cc1dc7a3Sopenharmony_ci		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
91cc1dc7a3Sopenharmony_ci		size_t ofs_stride = stride << log2_stride;
92cc1dc7a3Sopenharmony_ci
93cc1dc7a3Sopenharmony_ci		while (iters)
94cc1dc7a3Sopenharmony_ci		{
95cc1dc7a3Sopenharmony_ci			*da = *da + da[ofs];
96cc1dc7a3Sopenharmony_ci			da += ofs_stride;
97cc1dc7a3Sopenharmony_ci			iters--;
98cc1dc7a3Sopenharmony_ci		}
99cc1dc7a3Sopenharmony_ci	} while (lc_stride > 2);
100cc1dc7a3Sopenharmony_ci}
101cc1dc7a3Sopenharmony_ci
102cc1dc7a3Sopenharmony_ci/* See header for documentation. */
103cc1dc7a3Sopenharmony_civoid compute_pixel_region_variance(
104cc1dc7a3Sopenharmony_ci	astcenc_contexti& ctx,
105cc1dc7a3Sopenharmony_ci	const pixel_region_args& arg
106cc1dc7a3Sopenharmony_ci) {
107cc1dc7a3Sopenharmony_ci	// Unpack the memory structure into local variables
108cc1dc7a3Sopenharmony_ci	const astcenc_image* img = arg.img;
109cc1dc7a3Sopenharmony_ci	astcenc_swizzle swz = arg.swz;
110cc1dc7a3Sopenharmony_ci	bool have_z = arg.have_z;
111cc1dc7a3Sopenharmony_ci
112cc1dc7a3Sopenharmony_ci	int size_x = arg.size_x;
113cc1dc7a3Sopenharmony_ci	int size_y = arg.size_y;
114cc1dc7a3Sopenharmony_ci	int size_z = arg.size_z;
115cc1dc7a3Sopenharmony_ci
116cc1dc7a3Sopenharmony_ci	int offset_x = arg.offset_x;
117cc1dc7a3Sopenharmony_ci	int offset_y = arg.offset_y;
118cc1dc7a3Sopenharmony_ci	int offset_z = arg.offset_z;
119cc1dc7a3Sopenharmony_ci
120cc1dc7a3Sopenharmony_ci	int alpha_kernel_radius = arg.alpha_kernel_radius;
121cc1dc7a3Sopenharmony_ci
122cc1dc7a3Sopenharmony_ci	float*   input_alpha_averages = ctx.input_alpha_averages;
123cc1dc7a3Sopenharmony_ci	vfloat4* work_memory = arg.work_memory;
124cc1dc7a3Sopenharmony_ci
125cc1dc7a3Sopenharmony_ci	// Compute memory sizes and dimensions that we need
126cc1dc7a3Sopenharmony_ci	int kernel_radius = alpha_kernel_radius;
127cc1dc7a3Sopenharmony_ci	int kerneldim = 2 * kernel_radius + 1;
128cc1dc7a3Sopenharmony_ci	int kernel_radius_xy = kernel_radius;
129cc1dc7a3Sopenharmony_ci	int kernel_radius_z = have_z ? kernel_radius : 0;
130cc1dc7a3Sopenharmony_ci
131cc1dc7a3Sopenharmony_ci	int padsize_x = size_x + kerneldim;
132cc1dc7a3Sopenharmony_ci	int padsize_y = size_y + kerneldim;
133cc1dc7a3Sopenharmony_ci	int padsize_z = size_z + (have_z ? kerneldim : 0);
134cc1dc7a3Sopenharmony_ci	int sizeprod = padsize_x * padsize_y * padsize_z;
135cc1dc7a3Sopenharmony_ci
136cc1dc7a3Sopenharmony_ci	int zd_start = have_z ? 1 : 0;
137cc1dc7a3Sopenharmony_ci
138cc1dc7a3Sopenharmony_ci	vfloat4 *varbuf1 = work_memory;
139cc1dc7a3Sopenharmony_ci	vfloat4 *varbuf2 = work_memory + sizeprod;
140cc1dc7a3Sopenharmony_ci
141cc1dc7a3Sopenharmony_ci	// Scaling factors to apply to Y and Z for accesses into the work buffers
142cc1dc7a3Sopenharmony_ci	int yst = padsize_x;
143cc1dc7a3Sopenharmony_ci	int zst = padsize_x * padsize_y;
144cc1dc7a3Sopenharmony_ci
145cc1dc7a3Sopenharmony_ci	// Scaling factors to apply to Y and Z for accesses into result buffers
146cc1dc7a3Sopenharmony_ci	int ydt = img->dim_x;
147cc1dc7a3Sopenharmony_ci	int zdt = img->dim_x * img->dim_y;
148cc1dc7a3Sopenharmony_ci
149cc1dc7a3Sopenharmony_ci	// Macros to act as accessor functions for the work-memory
150cc1dc7a3Sopenharmony_ci	#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
151cc1dc7a3Sopenharmony_ci	#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
152cc1dc7a3Sopenharmony_ci
153cc1dc7a3Sopenharmony_ci	// Load N and N^2 values into the work buffers
154cc1dc7a3Sopenharmony_ci	if (img->data_type == ASTCENC_TYPE_U8)
155cc1dc7a3Sopenharmony_ci	{
156cc1dc7a3Sopenharmony_ci		// Swizzle data structure 4 = ZERO, 5 = ONE
157cc1dc7a3Sopenharmony_ci		uint8_t data[6];
158cc1dc7a3Sopenharmony_ci		data[ASTCENC_SWZ_0] = 0;
159cc1dc7a3Sopenharmony_ci		data[ASTCENC_SWZ_1] = 255;
160cc1dc7a3Sopenharmony_ci
161cc1dc7a3Sopenharmony_ci		for (int z = zd_start; z < padsize_z; z++)
162cc1dc7a3Sopenharmony_ci		{
163cc1dc7a3Sopenharmony_ci			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
164cc1dc7a3Sopenharmony_ci			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
165cc1dc7a3Sopenharmony_ci			uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
166cc1dc7a3Sopenharmony_ci
167cc1dc7a3Sopenharmony_ci			for (int y = 1; y < padsize_y; y++)
168cc1dc7a3Sopenharmony_ci			{
169cc1dc7a3Sopenharmony_ci				int y_src = (y - 1) + offset_y - kernel_radius_xy;
170cc1dc7a3Sopenharmony_ci				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
171cc1dc7a3Sopenharmony_ci
172cc1dc7a3Sopenharmony_ci				for (int x = 1; x < padsize_x; x++)
173cc1dc7a3Sopenharmony_ci				{
174cc1dc7a3Sopenharmony_ci					int x_src = (x - 1) + offset_x - kernel_radius_xy;
175cc1dc7a3Sopenharmony_ci					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
176cc1dc7a3Sopenharmony_ci
177cc1dc7a3Sopenharmony_ci					data[0] = data8[(4 * img->dim_stride * y_src) + (4 * x_src    )];
178cc1dc7a3Sopenharmony_ci					data[1] = data8[(4 * img->dim_stride * y_src) + (4 * x_src + 1)];
179cc1dc7a3Sopenharmony_ci					data[2] = data8[(4 * img->dim_stride * y_src) + (4 * x_src + 2)];
180cc1dc7a3Sopenharmony_ci					data[3] = data8[(4 * img->dim_stride * y_src) + (4 * x_src + 3)];
181cc1dc7a3Sopenharmony_ci
182cc1dc7a3Sopenharmony_ci					uint8_t r = data[swz.r];
183cc1dc7a3Sopenharmony_ci					uint8_t g = data[swz.g];
184cc1dc7a3Sopenharmony_ci					uint8_t b = data[swz.b];
185cc1dc7a3Sopenharmony_ci					uint8_t a = data[swz.a];
186cc1dc7a3Sopenharmony_ci
187cc1dc7a3Sopenharmony_ci					vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
188cc1dc7a3Sopenharmony_ci					                     g * (1.0f / 255.0f),
189cc1dc7a3Sopenharmony_ci					                     b * (1.0f / 255.0f),
190cc1dc7a3Sopenharmony_ci					                     a * (1.0f / 255.0f));
191cc1dc7a3Sopenharmony_ci
192cc1dc7a3Sopenharmony_ci					VARBUF1(z, y, x) = d;
193cc1dc7a3Sopenharmony_ci					VARBUF2(z, y, x) = d * d;
194cc1dc7a3Sopenharmony_ci				}
195cc1dc7a3Sopenharmony_ci			}
196cc1dc7a3Sopenharmony_ci		}
197cc1dc7a3Sopenharmony_ci	}
198cc1dc7a3Sopenharmony_ci	else if (img->data_type == ASTCENC_TYPE_F16)
199cc1dc7a3Sopenharmony_ci	{
200cc1dc7a3Sopenharmony_ci		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
201cc1dc7a3Sopenharmony_ci		uint16_t data[6];
202cc1dc7a3Sopenharmony_ci		data[ASTCENC_SWZ_0] = 0;
203cc1dc7a3Sopenharmony_ci		data[ASTCENC_SWZ_1] = 0x3C00;
204cc1dc7a3Sopenharmony_ci
205cc1dc7a3Sopenharmony_ci		for (int z = zd_start; z < padsize_z; z++)
206cc1dc7a3Sopenharmony_ci		{
207cc1dc7a3Sopenharmony_ci			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
208cc1dc7a3Sopenharmony_ci			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
209cc1dc7a3Sopenharmony_ci			uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
210cc1dc7a3Sopenharmony_ci
211cc1dc7a3Sopenharmony_ci			for (int y = 1; y < padsize_y; y++)
212cc1dc7a3Sopenharmony_ci			{
213cc1dc7a3Sopenharmony_ci				int y_src = (y - 1) + offset_y - kernel_radius_xy;
214cc1dc7a3Sopenharmony_ci				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
215cc1dc7a3Sopenharmony_ci
216cc1dc7a3Sopenharmony_ci				for (int x = 1; x < padsize_x; x++)
217cc1dc7a3Sopenharmony_ci				{
218cc1dc7a3Sopenharmony_ci					int x_src = (x - 1) + offset_x - kernel_radius_xy;
219cc1dc7a3Sopenharmony_ci					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
220cc1dc7a3Sopenharmony_ci
221cc1dc7a3Sopenharmony_ci					data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src    )];
222cc1dc7a3Sopenharmony_ci					data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
223cc1dc7a3Sopenharmony_ci					data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
224cc1dc7a3Sopenharmony_ci					data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
225cc1dc7a3Sopenharmony_ci
226cc1dc7a3Sopenharmony_ci					vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
227cc1dc7a3Sopenharmony_ci					vfloat4 d = float16_to_float(di);
228cc1dc7a3Sopenharmony_ci
229cc1dc7a3Sopenharmony_ci					VARBUF1(z, y, x) = d;
230cc1dc7a3Sopenharmony_ci					VARBUF2(z, y, x) = d * d;
231cc1dc7a3Sopenharmony_ci				}
232cc1dc7a3Sopenharmony_ci			}
233cc1dc7a3Sopenharmony_ci		}
234cc1dc7a3Sopenharmony_ci	}
235cc1dc7a3Sopenharmony_ci	else // if (img->data_type == ASTCENC_TYPE_F32)
236cc1dc7a3Sopenharmony_ci	{
237cc1dc7a3Sopenharmony_ci		assert(img->data_type == ASTCENC_TYPE_F32);
238cc1dc7a3Sopenharmony_ci
239cc1dc7a3Sopenharmony_ci		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
240cc1dc7a3Sopenharmony_ci		float data[6];
241cc1dc7a3Sopenharmony_ci		data[ASTCENC_SWZ_0] = 0.0f;
242cc1dc7a3Sopenharmony_ci		data[ASTCENC_SWZ_1] = 1.0f;
243cc1dc7a3Sopenharmony_ci
244cc1dc7a3Sopenharmony_ci		for (int z = zd_start; z < padsize_z; z++)
245cc1dc7a3Sopenharmony_ci		{
246cc1dc7a3Sopenharmony_ci			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
247cc1dc7a3Sopenharmony_ci			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
248cc1dc7a3Sopenharmony_ci			float* data32 = static_cast<float*>(img->data[z_src]);
249cc1dc7a3Sopenharmony_ci
250cc1dc7a3Sopenharmony_ci			for (int y = 1; y < padsize_y; y++)
251cc1dc7a3Sopenharmony_ci			{
252cc1dc7a3Sopenharmony_ci				int y_src = (y - 1) + offset_y - kernel_radius_xy;
253cc1dc7a3Sopenharmony_ci				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
254cc1dc7a3Sopenharmony_ci
255cc1dc7a3Sopenharmony_ci				for (int x = 1; x < padsize_x; x++)
256cc1dc7a3Sopenharmony_ci				{
257cc1dc7a3Sopenharmony_ci					int x_src = (x - 1) + offset_x - kernel_radius_xy;
258cc1dc7a3Sopenharmony_ci					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
259cc1dc7a3Sopenharmony_ci
260cc1dc7a3Sopenharmony_ci					data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src    )];
261cc1dc7a3Sopenharmony_ci					data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
262cc1dc7a3Sopenharmony_ci					data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
263cc1dc7a3Sopenharmony_ci					data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
264cc1dc7a3Sopenharmony_ci
265cc1dc7a3Sopenharmony_ci					float r = data[swz.r];
266cc1dc7a3Sopenharmony_ci					float g = data[swz.g];
267cc1dc7a3Sopenharmony_ci					float b = data[swz.b];
268cc1dc7a3Sopenharmony_ci					float a = data[swz.a];
269cc1dc7a3Sopenharmony_ci
270cc1dc7a3Sopenharmony_ci					vfloat4 d(r, g, b, a);
271cc1dc7a3Sopenharmony_ci
272cc1dc7a3Sopenharmony_ci					VARBUF1(z, y, x) = d;
273cc1dc7a3Sopenharmony_ci					VARBUF2(z, y, x) = d * d;
274cc1dc7a3Sopenharmony_ci				}
275cc1dc7a3Sopenharmony_ci			}
276cc1dc7a3Sopenharmony_ci		}
277cc1dc7a3Sopenharmony_ci	}
278cc1dc7a3Sopenharmony_ci
279cc1dc7a3Sopenharmony_ci	// Pad with an extra layer of 0s; this forms the edge of the SAT tables
280cc1dc7a3Sopenharmony_ci	vfloat4 vbz = vfloat4::zero();
281cc1dc7a3Sopenharmony_ci	for (int z = 0; z < padsize_z; z++)
282cc1dc7a3Sopenharmony_ci	{
283cc1dc7a3Sopenharmony_ci		for (int y = 0; y < padsize_y; y++)
284cc1dc7a3Sopenharmony_ci		{
285cc1dc7a3Sopenharmony_ci			VARBUF1(z, y, 0) = vbz;
286cc1dc7a3Sopenharmony_ci			VARBUF2(z, y, 0) = vbz;
287cc1dc7a3Sopenharmony_ci		}
288cc1dc7a3Sopenharmony_ci
289cc1dc7a3Sopenharmony_ci		for (int x = 0; x < padsize_x; x++)
290cc1dc7a3Sopenharmony_ci		{
291cc1dc7a3Sopenharmony_ci			VARBUF1(z, 0, x) = vbz;
292cc1dc7a3Sopenharmony_ci			VARBUF2(z, 0, x) = vbz;
293cc1dc7a3Sopenharmony_ci		}
294cc1dc7a3Sopenharmony_ci	}
295cc1dc7a3Sopenharmony_ci
296cc1dc7a3Sopenharmony_ci	if (have_z)
297cc1dc7a3Sopenharmony_ci	{
298cc1dc7a3Sopenharmony_ci		for (int y = 0; y < padsize_y; y++)
299cc1dc7a3Sopenharmony_ci		{
300cc1dc7a3Sopenharmony_ci			for (int x = 0; x < padsize_x; x++)
301cc1dc7a3Sopenharmony_ci			{
302cc1dc7a3Sopenharmony_ci				VARBUF1(0, y, x) = vbz;
303cc1dc7a3Sopenharmony_ci				VARBUF2(0, y, x) = vbz;
304cc1dc7a3Sopenharmony_ci			}
305cc1dc7a3Sopenharmony_ci		}
306cc1dc7a3Sopenharmony_ci	}
307cc1dc7a3Sopenharmony_ci
308cc1dc7a3Sopenharmony_ci	// Generate summed-area tables for N and N^2; this is done in-place, using
309cc1dc7a3Sopenharmony_ci	// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
310cc1dc7a3Sopenharmony_ci	for (int z = zd_start; z < padsize_z; z++)
311cc1dc7a3Sopenharmony_ci	{
312cc1dc7a3Sopenharmony_ci		for (int y = 1; y < padsize_y; y++)
313cc1dc7a3Sopenharmony_ci		{
314cc1dc7a3Sopenharmony_ci			brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
315cc1dc7a3Sopenharmony_ci			brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
316cc1dc7a3Sopenharmony_ci		}
317cc1dc7a3Sopenharmony_ci	}
318cc1dc7a3Sopenharmony_ci
319cc1dc7a3Sopenharmony_ci	for (int z = zd_start; z < padsize_z; z++)
320cc1dc7a3Sopenharmony_ci	{
321cc1dc7a3Sopenharmony_ci		for (int x = 1; x < padsize_x; x++)
322cc1dc7a3Sopenharmony_ci		{
323cc1dc7a3Sopenharmony_ci			brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
324cc1dc7a3Sopenharmony_ci			brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
325cc1dc7a3Sopenharmony_ci		}
326cc1dc7a3Sopenharmony_ci	}
327cc1dc7a3Sopenharmony_ci
328cc1dc7a3Sopenharmony_ci	if (have_z)
329cc1dc7a3Sopenharmony_ci	{
330cc1dc7a3Sopenharmony_ci		for (int y = 1; y < padsize_y; y++)
331cc1dc7a3Sopenharmony_ci		{
332cc1dc7a3Sopenharmony_ci			for (int x = 1; x < padsize_x; x++)
333cc1dc7a3Sopenharmony_ci			{
334cc1dc7a3Sopenharmony_ci				brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
335cc1dc7a3Sopenharmony_ci				brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
336cc1dc7a3Sopenharmony_ci			}
337cc1dc7a3Sopenharmony_ci		}
338cc1dc7a3Sopenharmony_ci	}
339cc1dc7a3Sopenharmony_ci
340cc1dc7a3Sopenharmony_ci	// Compute a few constants used in the variance-calculation.
341cc1dc7a3Sopenharmony_ci	float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
342cc1dc7a3Sopenharmony_ci	float alpha_rsamples;
343cc1dc7a3Sopenharmony_ci
344cc1dc7a3Sopenharmony_ci	if (have_z)
345cc1dc7a3Sopenharmony_ci	{
346cc1dc7a3Sopenharmony_ci		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
347cc1dc7a3Sopenharmony_ci	}
348cc1dc7a3Sopenharmony_ci	else
349cc1dc7a3Sopenharmony_ci	{
350cc1dc7a3Sopenharmony_ci		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
351cc1dc7a3Sopenharmony_ci	}
352cc1dc7a3Sopenharmony_ci
353cc1dc7a3Sopenharmony_ci	// Use the summed-area tables to compute variance for each neighborhood
354cc1dc7a3Sopenharmony_ci	if (have_z)
355cc1dc7a3Sopenharmony_ci	{
356cc1dc7a3Sopenharmony_ci		for (int z = 0; z < size_z; z++)
357cc1dc7a3Sopenharmony_ci		{
358cc1dc7a3Sopenharmony_ci			int z_src = z + kernel_radius_z;
359cc1dc7a3Sopenharmony_ci			int z_dst = z + offset_z;
360cc1dc7a3Sopenharmony_ci			int z_low  = z_src - alpha_kernel_radius;
361cc1dc7a3Sopenharmony_ci			int z_high = z_src + alpha_kernel_radius + 1;
362cc1dc7a3Sopenharmony_ci
363cc1dc7a3Sopenharmony_ci			for (int y = 0; y < size_y; y++)
364cc1dc7a3Sopenharmony_ci			{
365cc1dc7a3Sopenharmony_ci				int y_src = y + kernel_radius_xy;
366cc1dc7a3Sopenharmony_ci				int y_dst = y + offset_y;
367cc1dc7a3Sopenharmony_ci				int y_low  = y_src - alpha_kernel_radius;
368cc1dc7a3Sopenharmony_ci				int y_high = y_src + alpha_kernel_radius + 1;
369cc1dc7a3Sopenharmony_ci
370cc1dc7a3Sopenharmony_ci				for (int x = 0; x < size_x; x++)
371cc1dc7a3Sopenharmony_ci				{
372cc1dc7a3Sopenharmony_ci					int x_src = x + kernel_radius_xy;
373cc1dc7a3Sopenharmony_ci					int x_dst = x + offset_x;
374cc1dc7a3Sopenharmony_ci					int x_low  = x_src - alpha_kernel_radius;
375cc1dc7a3Sopenharmony_ci					int x_high = x_src + alpha_kernel_radius + 1;
376cc1dc7a3Sopenharmony_ci
377cc1dc7a3Sopenharmony_ci					// Summed-area table lookups for alpha average
378cc1dc7a3Sopenharmony_ci					float vasum = (  VARBUF1(z_high, y_low,  x_low).lane<3>()
379cc1dc7a3Sopenharmony_ci					               - VARBUF1(z_high, y_low,  x_high).lane<3>()
380cc1dc7a3Sopenharmony_ci					               - VARBUF1(z_high, y_high, x_low).lane<3>()
381cc1dc7a3Sopenharmony_ci					               + VARBUF1(z_high, y_high, x_high).lane<3>()) -
382cc1dc7a3Sopenharmony_ci					              (  VARBUF1(z_low,  y_low,  x_low).lane<3>()
383cc1dc7a3Sopenharmony_ci					               - VARBUF1(z_low,  y_low,  x_high).lane<3>()
384cc1dc7a3Sopenharmony_ci					               - VARBUF1(z_low,  y_high, x_low).lane<3>()
385cc1dc7a3Sopenharmony_ci					               + VARBUF1(z_low,  y_high, x_high).lane<3>());
386cc1dc7a3Sopenharmony_ci
387cc1dc7a3Sopenharmony_ci					int out_index = z_dst * zdt + y_dst * ydt + x_dst;
388cc1dc7a3Sopenharmony_ci					input_alpha_averages[out_index] = (vasum * alpha_rsamples);
389cc1dc7a3Sopenharmony_ci				}
390cc1dc7a3Sopenharmony_ci			}
391cc1dc7a3Sopenharmony_ci		}
392cc1dc7a3Sopenharmony_ci	}
393cc1dc7a3Sopenharmony_ci	else
394cc1dc7a3Sopenharmony_ci	{
395cc1dc7a3Sopenharmony_ci		for (int y = 0; y < size_y; y++)
396cc1dc7a3Sopenharmony_ci		{
397cc1dc7a3Sopenharmony_ci			int y_src = y + kernel_radius_xy;
398cc1dc7a3Sopenharmony_ci			int y_dst = y + offset_y;
399cc1dc7a3Sopenharmony_ci			int y_low  = y_src - alpha_kernel_radius;
400cc1dc7a3Sopenharmony_ci			int y_high = y_src + alpha_kernel_radius + 1;
401cc1dc7a3Sopenharmony_ci
402cc1dc7a3Sopenharmony_ci			for (int x = 0; x < size_x; x++)
403cc1dc7a3Sopenharmony_ci			{
404cc1dc7a3Sopenharmony_ci				int x_src = x + kernel_radius_xy;
405cc1dc7a3Sopenharmony_ci				int x_dst = x + offset_x;
406cc1dc7a3Sopenharmony_ci				int x_low  = x_src - alpha_kernel_radius;
407cc1dc7a3Sopenharmony_ci				int x_high = x_src + alpha_kernel_radius + 1;
408cc1dc7a3Sopenharmony_ci
409cc1dc7a3Sopenharmony_ci				// Summed-area table lookups for alpha average
410cc1dc7a3Sopenharmony_ci				float vasum = VARBUF1(0, y_low,  x_low).lane<3>()
411cc1dc7a3Sopenharmony_ci				            - VARBUF1(0, y_low,  x_high).lane<3>()
412cc1dc7a3Sopenharmony_ci				            - VARBUF1(0, y_high, x_low).lane<3>()
413cc1dc7a3Sopenharmony_ci				            + VARBUF1(0, y_high, x_high).lane<3>();
414cc1dc7a3Sopenharmony_ci
415cc1dc7a3Sopenharmony_ci				int out_index = y_dst * ydt + x_dst;
416cc1dc7a3Sopenharmony_ci				input_alpha_averages[out_index] = (vasum * alpha_rsamples);
417cc1dc7a3Sopenharmony_ci			}
418cc1dc7a3Sopenharmony_ci		}
419cc1dc7a3Sopenharmony_ci	}
420cc1dc7a3Sopenharmony_ci}
421cc1dc7a3Sopenharmony_ci
422cc1dc7a3Sopenharmony_ci/* See header for documentation. */
423cc1dc7a3Sopenharmony_ciunsigned int init_compute_averages(
424cc1dc7a3Sopenharmony_ci	const astcenc_image& img,
425cc1dc7a3Sopenharmony_ci	unsigned int alpha_kernel_radius,
426cc1dc7a3Sopenharmony_ci	const astcenc_swizzle& swz,
427cc1dc7a3Sopenharmony_ci	avg_args& ag
428cc1dc7a3Sopenharmony_ci) {
429cc1dc7a3Sopenharmony_ci	unsigned int size_x = img.dim_x;
430cc1dc7a3Sopenharmony_ci	unsigned int size_y = img.dim_y;
431cc1dc7a3Sopenharmony_ci	unsigned int size_z = img.dim_z;
432cc1dc7a3Sopenharmony_ci
433cc1dc7a3Sopenharmony_ci	// Compute maximum block size and from that the working memory buffer size
434cc1dc7a3Sopenharmony_ci	unsigned int kernel_radius = alpha_kernel_radius;
435cc1dc7a3Sopenharmony_ci	unsigned int kerneldim = 2 * kernel_radius + 1;
436cc1dc7a3Sopenharmony_ci
437cc1dc7a3Sopenharmony_ci	bool have_z = (size_z > 1);
438cc1dc7a3Sopenharmony_ci	unsigned int max_blk_size_xy = have_z ? 16 : 32;
439cc1dc7a3Sopenharmony_ci	unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
440cc1dc7a3Sopenharmony_ci
441cc1dc7a3Sopenharmony_ci	unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
442cc1dc7a3Sopenharmony_ci	unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
443cc1dc7a3Sopenharmony_ci
444cc1dc7a3Sopenharmony_ci	// Perform block-wise averages calculations across the image
445cc1dc7a3Sopenharmony_ci	// Initialize fields which are not populated until later
446cc1dc7a3Sopenharmony_ci	ag.arg.size_x = 0;
447cc1dc7a3Sopenharmony_ci	ag.arg.size_y = 0;
448cc1dc7a3Sopenharmony_ci	ag.arg.size_z = 0;
449cc1dc7a3Sopenharmony_ci	ag.arg.offset_x = 0;
450cc1dc7a3Sopenharmony_ci	ag.arg.offset_y = 0;
451cc1dc7a3Sopenharmony_ci	ag.arg.offset_z = 0;
452cc1dc7a3Sopenharmony_ci	ag.arg.work_memory = nullptr;
453cc1dc7a3Sopenharmony_ci
454cc1dc7a3Sopenharmony_ci	ag.arg.img = &img;
455cc1dc7a3Sopenharmony_ci	ag.arg.swz = swz;
456cc1dc7a3Sopenharmony_ci	ag.arg.have_z = have_z;
457cc1dc7a3Sopenharmony_ci	ag.arg.alpha_kernel_radius = alpha_kernel_radius;
458cc1dc7a3Sopenharmony_ci
459cc1dc7a3Sopenharmony_ci	ag.img_size_x = size_x;
460cc1dc7a3Sopenharmony_ci	ag.img_size_y = size_y;
461cc1dc7a3Sopenharmony_ci	ag.img_size_z = size_z;
462cc1dc7a3Sopenharmony_ci	ag.blk_size_xy = max_blk_size_xy;
463cc1dc7a3Sopenharmony_ci	ag.blk_size_z = max_blk_size_z;
464cc1dc7a3Sopenharmony_ci	ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
465cc1dc7a3Sopenharmony_ci
466cc1dc7a3Sopenharmony_ci	// The parallel task count
467cc1dc7a3Sopenharmony_ci	unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
468cc1dc7a3Sopenharmony_ci	unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
469cc1dc7a3Sopenharmony_ci	return z_tasks * y_tasks;
470cc1dc7a3Sopenharmony_ci}
471cc1dc7a3Sopenharmony_ci
472cc1dc7a3Sopenharmony_ci#endif
473