1cc1dc7a3Sopenharmony_ci// SPDX-License-Identifier: Apache-2.0 2cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 3cc1dc7a3Sopenharmony_ci// Copyright 2011-2022 Arm Limited 4cc1dc7a3Sopenharmony_ci// 5cc1dc7a3Sopenharmony_ci// Licensed under the Apache License, Version 2.0 (the "License"); you may not 6cc1dc7a3Sopenharmony_ci// use this file except in compliance with the License. You may obtain a copy 7cc1dc7a3Sopenharmony_ci// of the License at: 8cc1dc7a3Sopenharmony_ci// 9cc1dc7a3Sopenharmony_ci// http://www.apache.org/licenses/LICENSE-2.0 10cc1dc7a3Sopenharmony_ci// 11cc1dc7a3Sopenharmony_ci// Unless required by applicable law or agreed to in writing, software 12cc1dc7a3Sopenharmony_ci// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13cc1dc7a3Sopenharmony_ci// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14cc1dc7a3Sopenharmony_ci// License for the specific language governing permissions and limitations 15cc1dc7a3Sopenharmony_ci// under the License. 16cc1dc7a3Sopenharmony_ci// ---------------------------------------------------------------------------- 17cc1dc7a3Sopenharmony_ci 18cc1dc7a3Sopenharmony_ci#if !defined(ASTCENC_DECOMPRESS_ONLY) 19cc1dc7a3Sopenharmony_ci 20cc1dc7a3Sopenharmony_ci/** 21cc1dc7a3Sopenharmony_ci * @brief Functions to calculate variance per component in a NxN footprint. 22cc1dc7a3Sopenharmony_ci * 23cc1dc7a3Sopenharmony_ci * We need N to be parametric, so the routine below uses summed area tables in order to execute in 24cc1dc7a3Sopenharmony_ci * O(1) time independent of how big N is. 25cc1dc7a3Sopenharmony_ci * 26cc1dc7a3Sopenharmony_ci * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first 27cc1dc7a3Sopenharmony_ci * perform a binary reduction, and then distributes the results. This method means that there is no 28cc1dc7a3Sopenharmony_ci * serial dependency between a given element and the next one, and also significantly improves 29cc1dc7a3Sopenharmony_ci * numerical stability allowing us to use floats rather than doubles. 30cc1dc7a3Sopenharmony_ci */ 31cc1dc7a3Sopenharmony_ci 32cc1dc7a3Sopenharmony_ci#include "astcenc_internal.h" 33cc1dc7a3Sopenharmony_ci 34cc1dc7a3Sopenharmony_ci#include <cassert> 35cc1dc7a3Sopenharmony_ci 36cc1dc7a3Sopenharmony_ci/** 37cc1dc7a3Sopenharmony_ci * @brief Generate a prefix-sum array using the Brent-Kung algorithm. 38cc1dc7a3Sopenharmony_ci * 39cc1dc7a3Sopenharmony_ci * This will take an input array of the form: 40cc1dc7a3Sopenharmony_ci * v0, v1, v2, ... 41cc1dc7a3Sopenharmony_ci * ... and modify in-place to turn it into a prefix-sum array of the form: 42cc1dc7a3Sopenharmony_ci * v0, v0+v1, v0+v1+v2, ... 43cc1dc7a3Sopenharmony_ci * 44cc1dc7a3Sopenharmony_ci * @param d The array to prefix-sum. 45cc1dc7a3Sopenharmony_ci * @param items The number of items in the array. 46cc1dc7a3Sopenharmony_ci * @param stride The item spacing in the array; i.e. dense arrays should use 1. 47cc1dc7a3Sopenharmony_ci */ 48cc1dc7a3Sopenharmony_cistatic void brent_kung_prefix_sum( 49cc1dc7a3Sopenharmony_ci vfloat4* d, 50cc1dc7a3Sopenharmony_ci size_t items, 51cc1dc7a3Sopenharmony_ci int stride 52cc1dc7a3Sopenharmony_ci) { 53cc1dc7a3Sopenharmony_ci if (items < 2) 54cc1dc7a3Sopenharmony_ci return; 55cc1dc7a3Sopenharmony_ci 56cc1dc7a3Sopenharmony_ci size_t lc_stride = 2; 57cc1dc7a3Sopenharmony_ci size_t log2_stride = 1; 58cc1dc7a3Sopenharmony_ci 59cc1dc7a3Sopenharmony_ci // The reduction-tree loop 60cc1dc7a3Sopenharmony_ci do { 61cc1dc7a3Sopenharmony_ci size_t step = lc_stride >> 1; 62cc1dc7a3Sopenharmony_ci size_t start = lc_stride - 1; 63cc1dc7a3Sopenharmony_ci size_t iters = items >> log2_stride; 64cc1dc7a3Sopenharmony_ci 65cc1dc7a3Sopenharmony_ci vfloat4 *da = d + (start * stride); 66cc1dc7a3Sopenharmony_ci ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride); 67cc1dc7a3Sopenharmony_ci size_t ofs_stride = stride << log2_stride; 68cc1dc7a3Sopenharmony_ci 69cc1dc7a3Sopenharmony_ci while (iters) 70cc1dc7a3Sopenharmony_ci { 71cc1dc7a3Sopenharmony_ci *da = *da + da[ofs]; 72cc1dc7a3Sopenharmony_ci da += ofs_stride; 73cc1dc7a3Sopenharmony_ci iters--; 74cc1dc7a3Sopenharmony_ci } 75cc1dc7a3Sopenharmony_ci 76cc1dc7a3Sopenharmony_ci log2_stride += 1; 77cc1dc7a3Sopenharmony_ci lc_stride <<= 1; 78cc1dc7a3Sopenharmony_ci } while (lc_stride <= items); 79cc1dc7a3Sopenharmony_ci 80cc1dc7a3Sopenharmony_ci // The expansion-tree loop 81cc1dc7a3Sopenharmony_ci do { 82cc1dc7a3Sopenharmony_ci log2_stride -= 1; 83cc1dc7a3Sopenharmony_ci lc_stride >>= 1; 84cc1dc7a3Sopenharmony_ci 85cc1dc7a3Sopenharmony_ci size_t step = lc_stride >> 1; 86cc1dc7a3Sopenharmony_ci size_t start = step + lc_stride - 1; 87cc1dc7a3Sopenharmony_ci size_t iters = (items - step) >> log2_stride; 88cc1dc7a3Sopenharmony_ci 89cc1dc7a3Sopenharmony_ci vfloat4 *da = d + (start * stride); 90cc1dc7a3Sopenharmony_ci ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride); 91cc1dc7a3Sopenharmony_ci size_t ofs_stride = stride << log2_stride; 92cc1dc7a3Sopenharmony_ci 93cc1dc7a3Sopenharmony_ci while (iters) 94cc1dc7a3Sopenharmony_ci { 95cc1dc7a3Sopenharmony_ci *da = *da + da[ofs]; 96cc1dc7a3Sopenharmony_ci da += ofs_stride; 97cc1dc7a3Sopenharmony_ci iters--; 98cc1dc7a3Sopenharmony_ci } 99cc1dc7a3Sopenharmony_ci } while (lc_stride > 2); 100cc1dc7a3Sopenharmony_ci} 101cc1dc7a3Sopenharmony_ci 102cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 103cc1dc7a3Sopenharmony_civoid compute_pixel_region_variance( 104cc1dc7a3Sopenharmony_ci astcenc_contexti& ctx, 105cc1dc7a3Sopenharmony_ci const pixel_region_args& arg 106cc1dc7a3Sopenharmony_ci) { 107cc1dc7a3Sopenharmony_ci // Unpack the memory structure into local variables 108cc1dc7a3Sopenharmony_ci const astcenc_image* img = arg.img; 109cc1dc7a3Sopenharmony_ci astcenc_swizzle swz = arg.swz; 110cc1dc7a3Sopenharmony_ci bool have_z = arg.have_z; 111cc1dc7a3Sopenharmony_ci 112cc1dc7a3Sopenharmony_ci int size_x = arg.size_x; 113cc1dc7a3Sopenharmony_ci int size_y = arg.size_y; 114cc1dc7a3Sopenharmony_ci int size_z = arg.size_z; 115cc1dc7a3Sopenharmony_ci 116cc1dc7a3Sopenharmony_ci int offset_x = arg.offset_x; 117cc1dc7a3Sopenharmony_ci int offset_y = arg.offset_y; 118cc1dc7a3Sopenharmony_ci int offset_z = arg.offset_z; 119cc1dc7a3Sopenharmony_ci 120cc1dc7a3Sopenharmony_ci int alpha_kernel_radius = arg.alpha_kernel_radius; 121cc1dc7a3Sopenharmony_ci 122cc1dc7a3Sopenharmony_ci float* input_alpha_averages = ctx.input_alpha_averages; 123cc1dc7a3Sopenharmony_ci vfloat4* work_memory = arg.work_memory; 124cc1dc7a3Sopenharmony_ci 125cc1dc7a3Sopenharmony_ci // Compute memory sizes and dimensions that we need 126cc1dc7a3Sopenharmony_ci int kernel_radius = alpha_kernel_radius; 127cc1dc7a3Sopenharmony_ci int kerneldim = 2 * kernel_radius + 1; 128cc1dc7a3Sopenharmony_ci int kernel_radius_xy = kernel_radius; 129cc1dc7a3Sopenharmony_ci int kernel_radius_z = have_z ? kernel_radius : 0; 130cc1dc7a3Sopenharmony_ci 131cc1dc7a3Sopenharmony_ci int padsize_x = size_x + kerneldim; 132cc1dc7a3Sopenharmony_ci int padsize_y = size_y + kerneldim; 133cc1dc7a3Sopenharmony_ci int padsize_z = size_z + (have_z ? kerneldim : 0); 134cc1dc7a3Sopenharmony_ci int sizeprod = padsize_x * padsize_y * padsize_z; 135cc1dc7a3Sopenharmony_ci 136cc1dc7a3Sopenharmony_ci int zd_start = have_z ? 1 : 0; 137cc1dc7a3Sopenharmony_ci 138cc1dc7a3Sopenharmony_ci vfloat4 *varbuf1 = work_memory; 139cc1dc7a3Sopenharmony_ci vfloat4 *varbuf2 = work_memory + sizeprod; 140cc1dc7a3Sopenharmony_ci 141cc1dc7a3Sopenharmony_ci // Scaling factors to apply to Y and Z for accesses into the work buffers 142cc1dc7a3Sopenharmony_ci int yst = padsize_x; 143cc1dc7a3Sopenharmony_ci int zst = padsize_x * padsize_y; 144cc1dc7a3Sopenharmony_ci 145cc1dc7a3Sopenharmony_ci // Scaling factors to apply to Y and Z for accesses into result buffers 146cc1dc7a3Sopenharmony_ci int ydt = img->dim_x; 147cc1dc7a3Sopenharmony_ci int zdt = img->dim_x * img->dim_y; 148cc1dc7a3Sopenharmony_ci 149cc1dc7a3Sopenharmony_ci // Macros to act as accessor functions for the work-memory 150cc1dc7a3Sopenharmony_ci #define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x] 151cc1dc7a3Sopenharmony_ci #define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x] 152cc1dc7a3Sopenharmony_ci 153cc1dc7a3Sopenharmony_ci // Load N and N^2 values into the work buffers 154cc1dc7a3Sopenharmony_ci if (img->data_type == ASTCENC_TYPE_U8) 155cc1dc7a3Sopenharmony_ci { 156cc1dc7a3Sopenharmony_ci // Swizzle data structure 4 = ZERO, 5 = ONE 157cc1dc7a3Sopenharmony_ci uint8_t data[6]; 158cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_0] = 0; 159cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_1] = 255; 160cc1dc7a3Sopenharmony_ci 161cc1dc7a3Sopenharmony_ci for (int z = zd_start; z < padsize_z; z++) 162cc1dc7a3Sopenharmony_ci { 163cc1dc7a3Sopenharmony_ci int z_src = (z - zd_start) + offset_z - kernel_radius_z; 164cc1dc7a3Sopenharmony_ci z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1)); 165cc1dc7a3Sopenharmony_ci uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]); 166cc1dc7a3Sopenharmony_ci 167cc1dc7a3Sopenharmony_ci for (int y = 1; y < padsize_y; y++) 168cc1dc7a3Sopenharmony_ci { 169cc1dc7a3Sopenharmony_ci int y_src = (y - 1) + offset_y - kernel_radius_xy; 170cc1dc7a3Sopenharmony_ci y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1)); 171cc1dc7a3Sopenharmony_ci 172cc1dc7a3Sopenharmony_ci for (int x = 1; x < padsize_x; x++) 173cc1dc7a3Sopenharmony_ci { 174cc1dc7a3Sopenharmony_ci int x_src = (x - 1) + offset_x - kernel_radius_xy; 175cc1dc7a3Sopenharmony_ci x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1)); 176cc1dc7a3Sopenharmony_ci 177cc1dc7a3Sopenharmony_ci data[0] = data8[(4 * img->dim_stride * y_src) + (4 * x_src )]; 178cc1dc7a3Sopenharmony_ci data[1] = data8[(4 * img->dim_stride * y_src) + (4 * x_src + 1)]; 179cc1dc7a3Sopenharmony_ci data[2] = data8[(4 * img->dim_stride * y_src) + (4 * x_src + 2)]; 180cc1dc7a3Sopenharmony_ci data[3] = data8[(4 * img->dim_stride * y_src) + (4 * x_src + 3)]; 181cc1dc7a3Sopenharmony_ci 182cc1dc7a3Sopenharmony_ci uint8_t r = data[swz.r]; 183cc1dc7a3Sopenharmony_ci uint8_t g = data[swz.g]; 184cc1dc7a3Sopenharmony_ci uint8_t b = data[swz.b]; 185cc1dc7a3Sopenharmony_ci uint8_t a = data[swz.a]; 186cc1dc7a3Sopenharmony_ci 187cc1dc7a3Sopenharmony_ci vfloat4 d = vfloat4 (r * (1.0f / 255.0f), 188cc1dc7a3Sopenharmony_ci g * (1.0f / 255.0f), 189cc1dc7a3Sopenharmony_ci b * (1.0f / 255.0f), 190cc1dc7a3Sopenharmony_ci a * (1.0f / 255.0f)); 191cc1dc7a3Sopenharmony_ci 192cc1dc7a3Sopenharmony_ci VARBUF1(z, y, x) = d; 193cc1dc7a3Sopenharmony_ci VARBUF2(z, y, x) = d * d; 194cc1dc7a3Sopenharmony_ci } 195cc1dc7a3Sopenharmony_ci } 196cc1dc7a3Sopenharmony_ci } 197cc1dc7a3Sopenharmony_ci } 198cc1dc7a3Sopenharmony_ci else if (img->data_type == ASTCENC_TYPE_F16) 199cc1dc7a3Sopenharmony_ci { 200cc1dc7a3Sopenharmony_ci // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16) 201cc1dc7a3Sopenharmony_ci uint16_t data[6]; 202cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_0] = 0; 203cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_1] = 0x3C00; 204cc1dc7a3Sopenharmony_ci 205cc1dc7a3Sopenharmony_ci for (int z = zd_start; z < padsize_z; z++) 206cc1dc7a3Sopenharmony_ci { 207cc1dc7a3Sopenharmony_ci int z_src = (z - zd_start) + offset_z - kernel_radius_z; 208cc1dc7a3Sopenharmony_ci z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1)); 209cc1dc7a3Sopenharmony_ci uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]); 210cc1dc7a3Sopenharmony_ci 211cc1dc7a3Sopenharmony_ci for (int y = 1; y < padsize_y; y++) 212cc1dc7a3Sopenharmony_ci { 213cc1dc7a3Sopenharmony_ci int y_src = (y - 1) + offset_y - kernel_radius_xy; 214cc1dc7a3Sopenharmony_ci y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1)); 215cc1dc7a3Sopenharmony_ci 216cc1dc7a3Sopenharmony_ci for (int x = 1; x < padsize_x; x++) 217cc1dc7a3Sopenharmony_ci { 218cc1dc7a3Sopenharmony_ci int x_src = (x - 1) + offset_x - kernel_radius_xy; 219cc1dc7a3Sopenharmony_ci x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1)); 220cc1dc7a3Sopenharmony_ci 221cc1dc7a3Sopenharmony_ci data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )]; 222cc1dc7a3Sopenharmony_ci data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; 223cc1dc7a3Sopenharmony_ci data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; 224cc1dc7a3Sopenharmony_ci data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; 225cc1dc7a3Sopenharmony_ci 226cc1dc7a3Sopenharmony_ci vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); 227cc1dc7a3Sopenharmony_ci vfloat4 d = float16_to_float(di); 228cc1dc7a3Sopenharmony_ci 229cc1dc7a3Sopenharmony_ci VARBUF1(z, y, x) = d; 230cc1dc7a3Sopenharmony_ci VARBUF2(z, y, x) = d * d; 231cc1dc7a3Sopenharmony_ci } 232cc1dc7a3Sopenharmony_ci } 233cc1dc7a3Sopenharmony_ci } 234cc1dc7a3Sopenharmony_ci } 235cc1dc7a3Sopenharmony_ci else // if (img->data_type == ASTCENC_TYPE_F32) 236cc1dc7a3Sopenharmony_ci { 237cc1dc7a3Sopenharmony_ci assert(img->data_type == ASTCENC_TYPE_F32); 238cc1dc7a3Sopenharmony_ci 239cc1dc7a3Sopenharmony_ci // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16) 240cc1dc7a3Sopenharmony_ci float data[6]; 241cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_0] = 0.0f; 242cc1dc7a3Sopenharmony_ci data[ASTCENC_SWZ_1] = 1.0f; 243cc1dc7a3Sopenharmony_ci 244cc1dc7a3Sopenharmony_ci for (int z = zd_start; z < padsize_z; z++) 245cc1dc7a3Sopenharmony_ci { 246cc1dc7a3Sopenharmony_ci int z_src = (z - zd_start) + offset_z - kernel_radius_z; 247cc1dc7a3Sopenharmony_ci z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1)); 248cc1dc7a3Sopenharmony_ci float* data32 = static_cast<float*>(img->data[z_src]); 249cc1dc7a3Sopenharmony_ci 250cc1dc7a3Sopenharmony_ci for (int y = 1; y < padsize_y; y++) 251cc1dc7a3Sopenharmony_ci { 252cc1dc7a3Sopenharmony_ci int y_src = (y - 1) + offset_y - kernel_radius_xy; 253cc1dc7a3Sopenharmony_ci y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1)); 254cc1dc7a3Sopenharmony_ci 255cc1dc7a3Sopenharmony_ci for (int x = 1; x < padsize_x; x++) 256cc1dc7a3Sopenharmony_ci { 257cc1dc7a3Sopenharmony_ci int x_src = (x - 1) + offset_x - kernel_radius_xy; 258cc1dc7a3Sopenharmony_ci x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1)); 259cc1dc7a3Sopenharmony_ci 260cc1dc7a3Sopenharmony_ci data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )]; 261cc1dc7a3Sopenharmony_ci data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; 262cc1dc7a3Sopenharmony_ci data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; 263cc1dc7a3Sopenharmony_ci data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; 264cc1dc7a3Sopenharmony_ci 265cc1dc7a3Sopenharmony_ci float r = data[swz.r]; 266cc1dc7a3Sopenharmony_ci float g = data[swz.g]; 267cc1dc7a3Sopenharmony_ci float b = data[swz.b]; 268cc1dc7a3Sopenharmony_ci float a = data[swz.a]; 269cc1dc7a3Sopenharmony_ci 270cc1dc7a3Sopenharmony_ci vfloat4 d(r, g, b, a); 271cc1dc7a3Sopenharmony_ci 272cc1dc7a3Sopenharmony_ci VARBUF1(z, y, x) = d; 273cc1dc7a3Sopenharmony_ci VARBUF2(z, y, x) = d * d; 274cc1dc7a3Sopenharmony_ci } 275cc1dc7a3Sopenharmony_ci } 276cc1dc7a3Sopenharmony_ci } 277cc1dc7a3Sopenharmony_ci } 278cc1dc7a3Sopenharmony_ci 279cc1dc7a3Sopenharmony_ci // Pad with an extra layer of 0s; this forms the edge of the SAT tables 280cc1dc7a3Sopenharmony_ci vfloat4 vbz = vfloat4::zero(); 281cc1dc7a3Sopenharmony_ci for (int z = 0; z < padsize_z; z++) 282cc1dc7a3Sopenharmony_ci { 283cc1dc7a3Sopenharmony_ci for (int y = 0; y < padsize_y; y++) 284cc1dc7a3Sopenharmony_ci { 285cc1dc7a3Sopenharmony_ci VARBUF1(z, y, 0) = vbz; 286cc1dc7a3Sopenharmony_ci VARBUF2(z, y, 0) = vbz; 287cc1dc7a3Sopenharmony_ci } 288cc1dc7a3Sopenharmony_ci 289cc1dc7a3Sopenharmony_ci for (int x = 0; x < padsize_x; x++) 290cc1dc7a3Sopenharmony_ci { 291cc1dc7a3Sopenharmony_ci VARBUF1(z, 0, x) = vbz; 292cc1dc7a3Sopenharmony_ci VARBUF2(z, 0, x) = vbz; 293cc1dc7a3Sopenharmony_ci } 294cc1dc7a3Sopenharmony_ci } 295cc1dc7a3Sopenharmony_ci 296cc1dc7a3Sopenharmony_ci if (have_z) 297cc1dc7a3Sopenharmony_ci { 298cc1dc7a3Sopenharmony_ci for (int y = 0; y < padsize_y; y++) 299cc1dc7a3Sopenharmony_ci { 300cc1dc7a3Sopenharmony_ci for (int x = 0; x < padsize_x; x++) 301cc1dc7a3Sopenharmony_ci { 302cc1dc7a3Sopenharmony_ci VARBUF1(0, y, x) = vbz; 303cc1dc7a3Sopenharmony_ci VARBUF2(0, y, x) = vbz; 304cc1dc7a3Sopenharmony_ci } 305cc1dc7a3Sopenharmony_ci } 306cc1dc7a3Sopenharmony_ci } 307cc1dc7a3Sopenharmony_ci 308cc1dc7a3Sopenharmony_ci // Generate summed-area tables for N and N^2; this is done in-place, using 309cc1dc7a3Sopenharmony_ci // a Brent-Kung parallel-prefix based algorithm to minimize precision loss 310cc1dc7a3Sopenharmony_ci for (int z = zd_start; z < padsize_z; z++) 311cc1dc7a3Sopenharmony_ci { 312cc1dc7a3Sopenharmony_ci for (int y = 1; y < padsize_y; y++) 313cc1dc7a3Sopenharmony_ci { 314cc1dc7a3Sopenharmony_ci brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1); 315cc1dc7a3Sopenharmony_ci brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1); 316cc1dc7a3Sopenharmony_ci } 317cc1dc7a3Sopenharmony_ci } 318cc1dc7a3Sopenharmony_ci 319cc1dc7a3Sopenharmony_ci for (int z = zd_start; z < padsize_z; z++) 320cc1dc7a3Sopenharmony_ci { 321cc1dc7a3Sopenharmony_ci for (int x = 1; x < padsize_x; x++) 322cc1dc7a3Sopenharmony_ci { 323cc1dc7a3Sopenharmony_ci brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst); 324cc1dc7a3Sopenharmony_ci brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst); 325cc1dc7a3Sopenharmony_ci } 326cc1dc7a3Sopenharmony_ci } 327cc1dc7a3Sopenharmony_ci 328cc1dc7a3Sopenharmony_ci if (have_z) 329cc1dc7a3Sopenharmony_ci { 330cc1dc7a3Sopenharmony_ci for (int y = 1; y < padsize_y; y++) 331cc1dc7a3Sopenharmony_ci { 332cc1dc7a3Sopenharmony_ci for (int x = 1; x < padsize_x; x++) 333cc1dc7a3Sopenharmony_ci { 334cc1dc7a3Sopenharmony_ci brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst); 335cc1dc7a3Sopenharmony_ci brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst); 336cc1dc7a3Sopenharmony_ci } 337cc1dc7a3Sopenharmony_ci } 338cc1dc7a3Sopenharmony_ci } 339cc1dc7a3Sopenharmony_ci 340cc1dc7a3Sopenharmony_ci // Compute a few constants used in the variance-calculation. 341cc1dc7a3Sopenharmony_ci float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1); 342cc1dc7a3Sopenharmony_ci float alpha_rsamples; 343cc1dc7a3Sopenharmony_ci 344cc1dc7a3Sopenharmony_ci if (have_z) 345cc1dc7a3Sopenharmony_ci { 346cc1dc7a3Sopenharmony_ci alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim); 347cc1dc7a3Sopenharmony_ci } 348cc1dc7a3Sopenharmony_ci else 349cc1dc7a3Sopenharmony_ci { 350cc1dc7a3Sopenharmony_ci alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim); 351cc1dc7a3Sopenharmony_ci } 352cc1dc7a3Sopenharmony_ci 353cc1dc7a3Sopenharmony_ci // Use the summed-area tables to compute variance for each neighborhood 354cc1dc7a3Sopenharmony_ci if (have_z) 355cc1dc7a3Sopenharmony_ci { 356cc1dc7a3Sopenharmony_ci for (int z = 0; z < size_z; z++) 357cc1dc7a3Sopenharmony_ci { 358cc1dc7a3Sopenharmony_ci int z_src = z + kernel_radius_z; 359cc1dc7a3Sopenharmony_ci int z_dst = z + offset_z; 360cc1dc7a3Sopenharmony_ci int z_low = z_src - alpha_kernel_radius; 361cc1dc7a3Sopenharmony_ci int z_high = z_src + alpha_kernel_radius + 1; 362cc1dc7a3Sopenharmony_ci 363cc1dc7a3Sopenharmony_ci for (int y = 0; y < size_y; y++) 364cc1dc7a3Sopenharmony_ci { 365cc1dc7a3Sopenharmony_ci int y_src = y + kernel_radius_xy; 366cc1dc7a3Sopenharmony_ci int y_dst = y + offset_y; 367cc1dc7a3Sopenharmony_ci int y_low = y_src - alpha_kernel_radius; 368cc1dc7a3Sopenharmony_ci int y_high = y_src + alpha_kernel_radius + 1; 369cc1dc7a3Sopenharmony_ci 370cc1dc7a3Sopenharmony_ci for (int x = 0; x < size_x; x++) 371cc1dc7a3Sopenharmony_ci { 372cc1dc7a3Sopenharmony_ci int x_src = x + kernel_radius_xy; 373cc1dc7a3Sopenharmony_ci int x_dst = x + offset_x; 374cc1dc7a3Sopenharmony_ci int x_low = x_src - alpha_kernel_radius; 375cc1dc7a3Sopenharmony_ci int x_high = x_src + alpha_kernel_radius + 1; 376cc1dc7a3Sopenharmony_ci 377cc1dc7a3Sopenharmony_ci // Summed-area table lookups for alpha average 378cc1dc7a3Sopenharmony_ci float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>() 379cc1dc7a3Sopenharmony_ci - VARBUF1(z_high, y_low, x_high).lane<3>() 380cc1dc7a3Sopenharmony_ci - VARBUF1(z_high, y_high, x_low).lane<3>() 381cc1dc7a3Sopenharmony_ci + VARBUF1(z_high, y_high, x_high).lane<3>()) - 382cc1dc7a3Sopenharmony_ci ( VARBUF1(z_low, y_low, x_low).lane<3>() 383cc1dc7a3Sopenharmony_ci - VARBUF1(z_low, y_low, x_high).lane<3>() 384cc1dc7a3Sopenharmony_ci - VARBUF1(z_low, y_high, x_low).lane<3>() 385cc1dc7a3Sopenharmony_ci + VARBUF1(z_low, y_high, x_high).lane<3>()); 386cc1dc7a3Sopenharmony_ci 387cc1dc7a3Sopenharmony_ci int out_index = z_dst * zdt + y_dst * ydt + x_dst; 388cc1dc7a3Sopenharmony_ci input_alpha_averages[out_index] = (vasum * alpha_rsamples); 389cc1dc7a3Sopenharmony_ci } 390cc1dc7a3Sopenharmony_ci } 391cc1dc7a3Sopenharmony_ci } 392cc1dc7a3Sopenharmony_ci } 393cc1dc7a3Sopenharmony_ci else 394cc1dc7a3Sopenharmony_ci { 395cc1dc7a3Sopenharmony_ci for (int y = 0; y < size_y; y++) 396cc1dc7a3Sopenharmony_ci { 397cc1dc7a3Sopenharmony_ci int y_src = y + kernel_radius_xy; 398cc1dc7a3Sopenharmony_ci int y_dst = y + offset_y; 399cc1dc7a3Sopenharmony_ci int y_low = y_src - alpha_kernel_radius; 400cc1dc7a3Sopenharmony_ci int y_high = y_src + alpha_kernel_radius + 1; 401cc1dc7a3Sopenharmony_ci 402cc1dc7a3Sopenharmony_ci for (int x = 0; x < size_x; x++) 403cc1dc7a3Sopenharmony_ci { 404cc1dc7a3Sopenharmony_ci int x_src = x + kernel_radius_xy; 405cc1dc7a3Sopenharmony_ci int x_dst = x + offset_x; 406cc1dc7a3Sopenharmony_ci int x_low = x_src - alpha_kernel_radius; 407cc1dc7a3Sopenharmony_ci int x_high = x_src + alpha_kernel_radius + 1; 408cc1dc7a3Sopenharmony_ci 409cc1dc7a3Sopenharmony_ci // Summed-area table lookups for alpha average 410cc1dc7a3Sopenharmony_ci float vasum = VARBUF1(0, y_low, x_low).lane<3>() 411cc1dc7a3Sopenharmony_ci - VARBUF1(0, y_low, x_high).lane<3>() 412cc1dc7a3Sopenharmony_ci - VARBUF1(0, y_high, x_low).lane<3>() 413cc1dc7a3Sopenharmony_ci + VARBUF1(0, y_high, x_high).lane<3>(); 414cc1dc7a3Sopenharmony_ci 415cc1dc7a3Sopenharmony_ci int out_index = y_dst * ydt + x_dst; 416cc1dc7a3Sopenharmony_ci input_alpha_averages[out_index] = (vasum * alpha_rsamples); 417cc1dc7a3Sopenharmony_ci } 418cc1dc7a3Sopenharmony_ci } 419cc1dc7a3Sopenharmony_ci } 420cc1dc7a3Sopenharmony_ci} 421cc1dc7a3Sopenharmony_ci 422cc1dc7a3Sopenharmony_ci/* See header for documentation. */ 423cc1dc7a3Sopenharmony_ciunsigned int init_compute_averages( 424cc1dc7a3Sopenharmony_ci const astcenc_image& img, 425cc1dc7a3Sopenharmony_ci unsigned int alpha_kernel_radius, 426cc1dc7a3Sopenharmony_ci const astcenc_swizzle& swz, 427cc1dc7a3Sopenharmony_ci avg_args& ag 428cc1dc7a3Sopenharmony_ci) { 429cc1dc7a3Sopenharmony_ci unsigned int size_x = img.dim_x; 430cc1dc7a3Sopenharmony_ci unsigned int size_y = img.dim_y; 431cc1dc7a3Sopenharmony_ci unsigned int size_z = img.dim_z; 432cc1dc7a3Sopenharmony_ci 433cc1dc7a3Sopenharmony_ci // Compute maximum block size and from that the working memory buffer size 434cc1dc7a3Sopenharmony_ci unsigned int kernel_radius = alpha_kernel_radius; 435cc1dc7a3Sopenharmony_ci unsigned int kerneldim = 2 * kernel_radius + 1; 436cc1dc7a3Sopenharmony_ci 437cc1dc7a3Sopenharmony_ci bool have_z = (size_z > 1); 438cc1dc7a3Sopenharmony_ci unsigned int max_blk_size_xy = have_z ? 16 : 32; 439cc1dc7a3Sopenharmony_ci unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u); 440cc1dc7a3Sopenharmony_ci 441cc1dc7a3Sopenharmony_ci unsigned int max_padsize_xy = max_blk_size_xy + kerneldim; 442cc1dc7a3Sopenharmony_ci unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0); 443cc1dc7a3Sopenharmony_ci 444cc1dc7a3Sopenharmony_ci // Perform block-wise averages calculations across the image 445cc1dc7a3Sopenharmony_ci // Initialize fields which are not populated until later 446cc1dc7a3Sopenharmony_ci ag.arg.size_x = 0; 447cc1dc7a3Sopenharmony_ci ag.arg.size_y = 0; 448cc1dc7a3Sopenharmony_ci ag.arg.size_z = 0; 449cc1dc7a3Sopenharmony_ci ag.arg.offset_x = 0; 450cc1dc7a3Sopenharmony_ci ag.arg.offset_y = 0; 451cc1dc7a3Sopenharmony_ci ag.arg.offset_z = 0; 452cc1dc7a3Sopenharmony_ci ag.arg.work_memory = nullptr; 453cc1dc7a3Sopenharmony_ci 454cc1dc7a3Sopenharmony_ci ag.arg.img = &img; 455cc1dc7a3Sopenharmony_ci ag.arg.swz = swz; 456cc1dc7a3Sopenharmony_ci ag.arg.have_z = have_z; 457cc1dc7a3Sopenharmony_ci ag.arg.alpha_kernel_radius = alpha_kernel_radius; 458cc1dc7a3Sopenharmony_ci 459cc1dc7a3Sopenharmony_ci ag.img_size_x = size_x; 460cc1dc7a3Sopenharmony_ci ag.img_size_y = size_y; 461cc1dc7a3Sopenharmony_ci ag.img_size_z = size_z; 462cc1dc7a3Sopenharmony_ci ag.blk_size_xy = max_blk_size_xy; 463cc1dc7a3Sopenharmony_ci ag.blk_size_z = max_blk_size_z; 464cc1dc7a3Sopenharmony_ci ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z; 465cc1dc7a3Sopenharmony_ci 466cc1dc7a3Sopenharmony_ci // The parallel task count 467cc1dc7a3Sopenharmony_ci unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z; 468cc1dc7a3Sopenharmony_ci unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy; 469cc1dc7a3Sopenharmony_ci return z_tasks * y_tasks; 470cc1dc7a3Sopenharmony_ci} 471cc1dc7a3Sopenharmony_ci 472cc1dc7a3Sopenharmony_ci#endif 473