1/* 2 * Copyright (c) 2011-2013 Luc Verhaegen <libv@skynet.be> 3 * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io> 4 * Copyright (c) 2018 Vasily Khoruzhick <anarsoul@gmail.com> 5 * Copyright (c) 2019 Collabora, Ltd. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sub license, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 * DEALINGS IN THE SOFTWARE. 25 * 26 */ 27 28#include "pan_tiling.h" 29#include <stdbool.h> 30#include "util/macros.h" 31#include "util/bitscan.h" 32 33/* 34 * This file implements software encode/decode of u-interleaved textures. 35 * See docs/drivers/panfrost.rst for details on the format. 36 * 37 * The tricky bit is ordering along the space-filling curve: 38 * 39 * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) | 40 * 41 * While interleaving bits is trivial in hardware, it is nontrivial in software. 42 * The trick is to divide the pattern up: 43 * 44 * | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 | 45 * ^ | 0 | x3 | 0 | x2 | 0 | x1 | 0 | x0 | 46 * 47 * That is, duplicate the bits of the Y and space out the bits of the X. The top 48 * line is a function only of Y, so it can be calculated once per row and stored 49 * in a register. The bottom line is simply X with the bits spaced out. Spacing 50 * out the X is easy enough with a LUT, or by subtracting+ANDing the mask 51 * pattern (abusing carry bits). 52 * 53 */ 54 55/* Given the lower 4-bits of the Y coordinate, we would like to 56 * duplicate every bit over. So instead of 0b1010, we would like 57 * 0b11001100. The idea is that for the bits in the solely Y place, we 58 * get a Y place, and the bits in the XOR place *also* get a Y. */ 59 60const uint32_t bit_duplication[16] = { 61 0b00000000, 62 0b00000011, 63 0b00001100, 64 0b00001111, 65 0b00110000, 66 0b00110011, 67 0b00111100, 68 0b00111111, 69 0b11000000, 70 0b11000011, 71 0b11001100, 72 0b11001111, 73 0b11110000, 74 0b11110011, 75 0b11111100, 76 0b11111111, 77}; 78 79/* Space the bits out of a 4-bit nibble */ 80 81const unsigned space_4[16] = { 82 0b0000000, 83 0b0000001, 84 0b0000100, 85 0b0000101, 86 0b0010000, 87 0b0010001, 88 0b0010100, 89 0b0010101, 90 0b1000000, 91 0b1000001, 92 0b1000100, 93 0b1000101, 94 0b1010000, 95 0b1010001, 96 0b1010100, 97 0b1010101 98}; 99 100/* The scheme uses 16x16 tiles */ 101 102#define TILE_WIDTH 16 103#define TILE_HEIGHT 16 104#define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT) 105 106/* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must 107 * only support copies and sizeof, so emulating with a packed structure works 108 * well enough, but if there's a native 128-bit type we may we well prefer 109 * that. */ 110 111#ifdef __SIZEOF_INT128__ 112typedef __uint128_t pan_uint128_t; 113#else 114typedef struct { 115 uint64_t lo; 116 uint64_t hi; 117} __attribute__((packed)) pan_uint128_t; 118#endif 119 120typedef struct { 121 uint16_t lo; 122 uint8_t hi; 123} __attribute__((packed)) pan_uint24_t; 124 125typedef struct { 126 uint32_t lo; 127 uint16_t hi; 128} __attribute__((packed)) pan_uint48_t; 129 130typedef struct { 131 uint64_t lo; 132 uint32_t hi; 133} __attribute__((packed)) pan_uint96_t; 134 135/* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation: 136 * 137 * dest_start precomputes the offset to the beginning of the first horizontal 138 * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are 139 * stored linearly, so we get the X tile number by shifting and then multiply 140 * by the bytes per tile . 141 * 142 * We iterate across the pixels we're trying to store in source-order. For each 143 * row in the destination image, we figure out which row of 16x16 block we're 144 * in, by slicing off the lower 4-bits (block_y). 145 * 146 * dest then precomputes the location of the top-left corner of the block the 147 * row starts in. In pixel coordinates (where the origin is the top-left), 148 * (block_y, 0) is the top-left corner of the leftmost tile in this row. While 149 * pixels are reordered within a block, the blocks themselves are stored 150 * linearly, so multiplying block_y by the pixel stride of the destination 151 * image equals the byte offset of that top-left corner of the block this row 152 * is in. 153 * 154 * On the other hand, the source is linear so we compute the locations of the 155 * start and end of the row in the source by a simple linear addressing. 156 * 157 * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0 158 * y0] value. Since this is constant across a row, we look it up per-row and 159 * store in expanded_y. 160 * 161 * Finally, we iterate each row in source order. In the outer loop, we iterate 162 * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should 163 * be unrolled), calculating the index within the tile and writing. 164 */ 165 166#define TILED_ACCESS_TYPE(pixel_t, shift) \ 167static ALWAYS_INLINE void \ 168panfrost_access_tiled_image_##pixel_t \ 169 (void *dst, void *src, \ 170 uint16_t sx, uint16_t sy, \ 171 uint16_t w, uint16_t h, \ 172 uint32_t dst_stride, \ 173 uint32_t src_stride, \ 174 bool is_store) \ 175{ \ 176 uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \ 177 for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ 178 uint8_t *dest = (uint8_t *) (dest_start + ((y >> 4) * dst_stride)); \ 179 pixel_t *source = src + (src_y * src_stride); \ 180 pixel_t *source_end = source + w; \ 181 unsigned expanded_y = bit_duplication[y & 0xF] << shift; \ 182 for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \ 183 for (uint8_t i = 0; i < 16; ++i) { \ 184 unsigned index = expanded_y ^ (space_4[i] << shift); \ 185 if (is_store) \ 186 *((pixel_t *) (dest + index)) = *(source++); \ 187 else \ 188 *(source++) = *((pixel_t *) (dest + index)); \ 189 } \ 190 } \ 191 } \ 192} \ 193 194TILED_ACCESS_TYPE(uint8_t, 0); 195TILED_ACCESS_TYPE(uint16_t, 1); 196TILED_ACCESS_TYPE(uint32_t, 2); 197TILED_ACCESS_TYPE(uint64_t, 3); 198TILED_ACCESS_TYPE(pan_uint128_t, 4); 199 200#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \ 201 const unsigned mask = (1 << tile_shift) - 1; \ 202 for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ 203 unsigned block_start_s = (y >> tile_shift) * dst_stride; \ 204 unsigned source_start = src_y * src_stride; \ 205 unsigned expanded_y = bit_duplication[y & mask]; \ 206 \ 207 for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \ 208 unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \ 209 unsigned index = expanded_y ^ space_4[x & mask]; \ 210 uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \ 211 uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \ 212 \ 213 pixel_t *outp = (pixel_t *) (is_store ? dest : source); \ 214 pixel_t *inp = (pixel_t *) (is_store ? source : dest); \ 215 *outp = *inp; \ 216 } \ 217 } \ 218} 219 220#define TILED_UNALIGNED_TYPES(store, shift) { \ 221 if (bpp == 8) \ 222 TILED_UNALIGNED_TYPE(uint8_t, store, shift) \ 223 else if (bpp == 16) \ 224 TILED_UNALIGNED_TYPE(uint16_t, store, shift) \ 225 else if (bpp == 24) \ 226 TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \ 227 else if (bpp == 32) \ 228 TILED_UNALIGNED_TYPE(uint32_t, store, shift) \ 229 else if (bpp == 48) \ 230 TILED_UNALIGNED_TYPE(pan_uint48_t, store, shift) \ 231 else if (bpp == 64) \ 232 TILED_UNALIGNED_TYPE(uint64_t, store, shift) \ 233 else if (bpp == 96) \ 234 TILED_UNALIGNED_TYPE(pan_uint96_t, store, shift) \ 235 else if (bpp == 128) \ 236 TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \ 237} 238 239/* 240 * Perform a generic access to a tiled image with a given format. This works 241 * even for block-compressed images on entire blocks at a time. sx/sy/w/h are 242 * specified in pixels, not blocks, but our internal routines work in blocks, 243 * so we divide here. Alignment is assumed. 244 */ 245static void 246panfrost_access_tiled_image_generic(void *dst, void *src, 247 unsigned sx, unsigned sy, 248 unsigned w, unsigned h, 249 uint32_t dst_stride, 250 uint32_t src_stride, 251 const struct util_format_description *desc, 252 bool _is_store) 253{ 254 unsigned bpp = desc->block.bits; 255 256 /* Convert units */ 257 sx /= desc->block.width; 258 sy /= desc->block.height; 259 w = DIV_ROUND_UP(w, desc->block.width); 260 h = DIV_ROUND_UP(h, desc->block.height); 261 262 if (desc->block.width > 1) { 263 if (_is_store) 264 TILED_UNALIGNED_TYPES(true, 2) 265 else 266 TILED_UNALIGNED_TYPES(false, 2) 267 } else { 268 if (_is_store) 269 TILED_UNALIGNED_TYPES(true, 4) 270 else 271 TILED_UNALIGNED_TYPES(false, 4) 272 } 273} 274 275#define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8))) 276 277static ALWAYS_INLINE void 278panfrost_access_tiled_image(void *dst, void *src, 279 unsigned x, unsigned y, 280 unsigned w, unsigned h, 281 uint32_t dst_stride, 282 uint32_t src_stride, 283 enum pipe_format format, 284 bool is_store) 285{ 286 const struct util_format_description *desc = util_format_description(format); 287 unsigned bpp = desc->block.bits; 288 289 /* Our optimized routines cannot handle unaligned blocks (without depending 290 * on platform-specific behaviour), and there is no good reason to do so. If 291 * these assertions fail, there is either a driver bug or a non-portable unit 292 * test. 293 */ 294 assert((dst_stride % (bpp / 8)) == 0 && "unaligned destination stride"); 295 assert((src_stride % (bpp / 8)) == 0 && "unaligned source stride"); 296 297 if (desc->block.width > 1 || !util_is_power_of_two_nonzero(desc->block.bits)) { 298 panfrost_access_tiled_image_generic(dst, (void *) src, 299 x, y, w, h, 300 dst_stride, src_stride, desc, is_store); 301 302 return; 303 } 304 305 unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH; 306 unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT; 307 unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH; 308 unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT; 309 310 /* First, tile the top portion */ 311 312 unsigned orig_x = x, orig_y = y; 313 314 if (first_full_tile_y != y) { 315 unsigned dist = MIN2(first_full_tile_y - y, h); 316 317 panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), 318 x, y, w, dist, 319 dst_stride, src_stride, desc, is_store); 320 321 if (dist == h) 322 return; 323 324 y += dist; 325 h -= dist; 326 } 327 328 /* Next, the bottom portion */ 329 if (last_full_tile_y != (y + h)) { 330 unsigned dist = (y + h) - last_full_tile_y; 331 332 panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y), 333 x, last_full_tile_y, w, dist, 334 dst_stride, src_stride, desc, is_store); 335 336 h -= dist; 337 } 338 339 /* The left portion */ 340 if (first_full_tile_x != x) { 341 unsigned dist = MIN2(first_full_tile_x - x, w); 342 343 panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), 344 x, y, dist, h, 345 dst_stride, src_stride, desc, is_store); 346 347 if (dist == w) 348 return; 349 350 x += dist; 351 w -= dist; 352 } 353 354 /* Finally, the right portion */ 355 if (last_full_tile_x != (x + w)) { 356 unsigned dist = (x + w) - last_full_tile_x; 357 358 panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y), 359 last_full_tile_x, y, dist, h, 360 dst_stride, src_stride, desc, is_store); 361 362 w -= dist; 363 } 364 365 if (bpp == 8) 366 panfrost_access_tiled_image_uint8_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); 367 else if (bpp == 16) 368 panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); 369 else if (bpp == 32) 370 panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); 371 else if (bpp == 64) 372 panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); 373 else if (bpp == 128) 374 panfrost_access_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); 375} 376 377/** 378 * Access a tiled image (load or store). Note: the region of interest (x, y, w, 379 * h) is specified in pixels, not blocks. It is expected that these quantities 380 * are aligned to the block size. 381 */ 382void 383panfrost_store_tiled_image(void *dst, const void *src, 384 unsigned x, unsigned y, 385 unsigned w, unsigned h, 386 uint32_t dst_stride, 387 uint32_t src_stride, 388 enum pipe_format format) 389{ 390 panfrost_access_tiled_image(dst, (void *) src, 391 x, y, w, h, 392 dst_stride, src_stride, format, true); 393} 394 395void 396panfrost_load_tiled_image(void *dst, const void *src, 397 unsigned x, unsigned y, 398 unsigned w, unsigned h, 399 uint32_t dst_stride, 400 uint32_t src_stride, 401 enum pipe_format format) 402{ 403 panfrost_access_tiled_image((void *) src, dst, 404 x, y, w, h, 405 src_stride, dst_stride, format, false); 406} 407