1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Mesa 3-D graphics library 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Copyright 2012 Intel Corporation 5bf215546Sopenharmony_ci * Copyright 2013 Google 6bf215546Sopenharmony_ci * 7bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 8bf215546Sopenharmony_ci * copy of this software and associated documentation files (the 9bf215546Sopenharmony_ci * "Software"), to deal in the Software without restriction, including 10bf215546Sopenharmony_ci * without limitation the rights to use, copy, modify, merge, publish, 11bf215546Sopenharmony_ci * distribute, sublicense, and/or sell copies of the Software, and to 12bf215546Sopenharmony_ci * permit persons to whom the Software is furnished to do so, subject to 13bf215546Sopenharmony_ci * the following conditions: 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the 16bf215546Sopenharmony_ci * next paragraph) shall be included in all copies or substantial portions 17bf215546Sopenharmony_ci * of the Software. 18bf215546Sopenharmony_ci * 19bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20bf215546Sopenharmony_ci * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21bf215546Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 22bf215546Sopenharmony_ci * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 23bf215546Sopenharmony_ci * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 24bf215546Sopenharmony_ci * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 25bf215546Sopenharmony_ci * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26bf215546Sopenharmony_ci * 27bf215546Sopenharmony_ci * Authors: 28bf215546Sopenharmony_ci * Chad Versace <chad.versace@linux.intel.com> 29bf215546Sopenharmony_ci * Frank Henigman <fjhenigman@google.com> 30bf215546Sopenharmony_ci */ 31bf215546Sopenharmony_ci 32bf215546Sopenharmony_ci#include <string.h> 33bf215546Sopenharmony_ci 34bf215546Sopenharmony_ci#include "util/macros.h" 35bf215546Sopenharmony_ci#include "util/u_math.h" 36bf215546Sopenharmony_ci#include "util/rounding.h" 37bf215546Sopenharmony_ci#include "isl_priv.h" 38bf215546Sopenharmony_ci 39bf215546Sopenharmony_ci#if defined(__SSSE3__) 40bf215546Sopenharmony_ci#include <tmmintrin.h> 41bf215546Sopenharmony_ci#elif defined(__SSE2__) 42bf215546Sopenharmony_ci#include <emmintrin.h> 43bf215546Sopenharmony_ci#endif 44bf215546Sopenharmony_ci 45bf215546Sopenharmony_ci#define FILE_DEBUG_FLAG DEBUG_TEXTURE 46bf215546Sopenharmony_ci 47bf215546Sopenharmony_ci#define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b) 48bf215546Sopenharmony_ci#define ALIGN_UP(a, b) ALIGN(a, b) 49bf215546Sopenharmony_ci 50bf215546Sopenharmony_ci/* Tile dimensions. Width and span are in bytes, height is in pixels (i.e. 51bf215546Sopenharmony_ci * unitless). A "span" is the most number of bytes we can copy from linear 52bf215546Sopenharmony_ci * to tiled without needing to calculate a new destination address. 53bf215546Sopenharmony_ci */ 54bf215546Sopenharmony_cistatic const uint32_t xtile_width = 512; 55bf215546Sopenharmony_cistatic const uint32_t xtile_height = 8; 56bf215546Sopenharmony_cistatic const uint32_t xtile_span = 64; 57bf215546Sopenharmony_cistatic const uint32_t ytile_width = 128; 58bf215546Sopenharmony_cistatic const uint32_t ytile_height = 32; 59bf215546Sopenharmony_cistatic const uint32_t ytile_span = 16; 60bf215546Sopenharmony_ci 61bf215546Sopenharmony_cistatic inline uint32_t 62bf215546Sopenharmony_ciror(uint32_t n, uint32_t d) 63bf215546Sopenharmony_ci{ 64bf215546Sopenharmony_ci return (n >> d) | (n << (32 - d)); 65bf215546Sopenharmony_ci} 66bf215546Sopenharmony_ci 67bf215546Sopenharmony_ci// bswap32 already exists as a macro on some platforms (FreeBSD) 68bf215546Sopenharmony_ci#ifndef bswap32 69bf215546Sopenharmony_cistatic inline uint32_t 70bf215546Sopenharmony_cibswap32(uint32_t n) 71bf215546Sopenharmony_ci{ 72bf215546Sopenharmony_ci#if defined(HAVE___BUILTIN_BSWAP32) 73bf215546Sopenharmony_ci return __builtin_bswap32(n); 74bf215546Sopenharmony_ci#else 75bf215546Sopenharmony_ci return (n >> 24) | 76bf215546Sopenharmony_ci ((n >> 8) & 0x0000ff00) | 77bf215546Sopenharmony_ci ((n << 8) & 0x00ff0000) | 78bf215546Sopenharmony_ci (n << 24); 79bf215546Sopenharmony_ci#endif 80bf215546Sopenharmony_ci} 81bf215546Sopenharmony_ci#endif 82bf215546Sopenharmony_ci 83bf215546Sopenharmony_ci/** 84bf215546Sopenharmony_ci * Copy RGBA to BGRA - swap R and B. 85bf215546Sopenharmony_ci */ 86bf215546Sopenharmony_cistatic inline void * 87bf215546Sopenharmony_cirgba8_copy(void *dst, const void *src, size_t bytes) 88bf215546Sopenharmony_ci{ 89bf215546Sopenharmony_ci uint32_t *d = dst; 90bf215546Sopenharmony_ci uint32_t const *s = src; 91bf215546Sopenharmony_ci 92bf215546Sopenharmony_ci assert(bytes % 4 == 0); 93bf215546Sopenharmony_ci 94bf215546Sopenharmony_ci while (bytes >= 4) { 95bf215546Sopenharmony_ci *d = ror(bswap32(*s), 8); 96bf215546Sopenharmony_ci d += 1; 97bf215546Sopenharmony_ci s += 1; 98bf215546Sopenharmony_ci bytes -= 4; 99bf215546Sopenharmony_ci } 100bf215546Sopenharmony_ci return dst; 101bf215546Sopenharmony_ci} 102bf215546Sopenharmony_ci 103bf215546Sopenharmony_ci#ifdef __SSSE3__ 104bf215546Sopenharmony_cistatic const uint8_t rgba8_permutation[16] = 105bf215546Sopenharmony_ci { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 }; 106bf215546Sopenharmony_ci 107bf215546Sopenharmony_cistatic inline void 108bf215546Sopenharmony_cirgba8_copy_16_aligned_dst(void *dst, const void *src) 109bf215546Sopenharmony_ci{ 110bf215546Sopenharmony_ci _mm_store_si128(dst, 111bf215546Sopenharmony_ci _mm_shuffle_epi8(_mm_loadu_si128(src), 112bf215546Sopenharmony_ci *(__m128i *)rgba8_permutation)); 113bf215546Sopenharmony_ci} 114bf215546Sopenharmony_ci 115bf215546Sopenharmony_cistatic inline void 116bf215546Sopenharmony_cirgba8_copy_16_aligned_src(void *dst, const void *src) 117bf215546Sopenharmony_ci{ 118bf215546Sopenharmony_ci _mm_storeu_si128(dst, 119bf215546Sopenharmony_ci _mm_shuffle_epi8(_mm_load_si128(src), 120bf215546Sopenharmony_ci *(__m128i *)rgba8_permutation)); 121bf215546Sopenharmony_ci} 122bf215546Sopenharmony_ci 123bf215546Sopenharmony_ci#elif defined(__SSE2__) 124bf215546Sopenharmony_cistatic inline void 125bf215546Sopenharmony_cirgba8_copy_16_aligned_dst(void *dst, const void *src) 126bf215546Sopenharmony_ci{ 127bf215546Sopenharmony_ci __m128i srcreg, dstreg, agmask, ag, rb, br; 128bf215546Sopenharmony_ci 129bf215546Sopenharmony_ci agmask = _mm_set1_epi32(0xFF00FF00); 130bf215546Sopenharmony_ci srcreg = _mm_loadu_si128((__m128i *)src); 131bf215546Sopenharmony_ci 132bf215546Sopenharmony_ci rb = _mm_andnot_si128(agmask, srcreg); 133bf215546Sopenharmony_ci ag = _mm_and_si128(agmask, srcreg); 134bf215546Sopenharmony_ci br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)), 135bf215546Sopenharmony_ci _MM_SHUFFLE(2, 3, 0, 1)); 136bf215546Sopenharmony_ci dstreg = _mm_or_si128(ag, br); 137bf215546Sopenharmony_ci 138bf215546Sopenharmony_ci _mm_store_si128((__m128i *)dst, dstreg); 139bf215546Sopenharmony_ci} 140bf215546Sopenharmony_ci 141bf215546Sopenharmony_cistatic inline void 142bf215546Sopenharmony_cirgba8_copy_16_aligned_src(void *dst, const void *src) 143bf215546Sopenharmony_ci{ 144bf215546Sopenharmony_ci __m128i srcreg, dstreg, agmask, ag, rb, br; 145bf215546Sopenharmony_ci 146bf215546Sopenharmony_ci agmask = _mm_set1_epi32(0xFF00FF00); 147bf215546Sopenharmony_ci srcreg = _mm_load_si128((__m128i *)src); 148bf215546Sopenharmony_ci 149bf215546Sopenharmony_ci rb = _mm_andnot_si128(agmask, srcreg); 150bf215546Sopenharmony_ci ag = _mm_and_si128(agmask, srcreg); 151bf215546Sopenharmony_ci br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)), 152bf215546Sopenharmony_ci _MM_SHUFFLE(2, 3, 0, 1)); 153bf215546Sopenharmony_ci dstreg = _mm_or_si128(ag, br); 154bf215546Sopenharmony_ci 155bf215546Sopenharmony_ci _mm_storeu_si128((__m128i *)dst, dstreg); 156bf215546Sopenharmony_ci} 157bf215546Sopenharmony_ci#endif 158bf215546Sopenharmony_ci 159bf215546Sopenharmony_ci/** 160bf215546Sopenharmony_ci * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned. 161bf215546Sopenharmony_ci */ 162bf215546Sopenharmony_cistatic inline void * 163bf215546Sopenharmony_cirgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes) 164bf215546Sopenharmony_ci{ 165bf215546Sopenharmony_ci assert(bytes == 0 || !(((uintptr_t)dst) & 0xf)); 166bf215546Sopenharmony_ci 167bf215546Sopenharmony_ci#if defined(__SSSE3__) || defined(__SSE2__) 168bf215546Sopenharmony_ci if (bytes == 64) { 169bf215546Sopenharmony_ci rgba8_copy_16_aligned_dst(dst + 0, src + 0); 170bf215546Sopenharmony_ci rgba8_copy_16_aligned_dst(dst + 16, src + 16); 171bf215546Sopenharmony_ci rgba8_copy_16_aligned_dst(dst + 32, src + 32); 172bf215546Sopenharmony_ci rgba8_copy_16_aligned_dst(dst + 48, src + 48); 173bf215546Sopenharmony_ci return dst; 174bf215546Sopenharmony_ci } 175bf215546Sopenharmony_ci 176bf215546Sopenharmony_ci while (bytes >= 16) { 177bf215546Sopenharmony_ci rgba8_copy_16_aligned_dst(dst, src); 178bf215546Sopenharmony_ci src += 16; 179bf215546Sopenharmony_ci dst += 16; 180bf215546Sopenharmony_ci bytes -= 16; 181bf215546Sopenharmony_ci } 182bf215546Sopenharmony_ci#endif 183bf215546Sopenharmony_ci 184bf215546Sopenharmony_ci rgba8_copy(dst, src, bytes); 185bf215546Sopenharmony_ci 186bf215546Sopenharmony_ci return dst; 187bf215546Sopenharmony_ci} 188bf215546Sopenharmony_ci 189bf215546Sopenharmony_ci/** 190bf215546Sopenharmony_ci * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned. 191bf215546Sopenharmony_ci */ 192bf215546Sopenharmony_cistatic inline void * 193bf215546Sopenharmony_cirgba8_copy_aligned_src(void *dst, const void *src, size_t bytes) 194bf215546Sopenharmony_ci{ 195bf215546Sopenharmony_ci assert(bytes == 0 || !(((uintptr_t)src) & 0xf)); 196bf215546Sopenharmony_ci 197bf215546Sopenharmony_ci#if defined(__SSSE3__) || defined(__SSE2__) 198bf215546Sopenharmony_ci if (bytes == 64) { 199bf215546Sopenharmony_ci rgba8_copy_16_aligned_src(dst + 0, src + 0); 200bf215546Sopenharmony_ci rgba8_copy_16_aligned_src(dst + 16, src + 16); 201bf215546Sopenharmony_ci rgba8_copy_16_aligned_src(dst + 32, src + 32); 202bf215546Sopenharmony_ci rgba8_copy_16_aligned_src(dst + 48, src + 48); 203bf215546Sopenharmony_ci return dst; 204bf215546Sopenharmony_ci } 205bf215546Sopenharmony_ci 206bf215546Sopenharmony_ci while (bytes >= 16) { 207bf215546Sopenharmony_ci rgba8_copy_16_aligned_src(dst, src); 208bf215546Sopenharmony_ci src += 16; 209bf215546Sopenharmony_ci dst += 16; 210bf215546Sopenharmony_ci bytes -= 16; 211bf215546Sopenharmony_ci } 212bf215546Sopenharmony_ci#endif 213bf215546Sopenharmony_ci 214bf215546Sopenharmony_ci rgba8_copy(dst, src, bytes); 215bf215546Sopenharmony_ci 216bf215546Sopenharmony_ci return dst; 217bf215546Sopenharmony_ci} 218bf215546Sopenharmony_ci 219bf215546Sopenharmony_ci/** 220bf215546Sopenharmony_ci * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3). 221bf215546Sopenharmony_ci * These ranges are in bytes, i.e. pixels * bytes-per-pixel. 222bf215546Sopenharmony_ci * The first and last ranges must be shorter than a "span" (the longest linear 223bf215546Sopenharmony_ci * stretch within a tile) and the middle must equal a whole number of spans. 224bf215546Sopenharmony_ci * Ranges may be empty. The region copied must land entirely within one tile. 225bf215546Sopenharmony_ci * 'dst' is the start of the tile and 'src' is the corresponding 226bf215546Sopenharmony_ci * address to copy from, though copying begins at (x0, y0). 227bf215546Sopenharmony_ci * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero. 228bf215546Sopenharmony_ci * Swizzling flips bit 6 in the copy destination offset, when certain other 229bf215546Sopenharmony_ci * bits are set in it. 230bf215546Sopenharmony_ci */ 231bf215546Sopenharmony_citypedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 232bf215546Sopenharmony_ci uint32_t y0, uint32_t y1, 233bf215546Sopenharmony_ci char *dst, const char *src, 234bf215546Sopenharmony_ci int32_t linear_pitch, 235bf215546Sopenharmony_ci uint32_t swizzle_bit, 236bf215546Sopenharmony_ci isl_memcpy_type copy_type); 237bf215546Sopenharmony_ci 238bf215546Sopenharmony_ci/** 239bf215546Sopenharmony_ci * Copy texture data from linear to X tile layout. 240bf215546Sopenharmony_ci * 241bf215546Sopenharmony_ci * \copydoc tile_copy_fn 242bf215546Sopenharmony_ci * 243bf215546Sopenharmony_ci * The mem_copy parameters allow the user to specify an alternative mem_copy 244bf215546Sopenharmony_ci * function that, for instance, may do RGBA -> BGRA swizzling. The first 245bf215546Sopenharmony_ci * function must handle any memory alignment while the second function must 246bf215546Sopenharmony_ci * only handle 16-byte alignment in whichever side (source or destination) is 247bf215546Sopenharmony_ci * tiled. 248bf215546Sopenharmony_ci */ 249bf215546Sopenharmony_cistatic inline void 250bf215546Sopenharmony_cilinear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 251bf215546Sopenharmony_ci uint32_t y0, uint32_t y1, 252bf215546Sopenharmony_ci char *dst, const char *src, 253bf215546Sopenharmony_ci int32_t src_pitch, 254bf215546Sopenharmony_ci uint32_t swizzle_bit, 255bf215546Sopenharmony_ci isl_mem_copy_fn mem_copy, 256bf215546Sopenharmony_ci isl_mem_copy_fn mem_copy_align16) 257bf215546Sopenharmony_ci{ 258bf215546Sopenharmony_ci /* The copy destination offset for each range copied is the sum of 259bf215546Sopenharmony_ci * an X offset 'x0' or 'xo' and a Y offset 'yo.' 260bf215546Sopenharmony_ci */ 261bf215546Sopenharmony_ci uint32_t xo, yo; 262bf215546Sopenharmony_ci 263bf215546Sopenharmony_ci src += (ptrdiff_t)y0 * src_pitch; 264bf215546Sopenharmony_ci 265bf215546Sopenharmony_ci for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) { 266bf215546Sopenharmony_ci /* Bits 9 and 10 of the copy destination offset control swizzling. 267bf215546Sopenharmony_ci * Only 'yo' contributes to those bits in the total offset, 268bf215546Sopenharmony_ci * so calculate 'swizzle' just once per row. 269bf215546Sopenharmony_ci * Move bits 9 and 10 three and four places respectively down 270bf215546Sopenharmony_ci * to bit 6 and xor them. 271bf215546Sopenharmony_ci */ 272bf215546Sopenharmony_ci uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit; 273bf215546Sopenharmony_ci 274bf215546Sopenharmony_ci mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0); 275bf215546Sopenharmony_ci 276bf215546Sopenharmony_ci for (xo = x1; xo < x2; xo += xtile_span) { 277bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span); 278bf215546Sopenharmony_ci } 279bf215546Sopenharmony_ci 280bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); 281bf215546Sopenharmony_ci 282bf215546Sopenharmony_ci src += src_pitch; 283bf215546Sopenharmony_ci } 284bf215546Sopenharmony_ci} 285bf215546Sopenharmony_ci 286bf215546Sopenharmony_ci/** 287bf215546Sopenharmony_ci * Copy texture data from linear to Y tile layout. 288bf215546Sopenharmony_ci * 289bf215546Sopenharmony_ci * \copydoc tile_copy_fn 290bf215546Sopenharmony_ci */ 291bf215546Sopenharmony_cistatic inline void 292bf215546Sopenharmony_cilinear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 293bf215546Sopenharmony_ci uint32_t y0, uint32_t y3, 294bf215546Sopenharmony_ci char *dst, const char *src, 295bf215546Sopenharmony_ci int32_t src_pitch, 296bf215546Sopenharmony_ci uint32_t swizzle_bit, 297bf215546Sopenharmony_ci isl_mem_copy_fn mem_copy, 298bf215546Sopenharmony_ci isl_mem_copy_fn mem_copy_align16) 299bf215546Sopenharmony_ci{ 300bf215546Sopenharmony_ci /* Y tiles consist of columns that are 'ytile_span' wide (and the same height 301bf215546Sopenharmony_ci * as the tile). Thus the destination offset for (x,y) is the sum of: 302bf215546Sopenharmony_ci * (x % column_width) // position within column 303bf215546Sopenharmony_ci * (x / column_width) * bytes_per_column // column number * bytes per column 304bf215546Sopenharmony_ci * y * column_width 305bf215546Sopenharmony_ci * 306bf215546Sopenharmony_ci * The copy destination offset for each range copied is the sum of 307bf215546Sopenharmony_ci * an X offset 'xo0' or 'xo' and a Y offset 'yo.' 308bf215546Sopenharmony_ci */ 309bf215546Sopenharmony_ci const uint32_t column_width = ytile_span; 310bf215546Sopenharmony_ci const uint32_t bytes_per_column = column_width * ytile_height; 311bf215546Sopenharmony_ci 312bf215546Sopenharmony_ci uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4)); 313bf215546Sopenharmony_ci uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4)); 314bf215546Sopenharmony_ci 315bf215546Sopenharmony_ci uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column; 316bf215546Sopenharmony_ci uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column; 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ci /* Bit 9 of the destination offset control swizzling. 319bf215546Sopenharmony_ci * Only the X offset contributes to bit 9 of the total offset, 320bf215546Sopenharmony_ci * so swizzle can be calculated in advance for these X positions. 321bf215546Sopenharmony_ci * Move bit 9 three places down to bit 6. 322bf215546Sopenharmony_ci */ 323bf215546Sopenharmony_ci uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit; 324bf215546Sopenharmony_ci uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit; 325bf215546Sopenharmony_ci 326bf215546Sopenharmony_ci uint32_t x, yo; 327bf215546Sopenharmony_ci 328bf215546Sopenharmony_ci src += (ptrdiff_t)y0 * src_pitch; 329bf215546Sopenharmony_ci 330bf215546Sopenharmony_ci if (y0 != y1) { 331bf215546Sopenharmony_ci for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) { 332bf215546Sopenharmony_ci uint32_t xo = xo1; 333bf215546Sopenharmony_ci uint32_t swizzle = swizzle1; 334bf215546Sopenharmony_ci 335bf215546Sopenharmony_ci mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0); 336bf215546Sopenharmony_ci 337bf215546Sopenharmony_ci /* Step by spans/columns. As it happens, the swizzle bit flips 338bf215546Sopenharmony_ci * at each step so we don't need to calculate it explicitly. 339bf215546Sopenharmony_ci */ 340bf215546Sopenharmony_ci for (x = x1; x < x2; x += ytile_span) { 341bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span); 342bf215546Sopenharmony_ci xo += bytes_per_column; 343bf215546Sopenharmony_ci swizzle ^= swizzle_bit; 344bf215546Sopenharmony_ci } 345bf215546Sopenharmony_ci 346bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); 347bf215546Sopenharmony_ci 348bf215546Sopenharmony_ci src += src_pitch; 349bf215546Sopenharmony_ci } 350bf215546Sopenharmony_ci } 351bf215546Sopenharmony_ci 352bf215546Sopenharmony_ci for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) { 353bf215546Sopenharmony_ci uint32_t xo = xo1; 354bf215546Sopenharmony_ci uint32_t swizzle = swizzle1; 355bf215546Sopenharmony_ci 356bf215546Sopenharmony_ci if (x0 != x1) { 357bf215546Sopenharmony_ci mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0); 358bf215546Sopenharmony_ci mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0); 359bf215546Sopenharmony_ci mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0); 360bf215546Sopenharmony_ci mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0); 361bf215546Sopenharmony_ci } 362bf215546Sopenharmony_ci 363bf215546Sopenharmony_ci /* Step by spans/columns. As it happens, the swizzle bit flips 364bf215546Sopenharmony_ci * at each step so we don't need to calculate it explicitly. 365bf215546Sopenharmony_ci */ 366bf215546Sopenharmony_ci for (x = x1; x < x2; x += ytile_span) { 367bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span); 368bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span); 369bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span); 370bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span); 371bf215546Sopenharmony_ci xo += bytes_per_column; 372bf215546Sopenharmony_ci swizzle ^= swizzle_bit; 373bf215546Sopenharmony_ci } 374bf215546Sopenharmony_ci 375bf215546Sopenharmony_ci if (x2 != x3) { 376bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2); 377bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2); 378bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2); 379bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2); 380bf215546Sopenharmony_ci } 381bf215546Sopenharmony_ci 382bf215546Sopenharmony_ci src += 4 * src_pitch; 383bf215546Sopenharmony_ci } 384bf215546Sopenharmony_ci 385bf215546Sopenharmony_ci if (y2 != y3) { 386bf215546Sopenharmony_ci for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) { 387bf215546Sopenharmony_ci uint32_t xo = xo1; 388bf215546Sopenharmony_ci uint32_t swizzle = swizzle1; 389bf215546Sopenharmony_ci 390bf215546Sopenharmony_ci mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0); 391bf215546Sopenharmony_ci 392bf215546Sopenharmony_ci /* Step by spans/columns. As it happens, the swizzle bit flips 393bf215546Sopenharmony_ci * at each step so we don't need to calculate it explicitly. 394bf215546Sopenharmony_ci */ 395bf215546Sopenharmony_ci for (x = x1; x < x2; x += ytile_span) { 396bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span); 397bf215546Sopenharmony_ci xo += bytes_per_column; 398bf215546Sopenharmony_ci swizzle ^= swizzle_bit; 399bf215546Sopenharmony_ci } 400bf215546Sopenharmony_ci 401bf215546Sopenharmony_ci mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); 402bf215546Sopenharmony_ci 403bf215546Sopenharmony_ci src += src_pitch; 404bf215546Sopenharmony_ci } 405bf215546Sopenharmony_ci } 406bf215546Sopenharmony_ci} 407bf215546Sopenharmony_ci 408bf215546Sopenharmony_ci/** 409bf215546Sopenharmony_ci * Copy texture data from X tile layout to linear. 410bf215546Sopenharmony_ci * 411bf215546Sopenharmony_ci * \copydoc tile_copy_fn 412bf215546Sopenharmony_ci */ 413bf215546Sopenharmony_cistatic inline void 414bf215546Sopenharmony_cixtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 415bf215546Sopenharmony_ci uint32_t y0, uint32_t y1, 416bf215546Sopenharmony_ci char *dst, const char *src, 417bf215546Sopenharmony_ci int32_t dst_pitch, 418bf215546Sopenharmony_ci uint32_t swizzle_bit, 419bf215546Sopenharmony_ci isl_mem_copy_fn mem_copy, 420bf215546Sopenharmony_ci isl_mem_copy_fn mem_copy_align16) 421bf215546Sopenharmony_ci{ 422bf215546Sopenharmony_ci /* The copy destination offset for each range copied is the sum of 423bf215546Sopenharmony_ci * an X offset 'x0' or 'xo' and a Y offset 'yo.' 424bf215546Sopenharmony_ci */ 425bf215546Sopenharmony_ci uint32_t xo, yo; 426bf215546Sopenharmony_ci 427bf215546Sopenharmony_ci dst += (ptrdiff_t)y0 * dst_pitch; 428bf215546Sopenharmony_ci 429bf215546Sopenharmony_ci for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) { 430bf215546Sopenharmony_ci /* Bits 9 and 10 of the copy destination offset control swizzling. 431bf215546Sopenharmony_ci * Only 'yo' contributes to those bits in the total offset, 432bf215546Sopenharmony_ci * so calculate 'swizzle' just once per row. 433bf215546Sopenharmony_ci * Move bits 9 and 10 three and four places respectively down 434bf215546Sopenharmony_ci * to bit 6 and xor them. 435bf215546Sopenharmony_ci */ 436bf215546Sopenharmony_ci uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit; 437bf215546Sopenharmony_ci 438bf215546Sopenharmony_ci mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0); 439bf215546Sopenharmony_ci 440bf215546Sopenharmony_ci for (xo = x1; xo < x2; xo += xtile_span) { 441bf215546Sopenharmony_ci mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span); 442bf215546Sopenharmony_ci } 443bf215546Sopenharmony_ci 444bf215546Sopenharmony_ci mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); 445bf215546Sopenharmony_ci 446bf215546Sopenharmony_ci dst += dst_pitch; 447bf215546Sopenharmony_ci } 448bf215546Sopenharmony_ci} 449bf215546Sopenharmony_ci 450bf215546Sopenharmony_ci /** 451bf215546Sopenharmony_ci * Copy texture data from Y tile layout to linear. 452bf215546Sopenharmony_ci * 453bf215546Sopenharmony_ci * \copydoc tile_copy_fn 454bf215546Sopenharmony_ci */ 455bf215546Sopenharmony_cistatic inline void 456bf215546Sopenharmony_ciytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 457bf215546Sopenharmony_ci uint32_t y0, uint32_t y3, 458bf215546Sopenharmony_ci char *dst, const char *src, 459bf215546Sopenharmony_ci int32_t dst_pitch, 460bf215546Sopenharmony_ci uint32_t swizzle_bit, 461bf215546Sopenharmony_ci isl_mem_copy_fn mem_copy, 462bf215546Sopenharmony_ci isl_mem_copy_fn mem_copy_align16) 463bf215546Sopenharmony_ci{ 464bf215546Sopenharmony_ci /* Y tiles consist of columns that are 'ytile_span' wide (and the same height 465bf215546Sopenharmony_ci * as the tile). Thus the destination offset for (x,y) is the sum of: 466bf215546Sopenharmony_ci * (x % column_width) // position within column 467bf215546Sopenharmony_ci * (x / column_width) * bytes_per_column // column number * bytes per column 468bf215546Sopenharmony_ci * y * column_width 469bf215546Sopenharmony_ci * 470bf215546Sopenharmony_ci * The copy destination offset for each range copied is the sum of 471bf215546Sopenharmony_ci * an X offset 'xo0' or 'xo' and a Y offset 'yo.' 472bf215546Sopenharmony_ci */ 473bf215546Sopenharmony_ci const uint32_t column_width = ytile_span; 474bf215546Sopenharmony_ci const uint32_t bytes_per_column = column_width * ytile_height; 475bf215546Sopenharmony_ci 476bf215546Sopenharmony_ci uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4)); 477bf215546Sopenharmony_ci uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4)); 478bf215546Sopenharmony_ci 479bf215546Sopenharmony_ci uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column; 480bf215546Sopenharmony_ci uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column; 481bf215546Sopenharmony_ci 482bf215546Sopenharmony_ci /* Bit 9 of the destination offset control swizzling. 483bf215546Sopenharmony_ci * Only the X offset contributes to bit 9 of the total offset, 484bf215546Sopenharmony_ci * so swizzle can be calculated in advance for these X positions. 485bf215546Sopenharmony_ci * Move bit 9 three places down to bit 6. 486bf215546Sopenharmony_ci */ 487bf215546Sopenharmony_ci uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit; 488bf215546Sopenharmony_ci uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit; 489bf215546Sopenharmony_ci 490bf215546Sopenharmony_ci uint32_t x, yo; 491bf215546Sopenharmony_ci 492bf215546Sopenharmony_ci dst += (ptrdiff_t)y0 * dst_pitch; 493bf215546Sopenharmony_ci 494bf215546Sopenharmony_ci if (y0 != y1) { 495bf215546Sopenharmony_ci for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) { 496bf215546Sopenharmony_ci uint32_t xo = xo1; 497bf215546Sopenharmony_ci uint32_t swizzle = swizzle1; 498bf215546Sopenharmony_ci 499bf215546Sopenharmony_ci mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0); 500bf215546Sopenharmony_ci 501bf215546Sopenharmony_ci /* Step by spans/columns. As it happens, the swizzle bit flips 502bf215546Sopenharmony_ci * at each step so we don't need to calculate it explicitly. 503bf215546Sopenharmony_ci */ 504bf215546Sopenharmony_ci for (x = x1; x < x2; x += ytile_span) { 505bf215546Sopenharmony_ci mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span); 506bf215546Sopenharmony_ci xo += bytes_per_column; 507bf215546Sopenharmony_ci swizzle ^= swizzle_bit; 508bf215546Sopenharmony_ci } 509bf215546Sopenharmony_ci 510bf215546Sopenharmony_ci mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); 511bf215546Sopenharmony_ci 512bf215546Sopenharmony_ci dst += dst_pitch; 513bf215546Sopenharmony_ci } 514bf215546Sopenharmony_ci } 515bf215546Sopenharmony_ci 516bf215546Sopenharmony_ci for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) { 517bf215546Sopenharmony_ci uint32_t xo = xo1; 518bf215546Sopenharmony_ci uint32_t swizzle = swizzle1; 519bf215546Sopenharmony_ci 520bf215546Sopenharmony_ci if (x0 != x1) { 521bf215546Sopenharmony_ci mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0); 522bf215546Sopenharmony_ci mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0); 523bf215546Sopenharmony_ci mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0); 524bf215546Sopenharmony_ci mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0); 525bf215546Sopenharmony_ci } 526bf215546Sopenharmony_ci 527bf215546Sopenharmony_ci /* Step by spans/columns. As it happens, the swizzle bit flips 528bf215546Sopenharmony_ci * at each step so we don't need to calculate it explicitly. 529bf215546Sopenharmony_ci */ 530bf215546Sopenharmony_ci for (x = x1; x < x2; x += ytile_span) { 531bf215546Sopenharmony_ci mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span); 532bf215546Sopenharmony_ci mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span); 533bf215546Sopenharmony_ci mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span); 534bf215546Sopenharmony_ci mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span); 535bf215546Sopenharmony_ci xo += bytes_per_column; 536bf215546Sopenharmony_ci swizzle ^= swizzle_bit; 537bf215546Sopenharmony_ci } 538bf215546Sopenharmony_ci 539bf215546Sopenharmony_ci if (x2 != x3) { 540bf215546Sopenharmony_ci mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2); 541bf215546Sopenharmony_ci mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2); 542bf215546Sopenharmony_ci mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2); 543bf215546Sopenharmony_ci mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2); 544bf215546Sopenharmony_ci } 545bf215546Sopenharmony_ci 546bf215546Sopenharmony_ci dst += 4 * dst_pitch; 547bf215546Sopenharmony_ci } 548bf215546Sopenharmony_ci 549bf215546Sopenharmony_ci if (y2 != y3) { 550bf215546Sopenharmony_ci for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) { 551bf215546Sopenharmony_ci uint32_t xo = xo1; 552bf215546Sopenharmony_ci uint32_t swizzle = swizzle1; 553bf215546Sopenharmony_ci 554bf215546Sopenharmony_ci mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0); 555bf215546Sopenharmony_ci 556bf215546Sopenharmony_ci /* Step by spans/columns. As it happens, the swizzle bit flips 557bf215546Sopenharmony_ci * at each step so we don't need to calculate it explicitly. 558bf215546Sopenharmony_ci */ 559bf215546Sopenharmony_ci for (x = x1; x < x2; x += ytile_span) { 560bf215546Sopenharmony_ci mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span); 561bf215546Sopenharmony_ci xo += bytes_per_column; 562bf215546Sopenharmony_ci swizzle ^= swizzle_bit; 563bf215546Sopenharmony_ci } 564bf215546Sopenharmony_ci 565bf215546Sopenharmony_ci mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); 566bf215546Sopenharmony_ci 567bf215546Sopenharmony_ci dst += dst_pitch; 568bf215546Sopenharmony_ci } 569bf215546Sopenharmony_ci } 570bf215546Sopenharmony_ci} 571bf215546Sopenharmony_ci 572bf215546Sopenharmony_ci#if defined(INLINE_SSE41) 573bf215546Sopenharmony_cistatic ALWAYS_INLINE void * 574bf215546Sopenharmony_ci_memcpy_streaming_load(void *dest, const void *src, size_t count) 575bf215546Sopenharmony_ci{ 576bf215546Sopenharmony_ci if (count == 16) { 577bf215546Sopenharmony_ci __m128i val = _mm_stream_load_si128((__m128i *)src); 578bf215546Sopenharmony_ci _mm_storeu_si128((__m128i *)dest, val); 579bf215546Sopenharmony_ci return dest; 580bf215546Sopenharmony_ci } else if (count == 64) { 581bf215546Sopenharmony_ci __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0); 582bf215546Sopenharmony_ci __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1); 583bf215546Sopenharmony_ci __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2); 584bf215546Sopenharmony_ci __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3); 585bf215546Sopenharmony_ci _mm_storeu_si128(((__m128i *)dest) + 0, val0); 586bf215546Sopenharmony_ci _mm_storeu_si128(((__m128i *)dest) + 1, val1); 587bf215546Sopenharmony_ci _mm_storeu_si128(((__m128i *)dest) + 2, val2); 588bf215546Sopenharmony_ci _mm_storeu_si128(((__m128i *)dest) + 3, val3); 589bf215546Sopenharmony_ci return dest; 590bf215546Sopenharmony_ci } else { 591bf215546Sopenharmony_ci assert(count < 64); /* and (count < 16) for ytiled */ 592bf215546Sopenharmony_ci return memcpy(dest, src, count); 593bf215546Sopenharmony_ci } 594bf215546Sopenharmony_ci} 595bf215546Sopenharmony_ci#endif 596bf215546Sopenharmony_ci 597bf215546Sopenharmony_cistatic isl_mem_copy_fn 598bf215546Sopenharmony_cichoose_copy_function(isl_memcpy_type copy_type) 599bf215546Sopenharmony_ci{ 600bf215546Sopenharmony_ci switch(copy_type) { 601bf215546Sopenharmony_ci case ISL_MEMCPY: 602bf215546Sopenharmony_ci return memcpy; 603bf215546Sopenharmony_ci case ISL_MEMCPY_BGRA8: 604bf215546Sopenharmony_ci return rgba8_copy; 605bf215546Sopenharmony_ci case ISL_MEMCPY_STREAMING_LOAD: 606bf215546Sopenharmony_ci#if defined(INLINE_SSE41) 607bf215546Sopenharmony_ci return _memcpy_streaming_load; 608bf215546Sopenharmony_ci#else 609bf215546Sopenharmony_ci unreachable("ISL_MEMCOPY_STREAMING_LOAD requires sse4.1"); 610bf215546Sopenharmony_ci#endif 611bf215546Sopenharmony_ci case ISL_MEMCPY_INVALID: 612bf215546Sopenharmony_ci unreachable("invalid copy_type"); 613bf215546Sopenharmony_ci } 614bf215546Sopenharmony_ci unreachable("unhandled copy_type"); 615bf215546Sopenharmony_ci return NULL; 616bf215546Sopenharmony_ci} 617bf215546Sopenharmony_ci 618bf215546Sopenharmony_ci/** 619bf215546Sopenharmony_ci * Copy texture data from linear to X tile layout, faster. 620bf215546Sopenharmony_ci * 621bf215546Sopenharmony_ci * Same as \ref linear_to_xtiled but faster, because it passes constant 622bf215546Sopenharmony_ci * parameters for common cases, allowing the compiler to inline code 623bf215546Sopenharmony_ci * optimized for those cases. 624bf215546Sopenharmony_ci * 625bf215546Sopenharmony_ci * \copydoc tile_copy_fn 626bf215546Sopenharmony_ci */ 627bf215546Sopenharmony_cistatic FLATTEN void 628bf215546Sopenharmony_cilinear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 629bf215546Sopenharmony_ci uint32_t y0, uint32_t y1, 630bf215546Sopenharmony_ci char *dst, const char *src, 631bf215546Sopenharmony_ci int32_t src_pitch, 632bf215546Sopenharmony_ci uint32_t swizzle_bit, 633bf215546Sopenharmony_ci isl_memcpy_type copy_type) 634bf215546Sopenharmony_ci{ 635bf215546Sopenharmony_ci isl_mem_copy_fn mem_copy = choose_copy_function(copy_type); 636bf215546Sopenharmony_ci 637bf215546Sopenharmony_ci if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) { 638bf215546Sopenharmony_ci if (mem_copy == memcpy) 639bf215546Sopenharmony_ci return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height, 640bf215546Sopenharmony_ci dst, src, src_pitch, swizzle_bit, memcpy, memcpy); 641bf215546Sopenharmony_ci else if (mem_copy == rgba8_copy) 642bf215546Sopenharmony_ci return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height, 643bf215546Sopenharmony_ci dst, src, src_pitch, swizzle_bit, 644bf215546Sopenharmony_ci rgba8_copy, rgba8_copy_aligned_dst); 645bf215546Sopenharmony_ci else 646bf215546Sopenharmony_ci unreachable("not reached"); 647bf215546Sopenharmony_ci } else { 648bf215546Sopenharmony_ci if (mem_copy == memcpy) 649bf215546Sopenharmony_ci return linear_to_xtiled(x0, x1, x2, x3, y0, y1, 650bf215546Sopenharmony_ci dst, src, src_pitch, swizzle_bit, 651bf215546Sopenharmony_ci memcpy, memcpy); 652bf215546Sopenharmony_ci else if (mem_copy == rgba8_copy) 653bf215546Sopenharmony_ci return linear_to_xtiled(x0, x1, x2, x3, y0, y1, 654bf215546Sopenharmony_ci dst, src, src_pitch, swizzle_bit, 655bf215546Sopenharmony_ci rgba8_copy, rgba8_copy_aligned_dst); 656bf215546Sopenharmony_ci else 657bf215546Sopenharmony_ci unreachable("not reached"); 658bf215546Sopenharmony_ci } 659bf215546Sopenharmony_ci linear_to_xtiled(x0, x1, x2, x3, y0, y1, 660bf215546Sopenharmony_ci dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy); 661bf215546Sopenharmony_ci} 662bf215546Sopenharmony_ci 663bf215546Sopenharmony_ci/** 664bf215546Sopenharmony_ci * Copy texture data from linear to Y tile layout, faster. 665bf215546Sopenharmony_ci * 666bf215546Sopenharmony_ci * Same as \ref linear_to_ytiled but faster, because it passes constant 667bf215546Sopenharmony_ci * parameters for common cases, allowing the compiler to inline code 668bf215546Sopenharmony_ci * optimized for those cases. 669bf215546Sopenharmony_ci * 670bf215546Sopenharmony_ci * \copydoc tile_copy_fn 671bf215546Sopenharmony_ci */ 672bf215546Sopenharmony_cistatic FLATTEN void 673bf215546Sopenharmony_cilinear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 674bf215546Sopenharmony_ci uint32_t y0, uint32_t y1, 675bf215546Sopenharmony_ci char *dst, const char *src, 676bf215546Sopenharmony_ci int32_t src_pitch, 677bf215546Sopenharmony_ci uint32_t swizzle_bit, 678bf215546Sopenharmony_ci isl_memcpy_type copy_type) 679bf215546Sopenharmony_ci{ 680bf215546Sopenharmony_ci isl_mem_copy_fn mem_copy = choose_copy_function(copy_type); 681bf215546Sopenharmony_ci 682bf215546Sopenharmony_ci if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { 683bf215546Sopenharmony_ci if (mem_copy == memcpy) 684bf215546Sopenharmony_ci return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, 685bf215546Sopenharmony_ci dst, src, src_pitch, swizzle_bit, memcpy, memcpy); 686bf215546Sopenharmony_ci else if (mem_copy == rgba8_copy) 687bf215546Sopenharmony_ci return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, 688bf215546Sopenharmony_ci dst, src, src_pitch, swizzle_bit, 689bf215546Sopenharmony_ci rgba8_copy, rgba8_copy_aligned_dst); 690bf215546Sopenharmony_ci else 691bf215546Sopenharmony_ci unreachable("not reached"); 692bf215546Sopenharmony_ci } else { 693bf215546Sopenharmony_ci if (mem_copy == memcpy) 694bf215546Sopenharmony_ci return linear_to_ytiled(x0, x1, x2, x3, y0, y1, 695bf215546Sopenharmony_ci dst, src, src_pitch, swizzle_bit, memcpy, memcpy); 696bf215546Sopenharmony_ci else if (mem_copy == rgba8_copy) 697bf215546Sopenharmony_ci return linear_to_ytiled(x0, x1, x2, x3, y0, y1, 698bf215546Sopenharmony_ci dst, src, src_pitch, swizzle_bit, 699bf215546Sopenharmony_ci rgba8_copy, rgba8_copy_aligned_dst); 700bf215546Sopenharmony_ci else 701bf215546Sopenharmony_ci unreachable("not reached"); 702bf215546Sopenharmony_ci } 703bf215546Sopenharmony_ci linear_to_ytiled(x0, x1, x2, x3, y0, y1, 704bf215546Sopenharmony_ci dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy); 705bf215546Sopenharmony_ci} 706bf215546Sopenharmony_ci 707bf215546Sopenharmony_ci/** 708bf215546Sopenharmony_ci * Copy texture data from X tile layout to linear, faster. 709bf215546Sopenharmony_ci * 710bf215546Sopenharmony_ci * Same as \ref xtile_to_linear but faster, because it passes constant 711bf215546Sopenharmony_ci * parameters for common cases, allowing the compiler to inline code 712bf215546Sopenharmony_ci * optimized for those cases. 713bf215546Sopenharmony_ci * 714bf215546Sopenharmony_ci * \copydoc tile_copy_fn 715bf215546Sopenharmony_ci */ 716bf215546Sopenharmony_cistatic FLATTEN void 717bf215546Sopenharmony_cixtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 718bf215546Sopenharmony_ci uint32_t y0, uint32_t y1, 719bf215546Sopenharmony_ci char *dst, const char *src, 720bf215546Sopenharmony_ci int32_t dst_pitch, 721bf215546Sopenharmony_ci uint32_t swizzle_bit, 722bf215546Sopenharmony_ci isl_memcpy_type copy_type) 723bf215546Sopenharmony_ci{ 724bf215546Sopenharmony_ci isl_mem_copy_fn mem_copy = choose_copy_function(copy_type); 725bf215546Sopenharmony_ci 726bf215546Sopenharmony_ci if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) { 727bf215546Sopenharmony_ci if (mem_copy == memcpy) 728bf215546Sopenharmony_ci return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, 729bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); 730bf215546Sopenharmony_ci else if (mem_copy == rgba8_copy) 731bf215546Sopenharmony_ci return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, 732bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, 733bf215546Sopenharmony_ci rgba8_copy, rgba8_copy_aligned_src); 734bf215546Sopenharmony_ci#if defined(INLINE_SSE41) 735bf215546Sopenharmony_ci else if (mem_copy == _memcpy_streaming_load) 736bf215546Sopenharmony_ci return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, 737bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, 738bf215546Sopenharmony_ci memcpy, _memcpy_streaming_load); 739bf215546Sopenharmony_ci#endif 740bf215546Sopenharmony_ci else 741bf215546Sopenharmony_ci unreachable("not reached"); 742bf215546Sopenharmony_ci } else { 743bf215546Sopenharmony_ci if (mem_copy == memcpy) 744bf215546Sopenharmony_ci return xtiled_to_linear(x0, x1, x2, x3, y0, y1, 745bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); 746bf215546Sopenharmony_ci else if (mem_copy == rgba8_copy) 747bf215546Sopenharmony_ci return xtiled_to_linear(x0, x1, x2, x3, y0, y1, 748bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, 749bf215546Sopenharmony_ci rgba8_copy, rgba8_copy_aligned_src); 750bf215546Sopenharmony_ci#if defined(INLINE_SSE41) 751bf215546Sopenharmony_ci else if (mem_copy == _memcpy_streaming_load) 752bf215546Sopenharmony_ci return xtiled_to_linear(x0, x1, x2, x3, y0, y1, 753bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, 754bf215546Sopenharmony_ci memcpy, _memcpy_streaming_load); 755bf215546Sopenharmony_ci#endif 756bf215546Sopenharmony_ci else 757bf215546Sopenharmony_ci unreachable("not reached"); 758bf215546Sopenharmony_ci } 759bf215546Sopenharmony_ci xtiled_to_linear(x0, x1, x2, x3, y0, y1, 760bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy); 761bf215546Sopenharmony_ci} 762bf215546Sopenharmony_ci 763bf215546Sopenharmony_ci/** 764bf215546Sopenharmony_ci * Copy texture data from Y tile layout to linear, faster. 765bf215546Sopenharmony_ci * 766bf215546Sopenharmony_ci * Same as \ref ytile_to_linear but faster, because it passes constant 767bf215546Sopenharmony_ci * parameters for common cases, allowing the compiler to inline code 768bf215546Sopenharmony_ci * optimized for those cases. 769bf215546Sopenharmony_ci * 770bf215546Sopenharmony_ci * \copydoc tile_copy_fn 771bf215546Sopenharmony_ci */ 772bf215546Sopenharmony_cistatic FLATTEN void 773bf215546Sopenharmony_ciytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 774bf215546Sopenharmony_ci uint32_t y0, uint32_t y1, 775bf215546Sopenharmony_ci char *dst, const char *src, 776bf215546Sopenharmony_ci int32_t dst_pitch, 777bf215546Sopenharmony_ci uint32_t swizzle_bit, 778bf215546Sopenharmony_ci isl_memcpy_type copy_type) 779bf215546Sopenharmony_ci{ 780bf215546Sopenharmony_ci isl_mem_copy_fn mem_copy = choose_copy_function(copy_type); 781bf215546Sopenharmony_ci 782bf215546Sopenharmony_ci if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { 783bf215546Sopenharmony_ci if (mem_copy == memcpy) 784bf215546Sopenharmony_ci return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, 785bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); 786bf215546Sopenharmony_ci else if (mem_copy == rgba8_copy) 787bf215546Sopenharmony_ci return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, 788bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, 789bf215546Sopenharmony_ci rgba8_copy, rgba8_copy_aligned_src); 790bf215546Sopenharmony_ci#if defined(INLINE_SSE41) 791bf215546Sopenharmony_ci else if (copy_type == ISL_MEMCPY_STREAMING_LOAD) 792bf215546Sopenharmony_ci return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, 793bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, 794bf215546Sopenharmony_ci memcpy, _memcpy_streaming_load); 795bf215546Sopenharmony_ci#endif 796bf215546Sopenharmony_ci else 797bf215546Sopenharmony_ci unreachable("not reached"); 798bf215546Sopenharmony_ci } else { 799bf215546Sopenharmony_ci if (mem_copy == memcpy) 800bf215546Sopenharmony_ci return ytiled_to_linear(x0, x1, x2, x3, y0, y1, 801bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); 802bf215546Sopenharmony_ci else if (mem_copy == rgba8_copy) 803bf215546Sopenharmony_ci return ytiled_to_linear(x0, x1, x2, x3, y0, y1, 804bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, 805bf215546Sopenharmony_ci rgba8_copy, rgba8_copy_aligned_src); 806bf215546Sopenharmony_ci#if defined(INLINE_SSE41) 807bf215546Sopenharmony_ci else if (copy_type == ISL_MEMCPY_STREAMING_LOAD) 808bf215546Sopenharmony_ci return ytiled_to_linear(x0, x1, x2, x3, y0, y1, 809bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, 810bf215546Sopenharmony_ci memcpy, _memcpy_streaming_load); 811bf215546Sopenharmony_ci#endif 812bf215546Sopenharmony_ci else 813bf215546Sopenharmony_ci unreachable("not reached"); 814bf215546Sopenharmony_ci } 815bf215546Sopenharmony_ci ytiled_to_linear(x0, x1, x2, x3, y0, y1, 816bf215546Sopenharmony_ci dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy); 817bf215546Sopenharmony_ci} 818bf215546Sopenharmony_ci 819bf215546Sopenharmony_ci/** 820bf215546Sopenharmony_ci * Copy from linear to tiled texture. 821bf215546Sopenharmony_ci * 822bf215546Sopenharmony_ci * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into 823bf215546Sopenharmony_ci * pieces that do not cross tile boundaries and copy each piece with a tile 824bf215546Sopenharmony_ci * copy function (\ref tile_copy_fn). 825bf215546Sopenharmony_ci * The X range is in bytes, i.e. pixels * bytes-per-pixel. 826bf215546Sopenharmony_ci * The Y range is in pixels (i.e. unitless). 827bf215546Sopenharmony_ci * 'dst' is the address of (0, 0) in the destination tiled texture. 828bf215546Sopenharmony_ci * 'src' is the address of (xt1, yt1) in the source linear texture. 829bf215546Sopenharmony_ci */ 830bf215546Sopenharmony_cistatic void 831bf215546Sopenharmony_cilinear_to_tiled(uint32_t xt1, uint32_t xt2, 832bf215546Sopenharmony_ci uint32_t yt1, uint32_t yt2, 833bf215546Sopenharmony_ci char *dst, const char *src, 834bf215546Sopenharmony_ci uint32_t dst_pitch, int32_t src_pitch, 835bf215546Sopenharmony_ci bool has_swizzling, 836bf215546Sopenharmony_ci enum isl_tiling tiling, 837bf215546Sopenharmony_ci isl_memcpy_type copy_type) 838bf215546Sopenharmony_ci{ 839bf215546Sopenharmony_ci tile_copy_fn tile_copy; 840bf215546Sopenharmony_ci uint32_t xt0, xt3; 841bf215546Sopenharmony_ci uint32_t yt0, yt3; 842bf215546Sopenharmony_ci uint32_t xt, yt; 843bf215546Sopenharmony_ci uint32_t tw, th, span; 844bf215546Sopenharmony_ci uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0; 845bf215546Sopenharmony_ci 846bf215546Sopenharmony_ci if (tiling == ISL_TILING_X) { 847bf215546Sopenharmony_ci tw = xtile_width; 848bf215546Sopenharmony_ci th = xtile_height; 849bf215546Sopenharmony_ci span = xtile_span; 850bf215546Sopenharmony_ci tile_copy = linear_to_xtiled_faster; 851bf215546Sopenharmony_ci } else if (tiling == ISL_TILING_Y0) { 852bf215546Sopenharmony_ci tw = ytile_width; 853bf215546Sopenharmony_ci th = ytile_height; 854bf215546Sopenharmony_ci span = ytile_span; 855bf215546Sopenharmony_ci tile_copy = linear_to_ytiled_faster; 856bf215546Sopenharmony_ci } else { 857bf215546Sopenharmony_ci unreachable("unsupported tiling"); 858bf215546Sopenharmony_ci } 859bf215546Sopenharmony_ci 860bf215546Sopenharmony_ci /* Round out to tile boundaries. */ 861bf215546Sopenharmony_ci xt0 = ALIGN_DOWN(xt1, tw); 862bf215546Sopenharmony_ci xt3 = ALIGN_UP (xt2, tw); 863bf215546Sopenharmony_ci yt0 = ALIGN_DOWN(yt1, th); 864bf215546Sopenharmony_ci yt3 = ALIGN_UP (yt2, th); 865bf215546Sopenharmony_ci 866bf215546Sopenharmony_ci /* Loop over all tiles to which we have something to copy. 867bf215546Sopenharmony_ci * 'xt' and 'yt' are the origin of the destination tile, whether copying 868bf215546Sopenharmony_ci * copying a full or partial tile. 869bf215546Sopenharmony_ci * tile_copy() copies one tile or partial tile. 870bf215546Sopenharmony_ci * Looping x inside y is the faster memory access pattern. 871bf215546Sopenharmony_ci */ 872bf215546Sopenharmony_ci for (yt = yt0; yt < yt3; yt += th) { 873bf215546Sopenharmony_ci for (xt = xt0; xt < xt3; xt += tw) { 874bf215546Sopenharmony_ci /* The area to update is [x0,x3) x [y0,y1). 875bf215546Sopenharmony_ci * May not want the whole tile, hence the min and max. 876bf215546Sopenharmony_ci */ 877bf215546Sopenharmony_ci uint32_t x0 = MAX2(xt1, xt); 878bf215546Sopenharmony_ci uint32_t y0 = MAX2(yt1, yt); 879bf215546Sopenharmony_ci uint32_t x3 = MIN2(xt2, xt + tw); 880bf215546Sopenharmony_ci uint32_t y1 = MIN2(yt2, yt + th); 881bf215546Sopenharmony_ci 882bf215546Sopenharmony_ci /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that 883bf215546Sopenharmony_ci * the middle interval is the longest span-aligned part. 884bf215546Sopenharmony_ci * The sub-ranges could be empty. 885bf215546Sopenharmony_ci */ 886bf215546Sopenharmony_ci uint32_t x1, x2; 887bf215546Sopenharmony_ci x1 = ALIGN_UP(x0, span); 888bf215546Sopenharmony_ci if (x1 > x3) 889bf215546Sopenharmony_ci x1 = x2 = x3; 890bf215546Sopenharmony_ci else 891bf215546Sopenharmony_ci x2 = ALIGN_DOWN(x3, span); 892bf215546Sopenharmony_ci 893bf215546Sopenharmony_ci assert(x0 <= x1 && x1 <= x2 && x2 <= x3); 894bf215546Sopenharmony_ci assert(x1 - x0 < span && x3 - x2 < span); 895bf215546Sopenharmony_ci assert(x3 - x0 <= tw); 896bf215546Sopenharmony_ci assert((x2 - x1) % span == 0); 897bf215546Sopenharmony_ci 898bf215546Sopenharmony_ci /* Translate by (xt,yt) for single-tile copier. */ 899bf215546Sopenharmony_ci tile_copy(x0-xt, x1-xt, x2-xt, x3-xt, 900bf215546Sopenharmony_ci y0-yt, y1-yt, 901bf215546Sopenharmony_ci dst + (ptrdiff_t)xt * th + (ptrdiff_t)yt * dst_pitch, 902bf215546Sopenharmony_ci src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch, 903bf215546Sopenharmony_ci src_pitch, 904bf215546Sopenharmony_ci swizzle_bit, 905bf215546Sopenharmony_ci copy_type); 906bf215546Sopenharmony_ci } 907bf215546Sopenharmony_ci } 908bf215546Sopenharmony_ci} 909bf215546Sopenharmony_ci 910bf215546Sopenharmony_ci/** 911bf215546Sopenharmony_ci * Copy from tiled to linear texture. 912bf215546Sopenharmony_ci * 913bf215546Sopenharmony_ci * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into 914bf215546Sopenharmony_ci * pieces that do not cross tile boundaries and copy each piece with a tile 915bf215546Sopenharmony_ci * copy function (\ref tile_copy_fn). 916bf215546Sopenharmony_ci * The X range is in bytes, i.e. pixels * bytes-per-pixel. 917bf215546Sopenharmony_ci * The Y range is in pixels (i.e. unitless). 918bf215546Sopenharmony_ci * 'dst' is the address of (xt1, yt1) in the destination linear texture. 919bf215546Sopenharmony_ci * 'src' is the address of (0, 0) in the source tiled texture. 920bf215546Sopenharmony_ci */ 921bf215546Sopenharmony_cistatic void 922bf215546Sopenharmony_citiled_to_linear(uint32_t xt1, uint32_t xt2, 923bf215546Sopenharmony_ci uint32_t yt1, uint32_t yt2, 924bf215546Sopenharmony_ci char *dst, const char *src, 925bf215546Sopenharmony_ci int32_t dst_pitch, uint32_t src_pitch, 926bf215546Sopenharmony_ci bool has_swizzling, 927bf215546Sopenharmony_ci enum isl_tiling tiling, 928bf215546Sopenharmony_ci isl_memcpy_type copy_type) 929bf215546Sopenharmony_ci{ 930bf215546Sopenharmony_ci tile_copy_fn tile_copy; 931bf215546Sopenharmony_ci uint32_t xt0, xt3; 932bf215546Sopenharmony_ci uint32_t yt0, yt3; 933bf215546Sopenharmony_ci uint32_t xt, yt; 934bf215546Sopenharmony_ci uint32_t tw, th, span; 935bf215546Sopenharmony_ci uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0; 936bf215546Sopenharmony_ci 937bf215546Sopenharmony_ci if (tiling == ISL_TILING_X) { 938bf215546Sopenharmony_ci tw = xtile_width; 939bf215546Sopenharmony_ci th = xtile_height; 940bf215546Sopenharmony_ci span = xtile_span; 941bf215546Sopenharmony_ci tile_copy = xtiled_to_linear_faster; 942bf215546Sopenharmony_ci } else if (tiling == ISL_TILING_Y0) { 943bf215546Sopenharmony_ci tw = ytile_width; 944bf215546Sopenharmony_ci th = ytile_height; 945bf215546Sopenharmony_ci span = ytile_span; 946bf215546Sopenharmony_ci tile_copy = ytiled_to_linear_faster; 947bf215546Sopenharmony_ci } else { 948bf215546Sopenharmony_ci unreachable("unsupported tiling"); 949bf215546Sopenharmony_ci } 950bf215546Sopenharmony_ci 951bf215546Sopenharmony_ci#if defined(INLINE_SSE41) 952bf215546Sopenharmony_ci if (copy_type == ISL_MEMCPY_STREAMING_LOAD) { 953bf215546Sopenharmony_ci /* The hidden cacheline sized register used by movntdqa can apparently 954bf215546Sopenharmony_ci * give you stale data, so do an mfence to invalidate it. 955bf215546Sopenharmony_ci */ 956bf215546Sopenharmony_ci _mm_mfence(); 957bf215546Sopenharmony_ci } 958bf215546Sopenharmony_ci#endif 959bf215546Sopenharmony_ci 960bf215546Sopenharmony_ci /* Round out to tile boundaries. */ 961bf215546Sopenharmony_ci xt0 = ALIGN_DOWN(xt1, tw); 962bf215546Sopenharmony_ci xt3 = ALIGN_UP (xt2, tw); 963bf215546Sopenharmony_ci yt0 = ALIGN_DOWN(yt1, th); 964bf215546Sopenharmony_ci yt3 = ALIGN_UP (yt2, th); 965bf215546Sopenharmony_ci 966bf215546Sopenharmony_ci /* Loop over all tiles to which we have something to copy. 967bf215546Sopenharmony_ci * 'xt' and 'yt' are the origin of the destination tile, whether copying 968bf215546Sopenharmony_ci * copying a full or partial tile. 969bf215546Sopenharmony_ci * tile_copy() copies one tile or partial tile. 970bf215546Sopenharmony_ci * Looping x inside y is the faster memory access pattern. 971bf215546Sopenharmony_ci */ 972bf215546Sopenharmony_ci for (yt = yt0; yt < yt3; yt += th) { 973bf215546Sopenharmony_ci for (xt = xt0; xt < xt3; xt += tw) { 974bf215546Sopenharmony_ci /* The area to update is [x0,x3) x [y0,y1). 975bf215546Sopenharmony_ci * May not want the whole tile, hence the min and max. 976bf215546Sopenharmony_ci */ 977bf215546Sopenharmony_ci uint32_t x0 = MAX2(xt1, xt); 978bf215546Sopenharmony_ci uint32_t y0 = MAX2(yt1, yt); 979bf215546Sopenharmony_ci uint32_t x3 = MIN2(xt2, xt + tw); 980bf215546Sopenharmony_ci uint32_t y1 = MIN2(yt2, yt + th); 981bf215546Sopenharmony_ci 982bf215546Sopenharmony_ci /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that 983bf215546Sopenharmony_ci * the middle interval is the longest span-aligned part. 984bf215546Sopenharmony_ci * The sub-ranges could be empty. 985bf215546Sopenharmony_ci */ 986bf215546Sopenharmony_ci uint32_t x1, x2; 987bf215546Sopenharmony_ci x1 = ALIGN_UP(x0, span); 988bf215546Sopenharmony_ci if (x1 > x3) 989bf215546Sopenharmony_ci x1 = x2 = x3; 990bf215546Sopenharmony_ci else 991bf215546Sopenharmony_ci x2 = ALIGN_DOWN(x3, span); 992bf215546Sopenharmony_ci 993bf215546Sopenharmony_ci assert(x0 <= x1 && x1 <= x2 && x2 <= x3); 994bf215546Sopenharmony_ci assert(x1 - x0 < span && x3 - x2 < span); 995bf215546Sopenharmony_ci assert(x3 - x0 <= tw); 996bf215546Sopenharmony_ci assert((x2 - x1) % span == 0); 997bf215546Sopenharmony_ci 998bf215546Sopenharmony_ci /* Translate by (xt,yt) for single-tile copier. */ 999bf215546Sopenharmony_ci tile_copy(x0-xt, x1-xt, x2-xt, x3-xt, 1000bf215546Sopenharmony_ci y0-yt, y1-yt, 1001bf215546Sopenharmony_ci dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch, 1002bf215546Sopenharmony_ci src + (ptrdiff_t)xt * th + (ptrdiff_t)yt * src_pitch, 1003bf215546Sopenharmony_ci dst_pitch, 1004bf215546Sopenharmony_ci swizzle_bit, 1005bf215546Sopenharmony_ci copy_type); 1006bf215546Sopenharmony_ci } 1007bf215546Sopenharmony_ci } 1008bf215546Sopenharmony_ci} 1009