intel/isl/isl_tiled_memcpy.c

bf215546Sopenharmony_ci/*
bf215546Sopenharmony_ci * Mesa 3-D graphics library
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Copyright 2012 Intel Corporation
bf215546Sopenharmony_ci * Copyright 2013 Google
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
bf215546Sopenharmony_ci * copy of this software and associated documentation files (the
bf215546Sopenharmony_ci * "Software"), to deal in the Software without restriction, including
bf215546Sopenharmony_ci * without limitation the rights to use, copy, modify, merge, publish,
bf215546Sopenharmony_ci * distribute, sublicense, and/or sell copies of the Software, and to
bf215546Sopenharmony_ci * permit persons to whom the Software is furnished to do so, subject to
bf215546Sopenharmony_ci * the following conditions:
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the
bf215546Sopenharmony_ci * next paragraph) shall be included in all copies or substantial portions
bf215546Sopenharmony_ci * of the Software.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
bf215546Sopenharmony_ci * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
bf215546Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
bf215546Sopenharmony_ci * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
bf215546Sopenharmony_ci * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
bf215546Sopenharmony_ci * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
bf215546Sopenharmony_ci * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Authors:
bf215546Sopenharmony_ci *    Chad Versace <chad.versace@linux.intel.com>
bf215546Sopenharmony_ci *    Frank Henigman <fjhenigman@google.com>
bf215546Sopenharmony_ci */
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#include <string.h>
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#include "util/macros.h"
bf215546Sopenharmony_ci#include "util/u_math.h"
bf215546Sopenharmony_ci#include "util/rounding.h"
bf215546Sopenharmony_ci#include "isl_priv.h"
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#if defined(__SSSE3__)
bf215546Sopenharmony_ci#include <tmmintrin.h>
bf215546Sopenharmony_ci#elif defined(__SSE2__)
bf215546Sopenharmony_ci#include <emmintrin.h>
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#define FILE_DEBUG_FLAG DEBUG_TEXTURE
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
bf215546Sopenharmony_ci#define ALIGN_UP(a, b) ALIGN(a, b)
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/* Tile dimensions.  Width and span are in bytes, height is in pixels (i.e.
bf215546Sopenharmony_ci * unitless).  A "span" is the most number of bytes we can copy from linear
bf215546Sopenharmony_ci * to tiled without needing to calculate a new destination address.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic const uint32_t xtile_width = 512;
bf215546Sopenharmony_cistatic const uint32_t xtile_height = 8;
bf215546Sopenharmony_cistatic const uint32_t xtile_span = 64;
bf215546Sopenharmony_cistatic const uint32_t ytile_width = 128;
bf215546Sopenharmony_cistatic const uint32_t ytile_height = 32;
bf215546Sopenharmony_cistatic const uint32_t ytile_span = 16;
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic inline uint32_t
bf215546Sopenharmony_ciror(uint32_t n, uint32_t d)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   return (n >> d) | (n << (32 - d));
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci// bswap32 already exists as a macro on some platforms (FreeBSD)
bf215546Sopenharmony_ci#ifndef bswap32
bf215546Sopenharmony_cistatic inline uint32_t
bf215546Sopenharmony_cibswap32(uint32_t n)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci#if defined(HAVE___BUILTIN_BSWAP32)
bf215546Sopenharmony_ci   return __builtin_bswap32(n);
bf215546Sopenharmony_ci#else
bf215546Sopenharmony_ci   return (n >> 24) |
bf215546Sopenharmony_ci          ((n >> 8) & 0x0000ff00) |
bf215546Sopenharmony_ci          ((n << 8) & 0x00ff0000) |
bf215546Sopenharmony_ci          (n << 24);
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Copy RGBA to BGRA - swap R and B.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic inline void *
bf215546Sopenharmony_cirgba8_copy(void *dst, const void *src, size_t bytes)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   uint32_t *d = dst;
bf215546Sopenharmony_ci   uint32_t const *s = src;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   assert(bytes % 4 == 0);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   while (bytes >= 4) {
bf215546Sopenharmony_ci      *d = ror(bswap32(*s), 8);
bf215546Sopenharmony_ci      d += 1;
bf215546Sopenharmony_ci      s += 1;
bf215546Sopenharmony_ci      bytes -= 4;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   return dst;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#ifdef __SSSE3__
bf215546Sopenharmony_cistatic const uint8_t rgba8_permutation[16] =
bf215546Sopenharmony_ci   { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic inline void
bf215546Sopenharmony_cirgba8_copy_16_aligned_dst(void *dst, const void *src)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   _mm_store_si128(dst,
bf215546Sopenharmony_ci                   _mm_shuffle_epi8(_mm_loadu_si128(src),
bf215546Sopenharmony_ci                                    *(__m128i *)rgba8_permutation));
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic inline void
bf215546Sopenharmony_cirgba8_copy_16_aligned_src(void *dst, const void *src)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   _mm_storeu_si128(dst,
bf215546Sopenharmony_ci                    _mm_shuffle_epi8(_mm_load_si128(src),
bf215546Sopenharmony_ci                                     *(__m128i *)rgba8_permutation));
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#elif defined(__SSE2__)
bf215546Sopenharmony_cistatic inline void
bf215546Sopenharmony_cirgba8_copy_16_aligned_dst(void *dst, const void *src)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   __m128i srcreg, dstreg, agmask, ag, rb, br;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   agmask = _mm_set1_epi32(0xFF00FF00);
bf215546Sopenharmony_ci   srcreg = _mm_loadu_si128((__m128i *)src);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   rb = _mm_andnot_si128(agmask, srcreg);
bf215546Sopenharmony_ci   ag = _mm_and_si128(agmask, srcreg);
bf215546Sopenharmony_ci   br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
bf215546Sopenharmony_ci                            _MM_SHUFFLE(2, 3, 0, 1));
bf215546Sopenharmony_ci   dstreg = _mm_or_si128(ag, br);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   _mm_store_si128((__m128i *)dst, dstreg);
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic inline void
bf215546Sopenharmony_cirgba8_copy_16_aligned_src(void *dst, const void *src)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   __m128i srcreg, dstreg, agmask, ag, rb, br;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   agmask = _mm_set1_epi32(0xFF00FF00);
bf215546Sopenharmony_ci   srcreg = _mm_load_si128((__m128i *)src);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   rb = _mm_andnot_si128(agmask, srcreg);
bf215546Sopenharmony_ci   ag = _mm_and_si128(agmask, srcreg);
bf215546Sopenharmony_ci   br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
bf215546Sopenharmony_ci                            _MM_SHUFFLE(2, 3, 0, 1));
bf215546Sopenharmony_ci   dstreg = _mm_or_si128(ag, br);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   _mm_storeu_si128((__m128i *)dst, dstreg);
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic inline void *
bf215546Sopenharmony_cirgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#if defined(__SSSE3__) || defined(__SSE2__)
bf215546Sopenharmony_ci   if (bytes == 64) {
bf215546Sopenharmony_ci      rgba8_copy_16_aligned_dst(dst +  0, src +  0);
bf215546Sopenharmony_ci      rgba8_copy_16_aligned_dst(dst + 16, src + 16);
bf215546Sopenharmony_ci      rgba8_copy_16_aligned_dst(dst + 32, src + 32);
bf215546Sopenharmony_ci      rgba8_copy_16_aligned_dst(dst + 48, src + 48);
bf215546Sopenharmony_ci      return dst;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   while (bytes >= 16) {
bf215546Sopenharmony_ci      rgba8_copy_16_aligned_dst(dst, src);
bf215546Sopenharmony_ci      src += 16;
bf215546Sopenharmony_ci      dst += 16;
bf215546Sopenharmony_ci      bytes -= 16;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   rgba8_copy(dst, src, bytes);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return dst;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic inline void *
bf215546Sopenharmony_cirgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#if defined(__SSSE3__) || defined(__SSE2__)
bf215546Sopenharmony_ci   if (bytes == 64) {
bf215546Sopenharmony_ci      rgba8_copy_16_aligned_src(dst +  0, src +  0);
bf215546Sopenharmony_ci      rgba8_copy_16_aligned_src(dst + 16, src + 16);
bf215546Sopenharmony_ci      rgba8_copy_16_aligned_src(dst + 32, src + 32);
bf215546Sopenharmony_ci      rgba8_copy_16_aligned_src(dst + 48, src + 48);
bf215546Sopenharmony_ci      return dst;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   while (bytes >= 16) {
bf215546Sopenharmony_ci      rgba8_copy_16_aligned_src(dst, src);
bf215546Sopenharmony_ci      src += 16;
bf215546Sopenharmony_ci      dst += 16;
bf215546Sopenharmony_ci      bytes -= 16;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   rgba8_copy(dst, src, bytes);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   return dst;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
bf215546Sopenharmony_ci * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
bf215546Sopenharmony_ci * The first and last ranges must be shorter than a "span" (the longest linear
bf215546Sopenharmony_ci * stretch within a tile) and the middle must equal a whole number of spans.
bf215546Sopenharmony_ci * Ranges may be empty.  The region copied must land entirely within one tile.
bf215546Sopenharmony_ci * 'dst' is the start of the tile and 'src' is the corresponding
bf215546Sopenharmony_ci * address to copy from, though copying begins at (x0, y0).
bf215546Sopenharmony_ci * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
bf215546Sopenharmony_ci * Swizzling flips bit 6 in the copy destination offset, when certain other
bf215546Sopenharmony_ci * bits are set in it.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_citypedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
bf215546Sopenharmony_ci                             uint32_t y0, uint32_t y1,
bf215546Sopenharmony_ci                             char *dst, const char *src,
bf215546Sopenharmony_ci                             int32_t linear_pitch,
bf215546Sopenharmony_ci                             uint32_t swizzle_bit,
bf215546Sopenharmony_ci                             isl_memcpy_type copy_type);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Copy texture data from linear to X tile layout.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * \copydoc tile_copy_fn
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * The mem_copy parameters allow the user to specify an alternative mem_copy
bf215546Sopenharmony_ci * function that, for instance, may do RGBA -> BGRA swizzling.  The first
bf215546Sopenharmony_ci * function must handle any memory alignment while the second function must
bf215546Sopenharmony_ci * only handle 16-byte alignment in whichever side (source or destination) is
bf215546Sopenharmony_ci * tiled.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic inline void
bf215546Sopenharmony_cilinear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
bf215546Sopenharmony_ci                 uint32_t y0, uint32_t y1,
bf215546Sopenharmony_ci                 char *dst, const char *src,
bf215546Sopenharmony_ci                 int32_t src_pitch,
bf215546Sopenharmony_ci                 uint32_t swizzle_bit,
bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy,
bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy_align16)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   /* The copy destination offset for each range copied is the sum of
bf215546Sopenharmony_ci    * an X offset 'x0' or 'xo' and a Y offset 'yo.'
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   uint32_t xo, yo;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   src += (ptrdiff_t)y0 * src_pitch;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
bf215546Sopenharmony_ci      /* Bits 9 and 10 of the copy destination offset control swizzling.
bf215546Sopenharmony_ci       * Only 'yo' contributes to those bits in the total offset,
bf215546Sopenharmony_ci       * so calculate 'swizzle' just once per row.
bf215546Sopenharmony_ci       * Move bits 9 and 10 three and four places respectively down
bf215546Sopenharmony_ci       * to bit 6 and xor them.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      for (xo = x1; xo < x2; xo += xtile_span) {
bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      src += src_pitch;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Copy texture data from linear to Y tile layout.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * \copydoc tile_copy_fn
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic inline void
bf215546Sopenharmony_cilinear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
bf215546Sopenharmony_ci                 uint32_t y0, uint32_t y3,
bf215546Sopenharmony_ci                 char *dst, const char *src,
bf215546Sopenharmony_ci                 int32_t src_pitch,
bf215546Sopenharmony_ci                 uint32_t swizzle_bit,
bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy,
bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy_align16)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
bf215546Sopenharmony_ci    * as the tile).  Thus the destination offset for (x,y) is the sum of:
bf215546Sopenharmony_ci    *   (x % column_width)                    // position within column
bf215546Sopenharmony_ci    *   (x / column_width) * bytes_per_column // column number * bytes per column
bf215546Sopenharmony_ci    *   y * column_width
bf215546Sopenharmony_ci    *
bf215546Sopenharmony_ci    * The copy destination offset for each range copied is the sum of
bf215546Sopenharmony_ci    * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   const uint32_t column_width = ytile_span;
bf215546Sopenharmony_ci   const uint32_t bytes_per_column = column_width * ytile_height;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
bf215546Sopenharmony_ci   uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
bf215546Sopenharmony_ci   uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* Bit 9 of the destination offset control swizzling.
bf215546Sopenharmony_ci    * Only the X offset contributes to bit 9 of the total offset,
bf215546Sopenharmony_ci    * so swizzle can be calculated in advance for these X positions.
bf215546Sopenharmony_ci    * Move bit 9 three places down to bit 6.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
bf215546Sopenharmony_ci   uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   uint32_t x, yo;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   src += (ptrdiff_t)y0 * src_pitch;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (y0 != y1) {
bf215546Sopenharmony_ci      for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
bf215546Sopenharmony_ci         uint32_t xo = xo1;
bf215546Sopenharmony_ci         uint32_t swizzle = swizzle1;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         /* Step by spans/columns.  As it happens, the swizzle bit flips
bf215546Sopenharmony_ci          * at each step so we don't need to calculate it explicitly.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         for (x = x1; x < x2; x += ytile_span) {
bf215546Sopenharmony_ci            mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
bf215546Sopenharmony_ci            xo += bytes_per_column;
bf215546Sopenharmony_ci            swizzle ^= swizzle_bit;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         src += src_pitch;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
bf215546Sopenharmony_ci      uint32_t xo = xo1;
bf215546Sopenharmony_ci      uint32_t swizzle = swizzle1;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (x0 != x1) {
bf215546Sopenharmony_ci         mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);
bf215546Sopenharmony_ci         mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);
bf215546Sopenharmony_ci         mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);
bf215546Sopenharmony_ci         mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      /* Step by spans/columns.  As it happens, the swizzle bit flips
bf215546Sopenharmony_ci       * at each step so we don't need to calculate it explicitly.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      for (x = x1; x < x2; x += ytile_span) {
bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);
bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);
bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);
bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);
bf215546Sopenharmony_ci         xo += bytes_per_column;
bf215546Sopenharmony_ci         swizzle ^= swizzle_bit;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (x2 != x3) {
bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);
bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);
bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);
bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      src += 4 * src_pitch;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (y2 != y3) {
bf215546Sopenharmony_ci      for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
bf215546Sopenharmony_ci         uint32_t xo = xo1;
bf215546Sopenharmony_ci         uint32_t swizzle = swizzle1;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         /* Step by spans/columns.  As it happens, the swizzle bit flips
bf215546Sopenharmony_ci          * at each step so we don't need to calculate it explicitly.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         for (x = x1; x < x2; x += ytile_span) {
bf215546Sopenharmony_ci            mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
bf215546Sopenharmony_ci            xo += bytes_per_column;
bf215546Sopenharmony_ci            swizzle ^= swizzle_bit;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         src += src_pitch;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Copy texture data from X tile layout to linear.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * \copydoc tile_copy_fn
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic inline void
bf215546Sopenharmony_cixtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
bf215546Sopenharmony_ci                 uint32_t y0, uint32_t y1,
bf215546Sopenharmony_ci                 char *dst, const char *src,
bf215546Sopenharmony_ci                 int32_t dst_pitch,
bf215546Sopenharmony_ci                 uint32_t swizzle_bit,
bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy,
bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy_align16)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   /* The copy destination offset for each range copied is the sum of
bf215546Sopenharmony_ci    * an X offset 'x0' or 'xo' and a Y offset 'yo.'
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   uint32_t xo, yo;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   dst += (ptrdiff_t)y0 * dst_pitch;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
bf215546Sopenharmony_ci      /* Bits 9 and 10 of the copy destination offset control swizzling.
bf215546Sopenharmony_ci       * Only 'yo' contributes to those bits in the total offset,
bf215546Sopenharmony_ci       * so calculate 'swizzle' just once per row.
bf215546Sopenharmony_ci       * Move bits 9 and 10 three and four places respectively down
bf215546Sopenharmony_ci       * to bit 6 and xor them.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      for (xo = x1; xo < x2; xo += xtile_span) {
bf215546Sopenharmony_ci         mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      dst += dst_pitch;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci /**
bf215546Sopenharmony_ci * Copy texture data from Y tile layout to linear.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * \copydoc tile_copy_fn
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic inline void
bf215546Sopenharmony_ciytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
bf215546Sopenharmony_ci                 uint32_t y0, uint32_t y3,
bf215546Sopenharmony_ci                 char *dst, const char *src,
bf215546Sopenharmony_ci                 int32_t dst_pitch,
bf215546Sopenharmony_ci                 uint32_t swizzle_bit,
bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy,
bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy_align16)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
bf215546Sopenharmony_ci    * as the tile).  Thus the destination offset for (x,y) is the sum of:
bf215546Sopenharmony_ci    *   (x % column_width)                    // position within column
bf215546Sopenharmony_ci    *   (x / column_width) * bytes_per_column // column number * bytes per column
bf215546Sopenharmony_ci    *   y * column_width
bf215546Sopenharmony_ci    *
bf215546Sopenharmony_ci    * The copy destination offset for each range copied is the sum of
bf215546Sopenharmony_ci    * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   const uint32_t column_width = ytile_span;
bf215546Sopenharmony_ci   const uint32_t bytes_per_column = column_width * ytile_height;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
bf215546Sopenharmony_ci   uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
bf215546Sopenharmony_ci   uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* Bit 9 of the destination offset control swizzling.
bf215546Sopenharmony_ci    * Only the X offset contributes to bit 9 of the total offset,
bf215546Sopenharmony_ci    * so swizzle can be calculated in advance for these X positions.
bf215546Sopenharmony_ci    * Move bit 9 three places down to bit 6.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
bf215546Sopenharmony_ci   uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   uint32_t x, yo;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   dst += (ptrdiff_t)y0 * dst_pitch;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (y0 != y1) {
bf215546Sopenharmony_ci      for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
bf215546Sopenharmony_ci         uint32_t xo = xo1;
bf215546Sopenharmony_ci         uint32_t swizzle = swizzle1;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         /* Step by spans/columns.  As it happens, the swizzle bit flips
bf215546Sopenharmony_ci          * at each step so we don't need to calculate it explicitly.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         for (x = x1; x < x2; x += ytile_span) {
bf215546Sopenharmony_ci            mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
bf215546Sopenharmony_ci            xo += bytes_per_column;
bf215546Sopenharmony_ci            swizzle ^= swizzle_bit;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         dst += dst_pitch;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
bf215546Sopenharmony_ci      uint32_t xo = xo1;
bf215546Sopenharmony_ci      uint32_t swizzle = swizzle1;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (x0 != x1) {
bf215546Sopenharmony_ci         mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0);
bf215546Sopenharmony_ci         mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0);
bf215546Sopenharmony_ci         mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0);
bf215546Sopenharmony_ci         mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      /* Step by spans/columns.  As it happens, the swizzle bit flips
bf215546Sopenharmony_ci       * at each step so we don't need to calculate it explicitly.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      for (x = x1; x < x2; x += ytile_span) {
bf215546Sopenharmony_ci         mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span);
bf215546Sopenharmony_ci         mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);
bf215546Sopenharmony_ci         mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);
bf215546Sopenharmony_ci         mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);
bf215546Sopenharmony_ci         xo += bytes_per_column;
bf215546Sopenharmony_ci         swizzle ^= swizzle_bit;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      if (x2 != x3) {
bf215546Sopenharmony_ci         mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2);
bf215546Sopenharmony_ci         mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2);
bf215546Sopenharmony_ci         mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2);
bf215546Sopenharmony_ci         mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci      dst += 4 * dst_pitch;
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (y2 != y3) {
bf215546Sopenharmony_ci      for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
bf215546Sopenharmony_ci         uint32_t xo = xo1;
bf215546Sopenharmony_ci         uint32_t swizzle = swizzle1;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         /* Step by spans/columns.  As it happens, the swizzle bit flips
bf215546Sopenharmony_ci          * at each step so we don't need to calculate it explicitly.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         for (x = x1; x < x2; x += ytile_span) {
bf215546Sopenharmony_ci            mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
bf215546Sopenharmony_ci            xo += bytes_per_column;
bf215546Sopenharmony_ci            swizzle ^= swizzle_bit;
bf215546Sopenharmony_ci         }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         dst += dst_pitch;
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
bf215546Sopenharmony_cistatic ALWAYS_INLINE void *
bf215546Sopenharmony_ci_memcpy_streaming_load(void *dest, const void *src, size_t count)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   if (count == 16) {
bf215546Sopenharmony_ci      __m128i val = _mm_stream_load_si128((__m128i *)src);
bf215546Sopenharmony_ci      _mm_storeu_si128((__m128i *)dest, val);
bf215546Sopenharmony_ci      return dest;
bf215546Sopenharmony_ci   } else if (count == 64) {
bf215546Sopenharmony_ci      __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
bf215546Sopenharmony_ci      __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
bf215546Sopenharmony_ci      __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
bf215546Sopenharmony_ci      __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
bf215546Sopenharmony_ci      _mm_storeu_si128(((__m128i *)dest) + 0, val0);
bf215546Sopenharmony_ci      _mm_storeu_si128(((__m128i *)dest) + 1, val1);
bf215546Sopenharmony_ci      _mm_storeu_si128(((__m128i *)dest) + 2, val2);
bf215546Sopenharmony_ci      _mm_storeu_si128(((__m128i *)dest) + 3, val3);
bf215546Sopenharmony_ci      return dest;
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      assert(count < 64); /* and (count < 16) for ytiled */
bf215546Sopenharmony_ci      return memcpy(dest, src, count);
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci
bf215546Sopenharmony_cistatic isl_mem_copy_fn
bf215546Sopenharmony_cichoose_copy_function(isl_memcpy_type copy_type)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   switch(copy_type) {
bf215546Sopenharmony_ci   case ISL_MEMCPY:
bf215546Sopenharmony_ci      return memcpy;
bf215546Sopenharmony_ci   case ISL_MEMCPY_BGRA8:
bf215546Sopenharmony_ci      return rgba8_copy;
bf215546Sopenharmony_ci   case ISL_MEMCPY_STREAMING_LOAD:
bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
bf215546Sopenharmony_ci      return _memcpy_streaming_load;
bf215546Sopenharmony_ci#else
bf215546Sopenharmony_ci      unreachable("ISL_MEMCOPY_STREAMING_LOAD requires sse4.1");
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci   case ISL_MEMCPY_INVALID:
bf215546Sopenharmony_ci      unreachable("invalid copy_type");
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   unreachable("unhandled copy_type");
bf215546Sopenharmony_ci   return NULL;
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Copy texture data from linear to X tile layout, faster.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Same as \ref linear_to_xtiled but faster, because it passes constant
bf215546Sopenharmony_ci * parameters for common cases, allowing the compiler to inline code
bf215546Sopenharmony_ci * optimized for those cases.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * \copydoc tile_copy_fn
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic FLATTEN void
bf215546Sopenharmony_cilinear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
bf215546Sopenharmony_ci                        uint32_t y0, uint32_t y1,
bf215546Sopenharmony_ci                        char *dst, const char *src,
bf215546Sopenharmony_ci                        int32_t src_pitch,
bf215546Sopenharmony_ci                        uint32_t swizzle_bit,
bf215546Sopenharmony_ci                        isl_memcpy_type copy_type)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
bf215546Sopenharmony_ci      if (mem_copy == memcpy)
bf215546Sopenharmony_ci         return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
bf215546Sopenharmony_ci         return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_dst);
bf215546Sopenharmony_ci      else
bf215546Sopenharmony_ci         unreachable("not reached");
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      if (mem_copy == memcpy)
bf215546Sopenharmony_ci         return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 memcpy, memcpy);
bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
bf215546Sopenharmony_ci         return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_dst);
bf215546Sopenharmony_ci      else
bf215546Sopenharmony_ci         unreachable("not reached");
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   linear_to_xtiled(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                    dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Copy texture data from linear to Y tile layout, faster.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Same as \ref linear_to_ytiled but faster, because it passes constant
bf215546Sopenharmony_ci * parameters for common cases, allowing the compiler to inline code
bf215546Sopenharmony_ci * optimized for those cases.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * \copydoc tile_copy_fn
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic FLATTEN void
bf215546Sopenharmony_cilinear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
bf215546Sopenharmony_ci                        uint32_t y0, uint32_t y1,
bf215546Sopenharmony_ci                        char *dst, const char *src,
bf215546Sopenharmony_ci                        int32_t src_pitch,
bf215546Sopenharmony_ci                        uint32_t swizzle_bit,
bf215546Sopenharmony_ci                        isl_memcpy_type copy_type)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
bf215546Sopenharmony_ci      if (mem_copy == memcpy)
bf215546Sopenharmony_ci         return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
bf215546Sopenharmony_ci         return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_dst);
bf215546Sopenharmony_ci      else
bf215546Sopenharmony_ci         unreachable("not reached");
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      if (mem_copy == memcpy)
bf215546Sopenharmony_ci         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
bf215546Sopenharmony_ci         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_dst);
bf215546Sopenharmony_ci      else
bf215546Sopenharmony_ci         unreachable("not reached");
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   linear_to_ytiled(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                    dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Copy texture data from X tile layout to linear, faster.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Same as \ref xtile_to_linear but faster, because it passes constant
bf215546Sopenharmony_ci * parameters for common cases, allowing the compiler to inline code
bf215546Sopenharmony_ci * optimized for those cases.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * \copydoc tile_copy_fn
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic FLATTEN void
bf215546Sopenharmony_cixtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
bf215546Sopenharmony_ci                        uint32_t y0, uint32_t y1,
bf215546Sopenharmony_ci                        char *dst, const char *src,
bf215546Sopenharmony_ci                        int32_t dst_pitch,
bf215546Sopenharmony_ci                        uint32_t swizzle_bit,
bf215546Sopenharmony_ci                        isl_memcpy_type copy_type)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
bf215546Sopenharmony_ci      if (mem_copy == memcpy)
bf215546Sopenharmony_ci         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
bf215546Sopenharmony_ci         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_src);
bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
bf215546Sopenharmony_ci      else if (mem_copy == _memcpy_streaming_load)
bf215546Sopenharmony_ci         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 memcpy, _memcpy_streaming_load);
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci      else
bf215546Sopenharmony_ci         unreachable("not reached");
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      if (mem_copy == memcpy)
bf215546Sopenharmony_ci         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
bf215546Sopenharmony_ci         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_src);
bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
bf215546Sopenharmony_ci      else if (mem_copy == _memcpy_streaming_load)
bf215546Sopenharmony_ci         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 memcpy, _memcpy_streaming_load);
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci      else
bf215546Sopenharmony_ci         unreachable("not reached");
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   xtiled_to_linear(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                    dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Copy texture data from Y tile layout to linear, faster.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Same as \ref ytile_to_linear but faster, because it passes constant
bf215546Sopenharmony_ci * parameters for common cases, allowing the compiler to inline code
bf215546Sopenharmony_ci * optimized for those cases.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * \copydoc tile_copy_fn
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic FLATTEN void
bf215546Sopenharmony_ciytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
bf215546Sopenharmony_ci                        uint32_t y0, uint32_t y1,
bf215546Sopenharmony_ci                        char *dst, const char *src,
bf215546Sopenharmony_ci                        int32_t dst_pitch,
bf215546Sopenharmony_ci                        uint32_t swizzle_bit,
bf215546Sopenharmony_ci                        isl_memcpy_type copy_type)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
bf215546Sopenharmony_ci      if (mem_copy == memcpy)
bf215546Sopenharmony_ci         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
bf215546Sopenharmony_ci         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_src);
bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
bf215546Sopenharmony_ci      else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
bf215546Sopenharmony_ci         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 memcpy, _memcpy_streaming_load);
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci      else
bf215546Sopenharmony_ci         unreachable("not reached");
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      if (mem_copy == memcpy)
bf215546Sopenharmony_ci         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
bf215546Sopenharmony_ci         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_src);
bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
bf215546Sopenharmony_ci      else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
bf215546Sopenharmony_ci         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
bf215546Sopenharmony_ci                                 memcpy, _memcpy_streaming_load);
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci      else
bf215546Sopenharmony_ci         unreachable("not reached");
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci   ytiled_to_linear(x0, x1, x2, x3, y0, y1,
bf215546Sopenharmony_ci                    dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Copy from linear to tiled texture.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
bf215546Sopenharmony_ci * pieces that do not cross tile boundaries and copy each piece with a tile
bf215546Sopenharmony_ci * copy function (\ref tile_copy_fn).
bf215546Sopenharmony_ci * The X range is in bytes, i.e. pixels * bytes-per-pixel.
bf215546Sopenharmony_ci * The Y range is in pixels (i.e. unitless).
bf215546Sopenharmony_ci * 'dst' is the address of (0, 0) in the destination tiled texture.
bf215546Sopenharmony_ci * 'src' is the address of (xt1, yt1) in the source linear texture.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic void
bf215546Sopenharmony_cilinear_to_tiled(uint32_t xt1, uint32_t xt2,
bf215546Sopenharmony_ci                      uint32_t yt1, uint32_t yt2,
bf215546Sopenharmony_ci                      char *dst, const char *src,
bf215546Sopenharmony_ci                      uint32_t dst_pitch, int32_t src_pitch,
bf215546Sopenharmony_ci                      bool has_swizzling,
bf215546Sopenharmony_ci                      enum isl_tiling tiling,
bf215546Sopenharmony_ci                      isl_memcpy_type copy_type)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   tile_copy_fn tile_copy;
bf215546Sopenharmony_ci   uint32_t xt0, xt3;
bf215546Sopenharmony_ci   uint32_t yt0, yt3;
bf215546Sopenharmony_ci   uint32_t xt, yt;
bf215546Sopenharmony_ci   uint32_t tw, th, span;
bf215546Sopenharmony_ci   uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (tiling == ISL_TILING_X) {
bf215546Sopenharmony_ci      tw = xtile_width;
bf215546Sopenharmony_ci      th = xtile_height;
bf215546Sopenharmony_ci      span = xtile_span;
bf215546Sopenharmony_ci      tile_copy = linear_to_xtiled_faster;
bf215546Sopenharmony_ci   } else if (tiling == ISL_TILING_Y0) {
bf215546Sopenharmony_ci      tw = ytile_width;
bf215546Sopenharmony_ci      th = ytile_height;
bf215546Sopenharmony_ci      span = ytile_span;
bf215546Sopenharmony_ci      tile_copy = linear_to_ytiled_faster;
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      unreachable("unsupported tiling");
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* Round out to tile boundaries. */
bf215546Sopenharmony_ci   xt0 = ALIGN_DOWN(xt1, tw);
bf215546Sopenharmony_ci   xt3 = ALIGN_UP  (xt2, tw);
bf215546Sopenharmony_ci   yt0 = ALIGN_DOWN(yt1, th);
bf215546Sopenharmony_ci   yt3 = ALIGN_UP  (yt2, th);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* Loop over all tiles to which we have something to copy.
bf215546Sopenharmony_ci    * 'xt' and 'yt' are the origin of the destination tile, whether copying
bf215546Sopenharmony_ci    * copying a full or partial tile.
bf215546Sopenharmony_ci    * tile_copy() copies one tile or partial tile.
bf215546Sopenharmony_ci    * Looping x inside y is the faster memory access pattern.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   for (yt = yt0; yt < yt3; yt += th) {
bf215546Sopenharmony_ci      for (xt = xt0; xt < xt3; xt += tw) {
bf215546Sopenharmony_ci         /* The area to update is [x0,x3) x [y0,y1).
bf215546Sopenharmony_ci          * May not want the whole tile, hence the min and max.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         uint32_t x0 = MAX2(xt1, xt);
bf215546Sopenharmony_ci         uint32_t y0 = MAX2(yt1, yt);
bf215546Sopenharmony_ci         uint32_t x3 = MIN2(xt2, xt + tw);
bf215546Sopenharmony_ci         uint32_t y1 = MIN2(yt2, yt + th);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
bf215546Sopenharmony_ci          * the middle interval is the longest span-aligned part.
bf215546Sopenharmony_ci          * The sub-ranges could be empty.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         uint32_t x1, x2;
bf215546Sopenharmony_ci         x1 = ALIGN_UP(x0, span);
bf215546Sopenharmony_ci         if (x1 > x3)
bf215546Sopenharmony_ci            x1 = x2 = x3;
bf215546Sopenharmony_ci         else
bf215546Sopenharmony_ci            x2 = ALIGN_DOWN(x3, span);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
bf215546Sopenharmony_ci         assert(x1 - x0 < span && x3 - x2 < span);
bf215546Sopenharmony_ci         assert(x3 - x0 <= tw);
bf215546Sopenharmony_ci         assert((x2 - x1) % span == 0);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         /* Translate by (xt,yt) for single-tile copier. */
bf215546Sopenharmony_ci         tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
bf215546Sopenharmony_ci                   y0-yt, y1-yt,
bf215546Sopenharmony_ci                   dst + (ptrdiff_t)xt * th  +  (ptrdiff_t)yt        * dst_pitch,
bf215546Sopenharmony_ci                   src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch,
bf215546Sopenharmony_ci                   src_pitch,
bf215546Sopenharmony_ci                   swizzle_bit,
bf215546Sopenharmony_ci                   copy_type);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci/**
bf215546Sopenharmony_ci * Copy from tiled to linear texture.
bf215546Sopenharmony_ci *
bf215546Sopenharmony_ci * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
bf215546Sopenharmony_ci * pieces that do not cross tile boundaries and copy each piece with a tile
bf215546Sopenharmony_ci * copy function (\ref tile_copy_fn).
bf215546Sopenharmony_ci * The X range is in bytes, i.e. pixels * bytes-per-pixel.
bf215546Sopenharmony_ci * The Y range is in pixels (i.e. unitless).
bf215546Sopenharmony_ci * 'dst' is the address of (xt1, yt1) in the destination linear texture.
bf215546Sopenharmony_ci * 'src' is the address of (0, 0) in the source tiled texture.
bf215546Sopenharmony_ci */
bf215546Sopenharmony_cistatic void
bf215546Sopenharmony_citiled_to_linear(uint32_t xt1, uint32_t xt2,
bf215546Sopenharmony_ci                      uint32_t yt1, uint32_t yt2,
bf215546Sopenharmony_ci                      char *dst, const char *src,
bf215546Sopenharmony_ci                      int32_t dst_pitch, uint32_t src_pitch,
bf215546Sopenharmony_ci                      bool has_swizzling,
bf215546Sopenharmony_ci                      enum isl_tiling tiling,
bf215546Sopenharmony_ci                      isl_memcpy_type copy_type)
bf215546Sopenharmony_ci{
bf215546Sopenharmony_ci   tile_copy_fn tile_copy;
bf215546Sopenharmony_ci   uint32_t xt0, xt3;
bf215546Sopenharmony_ci   uint32_t yt0, yt3;
bf215546Sopenharmony_ci   uint32_t xt, yt;
bf215546Sopenharmony_ci   uint32_t tw, th, span;
bf215546Sopenharmony_ci   uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   if (tiling == ISL_TILING_X) {
bf215546Sopenharmony_ci      tw = xtile_width;
bf215546Sopenharmony_ci      th = xtile_height;
bf215546Sopenharmony_ci      span = xtile_span;
bf215546Sopenharmony_ci      tile_copy = xtiled_to_linear_faster;
bf215546Sopenharmony_ci   } else if (tiling == ISL_TILING_Y0) {
bf215546Sopenharmony_ci      tw = ytile_width;
bf215546Sopenharmony_ci      th = ytile_height;
bf215546Sopenharmony_ci      span = ytile_span;
bf215546Sopenharmony_ci      tile_copy = ytiled_to_linear_faster;
bf215546Sopenharmony_ci   } else {
bf215546Sopenharmony_ci      unreachable("unsupported tiling");
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
bf215546Sopenharmony_ci   if (copy_type == ISL_MEMCPY_STREAMING_LOAD) {
bf215546Sopenharmony_ci      /* The hidden cacheline sized register used by movntdqa can apparently
bf215546Sopenharmony_ci       * give you stale data, so do an mfence to invalidate it.
bf215546Sopenharmony_ci       */
bf215546Sopenharmony_ci      _mm_mfence();
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci#endif
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* Round out to tile boundaries. */
bf215546Sopenharmony_ci   xt0 = ALIGN_DOWN(xt1, tw);
bf215546Sopenharmony_ci   xt3 = ALIGN_UP  (xt2, tw);
bf215546Sopenharmony_ci   yt0 = ALIGN_DOWN(yt1, th);
bf215546Sopenharmony_ci   yt3 = ALIGN_UP  (yt2, th);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci   /* Loop over all tiles to which we have something to copy.
bf215546Sopenharmony_ci    * 'xt' and 'yt' are the origin of the destination tile, whether copying
bf215546Sopenharmony_ci    * copying a full or partial tile.
bf215546Sopenharmony_ci    * tile_copy() copies one tile or partial tile.
bf215546Sopenharmony_ci    * Looping x inside y is the faster memory access pattern.
bf215546Sopenharmony_ci    */
bf215546Sopenharmony_ci   for (yt = yt0; yt < yt3; yt += th) {
bf215546Sopenharmony_ci      for (xt = xt0; xt < xt3; xt += tw) {
bf215546Sopenharmony_ci         /* The area to update is [x0,x3) x [y0,y1).
bf215546Sopenharmony_ci          * May not want the whole tile, hence the min and max.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         uint32_t x0 = MAX2(xt1, xt);
bf215546Sopenharmony_ci         uint32_t y0 = MAX2(yt1, yt);
bf215546Sopenharmony_ci         uint32_t x3 = MIN2(xt2, xt + tw);
bf215546Sopenharmony_ci         uint32_t y1 = MIN2(yt2, yt + th);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
bf215546Sopenharmony_ci          * the middle interval is the longest span-aligned part.
bf215546Sopenharmony_ci          * The sub-ranges could be empty.
bf215546Sopenharmony_ci          */
bf215546Sopenharmony_ci         uint32_t x1, x2;
bf215546Sopenharmony_ci         x1 = ALIGN_UP(x0, span);
bf215546Sopenharmony_ci         if (x1 > x3)
bf215546Sopenharmony_ci            x1 = x2 = x3;
bf215546Sopenharmony_ci         else
bf215546Sopenharmony_ci            x2 = ALIGN_DOWN(x3, span);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
bf215546Sopenharmony_ci         assert(x1 - x0 < span && x3 - x2 < span);
bf215546Sopenharmony_ci         assert(x3 - x0 <= tw);
bf215546Sopenharmony_ci         assert((x2 - x1) % span == 0);
bf215546Sopenharmony_ci
bf215546Sopenharmony_ci         /* Translate by (xt,yt) for single-tile copier. */
bf215546Sopenharmony_ci         tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
bf215546Sopenharmony_ci                   y0-yt, y1-yt,
bf215546Sopenharmony_ci                   dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch,
bf215546Sopenharmony_ci                   src + (ptrdiff_t)xt * th  +  (ptrdiff_t)yt        * src_pitch,
bf215546Sopenharmony_ci                   dst_pitch,
bf215546Sopenharmony_ci                   swizzle_bit,
bf215546Sopenharmony_ci                   copy_type);
bf215546Sopenharmony_ci      }
bf215546Sopenharmony_ci   }
bf215546Sopenharmony_ci}