1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Mesa 3-D graphics library
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Copyright 2012 Intel Corporation
5bf215546Sopenharmony_ci * Copyright 2013 Google
6bf215546Sopenharmony_ci *
7bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
8bf215546Sopenharmony_ci * copy of this software and associated documentation files (the
9bf215546Sopenharmony_ci * "Software"), to deal in the Software without restriction, including
10bf215546Sopenharmony_ci * without limitation the rights to use, copy, modify, merge, publish,
11bf215546Sopenharmony_ci * distribute, sublicense, and/or sell copies of the Software, and to
12bf215546Sopenharmony_ci * permit persons to whom the Software is furnished to do so, subject to
13bf215546Sopenharmony_ci * the following conditions:
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the
16bf215546Sopenharmony_ci * next paragraph) shall be included in all copies or substantial portions
17bf215546Sopenharmony_ci * of the Software.
18bf215546Sopenharmony_ci *
19bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20bf215546Sopenharmony_ci * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21bf215546Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22bf215546Sopenharmony_ci * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23bf215546Sopenharmony_ci * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24bf215546Sopenharmony_ci * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25bf215546Sopenharmony_ci * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26bf215546Sopenharmony_ci *
27bf215546Sopenharmony_ci * Authors:
28bf215546Sopenharmony_ci *    Chad Versace <chad.versace@linux.intel.com>
29bf215546Sopenharmony_ci *    Frank Henigman <fjhenigman@google.com>
30bf215546Sopenharmony_ci */
31bf215546Sopenharmony_ci
32bf215546Sopenharmony_ci#include <string.h>
33bf215546Sopenharmony_ci
34bf215546Sopenharmony_ci#include "util/macros.h"
35bf215546Sopenharmony_ci#include "util/u_math.h"
36bf215546Sopenharmony_ci#include "util/rounding.h"
37bf215546Sopenharmony_ci#include "isl_priv.h"
38bf215546Sopenharmony_ci
39bf215546Sopenharmony_ci#if defined(__SSSE3__)
40bf215546Sopenharmony_ci#include <tmmintrin.h>
41bf215546Sopenharmony_ci#elif defined(__SSE2__)
42bf215546Sopenharmony_ci#include <emmintrin.h>
43bf215546Sopenharmony_ci#endif
44bf215546Sopenharmony_ci
45bf215546Sopenharmony_ci#define FILE_DEBUG_FLAG DEBUG_TEXTURE
46bf215546Sopenharmony_ci
47bf215546Sopenharmony_ci#define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
48bf215546Sopenharmony_ci#define ALIGN_UP(a, b) ALIGN(a, b)
49bf215546Sopenharmony_ci
50bf215546Sopenharmony_ci/* Tile dimensions.  Width and span are in bytes, height is in pixels (i.e.
51bf215546Sopenharmony_ci * unitless).  A "span" is the most number of bytes we can copy from linear
52bf215546Sopenharmony_ci * to tiled without needing to calculate a new destination address.
53bf215546Sopenharmony_ci */
54bf215546Sopenharmony_cistatic const uint32_t xtile_width = 512;
55bf215546Sopenharmony_cistatic const uint32_t xtile_height = 8;
56bf215546Sopenharmony_cistatic const uint32_t xtile_span = 64;
57bf215546Sopenharmony_cistatic const uint32_t ytile_width = 128;
58bf215546Sopenharmony_cistatic const uint32_t ytile_height = 32;
59bf215546Sopenharmony_cistatic const uint32_t ytile_span = 16;
60bf215546Sopenharmony_ci
61bf215546Sopenharmony_cistatic inline uint32_t
62bf215546Sopenharmony_ciror(uint32_t n, uint32_t d)
63bf215546Sopenharmony_ci{
64bf215546Sopenharmony_ci   return (n >> d) | (n << (32 - d));
65bf215546Sopenharmony_ci}
66bf215546Sopenharmony_ci
67bf215546Sopenharmony_ci// bswap32 already exists as a macro on some platforms (FreeBSD)
68bf215546Sopenharmony_ci#ifndef bswap32
69bf215546Sopenharmony_cistatic inline uint32_t
70bf215546Sopenharmony_cibswap32(uint32_t n)
71bf215546Sopenharmony_ci{
72bf215546Sopenharmony_ci#if defined(HAVE___BUILTIN_BSWAP32)
73bf215546Sopenharmony_ci   return __builtin_bswap32(n);
74bf215546Sopenharmony_ci#else
75bf215546Sopenharmony_ci   return (n >> 24) |
76bf215546Sopenharmony_ci          ((n >> 8) & 0x0000ff00) |
77bf215546Sopenharmony_ci          ((n << 8) & 0x00ff0000) |
78bf215546Sopenharmony_ci          (n << 24);
79bf215546Sopenharmony_ci#endif
80bf215546Sopenharmony_ci}
81bf215546Sopenharmony_ci#endif
82bf215546Sopenharmony_ci
83bf215546Sopenharmony_ci/**
84bf215546Sopenharmony_ci * Copy RGBA to BGRA - swap R and B.
85bf215546Sopenharmony_ci */
86bf215546Sopenharmony_cistatic inline void *
87bf215546Sopenharmony_cirgba8_copy(void *dst, const void *src, size_t bytes)
88bf215546Sopenharmony_ci{
89bf215546Sopenharmony_ci   uint32_t *d = dst;
90bf215546Sopenharmony_ci   uint32_t const *s = src;
91bf215546Sopenharmony_ci
92bf215546Sopenharmony_ci   assert(bytes % 4 == 0);
93bf215546Sopenharmony_ci
94bf215546Sopenharmony_ci   while (bytes >= 4) {
95bf215546Sopenharmony_ci      *d = ror(bswap32(*s), 8);
96bf215546Sopenharmony_ci      d += 1;
97bf215546Sopenharmony_ci      s += 1;
98bf215546Sopenharmony_ci      bytes -= 4;
99bf215546Sopenharmony_ci   }
100bf215546Sopenharmony_ci   return dst;
101bf215546Sopenharmony_ci}
102bf215546Sopenharmony_ci
103bf215546Sopenharmony_ci#ifdef __SSSE3__
104bf215546Sopenharmony_cistatic const uint8_t rgba8_permutation[16] =
105bf215546Sopenharmony_ci   { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
106bf215546Sopenharmony_ci
107bf215546Sopenharmony_cistatic inline void
108bf215546Sopenharmony_cirgba8_copy_16_aligned_dst(void *dst, const void *src)
109bf215546Sopenharmony_ci{
110bf215546Sopenharmony_ci   _mm_store_si128(dst,
111bf215546Sopenharmony_ci                   _mm_shuffle_epi8(_mm_loadu_si128(src),
112bf215546Sopenharmony_ci                                    *(__m128i *)rgba8_permutation));
113bf215546Sopenharmony_ci}
114bf215546Sopenharmony_ci
115bf215546Sopenharmony_cistatic inline void
116bf215546Sopenharmony_cirgba8_copy_16_aligned_src(void *dst, const void *src)
117bf215546Sopenharmony_ci{
118bf215546Sopenharmony_ci   _mm_storeu_si128(dst,
119bf215546Sopenharmony_ci                    _mm_shuffle_epi8(_mm_load_si128(src),
120bf215546Sopenharmony_ci                                     *(__m128i *)rgba8_permutation));
121bf215546Sopenharmony_ci}
122bf215546Sopenharmony_ci
123bf215546Sopenharmony_ci#elif defined(__SSE2__)
124bf215546Sopenharmony_cistatic inline void
125bf215546Sopenharmony_cirgba8_copy_16_aligned_dst(void *dst, const void *src)
126bf215546Sopenharmony_ci{
127bf215546Sopenharmony_ci   __m128i srcreg, dstreg, agmask, ag, rb, br;
128bf215546Sopenharmony_ci
129bf215546Sopenharmony_ci   agmask = _mm_set1_epi32(0xFF00FF00);
130bf215546Sopenharmony_ci   srcreg = _mm_loadu_si128((__m128i *)src);
131bf215546Sopenharmony_ci
132bf215546Sopenharmony_ci   rb = _mm_andnot_si128(agmask, srcreg);
133bf215546Sopenharmony_ci   ag = _mm_and_si128(agmask, srcreg);
134bf215546Sopenharmony_ci   br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
135bf215546Sopenharmony_ci                            _MM_SHUFFLE(2, 3, 0, 1));
136bf215546Sopenharmony_ci   dstreg = _mm_or_si128(ag, br);
137bf215546Sopenharmony_ci
138bf215546Sopenharmony_ci   _mm_store_si128((__m128i *)dst, dstreg);
139bf215546Sopenharmony_ci}
140bf215546Sopenharmony_ci
141bf215546Sopenharmony_cistatic inline void
142bf215546Sopenharmony_cirgba8_copy_16_aligned_src(void *dst, const void *src)
143bf215546Sopenharmony_ci{
144bf215546Sopenharmony_ci   __m128i srcreg, dstreg, agmask, ag, rb, br;
145bf215546Sopenharmony_ci
146bf215546Sopenharmony_ci   agmask = _mm_set1_epi32(0xFF00FF00);
147bf215546Sopenharmony_ci   srcreg = _mm_load_si128((__m128i *)src);
148bf215546Sopenharmony_ci
149bf215546Sopenharmony_ci   rb = _mm_andnot_si128(agmask, srcreg);
150bf215546Sopenharmony_ci   ag = _mm_and_si128(agmask, srcreg);
151bf215546Sopenharmony_ci   br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
152bf215546Sopenharmony_ci                            _MM_SHUFFLE(2, 3, 0, 1));
153bf215546Sopenharmony_ci   dstreg = _mm_or_si128(ag, br);
154bf215546Sopenharmony_ci
155bf215546Sopenharmony_ci   _mm_storeu_si128((__m128i *)dst, dstreg);
156bf215546Sopenharmony_ci}
157bf215546Sopenharmony_ci#endif
158bf215546Sopenharmony_ci
159bf215546Sopenharmony_ci/**
160bf215546Sopenharmony_ci * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
161bf215546Sopenharmony_ci */
162bf215546Sopenharmony_cistatic inline void *
163bf215546Sopenharmony_cirgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
164bf215546Sopenharmony_ci{
165bf215546Sopenharmony_ci   assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
166bf215546Sopenharmony_ci
167bf215546Sopenharmony_ci#if defined(__SSSE3__) || defined(__SSE2__)
168bf215546Sopenharmony_ci   if (bytes == 64) {
169bf215546Sopenharmony_ci      rgba8_copy_16_aligned_dst(dst +  0, src +  0);
170bf215546Sopenharmony_ci      rgba8_copy_16_aligned_dst(dst + 16, src + 16);
171bf215546Sopenharmony_ci      rgba8_copy_16_aligned_dst(dst + 32, src + 32);
172bf215546Sopenharmony_ci      rgba8_copy_16_aligned_dst(dst + 48, src + 48);
173bf215546Sopenharmony_ci      return dst;
174bf215546Sopenharmony_ci   }
175bf215546Sopenharmony_ci
176bf215546Sopenharmony_ci   while (bytes >= 16) {
177bf215546Sopenharmony_ci      rgba8_copy_16_aligned_dst(dst, src);
178bf215546Sopenharmony_ci      src += 16;
179bf215546Sopenharmony_ci      dst += 16;
180bf215546Sopenharmony_ci      bytes -= 16;
181bf215546Sopenharmony_ci   }
182bf215546Sopenharmony_ci#endif
183bf215546Sopenharmony_ci
184bf215546Sopenharmony_ci   rgba8_copy(dst, src, bytes);
185bf215546Sopenharmony_ci
186bf215546Sopenharmony_ci   return dst;
187bf215546Sopenharmony_ci}
188bf215546Sopenharmony_ci
189bf215546Sopenharmony_ci/**
190bf215546Sopenharmony_ci * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
191bf215546Sopenharmony_ci */
192bf215546Sopenharmony_cistatic inline void *
193bf215546Sopenharmony_cirgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
194bf215546Sopenharmony_ci{
195bf215546Sopenharmony_ci   assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
196bf215546Sopenharmony_ci
197bf215546Sopenharmony_ci#if defined(__SSSE3__) || defined(__SSE2__)
198bf215546Sopenharmony_ci   if (bytes == 64) {
199bf215546Sopenharmony_ci      rgba8_copy_16_aligned_src(dst +  0, src +  0);
200bf215546Sopenharmony_ci      rgba8_copy_16_aligned_src(dst + 16, src + 16);
201bf215546Sopenharmony_ci      rgba8_copy_16_aligned_src(dst + 32, src + 32);
202bf215546Sopenharmony_ci      rgba8_copy_16_aligned_src(dst + 48, src + 48);
203bf215546Sopenharmony_ci      return dst;
204bf215546Sopenharmony_ci   }
205bf215546Sopenharmony_ci
206bf215546Sopenharmony_ci   while (bytes >= 16) {
207bf215546Sopenharmony_ci      rgba8_copy_16_aligned_src(dst, src);
208bf215546Sopenharmony_ci      src += 16;
209bf215546Sopenharmony_ci      dst += 16;
210bf215546Sopenharmony_ci      bytes -= 16;
211bf215546Sopenharmony_ci   }
212bf215546Sopenharmony_ci#endif
213bf215546Sopenharmony_ci
214bf215546Sopenharmony_ci   rgba8_copy(dst, src, bytes);
215bf215546Sopenharmony_ci
216bf215546Sopenharmony_ci   return dst;
217bf215546Sopenharmony_ci}
218bf215546Sopenharmony_ci
219bf215546Sopenharmony_ci/**
220bf215546Sopenharmony_ci * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
221bf215546Sopenharmony_ci * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
222bf215546Sopenharmony_ci * The first and last ranges must be shorter than a "span" (the longest linear
223bf215546Sopenharmony_ci * stretch within a tile) and the middle must equal a whole number of spans.
224bf215546Sopenharmony_ci * Ranges may be empty.  The region copied must land entirely within one tile.
225bf215546Sopenharmony_ci * 'dst' is the start of the tile and 'src' is the corresponding
226bf215546Sopenharmony_ci * address to copy from, though copying begins at (x0, y0).
227bf215546Sopenharmony_ci * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
228bf215546Sopenharmony_ci * Swizzling flips bit 6 in the copy destination offset, when certain other
229bf215546Sopenharmony_ci * bits are set in it.
230bf215546Sopenharmony_ci */
231bf215546Sopenharmony_citypedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
232bf215546Sopenharmony_ci                             uint32_t y0, uint32_t y1,
233bf215546Sopenharmony_ci                             char *dst, const char *src,
234bf215546Sopenharmony_ci                             int32_t linear_pitch,
235bf215546Sopenharmony_ci                             uint32_t swizzle_bit,
236bf215546Sopenharmony_ci                             isl_memcpy_type copy_type);
237bf215546Sopenharmony_ci
238bf215546Sopenharmony_ci/**
239bf215546Sopenharmony_ci * Copy texture data from linear to X tile layout.
240bf215546Sopenharmony_ci *
241bf215546Sopenharmony_ci * \copydoc tile_copy_fn
242bf215546Sopenharmony_ci *
243bf215546Sopenharmony_ci * The mem_copy parameters allow the user to specify an alternative mem_copy
244bf215546Sopenharmony_ci * function that, for instance, may do RGBA -> BGRA swizzling.  The first
245bf215546Sopenharmony_ci * function must handle any memory alignment while the second function must
246bf215546Sopenharmony_ci * only handle 16-byte alignment in whichever side (source or destination) is
247bf215546Sopenharmony_ci * tiled.
248bf215546Sopenharmony_ci */
249bf215546Sopenharmony_cistatic inline void
250bf215546Sopenharmony_cilinear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
251bf215546Sopenharmony_ci                 uint32_t y0, uint32_t y1,
252bf215546Sopenharmony_ci                 char *dst, const char *src,
253bf215546Sopenharmony_ci                 int32_t src_pitch,
254bf215546Sopenharmony_ci                 uint32_t swizzle_bit,
255bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy,
256bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy_align16)
257bf215546Sopenharmony_ci{
258bf215546Sopenharmony_ci   /* The copy destination offset for each range copied is the sum of
259bf215546Sopenharmony_ci    * an X offset 'x0' or 'xo' and a Y offset 'yo.'
260bf215546Sopenharmony_ci    */
261bf215546Sopenharmony_ci   uint32_t xo, yo;
262bf215546Sopenharmony_ci
263bf215546Sopenharmony_ci   src += (ptrdiff_t)y0 * src_pitch;
264bf215546Sopenharmony_ci
265bf215546Sopenharmony_ci   for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
266bf215546Sopenharmony_ci      /* Bits 9 and 10 of the copy destination offset control swizzling.
267bf215546Sopenharmony_ci       * Only 'yo' contributes to those bits in the total offset,
268bf215546Sopenharmony_ci       * so calculate 'swizzle' just once per row.
269bf215546Sopenharmony_ci       * Move bits 9 and 10 three and four places respectively down
270bf215546Sopenharmony_ci       * to bit 6 and xor them.
271bf215546Sopenharmony_ci       */
272bf215546Sopenharmony_ci      uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
273bf215546Sopenharmony_ci
274bf215546Sopenharmony_ci      mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
275bf215546Sopenharmony_ci
276bf215546Sopenharmony_ci      for (xo = x1; xo < x2; xo += xtile_span) {
277bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
278bf215546Sopenharmony_ci      }
279bf215546Sopenharmony_ci
280bf215546Sopenharmony_ci      mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
281bf215546Sopenharmony_ci
282bf215546Sopenharmony_ci      src += src_pitch;
283bf215546Sopenharmony_ci   }
284bf215546Sopenharmony_ci}
285bf215546Sopenharmony_ci
286bf215546Sopenharmony_ci/**
287bf215546Sopenharmony_ci * Copy texture data from linear to Y tile layout.
288bf215546Sopenharmony_ci *
289bf215546Sopenharmony_ci * \copydoc tile_copy_fn
290bf215546Sopenharmony_ci */
291bf215546Sopenharmony_cistatic inline void
292bf215546Sopenharmony_cilinear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
293bf215546Sopenharmony_ci                 uint32_t y0, uint32_t y3,
294bf215546Sopenharmony_ci                 char *dst, const char *src,
295bf215546Sopenharmony_ci                 int32_t src_pitch,
296bf215546Sopenharmony_ci                 uint32_t swizzle_bit,
297bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy,
298bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy_align16)
299bf215546Sopenharmony_ci{
300bf215546Sopenharmony_ci   /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
301bf215546Sopenharmony_ci    * as the tile).  Thus the destination offset for (x,y) is the sum of:
302bf215546Sopenharmony_ci    *   (x % column_width)                    // position within column
303bf215546Sopenharmony_ci    *   (x / column_width) * bytes_per_column // column number * bytes per column
304bf215546Sopenharmony_ci    *   y * column_width
305bf215546Sopenharmony_ci    *
306bf215546Sopenharmony_ci    * The copy destination offset for each range copied is the sum of
307bf215546Sopenharmony_ci    * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
308bf215546Sopenharmony_ci    */
309bf215546Sopenharmony_ci   const uint32_t column_width = ytile_span;
310bf215546Sopenharmony_ci   const uint32_t bytes_per_column = column_width * ytile_height;
311bf215546Sopenharmony_ci
312bf215546Sopenharmony_ci   uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
313bf215546Sopenharmony_ci   uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
314bf215546Sopenharmony_ci
315bf215546Sopenharmony_ci   uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
316bf215546Sopenharmony_ci   uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
317bf215546Sopenharmony_ci
318bf215546Sopenharmony_ci   /* Bit 9 of the destination offset control swizzling.
319bf215546Sopenharmony_ci    * Only the X offset contributes to bit 9 of the total offset,
320bf215546Sopenharmony_ci    * so swizzle can be calculated in advance for these X positions.
321bf215546Sopenharmony_ci    * Move bit 9 three places down to bit 6.
322bf215546Sopenharmony_ci    */
323bf215546Sopenharmony_ci   uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
324bf215546Sopenharmony_ci   uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
325bf215546Sopenharmony_ci
326bf215546Sopenharmony_ci   uint32_t x, yo;
327bf215546Sopenharmony_ci
328bf215546Sopenharmony_ci   src += (ptrdiff_t)y0 * src_pitch;
329bf215546Sopenharmony_ci
330bf215546Sopenharmony_ci   if (y0 != y1) {
331bf215546Sopenharmony_ci      for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
332bf215546Sopenharmony_ci         uint32_t xo = xo1;
333bf215546Sopenharmony_ci         uint32_t swizzle = swizzle1;
334bf215546Sopenharmony_ci
335bf215546Sopenharmony_ci         mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
336bf215546Sopenharmony_ci
337bf215546Sopenharmony_ci         /* Step by spans/columns.  As it happens, the swizzle bit flips
338bf215546Sopenharmony_ci          * at each step so we don't need to calculate it explicitly.
339bf215546Sopenharmony_ci          */
340bf215546Sopenharmony_ci         for (x = x1; x < x2; x += ytile_span) {
341bf215546Sopenharmony_ci            mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
342bf215546Sopenharmony_ci            xo += bytes_per_column;
343bf215546Sopenharmony_ci            swizzle ^= swizzle_bit;
344bf215546Sopenharmony_ci         }
345bf215546Sopenharmony_ci
346bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
347bf215546Sopenharmony_ci
348bf215546Sopenharmony_ci         src += src_pitch;
349bf215546Sopenharmony_ci      }
350bf215546Sopenharmony_ci   }
351bf215546Sopenharmony_ci
352bf215546Sopenharmony_ci   for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
353bf215546Sopenharmony_ci      uint32_t xo = xo1;
354bf215546Sopenharmony_ci      uint32_t swizzle = swizzle1;
355bf215546Sopenharmony_ci
356bf215546Sopenharmony_ci      if (x0 != x1) {
357bf215546Sopenharmony_ci         mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);
358bf215546Sopenharmony_ci         mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);
359bf215546Sopenharmony_ci         mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);
360bf215546Sopenharmony_ci         mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);
361bf215546Sopenharmony_ci      }
362bf215546Sopenharmony_ci
363bf215546Sopenharmony_ci      /* Step by spans/columns.  As it happens, the swizzle bit flips
364bf215546Sopenharmony_ci       * at each step so we don't need to calculate it explicitly.
365bf215546Sopenharmony_ci       */
366bf215546Sopenharmony_ci      for (x = x1; x < x2; x += ytile_span) {
367bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);
368bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);
369bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);
370bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);
371bf215546Sopenharmony_ci         xo += bytes_per_column;
372bf215546Sopenharmony_ci         swizzle ^= swizzle_bit;
373bf215546Sopenharmony_ci      }
374bf215546Sopenharmony_ci
375bf215546Sopenharmony_ci      if (x2 != x3) {
376bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);
377bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);
378bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);
379bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);
380bf215546Sopenharmony_ci      }
381bf215546Sopenharmony_ci
382bf215546Sopenharmony_ci      src += 4 * src_pitch;
383bf215546Sopenharmony_ci   }
384bf215546Sopenharmony_ci
385bf215546Sopenharmony_ci   if (y2 != y3) {
386bf215546Sopenharmony_ci      for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
387bf215546Sopenharmony_ci         uint32_t xo = xo1;
388bf215546Sopenharmony_ci         uint32_t swizzle = swizzle1;
389bf215546Sopenharmony_ci
390bf215546Sopenharmony_ci         mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
391bf215546Sopenharmony_ci
392bf215546Sopenharmony_ci         /* Step by spans/columns.  As it happens, the swizzle bit flips
393bf215546Sopenharmony_ci          * at each step so we don't need to calculate it explicitly.
394bf215546Sopenharmony_ci          */
395bf215546Sopenharmony_ci         for (x = x1; x < x2; x += ytile_span) {
396bf215546Sopenharmony_ci            mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
397bf215546Sopenharmony_ci            xo += bytes_per_column;
398bf215546Sopenharmony_ci            swizzle ^= swizzle_bit;
399bf215546Sopenharmony_ci         }
400bf215546Sopenharmony_ci
401bf215546Sopenharmony_ci         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
402bf215546Sopenharmony_ci
403bf215546Sopenharmony_ci         src += src_pitch;
404bf215546Sopenharmony_ci      }
405bf215546Sopenharmony_ci   }
406bf215546Sopenharmony_ci}
407bf215546Sopenharmony_ci
408bf215546Sopenharmony_ci/**
409bf215546Sopenharmony_ci * Copy texture data from X tile layout to linear.
410bf215546Sopenharmony_ci *
411bf215546Sopenharmony_ci * \copydoc tile_copy_fn
412bf215546Sopenharmony_ci */
413bf215546Sopenharmony_cistatic inline void
414bf215546Sopenharmony_cixtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
415bf215546Sopenharmony_ci                 uint32_t y0, uint32_t y1,
416bf215546Sopenharmony_ci                 char *dst, const char *src,
417bf215546Sopenharmony_ci                 int32_t dst_pitch,
418bf215546Sopenharmony_ci                 uint32_t swizzle_bit,
419bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy,
420bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy_align16)
421bf215546Sopenharmony_ci{
422bf215546Sopenharmony_ci   /* The copy destination offset for each range copied is the sum of
423bf215546Sopenharmony_ci    * an X offset 'x0' or 'xo' and a Y offset 'yo.'
424bf215546Sopenharmony_ci    */
425bf215546Sopenharmony_ci   uint32_t xo, yo;
426bf215546Sopenharmony_ci
427bf215546Sopenharmony_ci   dst += (ptrdiff_t)y0 * dst_pitch;
428bf215546Sopenharmony_ci
429bf215546Sopenharmony_ci   for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
430bf215546Sopenharmony_ci      /* Bits 9 and 10 of the copy destination offset control swizzling.
431bf215546Sopenharmony_ci       * Only 'yo' contributes to those bits in the total offset,
432bf215546Sopenharmony_ci       * so calculate 'swizzle' just once per row.
433bf215546Sopenharmony_ci       * Move bits 9 and 10 three and four places respectively down
434bf215546Sopenharmony_ci       * to bit 6 and xor them.
435bf215546Sopenharmony_ci       */
436bf215546Sopenharmony_ci      uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
437bf215546Sopenharmony_ci
438bf215546Sopenharmony_ci      mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
439bf215546Sopenharmony_ci
440bf215546Sopenharmony_ci      for (xo = x1; xo < x2; xo += xtile_span) {
441bf215546Sopenharmony_ci         mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
442bf215546Sopenharmony_ci      }
443bf215546Sopenharmony_ci
444bf215546Sopenharmony_ci      mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
445bf215546Sopenharmony_ci
446bf215546Sopenharmony_ci      dst += dst_pitch;
447bf215546Sopenharmony_ci   }
448bf215546Sopenharmony_ci}
449bf215546Sopenharmony_ci
450bf215546Sopenharmony_ci /**
451bf215546Sopenharmony_ci * Copy texture data from Y tile layout to linear.
452bf215546Sopenharmony_ci *
453bf215546Sopenharmony_ci * \copydoc tile_copy_fn
454bf215546Sopenharmony_ci */
455bf215546Sopenharmony_cistatic inline void
456bf215546Sopenharmony_ciytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
457bf215546Sopenharmony_ci                 uint32_t y0, uint32_t y3,
458bf215546Sopenharmony_ci                 char *dst, const char *src,
459bf215546Sopenharmony_ci                 int32_t dst_pitch,
460bf215546Sopenharmony_ci                 uint32_t swizzle_bit,
461bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy,
462bf215546Sopenharmony_ci                 isl_mem_copy_fn mem_copy_align16)
463bf215546Sopenharmony_ci{
464bf215546Sopenharmony_ci   /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
465bf215546Sopenharmony_ci    * as the tile).  Thus the destination offset for (x,y) is the sum of:
466bf215546Sopenharmony_ci    *   (x % column_width)                    // position within column
467bf215546Sopenharmony_ci    *   (x / column_width) * bytes_per_column // column number * bytes per column
468bf215546Sopenharmony_ci    *   y * column_width
469bf215546Sopenharmony_ci    *
470bf215546Sopenharmony_ci    * The copy destination offset for each range copied is the sum of
471bf215546Sopenharmony_ci    * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
472bf215546Sopenharmony_ci    */
473bf215546Sopenharmony_ci   const uint32_t column_width = ytile_span;
474bf215546Sopenharmony_ci   const uint32_t bytes_per_column = column_width * ytile_height;
475bf215546Sopenharmony_ci
476bf215546Sopenharmony_ci   uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
477bf215546Sopenharmony_ci   uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
478bf215546Sopenharmony_ci
479bf215546Sopenharmony_ci   uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
480bf215546Sopenharmony_ci   uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
481bf215546Sopenharmony_ci
482bf215546Sopenharmony_ci   /* Bit 9 of the destination offset control swizzling.
483bf215546Sopenharmony_ci    * Only the X offset contributes to bit 9 of the total offset,
484bf215546Sopenharmony_ci    * so swizzle can be calculated in advance for these X positions.
485bf215546Sopenharmony_ci    * Move bit 9 three places down to bit 6.
486bf215546Sopenharmony_ci    */
487bf215546Sopenharmony_ci   uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
488bf215546Sopenharmony_ci   uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
489bf215546Sopenharmony_ci
490bf215546Sopenharmony_ci   uint32_t x, yo;
491bf215546Sopenharmony_ci
492bf215546Sopenharmony_ci   dst += (ptrdiff_t)y0 * dst_pitch;
493bf215546Sopenharmony_ci
494bf215546Sopenharmony_ci   if (y0 != y1) {
495bf215546Sopenharmony_ci      for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
496bf215546Sopenharmony_ci         uint32_t xo = xo1;
497bf215546Sopenharmony_ci         uint32_t swizzle = swizzle1;
498bf215546Sopenharmony_ci
499bf215546Sopenharmony_ci         mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
500bf215546Sopenharmony_ci
501bf215546Sopenharmony_ci         /* Step by spans/columns.  As it happens, the swizzle bit flips
502bf215546Sopenharmony_ci          * at each step so we don't need to calculate it explicitly.
503bf215546Sopenharmony_ci          */
504bf215546Sopenharmony_ci         for (x = x1; x < x2; x += ytile_span) {
505bf215546Sopenharmony_ci            mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
506bf215546Sopenharmony_ci            xo += bytes_per_column;
507bf215546Sopenharmony_ci            swizzle ^= swizzle_bit;
508bf215546Sopenharmony_ci         }
509bf215546Sopenharmony_ci
510bf215546Sopenharmony_ci         mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
511bf215546Sopenharmony_ci
512bf215546Sopenharmony_ci         dst += dst_pitch;
513bf215546Sopenharmony_ci      }
514bf215546Sopenharmony_ci   }
515bf215546Sopenharmony_ci
516bf215546Sopenharmony_ci   for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
517bf215546Sopenharmony_ci      uint32_t xo = xo1;
518bf215546Sopenharmony_ci      uint32_t swizzle = swizzle1;
519bf215546Sopenharmony_ci
520bf215546Sopenharmony_ci      if (x0 != x1) {
521bf215546Sopenharmony_ci         mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0);
522bf215546Sopenharmony_ci         mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0);
523bf215546Sopenharmony_ci         mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0);
524bf215546Sopenharmony_ci         mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0);
525bf215546Sopenharmony_ci      }
526bf215546Sopenharmony_ci
527bf215546Sopenharmony_ci      /* Step by spans/columns.  As it happens, the swizzle bit flips
528bf215546Sopenharmony_ci       * at each step so we don't need to calculate it explicitly.
529bf215546Sopenharmony_ci       */
530bf215546Sopenharmony_ci      for (x = x1; x < x2; x += ytile_span) {
531bf215546Sopenharmony_ci         mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span);
532bf215546Sopenharmony_ci         mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);
533bf215546Sopenharmony_ci         mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);
534bf215546Sopenharmony_ci         mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);
535bf215546Sopenharmony_ci         xo += bytes_per_column;
536bf215546Sopenharmony_ci         swizzle ^= swizzle_bit;
537bf215546Sopenharmony_ci      }
538bf215546Sopenharmony_ci
539bf215546Sopenharmony_ci      if (x2 != x3) {
540bf215546Sopenharmony_ci         mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2);
541bf215546Sopenharmony_ci         mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2);
542bf215546Sopenharmony_ci         mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2);
543bf215546Sopenharmony_ci         mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2);
544bf215546Sopenharmony_ci      }
545bf215546Sopenharmony_ci
546bf215546Sopenharmony_ci      dst += 4 * dst_pitch;
547bf215546Sopenharmony_ci   }
548bf215546Sopenharmony_ci
549bf215546Sopenharmony_ci   if (y2 != y3) {
550bf215546Sopenharmony_ci      for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
551bf215546Sopenharmony_ci         uint32_t xo = xo1;
552bf215546Sopenharmony_ci         uint32_t swizzle = swizzle1;
553bf215546Sopenharmony_ci
554bf215546Sopenharmony_ci         mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
555bf215546Sopenharmony_ci
556bf215546Sopenharmony_ci         /* Step by spans/columns.  As it happens, the swizzle bit flips
557bf215546Sopenharmony_ci          * at each step so we don't need to calculate it explicitly.
558bf215546Sopenharmony_ci          */
559bf215546Sopenharmony_ci         for (x = x1; x < x2; x += ytile_span) {
560bf215546Sopenharmony_ci            mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
561bf215546Sopenharmony_ci            xo += bytes_per_column;
562bf215546Sopenharmony_ci            swizzle ^= swizzle_bit;
563bf215546Sopenharmony_ci         }
564bf215546Sopenharmony_ci
565bf215546Sopenharmony_ci         mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
566bf215546Sopenharmony_ci
567bf215546Sopenharmony_ci         dst += dst_pitch;
568bf215546Sopenharmony_ci      }
569bf215546Sopenharmony_ci   }
570bf215546Sopenharmony_ci}
571bf215546Sopenharmony_ci
572bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
573bf215546Sopenharmony_cistatic ALWAYS_INLINE void *
574bf215546Sopenharmony_ci_memcpy_streaming_load(void *dest, const void *src, size_t count)
575bf215546Sopenharmony_ci{
576bf215546Sopenharmony_ci   if (count == 16) {
577bf215546Sopenharmony_ci      __m128i val = _mm_stream_load_si128((__m128i *)src);
578bf215546Sopenharmony_ci      _mm_storeu_si128((__m128i *)dest, val);
579bf215546Sopenharmony_ci      return dest;
580bf215546Sopenharmony_ci   } else if (count == 64) {
581bf215546Sopenharmony_ci      __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
582bf215546Sopenharmony_ci      __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
583bf215546Sopenharmony_ci      __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
584bf215546Sopenharmony_ci      __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
585bf215546Sopenharmony_ci      _mm_storeu_si128(((__m128i *)dest) + 0, val0);
586bf215546Sopenharmony_ci      _mm_storeu_si128(((__m128i *)dest) + 1, val1);
587bf215546Sopenharmony_ci      _mm_storeu_si128(((__m128i *)dest) + 2, val2);
588bf215546Sopenharmony_ci      _mm_storeu_si128(((__m128i *)dest) + 3, val3);
589bf215546Sopenharmony_ci      return dest;
590bf215546Sopenharmony_ci   } else {
591bf215546Sopenharmony_ci      assert(count < 64); /* and (count < 16) for ytiled */
592bf215546Sopenharmony_ci      return memcpy(dest, src, count);
593bf215546Sopenharmony_ci   }
594bf215546Sopenharmony_ci}
595bf215546Sopenharmony_ci#endif
596bf215546Sopenharmony_ci
597bf215546Sopenharmony_cistatic isl_mem_copy_fn
598bf215546Sopenharmony_cichoose_copy_function(isl_memcpy_type copy_type)
599bf215546Sopenharmony_ci{
600bf215546Sopenharmony_ci   switch(copy_type) {
601bf215546Sopenharmony_ci   case ISL_MEMCPY:
602bf215546Sopenharmony_ci      return memcpy;
603bf215546Sopenharmony_ci   case ISL_MEMCPY_BGRA8:
604bf215546Sopenharmony_ci      return rgba8_copy;
605bf215546Sopenharmony_ci   case ISL_MEMCPY_STREAMING_LOAD:
606bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
607bf215546Sopenharmony_ci      return _memcpy_streaming_load;
608bf215546Sopenharmony_ci#else
609bf215546Sopenharmony_ci      unreachable("ISL_MEMCOPY_STREAMING_LOAD requires sse4.1");
610bf215546Sopenharmony_ci#endif
611bf215546Sopenharmony_ci   case ISL_MEMCPY_INVALID:
612bf215546Sopenharmony_ci      unreachable("invalid copy_type");
613bf215546Sopenharmony_ci   }
614bf215546Sopenharmony_ci   unreachable("unhandled copy_type");
615bf215546Sopenharmony_ci   return NULL;
616bf215546Sopenharmony_ci}
617bf215546Sopenharmony_ci
618bf215546Sopenharmony_ci/**
619bf215546Sopenharmony_ci * Copy texture data from linear to X tile layout, faster.
620bf215546Sopenharmony_ci *
621bf215546Sopenharmony_ci * Same as \ref linear_to_xtiled but faster, because it passes constant
622bf215546Sopenharmony_ci * parameters for common cases, allowing the compiler to inline code
623bf215546Sopenharmony_ci * optimized for those cases.
624bf215546Sopenharmony_ci *
625bf215546Sopenharmony_ci * \copydoc tile_copy_fn
626bf215546Sopenharmony_ci */
627bf215546Sopenharmony_cistatic FLATTEN void
628bf215546Sopenharmony_cilinear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
629bf215546Sopenharmony_ci                        uint32_t y0, uint32_t y1,
630bf215546Sopenharmony_ci                        char *dst, const char *src,
631bf215546Sopenharmony_ci                        int32_t src_pitch,
632bf215546Sopenharmony_ci                        uint32_t swizzle_bit,
633bf215546Sopenharmony_ci                        isl_memcpy_type copy_type)
634bf215546Sopenharmony_ci{
635bf215546Sopenharmony_ci   isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
636bf215546Sopenharmony_ci
637bf215546Sopenharmony_ci   if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
638bf215546Sopenharmony_ci      if (mem_copy == memcpy)
639bf215546Sopenharmony_ci         return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
640bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
641bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
642bf215546Sopenharmony_ci         return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
643bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit,
644bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_dst);
645bf215546Sopenharmony_ci      else
646bf215546Sopenharmony_ci         unreachable("not reached");
647bf215546Sopenharmony_ci   } else {
648bf215546Sopenharmony_ci      if (mem_copy == memcpy)
649bf215546Sopenharmony_ci         return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
650bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit,
651bf215546Sopenharmony_ci                                 memcpy, memcpy);
652bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
653bf215546Sopenharmony_ci         return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
654bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit,
655bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_dst);
656bf215546Sopenharmony_ci      else
657bf215546Sopenharmony_ci         unreachable("not reached");
658bf215546Sopenharmony_ci   }
659bf215546Sopenharmony_ci   linear_to_xtiled(x0, x1, x2, x3, y0, y1,
660bf215546Sopenharmony_ci                    dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
661bf215546Sopenharmony_ci}
662bf215546Sopenharmony_ci
663bf215546Sopenharmony_ci/**
664bf215546Sopenharmony_ci * Copy texture data from linear to Y tile layout, faster.
665bf215546Sopenharmony_ci *
666bf215546Sopenharmony_ci * Same as \ref linear_to_ytiled but faster, because it passes constant
667bf215546Sopenharmony_ci * parameters for common cases, allowing the compiler to inline code
668bf215546Sopenharmony_ci * optimized for those cases.
669bf215546Sopenharmony_ci *
670bf215546Sopenharmony_ci * \copydoc tile_copy_fn
671bf215546Sopenharmony_ci */
672bf215546Sopenharmony_cistatic FLATTEN void
673bf215546Sopenharmony_cilinear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
674bf215546Sopenharmony_ci                        uint32_t y0, uint32_t y1,
675bf215546Sopenharmony_ci                        char *dst, const char *src,
676bf215546Sopenharmony_ci                        int32_t src_pitch,
677bf215546Sopenharmony_ci                        uint32_t swizzle_bit,
678bf215546Sopenharmony_ci                        isl_memcpy_type copy_type)
679bf215546Sopenharmony_ci{
680bf215546Sopenharmony_ci   isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
681bf215546Sopenharmony_ci
682bf215546Sopenharmony_ci   if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
683bf215546Sopenharmony_ci      if (mem_copy == memcpy)
684bf215546Sopenharmony_ci         return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
685bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
686bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
687bf215546Sopenharmony_ci         return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
688bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit,
689bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_dst);
690bf215546Sopenharmony_ci      else
691bf215546Sopenharmony_ci         unreachable("not reached");
692bf215546Sopenharmony_ci   } else {
693bf215546Sopenharmony_ci      if (mem_copy == memcpy)
694bf215546Sopenharmony_ci         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
695bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
696bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
697bf215546Sopenharmony_ci         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
698bf215546Sopenharmony_ci                                 dst, src, src_pitch, swizzle_bit,
699bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_dst);
700bf215546Sopenharmony_ci      else
701bf215546Sopenharmony_ci         unreachable("not reached");
702bf215546Sopenharmony_ci   }
703bf215546Sopenharmony_ci   linear_to_ytiled(x0, x1, x2, x3, y0, y1,
704bf215546Sopenharmony_ci                    dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
705bf215546Sopenharmony_ci}
706bf215546Sopenharmony_ci
707bf215546Sopenharmony_ci/**
708bf215546Sopenharmony_ci * Copy texture data from X tile layout to linear, faster.
709bf215546Sopenharmony_ci *
710bf215546Sopenharmony_ci * Same as \ref xtile_to_linear but faster, because it passes constant
711bf215546Sopenharmony_ci * parameters for common cases, allowing the compiler to inline code
712bf215546Sopenharmony_ci * optimized for those cases.
713bf215546Sopenharmony_ci *
714bf215546Sopenharmony_ci * \copydoc tile_copy_fn
715bf215546Sopenharmony_ci */
716bf215546Sopenharmony_cistatic FLATTEN void
717bf215546Sopenharmony_cixtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
718bf215546Sopenharmony_ci                        uint32_t y0, uint32_t y1,
719bf215546Sopenharmony_ci                        char *dst, const char *src,
720bf215546Sopenharmony_ci                        int32_t dst_pitch,
721bf215546Sopenharmony_ci                        uint32_t swizzle_bit,
722bf215546Sopenharmony_ci                        isl_memcpy_type copy_type)
723bf215546Sopenharmony_ci{
724bf215546Sopenharmony_ci   isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
725bf215546Sopenharmony_ci
726bf215546Sopenharmony_ci   if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
727bf215546Sopenharmony_ci      if (mem_copy == memcpy)
728bf215546Sopenharmony_ci         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
729bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
730bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
731bf215546Sopenharmony_ci         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
732bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
733bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_src);
734bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
735bf215546Sopenharmony_ci      else if (mem_copy == _memcpy_streaming_load)
736bf215546Sopenharmony_ci         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
737bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
738bf215546Sopenharmony_ci                                 memcpy, _memcpy_streaming_load);
739bf215546Sopenharmony_ci#endif
740bf215546Sopenharmony_ci      else
741bf215546Sopenharmony_ci         unreachable("not reached");
742bf215546Sopenharmony_ci   } else {
743bf215546Sopenharmony_ci      if (mem_copy == memcpy)
744bf215546Sopenharmony_ci         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
745bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
746bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
747bf215546Sopenharmony_ci         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
748bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
749bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_src);
750bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
751bf215546Sopenharmony_ci      else if (mem_copy == _memcpy_streaming_load)
752bf215546Sopenharmony_ci         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
753bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
754bf215546Sopenharmony_ci                                 memcpy, _memcpy_streaming_load);
755bf215546Sopenharmony_ci#endif
756bf215546Sopenharmony_ci      else
757bf215546Sopenharmony_ci         unreachable("not reached");
758bf215546Sopenharmony_ci   }
759bf215546Sopenharmony_ci   xtiled_to_linear(x0, x1, x2, x3, y0, y1,
760bf215546Sopenharmony_ci                    dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
761bf215546Sopenharmony_ci}
762bf215546Sopenharmony_ci
763bf215546Sopenharmony_ci/**
764bf215546Sopenharmony_ci * Copy texture data from Y tile layout to linear, faster.
765bf215546Sopenharmony_ci *
766bf215546Sopenharmony_ci * Same as \ref ytile_to_linear but faster, because it passes constant
767bf215546Sopenharmony_ci * parameters for common cases, allowing the compiler to inline code
768bf215546Sopenharmony_ci * optimized for those cases.
769bf215546Sopenharmony_ci *
770bf215546Sopenharmony_ci * \copydoc tile_copy_fn
771bf215546Sopenharmony_ci */
772bf215546Sopenharmony_cistatic FLATTEN void
773bf215546Sopenharmony_ciytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
774bf215546Sopenharmony_ci                        uint32_t y0, uint32_t y1,
775bf215546Sopenharmony_ci                        char *dst, const char *src,
776bf215546Sopenharmony_ci                        int32_t dst_pitch,
777bf215546Sopenharmony_ci                        uint32_t swizzle_bit,
778bf215546Sopenharmony_ci                        isl_memcpy_type copy_type)
779bf215546Sopenharmony_ci{
780bf215546Sopenharmony_ci   isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
781bf215546Sopenharmony_ci
782bf215546Sopenharmony_ci   if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
783bf215546Sopenharmony_ci      if (mem_copy == memcpy)
784bf215546Sopenharmony_ci         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
785bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
786bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
787bf215546Sopenharmony_ci         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
788bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
789bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_src);
790bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
791bf215546Sopenharmony_ci      else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
792bf215546Sopenharmony_ci         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
793bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
794bf215546Sopenharmony_ci                                 memcpy, _memcpy_streaming_load);
795bf215546Sopenharmony_ci#endif
796bf215546Sopenharmony_ci      else
797bf215546Sopenharmony_ci         unreachable("not reached");
798bf215546Sopenharmony_ci   } else {
799bf215546Sopenharmony_ci      if (mem_copy == memcpy)
800bf215546Sopenharmony_ci         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
801bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
802bf215546Sopenharmony_ci      else if (mem_copy == rgba8_copy)
803bf215546Sopenharmony_ci         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
804bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
805bf215546Sopenharmony_ci                                 rgba8_copy, rgba8_copy_aligned_src);
806bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
807bf215546Sopenharmony_ci      else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
808bf215546Sopenharmony_ci         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
809bf215546Sopenharmony_ci                                 dst, src, dst_pitch, swizzle_bit,
810bf215546Sopenharmony_ci                                 memcpy, _memcpy_streaming_load);
811bf215546Sopenharmony_ci#endif
812bf215546Sopenharmony_ci      else
813bf215546Sopenharmony_ci         unreachable("not reached");
814bf215546Sopenharmony_ci   }
815bf215546Sopenharmony_ci   ytiled_to_linear(x0, x1, x2, x3, y0, y1,
816bf215546Sopenharmony_ci                    dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
817bf215546Sopenharmony_ci}
818bf215546Sopenharmony_ci
819bf215546Sopenharmony_ci/**
820bf215546Sopenharmony_ci * Copy from linear to tiled texture.
821bf215546Sopenharmony_ci *
822bf215546Sopenharmony_ci * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
823bf215546Sopenharmony_ci * pieces that do not cross tile boundaries and copy each piece with a tile
824bf215546Sopenharmony_ci * copy function (\ref tile_copy_fn).
825bf215546Sopenharmony_ci * The X range is in bytes, i.e. pixels * bytes-per-pixel.
826bf215546Sopenharmony_ci * The Y range is in pixels (i.e. unitless).
827bf215546Sopenharmony_ci * 'dst' is the address of (0, 0) in the destination tiled texture.
828bf215546Sopenharmony_ci * 'src' is the address of (xt1, yt1) in the source linear texture.
829bf215546Sopenharmony_ci */
830bf215546Sopenharmony_cistatic void
831bf215546Sopenharmony_cilinear_to_tiled(uint32_t xt1, uint32_t xt2,
832bf215546Sopenharmony_ci                      uint32_t yt1, uint32_t yt2,
833bf215546Sopenharmony_ci                      char *dst, const char *src,
834bf215546Sopenharmony_ci                      uint32_t dst_pitch, int32_t src_pitch,
835bf215546Sopenharmony_ci                      bool has_swizzling,
836bf215546Sopenharmony_ci                      enum isl_tiling tiling,
837bf215546Sopenharmony_ci                      isl_memcpy_type copy_type)
838bf215546Sopenharmony_ci{
839bf215546Sopenharmony_ci   tile_copy_fn tile_copy;
840bf215546Sopenharmony_ci   uint32_t xt0, xt3;
841bf215546Sopenharmony_ci   uint32_t yt0, yt3;
842bf215546Sopenharmony_ci   uint32_t xt, yt;
843bf215546Sopenharmony_ci   uint32_t tw, th, span;
844bf215546Sopenharmony_ci   uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
845bf215546Sopenharmony_ci
846bf215546Sopenharmony_ci   if (tiling == ISL_TILING_X) {
847bf215546Sopenharmony_ci      tw = xtile_width;
848bf215546Sopenharmony_ci      th = xtile_height;
849bf215546Sopenharmony_ci      span = xtile_span;
850bf215546Sopenharmony_ci      tile_copy = linear_to_xtiled_faster;
851bf215546Sopenharmony_ci   } else if (tiling == ISL_TILING_Y0) {
852bf215546Sopenharmony_ci      tw = ytile_width;
853bf215546Sopenharmony_ci      th = ytile_height;
854bf215546Sopenharmony_ci      span = ytile_span;
855bf215546Sopenharmony_ci      tile_copy = linear_to_ytiled_faster;
856bf215546Sopenharmony_ci   } else {
857bf215546Sopenharmony_ci      unreachable("unsupported tiling");
858bf215546Sopenharmony_ci   }
859bf215546Sopenharmony_ci
860bf215546Sopenharmony_ci   /* Round out to tile boundaries. */
861bf215546Sopenharmony_ci   xt0 = ALIGN_DOWN(xt1, tw);
862bf215546Sopenharmony_ci   xt3 = ALIGN_UP  (xt2, tw);
863bf215546Sopenharmony_ci   yt0 = ALIGN_DOWN(yt1, th);
864bf215546Sopenharmony_ci   yt3 = ALIGN_UP  (yt2, th);
865bf215546Sopenharmony_ci
866bf215546Sopenharmony_ci   /* Loop over all tiles to which we have something to copy.
867bf215546Sopenharmony_ci    * 'xt' and 'yt' are the origin of the destination tile, whether copying
868bf215546Sopenharmony_ci    * copying a full or partial tile.
869bf215546Sopenharmony_ci    * tile_copy() copies one tile or partial tile.
870bf215546Sopenharmony_ci    * Looping x inside y is the faster memory access pattern.
871bf215546Sopenharmony_ci    */
872bf215546Sopenharmony_ci   for (yt = yt0; yt < yt3; yt += th) {
873bf215546Sopenharmony_ci      for (xt = xt0; xt < xt3; xt += tw) {
874bf215546Sopenharmony_ci         /* The area to update is [x0,x3) x [y0,y1).
875bf215546Sopenharmony_ci          * May not want the whole tile, hence the min and max.
876bf215546Sopenharmony_ci          */
877bf215546Sopenharmony_ci         uint32_t x0 = MAX2(xt1, xt);
878bf215546Sopenharmony_ci         uint32_t y0 = MAX2(yt1, yt);
879bf215546Sopenharmony_ci         uint32_t x3 = MIN2(xt2, xt + tw);
880bf215546Sopenharmony_ci         uint32_t y1 = MIN2(yt2, yt + th);
881bf215546Sopenharmony_ci
882bf215546Sopenharmony_ci         /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
883bf215546Sopenharmony_ci          * the middle interval is the longest span-aligned part.
884bf215546Sopenharmony_ci          * The sub-ranges could be empty.
885bf215546Sopenharmony_ci          */
886bf215546Sopenharmony_ci         uint32_t x1, x2;
887bf215546Sopenharmony_ci         x1 = ALIGN_UP(x0, span);
888bf215546Sopenharmony_ci         if (x1 > x3)
889bf215546Sopenharmony_ci            x1 = x2 = x3;
890bf215546Sopenharmony_ci         else
891bf215546Sopenharmony_ci            x2 = ALIGN_DOWN(x3, span);
892bf215546Sopenharmony_ci
893bf215546Sopenharmony_ci         assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
894bf215546Sopenharmony_ci         assert(x1 - x0 < span && x3 - x2 < span);
895bf215546Sopenharmony_ci         assert(x3 - x0 <= tw);
896bf215546Sopenharmony_ci         assert((x2 - x1) % span == 0);
897bf215546Sopenharmony_ci
898bf215546Sopenharmony_ci         /* Translate by (xt,yt) for single-tile copier. */
899bf215546Sopenharmony_ci         tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
900bf215546Sopenharmony_ci                   y0-yt, y1-yt,
901bf215546Sopenharmony_ci                   dst + (ptrdiff_t)xt * th  +  (ptrdiff_t)yt        * dst_pitch,
902bf215546Sopenharmony_ci                   src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch,
903bf215546Sopenharmony_ci                   src_pitch,
904bf215546Sopenharmony_ci                   swizzle_bit,
905bf215546Sopenharmony_ci                   copy_type);
906bf215546Sopenharmony_ci      }
907bf215546Sopenharmony_ci   }
908bf215546Sopenharmony_ci}
909bf215546Sopenharmony_ci
910bf215546Sopenharmony_ci/**
911bf215546Sopenharmony_ci * Copy from tiled to linear texture.
912bf215546Sopenharmony_ci *
913bf215546Sopenharmony_ci * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
914bf215546Sopenharmony_ci * pieces that do not cross tile boundaries and copy each piece with a tile
915bf215546Sopenharmony_ci * copy function (\ref tile_copy_fn).
916bf215546Sopenharmony_ci * The X range is in bytes, i.e. pixels * bytes-per-pixel.
917bf215546Sopenharmony_ci * The Y range is in pixels (i.e. unitless).
918bf215546Sopenharmony_ci * 'dst' is the address of (xt1, yt1) in the destination linear texture.
919bf215546Sopenharmony_ci * 'src' is the address of (0, 0) in the source tiled texture.
920bf215546Sopenharmony_ci */
921bf215546Sopenharmony_cistatic void
922bf215546Sopenharmony_citiled_to_linear(uint32_t xt1, uint32_t xt2,
923bf215546Sopenharmony_ci                      uint32_t yt1, uint32_t yt2,
924bf215546Sopenharmony_ci                      char *dst, const char *src,
925bf215546Sopenharmony_ci                      int32_t dst_pitch, uint32_t src_pitch,
926bf215546Sopenharmony_ci                      bool has_swizzling,
927bf215546Sopenharmony_ci                      enum isl_tiling tiling,
928bf215546Sopenharmony_ci                      isl_memcpy_type copy_type)
929bf215546Sopenharmony_ci{
930bf215546Sopenharmony_ci   tile_copy_fn tile_copy;
931bf215546Sopenharmony_ci   uint32_t xt0, xt3;
932bf215546Sopenharmony_ci   uint32_t yt0, yt3;
933bf215546Sopenharmony_ci   uint32_t xt, yt;
934bf215546Sopenharmony_ci   uint32_t tw, th, span;
935bf215546Sopenharmony_ci   uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
936bf215546Sopenharmony_ci
937bf215546Sopenharmony_ci   if (tiling == ISL_TILING_X) {
938bf215546Sopenharmony_ci      tw = xtile_width;
939bf215546Sopenharmony_ci      th = xtile_height;
940bf215546Sopenharmony_ci      span = xtile_span;
941bf215546Sopenharmony_ci      tile_copy = xtiled_to_linear_faster;
942bf215546Sopenharmony_ci   } else if (tiling == ISL_TILING_Y0) {
943bf215546Sopenharmony_ci      tw = ytile_width;
944bf215546Sopenharmony_ci      th = ytile_height;
945bf215546Sopenharmony_ci      span = ytile_span;
946bf215546Sopenharmony_ci      tile_copy = ytiled_to_linear_faster;
947bf215546Sopenharmony_ci   } else {
948bf215546Sopenharmony_ci      unreachable("unsupported tiling");
949bf215546Sopenharmony_ci   }
950bf215546Sopenharmony_ci
951bf215546Sopenharmony_ci#if defined(INLINE_SSE41)
952bf215546Sopenharmony_ci   if (copy_type == ISL_MEMCPY_STREAMING_LOAD) {
953bf215546Sopenharmony_ci      /* The hidden cacheline sized register used by movntdqa can apparently
954bf215546Sopenharmony_ci       * give you stale data, so do an mfence to invalidate it.
955bf215546Sopenharmony_ci       */
956bf215546Sopenharmony_ci      _mm_mfence();
957bf215546Sopenharmony_ci   }
958bf215546Sopenharmony_ci#endif
959bf215546Sopenharmony_ci
960bf215546Sopenharmony_ci   /* Round out to tile boundaries. */
961bf215546Sopenharmony_ci   xt0 = ALIGN_DOWN(xt1, tw);
962bf215546Sopenharmony_ci   xt3 = ALIGN_UP  (xt2, tw);
963bf215546Sopenharmony_ci   yt0 = ALIGN_DOWN(yt1, th);
964bf215546Sopenharmony_ci   yt3 = ALIGN_UP  (yt2, th);
965bf215546Sopenharmony_ci
966bf215546Sopenharmony_ci   /* Loop over all tiles to which we have something to copy.
967bf215546Sopenharmony_ci    * 'xt' and 'yt' are the origin of the destination tile, whether copying
968bf215546Sopenharmony_ci    * copying a full or partial tile.
969bf215546Sopenharmony_ci    * tile_copy() copies one tile or partial tile.
970bf215546Sopenharmony_ci    * Looping x inside y is the faster memory access pattern.
971bf215546Sopenharmony_ci    */
972bf215546Sopenharmony_ci   for (yt = yt0; yt < yt3; yt += th) {
973bf215546Sopenharmony_ci      for (xt = xt0; xt < xt3; xt += tw) {
974bf215546Sopenharmony_ci         /* The area to update is [x0,x3) x [y0,y1).
975bf215546Sopenharmony_ci          * May not want the whole tile, hence the min and max.
976bf215546Sopenharmony_ci          */
977bf215546Sopenharmony_ci         uint32_t x0 = MAX2(xt1, xt);
978bf215546Sopenharmony_ci         uint32_t y0 = MAX2(yt1, yt);
979bf215546Sopenharmony_ci         uint32_t x3 = MIN2(xt2, xt + tw);
980bf215546Sopenharmony_ci         uint32_t y1 = MIN2(yt2, yt + th);
981bf215546Sopenharmony_ci
982bf215546Sopenharmony_ci         /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
983bf215546Sopenharmony_ci          * the middle interval is the longest span-aligned part.
984bf215546Sopenharmony_ci          * The sub-ranges could be empty.
985bf215546Sopenharmony_ci          */
986bf215546Sopenharmony_ci         uint32_t x1, x2;
987bf215546Sopenharmony_ci         x1 = ALIGN_UP(x0, span);
988bf215546Sopenharmony_ci         if (x1 > x3)
989bf215546Sopenharmony_ci            x1 = x2 = x3;
990bf215546Sopenharmony_ci         else
991bf215546Sopenharmony_ci            x2 = ALIGN_DOWN(x3, span);
992bf215546Sopenharmony_ci
993bf215546Sopenharmony_ci         assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
994bf215546Sopenharmony_ci         assert(x1 - x0 < span && x3 - x2 < span);
995bf215546Sopenharmony_ci         assert(x3 - x0 <= tw);
996bf215546Sopenharmony_ci         assert((x2 - x1) % span == 0);
997bf215546Sopenharmony_ci
998bf215546Sopenharmony_ci         /* Translate by (xt,yt) for single-tile copier. */
999bf215546Sopenharmony_ci         tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
1000bf215546Sopenharmony_ci                   y0-yt, y1-yt,
1001bf215546Sopenharmony_ci                   dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch,
1002bf215546Sopenharmony_ci                   src + (ptrdiff_t)xt * th  +  (ptrdiff_t)yt        * src_pitch,
1003bf215546Sopenharmony_ci                   dst_pitch,
1004bf215546Sopenharmony_ci                   swizzle_bit,
1005bf215546Sopenharmony_ci                   copy_type);
1006bf215546Sopenharmony_ci      }
1007bf215546Sopenharmony_ci   }
1008bf215546Sopenharmony_ci}
1009