1/*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 *    Jonathan Marek <jonathan@marek.ca>
7 */
8
9#include "tu_clear_blit.h"
10
11#include "ir3/ir3_nir.h"
12
13#include "util/format_r11g11b10f.h"
14#include "util/format_rgb9e5.h"
15#include "util/format_srgb.h"
16#include "util/half_float.h"
17#include "compiler/nir/nir_builder.h"
18
19#include "tu_cmd_buffer.h"
20#include "tu_cs.h"
21#include "tu_formats.h"
22#include "tu_image.h"
23#include "tu_tracepoints.h"
24
25static uint32_t
26tu_pack_float32_for_unorm(float val, int bits)
27{
28   return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
29}
30
31/* r2d_ = BLIT_OP_SCALE operations */
32
33static enum a6xx_2d_ifmt
34format_to_ifmt(enum pipe_format format)
35{
36   if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
37       format == PIPE_FORMAT_Z24X8_UNORM)
38      return R2D_UNORM8;
39
40   /* get_component_bits doesn't work with depth/stencil formats: */
41   if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
42      return R2D_FLOAT32;
43   if (format == PIPE_FORMAT_S8_UINT)
44      return R2D_INT8;
45   if (format == PIPE_FORMAT_A8_UNORM)
46      return R2D_UNORM8;
47
48   /* use the size of the red channel to find the corresponding "ifmt" */
49   bool is_int = util_format_is_pure_integer(format);
50   switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
51   case 4: case 5: case 8:
52      return is_int ? R2D_INT8 : R2D_UNORM8;
53   case 10: case 11:
54      return is_int ? R2D_INT16 : R2D_FLOAT16;
55   case 16:
56      if (util_format_is_float(format))
57         return R2D_FLOAT16;
58      return is_int ? R2D_INT16 : R2D_FLOAT32;
59   case 32:
60      return is_int ? R2D_INT32 : R2D_FLOAT32;
61    default:
62      unreachable("bad format");
63      return 0;
64   }
65}
66
67static void
68r2d_coords(struct tu_cs *cs,
69           const VkOffset2D *dst,
70           const VkOffset2D *src,
71           const VkExtent2D *extent)
72{
73   tu_cs_emit_regs(cs,
74      A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
75      A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
76
77   if (!src)
78      return;
79
80   tu_cs_emit_regs(cs,
81                   A6XX_GRAS_2D_SRC_TL_X(src->x),
82                   A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
83                   A6XX_GRAS_2D_SRC_TL_Y(src->y),
84                   A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
85}
86
87static void
88r2d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
89{
90   uint32_t clear_value[4] = {};
91
92   switch (format) {
93   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
94   case PIPE_FORMAT_Z24X8_UNORM:
95      /* cleared as r8g8b8a8_unorm using special format */
96      clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
97      clear_value[1] = clear_value[0] >> 8;
98      clear_value[2] = clear_value[0] >> 16;
99      clear_value[3] = val->depthStencil.stencil;
100      break;
101   case PIPE_FORMAT_Z16_UNORM:
102   case PIPE_FORMAT_Z32_FLOAT:
103      /* R2D_FLOAT32 */
104      clear_value[0] = fui(val->depthStencil.depth);
105      break;
106   case PIPE_FORMAT_S8_UINT:
107      clear_value[0] = val->depthStencil.stencil;
108      break;
109   case PIPE_FORMAT_R9G9B9E5_FLOAT:
110      /* cleared as UINT32 */
111      clear_value[0] = float3_to_rgb9e5(val->color.float32);
112      break;
113   default:
114      assert(!util_format_is_depth_or_stencil(format));
115      const struct util_format_description *desc = util_format_description(format);
116      enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
117
118      assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
119             format == PIPE_FORMAT_R11G11B10_FLOAT);
120
121      for (unsigned i = 0; i < desc->nr_channels; i++) {
122         const struct util_format_channel_description *ch = &desc->channel[i];
123         if (ifmt == R2D_UNORM8) {
124            float linear = val->color.float32[i];
125            if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
126               linear = util_format_linear_to_srgb_float(val->color.float32[i]);
127
128            if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
129               clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
130            else
131               clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
132         } else if (ifmt == R2D_FLOAT16) {
133            clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
134         } else {
135            assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
136                   ifmt == R2D_INT16 || ifmt == R2D_INT8);
137            clear_value[i] = val->color.uint32[i];
138         }
139      }
140      break;
141   }
142
143   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
144   tu_cs_emit_array(cs, clear_value, 4);
145}
146
147static void
148fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format,
149                 enum a6xx_format *fmt)
150{
151   /* When blitting S8 -> D24S8 or vice versa, we have to override S8, which
152    * is normally R8_UINT for sampling/blitting purposes, to a unorm format.
153    * We also have to move stencil, which is normally in the .w channel, into
154    * the right channel. Reintepreting the S8 texture as A8_UNORM solves both
155    * problems, and avoids using a swap, which seems to sometimes not work
156    * with a D24S8 source, or a texture swizzle which is only supported with
157    * the 3d path. Sometimes this blit happens on already-constructed
158    * fdl6_view's, e.g. for sysmem resolves, so this has to happen as a fixup.
159    */
160   if (*src_format == PIPE_FORMAT_S8_UINT &&
161       (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
162        dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
163      *fmt = FMT6_A8_UNORM;
164      *src_format = PIPE_FORMAT_A8_UNORM;
165   }
166}
167
168static void
169fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format,
170                 enum a6xx_format *fmt)
171{
172   if (*dst_format == PIPE_FORMAT_S8_UINT &&
173       (src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
174        src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
175      *dst_format = PIPE_FORMAT_A8_UNORM;
176      *fmt = FMT6_A8_UNORM;
177   }
178}
179
180static void
181r2d_src(struct tu_cmd_buffer *cmd,
182        struct tu_cs *cs,
183        const struct fdl6_view *iview,
184        uint32_t layer,
185        VkFilter filter,
186        enum pipe_format dst_format)
187{
188   uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
189   if (filter != VK_FILTER_NEAREST)
190      src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
191
192   enum a6xx_format fmt = (src_info & A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK);
193   enum pipe_format src_format = iview->format;
194   fixup_src_format(&src_format, dst_format, &fmt);
195
196   src_info =
197      (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) |
198      A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt);
199
200   tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
201   tu_cs_emit(cs, src_info);
202   tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
203   tu_cs_image_ref_2d(cs, iview, layer, true);
204
205   tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3);
206   tu_cs_image_flag_ref(cs, iview, layer);
207}
208
209static void
210r2d_src_depth(struct tu_cmd_buffer *cmd,
211                struct tu_cs *cs,
212                const struct tu_image_view *iview,
213                uint32_t layer,
214                VkFilter filter)
215{
216   tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
217   tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO));
218   tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
219   tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
220   /* SP_PS_2D_SRC_PITCH has shifted pitch field */
221   tu_cs_emit(cs, iview->depth_PITCH << 9);
222
223   tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3);
224   tu_cs_image_flag_ref(cs, &iview->view, layer);
225}
226
227static void
228r2d_src_stencil(struct tu_cmd_buffer *cmd,
229                struct tu_cs *cs,
230                const struct tu_image_view *iview,
231                uint32_t layer,
232                VkFilter filter)
233{
234   tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
235   tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
236   tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
237   tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
238   /* SP_PS_2D_SRC_PITCH has shifted pitch field */
239   tu_cs_emit(cs, iview->stencil_PITCH << 9);
240}
241
242static void
243r2d_src_buffer(struct tu_cmd_buffer *cmd,
244               struct tu_cs *cs,
245               enum pipe_format format,
246               uint64_t va, uint32_t pitch,
247               uint32_t width, uint32_t height,
248               enum pipe_format dst_format)
249{
250   struct tu_native_format fmt = tu6_format_texture(format, TILE6_LINEAR);
251   enum a6xx_format color_format = fmt.fmt;
252   fixup_src_format(&format, dst_format, &color_format);
253
254   tu_cs_emit_regs(cs,
255                   A6XX_SP_PS_2D_SRC_INFO(
256                      .color_format = color_format,
257                      .color_swap = fmt.swap,
258                      .srgb = util_format_is_srgb(format),
259                      .unk20 = 1,
260                      .unk22 = 1),
261                   A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
262                   A6XX_SP_PS_2D_SRC(.qword = va),
263                   A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
264}
265
266static void
267r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
268        enum pipe_format src_format)
269{
270   uint32_t dst_info = iview->RB_2D_DST_INFO;
271   enum a6xx_format fmt = dst_info & A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK;
272   enum pipe_format dst_format = iview->format;
273   fixup_dst_format(src_format, &dst_format, &fmt);
274
275   dst_info =
276         (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt;
277   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
278   tu_cs_emit(cs, dst_info);
279   tu_cs_image_ref_2d(cs, iview, layer, false);
280
281   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
282   tu_cs_image_flag_ref(cs, iview, layer);
283}
284
285static void
286r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
287{
288   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
289   tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO));
290   tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
291   tu_cs_emit(cs, iview->depth_PITCH);
292
293   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
294   tu_cs_image_flag_ref(cs, &iview->view, layer);
295}
296
297static void
298r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
299{
300   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
301   tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
302   tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
303   tu_cs_emit(cs, iview->stencil_PITCH);
304}
305
306static void
307r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
308               enum pipe_format src_format)
309{
310   struct tu_native_format fmt = tu6_format_color(format, TILE6_LINEAR);
311   enum a6xx_format color_fmt = fmt.fmt;
312   fixup_dst_format(src_format, &format, &color_fmt);
313   fmt.fmt = color_fmt;
314
315   tu_cs_emit_regs(cs,
316                   A6XX_RB_2D_DST_INFO(
317                      .color_format = fmt.fmt,
318                      .color_swap = fmt.swap,
319                      .srgb = util_format_is_srgb(format)),
320                   A6XX_RB_2D_DST(.qword = va),
321                   A6XX_RB_2D_DST_PITCH(pitch));
322}
323
324static void
325r2d_setup_common(struct tu_cmd_buffer *cmd,
326                 struct tu_cs *cs,
327                 enum pipe_format src_format,
328                 enum pipe_format dst_format,
329                 VkImageAspectFlags aspect_mask,
330                 unsigned blit_param,
331                 bool clear,
332                 bool ubwc,
333                 bool scissor)
334{
335   enum a6xx_format fmt = tu6_base_format(dst_format);
336   fixup_dst_format(src_format, &dst_format, &fmt);
337   enum a6xx_2d_ifmt ifmt = format_to_ifmt(dst_format);
338
339   uint32_t unknown_8c01 = 0;
340
341   if ((dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
342       dst_format == PIPE_FORMAT_Z24X8_UNORM) && ubwc) {
343      fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
344   }
345
346   /* note: the only format with partial clearing is D24S8 */
347   if (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
348      /* preserve stencil channel */
349      if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
350         unknown_8c01 = 0x08000041;
351      /* preserve depth channels */
352      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
353         unknown_8c01 = 0x00084001;
354   }
355
356   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
357   tu_cs_emit(cs, unknown_8c01);
358
359   uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
360         .scissor = scissor,
361         .rotate = blit_param,
362         .solid_color = clear,
363         .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
364         .color_format = fmt,
365         .mask = 0xf,
366         .ifmt = util_format_is_srgb(dst_format) ? R2D_UNORM8_SRGB : ifmt,
367      ).value;
368
369   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
370   tu_cs_emit(cs, blit_cntl);
371
372   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
373   tu_cs_emit(cs, blit_cntl);
374
375   if (fmt == FMT6_10_10_10_2_UNORM_DEST)
376      fmt = FMT6_16_16_16_16_FLOAT;
377
378   tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
379         .sint = util_format_is_pure_sint(dst_format),
380         .uint = util_format_is_pure_uint(dst_format),
381         .color_format = fmt,
382         .srgb = util_format_is_srgb(dst_format),
383         .mask = 0xf));
384}
385
386static void
387r2d_setup(struct tu_cmd_buffer *cmd,
388          struct tu_cs *cs,
389          enum pipe_format src_format,
390          enum pipe_format dst_format,
391          VkImageAspectFlags aspect_mask,
392          unsigned blit_param,
393          bool clear,
394          bool ubwc,
395          VkSampleCountFlagBits samples)
396{
397   assert(samples == VK_SAMPLE_COUNT_1_BIT);
398
399   if (!cmd->state.pass) {
400      tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
401   }
402
403   r2d_setup_common(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false);
404}
405
406static void
407r2d_teardown(struct tu_cmd_buffer *cmd,
408             struct tu_cs *cs)
409{
410   /* nothing to do here */
411}
412
413static void
414r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
415{
416   tu_cs_emit_pkt7(cs, CP_BLIT, 1);
417   tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
418}
419
420/* r3d_ = shader path operations */
421
422static nir_ssa_def *
423load_const(nir_builder *b, unsigned base, unsigned components)
424{
425   return nir_load_uniform(b, components, 32, nir_imm_int(b, 0),
426                           .base = base);
427}
428
429static nir_shader *
430build_blit_vs_shader(void)
431{
432   nir_builder _b =
433      nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
434   nir_builder *b = &_b;
435
436   nir_variable *out_pos =
437      nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
438                          "gl_Position");
439   out_pos->data.location = VARYING_SLOT_POS;
440
441   nir_ssa_def *vert0_pos = load_const(b, 0, 2);
442   nir_ssa_def *vert1_pos = load_const(b, 4, 2);
443   nir_ssa_def *vertex = nir_load_vertex_id(b);
444
445   nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos);
446   pos = nir_vec4(b, nir_channel(b, pos, 0),
447                     nir_channel(b, pos, 1),
448                     nir_imm_float(b, 0.0),
449                     nir_imm_float(b, 1.0));
450
451   nir_store_var(b, out_pos, pos, 0xf);
452
453   nir_variable *out_coords =
454      nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
455                          "coords");
456   out_coords->data.location = VARYING_SLOT_VAR0;
457
458   nir_ssa_def *vert0_coords = load_const(b, 2, 2);
459   nir_ssa_def *vert1_coords = load_const(b, 6, 2);
460
461   /* Only used with "z scale" blit path which uses a 3d texture */
462   nir_ssa_def *z_coord = load_const(b, 8, 1);
463
464   nir_ssa_def *coords = nir_bcsel(b, nir_i2b1(b, vertex), vert1_coords, vert0_coords);
465   coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
466                     z_coord);
467
468   nir_store_var(b, out_coords, coords, 0x7);
469
470   return b->shader;
471}
472
473static nir_shader *
474build_clear_vs_shader(void)
475{
476   nir_builder _b =
477      nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
478   nir_builder *b = &_b;
479
480   nir_variable *out_pos =
481      nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
482                          "gl_Position");
483   out_pos->data.location = VARYING_SLOT_POS;
484
485   nir_ssa_def *vert0_pos = load_const(b, 0, 2);
486   nir_ssa_def *vert1_pos = load_const(b, 4, 2);
487   /* c0.z is used to clear depth */
488   nir_ssa_def *depth = load_const(b, 2, 1);
489   nir_ssa_def *vertex = nir_load_vertex_id(b);
490
491   nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos);
492   pos = nir_vec4(b, nir_channel(b, pos, 0),
493                     nir_channel(b, pos, 1),
494                     depth, nir_imm_float(b, 1.0));
495
496   nir_store_var(b, out_pos, pos, 0xf);
497
498   nir_variable *out_layer =
499      nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
500                          "gl_Layer");
501   out_layer->data.location = VARYING_SLOT_LAYER;
502   nir_ssa_def *layer = load_const(b, 3, 1);
503   nir_store_var(b, out_layer, layer, 1);
504
505   return b->shader;
506}
507
508static nir_shader *
509build_blit_fs_shader(bool zscale)
510{
511   nir_builder _b =
512      nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
513                                     zscale ? "zscale blit fs" : "blit fs");
514   nir_builder *b = &_b;
515
516   nir_variable *out_color =
517      nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
518                          "color0");
519   out_color->data.location = FRAG_RESULT_DATA0;
520
521   unsigned coord_components = zscale ? 3 : 2;
522   nir_variable *in_coords =
523      nir_variable_create(b->shader, nir_var_shader_in,
524                          glsl_vec_type(coord_components),
525                          "coords");
526   in_coords->data.location = VARYING_SLOT_VAR0;
527
528   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
529   /* Note: since we're just copying data, we rely on the HW ignoring the
530    * dest_type.
531    */
532   tex->dest_type = nir_type_int32;
533   tex->is_array = false;
534   tex->is_shadow = false;
535   tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
536
537   tex->texture_index = 0;
538   tex->sampler_index = 0;
539
540   b->shader->info.num_textures = 1;
541   BITSET_SET(b->shader->info.textures_used, 0);
542
543   tex->src[0].src_type = nir_tex_src_coord;
544   tex->src[0].src = nir_src_for_ssa(nir_load_var(b, in_coords));
545   tex->coord_components = coord_components;
546
547   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
548   nir_builder_instr_insert(b, &tex->instr);
549
550   nir_store_var(b, out_color, &tex->dest.ssa, 0xf);
551
552   return b->shader;
553}
554
555/* We can only read multisample textures via txf_ms, so we need a separate
556 * variant for them.
557 */
558static nir_shader *
559build_ms_copy_fs_shader(void)
560{
561   nir_builder _b =
562      nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
563                                     "multisample copy fs");
564   nir_builder *b = &_b;
565
566   nir_variable *out_color =
567      nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
568                          "color0");
569   out_color->data.location = FRAG_RESULT_DATA0;
570
571   nir_variable *in_coords =
572      nir_variable_create(b->shader, nir_var_shader_in,
573                          glsl_vec_type(2),
574                          "coords");
575   in_coords->data.location = VARYING_SLOT_VAR0;
576
577   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
578
579   tex->op = nir_texop_txf_ms;
580
581   /* Note: since we're just copying data, we rely on the HW ignoring the
582    * dest_type.
583    */
584   tex->dest_type = nir_type_int32;
585   tex->is_array = false;
586   tex->is_shadow = false;
587   tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
588
589   tex->texture_index = 0;
590   tex->sampler_index = 0;
591
592   b->shader->info.num_textures = 1;
593   BITSET_SET(b->shader->info.textures_used, 0);
594   BITSET_SET(b->shader->info.textures_used_by_txf, 0);
595
596   nir_ssa_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
597
598   tex->src[0].src_type = nir_tex_src_coord;
599   tex->src[0].src = nir_src_for_ssa(coord);
600   tex->coord_components = 2;
601
602   tex->src[1].src_type = nir_tex_src_ms_index;
603   tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(b));
604
605   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
606   nir_builder_instr_insert(b, &tex->instr);
607
608   nir_store_var(b, out_color, &tex->dest.ssa, 0xf);
609
610   return b->shader;
611}
612
613static nir_shader *
614build_clear_fs_shader(unsigned mrts)
615{
616   nir_builder _b =
617      nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
618                                     "mrt%u clear fs", mrts);
619   nir_builder *b = &_b;
620
621   for (unsigned i = 0; i < mrts; i++) {
622      nir_variable *out_color =
623         nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
624                             "color");
625      out_color->data.location = FRAG_RESULT_DATA0 + i;
626
627      nir_ssa_def *color = load_const(b, 4 * i, 4);
628      nir_store_var(b, out_color, color, 0xf);
629   }
630
631   return b->shader;
632}
633
634static void
635compile_shader(struct tu_device *dev, struct nir_shader *nir,
636               unsigned consts, unsigned *offset, enum global_shader idx)
637{
638   nir->options = ir3_get_compiler_options(dev->compiler);
639
640   nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
641   nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
642
643   ir3_finalize_nir(dev->compiler, nir);
644
645   struct ir3_shader *sh =
646      ir3_shader_from_nir(dev->compiler, nir, &(struct ir3_shader_options) {
647                              .api_wavesize = IR3_SINGLE_OR_DOUBLE,
648                              .real_wavesize = IR3_SINGLE_OR_DOUBLE,
649                              .reserved_user_consts = align(consts, 4),
650                          }, NULL);
651
652   struct ir3_shader_key key = {};
653   bool created;
654   struct ir3_shader_variant *so =
655      ir3_shader_get_variant(sh, &key, false, false, &created);
656
657   struct tu6_global *global = dev->global_bo->map;
658
659   assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
660   dev->global_shaders[idx] = sh;
661   dev->global_shader_variants[idx] = so;
662   memcpy(&global->shaders[*offset], so->bin,
663          sizeof(uint32_t) * so->info.sizedwords);
664   dev->global_shader_va[idx] = dev->global_bo->iova +
665      gb_offset(shaders[*offset]);
666   *offset += align(so->info.sizedwords, 32);
667}
668
669void
670tu_init_clear_blit_shaders(struct tu_device *dev)
671{
672   unsigned offset = 0;
673   compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
674   compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
675   compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
676   compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
677   compile_shader(dev, build_ms_copy_fs_shader(), 0, &offset, GLOBAL_SH_FS_COPY_MS);
678
679   for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
680      compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
681                     GLOBAL_SH_FS_CLEAR0 + num_rts);
682   }
683}
684
685void
686tu_destroy_clear_blit_shaders(struct tu_device *dev)
687{
688   for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
689      if (dev->global_shaders[i])
690         ir3_shader_destroy(dev->global_shaders[i]);
691   }
692}
693
694static void
695r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit,
696           uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
697{
698   enum global_shader vs_id =
699      blit ? GLOBAL_SH_VS_BLIT : GLOBAL_SH_VS_CLEAR;
700
701   struct ir3_shader_variant *vs = cmd->device->global_shader_variants[vs_id];
702   uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
703
704   enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
705
706   if (z_scale)
707      fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
708   else if (samples != VK_SAMPLE_COUNT_1_BIT)
709      fs_id = GLOBAL_SH_FS_COPY_MS;
710
711   unsigned num_rts = util_bitcount(rts_mask);
712   if (!blit)
713      fs_id = GLOBAL_SH_FS_CLEAR0 + num_rts;
714
715   struct ir3_shader_variant *fs = cmd->device->global_shader_variants[fs_id];
716   uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
717
718   tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
719         .vs_state = true,
720         .hs_state = true,
721         .ds_state = true,
722         .gs_state = true,
723         .fs_state = true,
724         .cs_state = true,
725         .gfx_ibo = true,
726         .cs_ibo = true,
727         .gfx_shared_const = true,
728         .gfx_bindless = 0x1f,
729         .cs_bindless = 0x1f));
730
731   tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, vs);
732   tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL);
733   tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL);
734   tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL);
735   tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, fs);
736
737   struct tu_pvtmem_config pvtmem = {};
738   tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
739   tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
740
741   tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
742   tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
743
744   if (cmd->device->physical_device->info->a6xx.has_cp_reg_write) {
745   /* Copy what the blob does here. This will emit an extra 0x3f
746    * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
747    * this is working around yet.
748    */
749   tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
750   tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
751   tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
752   tu_cs_emit(cs, 0);
753   } else {
754      tu_cs_emit_regs(cs, A6XX_PC_MULTIVIEW_CNTL());
755   }
756   tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL());
757
758   tu6_emit_vpc(cs, vs, NULL, NULL, NULL, fs, 0);
759
760   /* REPL_MODE for varying with RECTLIST (2 vertices only) */
761   tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
762   tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
763
764   tu6_emit_fs_inputs(cs, fs);
765
766   tu_cs_emit_regs(cs,
767                   A6XX_GRAS_CL_CNTL(
768                      .persp_division_disable = 1,
769                      .vp_xform_disable = 1,
770                      .vp_clip_code_ignore = 1,
771                      .clip_disable = 1));
772   tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
773
774   tu_cs_emit_regs(cs, A6XX_PC_RASTER_CNTL());
775   tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
776
777   tu_cs_emit_regs(cs,
778                   A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
779                   A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
780   tu_cs_emit_regs(cs,
781                   A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
782                   A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
783
784   tu_cs_emit_regs(cs,
785                   A6XX_VFD_INDEX_OFFSET(),
786                   A6XX_VFD_INSTANCE_START_OFFSET());
787
788   if (rts_mask) {
789      unsigned rts_count = util_last_bit(rts_mask);
790      tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
791      unsigned rt = 0;
792      for (unsigned i = 0; i < rts_count; i++) {
793         unsigned regid = 0;
794         if (rts_mask & (1u << i))
795            regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
796         tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid));
797      }
798   }
799
800   cmd->state.line_mode = RECTANGULAR;
801   tu6_emit_msaa(cs, samples, cmd->state.line_mode);
802}
803
804static void
805r3d_coords_raw(struct tu_cs *cs, const float *coords)
806{
807   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
808   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
809                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
810                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
811                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
812                  CP_LOAD_STATE6_0_NUM_UNIT(2));
813   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
814   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
815   tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
816}
817
818/* z coordinate for "z scale" blit path which uses a 3d texture */
819static void
820r3d_coord_z(struct tu_cs *cs, float z)
821{
822   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 4);
823   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(2) |
824                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
825                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
826                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
827                  CP_LOAD_STATE6_0_NUM_UNIT(1));
828   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
829   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
830   tu_cs_emit(cs, fui(z));
831   tu_cs_emit(cs, 0);
832   tu_cs_emit(cs, 0);
833   tu_cs_emit(cs, 0);
834}
835
836static void
837r3d_coords(struct tu_cs *cs,
838           const VkOffset2D *dst,
839           const VkOffset2D *src,
840           const VkExtent2D *extent)
841{
842   int32_t src_x1 = src ? src->x : 0;
843   int32_t src_y1 = src ? src->y : 0;
844   r3d_coords_raw(cs, (float[]) {
845      dst->x,                 dst->y,
846      src_x1,                 src_y1,
847      dst->x + extent->width, dst->y + extent->height,
848      src_x1 + extent->width, src_y1 + extent->height,
849   });
850}
851
852static void
853r3d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
854{
855   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
856   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
857                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
858                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
859                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
860                  CP_LOAD_STATE6_0_NUM_UNIT(1));
861   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
862   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
863   switch (format) {
864   case PIPE_FORMAT_Z24X8_UNORM:
865   case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
866      /* cleared as r8g8b8a8_unorm using special format */
867      uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
868      tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
869      tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
870      tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
871      tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
872   } break;
873   case PIPE_FORMAT_Z16_UNORM:
874   case PIPE_FORMAT_Z32_FLOAT:
875      tu_cs_emit(cs, fui(val->depthStencil.depth));
876      tu_cs_emit(cs, 0);
877      tu_cs_emit(cs, 0);
878      tu_cs_emit(cs, 0);
879      break;
880   case PIPE_FORMAT_S8_UINT:
881      tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
882      tu_cs_emit(cs, 0);
883      tu_cs_emit(cs, 0);
884      tu_cs_emit(cs, 0);
885      break;
886   default:
887      /* as color formats use clear value as-is */
888      assert(!util_format_is_depth_or_stencil(format));
889      tu_cs_emit_array(cs, val->color.uint32, 4);
890      break;
891   }
892}
893
894static void
895r3d_src_common(struct tu_cmd_buffer *cmd,
896               struct tu_cs *cs,
897               const uint32_t *tex_const,
898               uint32_t offset_base,
899               uint32_t offset_ubwc,
900               VkFilter filter)
901{
902   struct tu_cs_memory texture = { };
903   VkResult result = tu_cs_alloc(&cmd->sub_cs,
904                                 2, /* allocate space for a sampler too */
905                                 A6XX_TEX_CONST_DWORDS, &texture);
906   if (result != VK_SUCCESS) {
907      cmd->record_result = result;
908      return;
909   }
910
911   memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
912
913   /* patch addresses for layer offset */
914   *(uint64_t*) (texture.map + 4) += offset_base;
915   uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
916   texture.map[7] = ubwc_addr;
917   texture.map[8] = ubwc_addr >> 32;
918
919   texture.map[A6XX_TEX_CONST_DWORDS + 0] =
920      A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
921      A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
922      A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
923      A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
924      A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
925      0x60000; /* XXX used by blob, doesn't seem necessary */
926   texture.map[A6XX_TEX_CONST_DWORDS + 1] =
927      A6XX_TEX_SAMP_1_UNNORM_COORDS |
928      A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
929   texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
930   texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
931
932   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
933   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
934               CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
935               CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
936               CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
937               CP_LOAD_STATE6_0_NUM_UNIT(1));
938   tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
939
940   tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
941
942   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
943   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
944      CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
945      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
946      CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
947      CP_LOAD_STATE6_0_NUM_UNIT(1));
948   tu_cs_emit_qw(cs, texture.iova);
949
950   tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
951   tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
952}
953
954static void
955r3d_src(struct tu_cmd_buffer *cmd,
956        struct tu_cs *cs,
957        const struct fdl6_view *iview,
958        uint32_t layer,
959        VkFilter filter,
960        enum pipe_format dst_format)
961{
962   uint32_t desc[A6XX_TEX_CONST_DWORDS];
963   memcpy(desc, iview->descriptor, sizeof(desc));
964
965   enum a6xx_format fmt = (desc[0] & A6XX_TEX_CONST_0_FMT__MASK) >>
966         A6XX_TEX_CONST_0_FMT__SHIFT;
967   enum pipe_format src_format = iview->format;
968   fixup_src_format(&src_format, dst_format, &fmt);
969   desc[0] = (desc[0] & ~A6XX_TEX_CONST_0_FMT__MASK) |
970      A6XX_TEX_CONST_0_FMT(fmt);
971
972   r3d_src_common(cmd, cs, desc,
973                  iview->layer_size * layer,
974                  iview->ubwc_layer_size * layer,
975                  filter);
976}
977
978static void
979r3d_src_buffer(struct tu_cmd_buffer *cmd,
980               struct tu_cs *cs,
981               enum pipe_format format,
982               uint64_t va, uint32_t pitch,
983               uint32_t width, uint32_t height,
984               enum pipe_format dst_format)
985{
986   uint32_t desc[A6XX_TEX_CONST_DWORDS];
987
988   struct tu_native_format fmt = tu6_format_texture(format, TILE6_LINEAR);
989   enum a6xx_format color_format = fmt.fmt;
990   fixup_src_format(&format, dst_format, &color_format);
991
992   desc[0] =
993      COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
994      A6XX_TEX_CONST_0_FMT(color_format) |
995      A6XX_TEX_CONST_0_SWAP(fmt.swap) |
996      A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
997      A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
998      A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
999      A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1000   desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
1001   desc[2] =
1002      A6XX_TEX_CONST_2_PITCH(pitch) |
1003      A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1004   desc[3] = 0;
1005   desc[4] = va;
1006   desc[5] = va >> 32;
1007   for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1008      desc[i] = 0;
1009
1010   r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1011}
1012
1013static void
1014r3d_src_gmem(struct tu_cmd_buffer *cmd,
1015             struct tu_cs *cs,
1016             const struct tu_image_view *iview,
1017             enum pipe_format format,
1018             enum pipe_format dst_format,
1019             uint32_t gmem_offset,
1020             uint32_t cpp)
1021{
1022   uint32_t desc[A6XX_TEX_CONST_DWORDS];
1023   memcpy(desc, iview->view.descriptor, sizeof(desc));
1024
1025   enum a6xx_format fmt = tu6_format_texture(format, TILE6_LINEAR).fmt;
1026   fixup_src_format(&format, dst_format, &fmt);
1027
1028   /* patch the format so that depth/stencil get the right format and swizzle */
1029   desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1030                A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1031                A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1032   desc[0] |= A6XX_TEX_CONST_0_FMT(fmt) |
1033               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1034               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1035               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1036               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1037
1038   /* patched for gmem */
1039   desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1040   desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1041   desc[2] =
1042      A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1043      A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
1044   desc[3] = 0;
1045   desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1046   desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
1047   for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1048      desc[i] = 0;
1049
1050   r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1051}
1052
1053static void
1054r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1055        enum pipe_format src_format)
1056{
1057   uint32_t mrt_buf_info = iview->RB_MRT_BUF_INFO;
1058
1059   enum a6xx_format fmt = mrt_buf_info & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK;
1060   enum pipe_format dst_format = iview->format;
1061   fixup_dst_format(src_format, &dst_format, &fmt);
1062   mrt_buf_info =
1063      (mrt_buf_info & ~A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK) |
1064      A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(fmt);
1065   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1066   tu_cs_emit(cs, mrt_buf_info);
1067   tu_cs_image_ref(cs, iview, layer);
1068   tu_cs_emit(cs, 0);
1069
1070   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1071   tu_cs_image_flag_ref(cs, iview, layer);
1072
1073   /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
1074    * FMT6_NV12_Y.
1075    */
1076   tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt));
1077
1078   tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
1079}
1080
1081static void
1082r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1083{
1084   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1085   tu_cs_emit(cs, tu_image_view_depth(iview, RB_MRT_BUF_INFO));
1086   tu_cs_image_depth_ref(cs, iview, layer);
1087   tu_cs_emit(cs, 0);
1088
1089   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1090   tu_cs_image_flag_ref(cs, &iview->view, layer);
1091
1092   tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->view.ubwc_enabled));
1093}
1094
1095static void
1096r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1097{
1098   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1099   tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
1100   tu_cs_image_stencil_ref(cs, iview, layer);
1101   tu_cs_emit(cs, 0);
1102
1103   tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
1104}
1105
1106static void
1107r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1108               enum pipe_format src_format)
1109{
1110   struct tu_native_format fmt = tu6_format_color(format, TILE6_LINEAR);
1111
1112   enum a6xx_format color_fmt = fmt.fmt;
1113   fixup_dst_format(src_format, &format, &color_fmt);
1114
1115   tu_cs_emit_regs(cs,
1116                   A6XX_RB_MRT_BUF_INFO(0, .color_format = color_fmt, .color_swap = fmt.swap),
1117                   A6XX_RB_MRT_PITCH(0, pitch),
1118                   A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1119                   A6XX_RB_MRT_BASE(0, .qword = va),
1120                   A6XX_RB_MRT_BASE_GMEM(0, 0));
1121
1122   tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
1123}
1124
1125static uint8_t
1126aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
1127{
1128   uint8_t mask = 0xf;
1129   assert(aspect_mask);
1130   /* note: the only format with partial writing is D24S8,
1131    * clear/blit uses the _AS_R8G8B8A8 format to access it
1132    */
1133   if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1134      if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1135         mask = 0x7;
1136      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1137         mask = 0x8;
1138   }
1139   return mask;
1140}
1141
1142static void
1143r3d_setup(struct tu_cmd_buffer *cmd,
1144          struct tu_cs *cs,
1145          enum pipe_format src_format,
1146          enum pipe_format dst_format,
1147          VkImageAspectFlags aspect_mask,
1148          unsigned blit_param,
1149          bool clear,
1150          bool ubwc,
1151          VkSampleCountFlagBits samples)
1152{
1153   enum a6xx_format fmt = tu6_base_format(dst_format);
1154   fixup_dst_format(src_format, &dst_format, &fmt);
1155
1156   if ((dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
1157        dst_format == PIPE_FORMAT_Z24X8_UNORM) && ubwc) {
1158      fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
1159   }
1160
1161   if (!cmd->state.pass) {
1162      tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
1163      tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1164   }
1165
1166   tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
1167   tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
1168
1169   r3d_common(cmd, cs, !clear, 1, blit_param, samples);
1170
1171   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1172   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1173                  A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1174                  0xfc000000);
1175   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
1176
1177   tu_cs_emit_regs(cs,
1178                   A6XX_RB_FS_OUTPUT_CNTL0(),
1179                   A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1180
1181   tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1182   tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1183
1184   tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1185   tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1186   tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1187   tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1188   tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1189   tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1190   tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1191
1192   tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
1193   tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
1194
1195   tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1196                        .color_format = fmt,
1197                        .color_sint = util_format_is_pure_sint(dst_format),
1198                        .color_uint = util_format_is_pure_uint(dst_format)));
1199
1200   tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1201      .component_enable = aspect_write_mask(dst_format, aspect_mask)));
1202   tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
1203   tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
1204
1205   tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1206   tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1207
1208   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1209                        A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1210
1211   /* Disable sample counting in order to not affect occlusion query. */
1212   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
1213
1214   if (cmd->state.prim_generated_query_running_before_rp) {
1215      tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS);
1216   }
1217
1218   if (cmd->state.predication_active) {
1219      tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1220      tu_cs_emit(cs, 0);
1221   }
1222}
1223
1224static void
1225r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1226{
1227   tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1228   tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1229                  CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1230                  CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1231   tu_cs_emit(cs, 1); /* instance count */
1232   tu_cs_emit(cs, 2); /* vertex count */
1233}
1234
1235static void
1236r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1237{
1238   tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1239   tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1240                  CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1241                  CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY));
1242   tu_cs_emit(cs, 1); /* instance count */
1243   tu_cs_emit(cs, 2); /* vertex count */
1244}
1245
1246static void
1247r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1248{
1249   if (cmd->state.predication_active) {
1250      tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1251      tu_cs_emit(cs, 1);
1252   }
1253
1254   /* Re-enable sample counting. */
1255   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
1256
1257   if (cmd->state.prim_generated_query_running_before_rp) {
1258      tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS);
1259   }
1260}
1261
1262/* blit ops - common interface for 2d/shader paths */
1263
1264struct blit_ops {
1265   void (*coords)(struct tu_cs *cs,
1266                  const VkOffset2D *dst,
1267                  const VkOffset2D *src,
1268                  const VkExtent2D *extent);
1269   void (*clear_value)(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val);
1270   void (*src)(
1271        struct tu_cmd_buffer *cmd,
1272        struct tu_cs *cs,
1273        const struct fdl6_view *iview,
1274        uint32_t layer,
1275        VkFilter filter,
1276        enum pipe_format dst_format);
1277   void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1278                      enum pipe_format format,
1279                      uint64_t va, uint32_t pitch,
1280                      uint32_t width, uint32_t height,
1281                      enum pipe_format dst_format);
1282   void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1283               enum pipe_format src_format);
1284   void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1285   void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1286   void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1287                      enum pipe_format src_format);
1288   void (*setup)(struct tu_cmd_buffer *cmd,
1289                 struct tu_cs *cs,
1290                 enum pipe_format src_format,
1291                 enum pipe_format dst_format,
1292                 VkImageAspectFlags aspect_mask,
1293                 unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1294                 bool clear,
1295                 bool ubwc,
1296                 VkSampleCountFlagBits samples);
1297   void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1298   void (*teardown)(struct tu_cmd_buffer *cmd,
1299                    struct tu_cs *cs);
1300};
1301
1302static const struct blit_ops r2d_ops = {
1303   .coords = r2d_coords,
1304   .clear_value = r2d_clear_value,
1305   .src = r2d_src,
1306   .src_buffer = r2d_src_buffer,
1307   .dst = r2d_dst,
1308   .dst_depth = r2d_dst_depth,
1309   .dst_stencil = r2d_dst_stencil,
1310   .dst_buffer = r2d_dst_buffer,
1311   .setup = r2d_setup,
1312   .run = r2d_run,
1313   .teardown = r2d_teardown,
1314};
1315
1316static const struct blit_ops r3d_ops = {
1317   .coords = r3d_coords,
1318   .clear_value = r3d_clear_value,
1319   .src = r3d_src,
1320   .src_buffer = r3d_src_buffer,
1321   .dst = r3d_dst,
1322   .dst_depth = r3d_dst_depth,
1323   .dst_stencil = r3d_dst_stencil,
1324   .dst_buffer = r3d_dst_buffer,
1325   .setup = r3d_setup,
1326   .run = r3d_run,
1327   .teardown = r3d_teardown,
1328};
1329
1330/* passthrough set coords from 3D extents */
1331static void
1332coords(const struct blit_ops *ops,
1333       struct tu_cs *cs,
1334       const VkOffset3D *dst,
1335       const VkOffset3D *src,
1336       const VkExtent3D *extent)
1337{
1338   ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1339}
1340
1341/* Decides the VK format to treat our data as for a memcpy-style blit. We have
1342 * to be a bit careful because we have to pick a format with matching UBWC
1343 * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1344 * everything.
1345 */
1346static enum pipe_format
1347copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)
1348{
1349   if (vk_format_is_compressed(vk_format)) {
1350      switch (vk_format_get_blocksize(vk_format)) {
1351      case 1: return PIPE_FORMAT_R8_UINT;
1352      case 2: return PIPE_FORMAT_R16_UINT;
1353      case 4: return PIPE_FORMAT_R32_UINT;
1354      case 8: return PIPE_FORMAT_R32G32_UINT;
1355      case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
1356      default:
1357         unreachable("unhandled format size");
1358      }
1359   }
1360
1361   enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
1362
1363   /* For SNORM formats, copy them as the equivalent UNORM format.  If we treat
1364    * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1365    * (also -1.0), when we're supposed to be memcpying the bits. See
1366    * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1367    */
1368   format = util_format_snorm_to_unorm(format);
1369
1370   switch (format) {
1371   case PIPE_FORMAT_R9G9B9E5_FLOAT:
1372      return PIPE_FORMAT_R32_UINT;
1373
1374   case PIPE_FORMAT_G8_B8R8_420_UNORM:
1375      if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
1376         return PIPE_FORMAT_R8G8_UNORM;
1377      else
1378         return PIPE_FORMAT_Y8_UNORM;
1379   case PIPE_FORMAT_G8_B8_R8_420_UNORM:
1380      return PIPE_FORMAT_R8_UNORM;
1381
1382   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
1383      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1384         return PIPE_FORMAT_S8_UINT;
1385      assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
1386      return PIPE_FORMAT_Z32_FLOAT;
1387
1388   default:
1389      return format;
1390   }
1391}
1392
1393void
1394tu6_clear_lrz(struct tu_cmd_buffer *cmd,
1395              struct tu_cs *cs,
1396              struct tu_image *image,
1397              const VkClearValue *value)
1398{
1399   const struct blit_ops *ops = &r2d_ops;
1400
1401   /* It is assumed that LRZ cache is invalidated at this point for
1402    * the writes here to become visible to LRZ.
1403    *
1404    * LRZ writes are going through UCHE cache, flush UCHE before changing
1405    * LRZ via CCU. Don't need to invalidate CCU since we are presumably
1406    * writing whole cache lines we assume to be 64 bytes.
1407    */
1408   tu6_emit_event_write(cmd, &cmd->cs, CACHE_FLUSH_TS);
1409
1410   ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
1411              VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
1412              VK_SAMPLE_COUNT_1_BIT);
1413   ops->clear_value(cs, PIPE_FORMAT_Z16_UNORM, value);
1414   ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
1415                   image->iova + image->lrz_offset,
1416                   image->lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
1417   ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});
1418   ops->run(cmd, cs);
1419   ops->teardown(cmd, cs);
1420
1421   /* Clearing writes via CCU color in the PS stage, and LRZ is read via
1422    * UCHE in the earlier GRAS stage.
1423    */
1424   cmd->state.cache.flush_bits |=
1425      TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
1426      TU_CMD_FLAG_WAIT_FOR_IDLE;
1427}
1428
1429void
1430tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd,
1431                 struct tu_cs *cs,
1432                 struct tu_image *image)
1433{
1434   const struct blit_ops *ops = &r2d_ops;
1435   VkClearValue clear = { .color = { .uint32[0] = 0xffffffff } };
1436
1437   /* LRZ fast-clear buffer is always allocated with 512 bytes size. */
1438   ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
1439              VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
1440              VK_SAMPLE_COUNT_1_BIT);
1441   ops->clear_value(cs, PIPE_FORMAT_R32_UINT, &clear);
1442   ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
1443                   image->iova + image->lrz_fc_offset, 512,
1444                   PIPE_FORMAT_R32_UINT);
1445   ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {128, 1});
1446   ops->run(cmd, cs);
1447   ops->teardown(cmd, cs);
1448}
1449
1450static void
1451tu_image_view_copy_blit(struct fdl6_view *iview,
1452                        struct tu_image *image,
1453                        enum pipe_format format,
1454                        const VkImageSubresourceLayers *subres,
1455                        uint32_t layer,
1456                        bool z_scale)
1457{
1458   VkImageAspectFlags aspect_mask = subres->aspectMask;
1459
1460   /* always use the AS_R8G8B8A8 format for these */
1461   if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
1462       format == PIPE_FORMAT_Z24X8_UNORM) {
1463      aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1464   }
1465
1466   const struct fdl_layout *layout =
1467      &image->layout[tu6_plane_index(image->vk.format, aspect_mask)];
1468
1469   fdl6_view_init(iview, &layout, &(struct fdl_view_args) {
1470      .iova = image->iova,
1471      .base_array_layer = subres->baseArrayLayer + layer,
1472      .layer_count = 1,
1473      .base_miplevel = subres->mipLevel,
1474      .level_count = 1,
1475      .format = tu_format_for_aspect(format, aspect_mask),
1476      .swiz = {
1477         PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
1478      },
1479      .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
1480   }, false);
1481}
1482
1483static void
1484tu_image_view_copy(struct fdl6_view *iview,
1485                   struct tu_image *image,
1486                   enum pipe_format format,
1487                   const VkImageSubresourceLayers *subres,
1488                   uint32_t layer)
1489{
1490   tu_image_view_copy_blit(iview, image, format, subres, layer, false);
1491}
1492
1493static void
1494tu_image_view_blit(struct fdl6_view *iview,
1495                   struct tu_image *image,
1496                   const VkImageSubresourceLayers *subres,
1497                   uint32_t layer)
1498{
1499   enum pipe_format format =
1500      tu6_plane_format(image->vk.format, tu6_plane_index(image->vk.format,
1501                                                         subres->aspectMask));
1502   tu_image_view_copy_blit(iview, image, format, subres, layer, false);
1503}
1504
1505static void
1506tu6_blit_image(struct tu_cmd_buffer *cmd,
1507               struct tu_image *src_image,
1508               struct tu_image *dst_image,
1509               const VkImageBlit2 *info,
1510               VkFilter filter)
1511{
1512   const struct blit_ops *ops = &r2d_ops;
1513   struct tu_cs *cs = &cmd->cs;
1514   bool z_scale = false;
1515   uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
1516
1517   /* 2D blit can't do rotation mirroring from just coordinates */
1518   static const enum a6xx_rotation rotate[2][2] = {
1519      {ROTATE_0, ROTATE_HFLIP},
1520      {ROTATE_VFLIP, ROTATE_180},
1521   };
1522
1523   bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1524                   (info->dstOffsets[1].x < info->dstOffsets[0].x);
1525   bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1526                   (info->dstOffsets[1].y < info->dstOffsets[0].y);
1527
1528   int32_t src0_z = info->srcOffsets[0].z;
1529   int32_t src1_z = info->srcOffsets[1].z;
1530
1531   if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
1532        info->dstOffsets[1].z - info->dstOffsets[0].z) ||
1533       info->srcOffsets[1].z < info->srcOffsets[0].z) {
1534      z_scale = true;
1535   }
1536
1537   if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
1538      layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
1539      src0_z = info->srcOffsets[1].z;
1540      src1_z = info->srcOffsets[0].z;
1541   }
1542
1543   if (info->dstSubresource.layerCount > 1) {
1544      assert(layers <= 1);
1545      layers = info->dstSubresource.layerCount;
1546   }
1547
1548   /* BC1_RGB_* formats need to have their last components overriden with 1
1549    * when sampling, which is normally handled with the texture descriptor
1550    * swizzle. The 2d path can't handle that, so use the 3d path.
1551    *
1552    * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1553    * the 2d path.
1554    */
1555
1556   unsigned blit_param = rotate[mirror_y][mirror_x];
1557   if (dst_image->layout[0].nr_samples > 1 ||
1558       src_image->vk.format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1559       src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1560       filter == VK_FILTER_CUBIC_EXT ||
1561       z_scale) {
1562      ops = &r3d_ops;
1563      blit_param = z_scale;
1564   }
1565
1566   /* use the right format in setup() for D32_S8
1567    * TODO: this probably should use a helper
1568    */
1569   enum pipe_format src_format =
1570      tu6_plane_format(src_image->vk.format,
1571                       tu6_plane_index(src_image->vk.format,
1572                                       info->srcSubresource.aspectMask));
1573   enum pipe_format dst_format =
1574      tu6_plane_format(dst_image->vk.format,
1575                       tu6_plane_index(src_image->vk.format,
1576                                       info->srcSubresource.aspectMask));
1577   trace_start_blit(&cmd->trace, cs);
1578
1579   ops->setup(cmd, cs, src_format, dst_format, info->dstSubresource.aspectMask,
1580              blit_param, false, dst_image->layout[0].ubwc,
1581              dst_image->layout[0].nr_samples);
1582
1583   if (ops == &r3d_ops) {
1584      r3d_coords_raw(cs, (float[]) {
1585         info->dstOffsets[0].x, info->dstOffsets[0].y,
1586         info->srcOffsets[0].x, info->srcOffsets[0].y,
1587         info->dstOffsets[1].x, info->dstOffsets[1].y,
1588         info->srcOffsets[1].x, info->srcOffsets[1].y
1589      });
1590   } else {
1591      tu_cs_emit_regs(cs,
1592         A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1593                             .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1594         A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1595                             .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1596      tu_cs_emit_regs(cs,
1597         A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1598         A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1599         A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1600         A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1601   }
1602
1603   struct fdl6_view dst, src;
1604   tu_image_view_blit(&dst, dst_image, &info->dstSubresource,
1605                      MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
1606
1607   if (z_scale) {
1608      tu_image_view_copy_blit(&src, src_image, src_format,
1609                              &info->srcSubresource, 0, true);
1610      ops->src(cmd, cs, &src, 0, filter, dst_format);
1611   } else {
1612      tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1613   }
1614
1615   for (uint32_t i = 0; i < layers; i++) {
1616      if (z_scale) {
1617         float t = ((float) i + 0.5f) / (float) layers;
1618         r3d_coord_z(cs, t * (src1_z - src0_z) + src0_z);
1619      } else {
1620         ops->src(cmd, cs, &src, i, filter, dst_format);
1621      }
1622      ops->dst(cs, &dst, i, src_format);
1623      ops->run(cmd, cs);
1624   }
1625
1626   ops->teardown(cmd, cs);
1627
1628   trace_end_blit(&cmd->trace, cs,
1629                  ops == &r3d_ops,
1630                  src_image->vk.format,
1631                  dst_image->vk.format,
1632                  layers);
1633}
1634
1635VKAPI_ATTR void VKAPI_CALL
1636tu_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
1637                    const VkBlitImageInfo2* pBlitImageInfo)
1638
1639{
1640   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1641   TU_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage);
1642   TU_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage);
1643
1644   for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) {
1645      /* can't blit both depth and stencil at once with D32_S8
1646       * TODO: more advanced 3D blit path to support it instead?
1647       */
1648      if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1649          dst_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1650         VkImageBlit2 region = pBlitImageInfo->pRegions[i];
1651         u_foreach_bit(b, region.dstSubresource.aspectMask) {
1652            region.srcSubresource.aspectMask = BIT(b);
1653            region.dstSubresource.aspectMask = BIT(b);
1654            tu6_blit_image(cmd, src_image, dst_image, &region, pBlitImageInfo->filter);
1655         }
1656         continue;
1657      }
1658      tu6_blit_image(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
1659                     pBlitImageInfo->filter);
1660   }
1661
1662   if (dst_image->lrz_height) {
1663      tu_disable_lrz(cmd, &cmd->cs, dst_image);
1664   }
1665}
1666
1667static void
1668copy_compressed(VkFormat format,
1669                VkOffset3D *offset,
1670                VkExtent3D *extent,
1671                uint32_t *width,
1672                uint32_t *height)
1673{
1674   if (!vk_format_is_compressed(format))
1675      return;
1676
1677   uint32_t block_width = vk_format_get_blockwidth(format);
1678   uint32_t block_height = vk_format_get_blockheight(format);
1679
1680   offset->x /= block_width;
1681   offset->y /= block_height;
1682
1683   if (extent) {
1684      extent->width = DIV_ROUND_UP(extent->width, block_width);
1685      extent->height = DIV_ROUND_UP(extent->height, block_height);
1686   }
1687   if (width)
1688      *width = DIV_ROUND_UP(*width, block_width);
1689   if (height)
1690      *height = DIV_ROUND_UP(*height, block_height);
1691}
1692
1693static void
1694tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1695                        struct tu_buffer *src_buffer,
1696                        struct tu_image *dst_image,
1697                        const VkBufferImageCopy2 *info)
1698{
1699   struct tu_cs *cs = &cmd->cs;
1700   uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1701   enum pipe_format src_format =
1702      copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
1703   enum pipe_format dst_format =
1704      copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
1705   const struct blit_ops *ops = &r2d_ops;
1706
1707   /* special case for buffer to stencil */
1708   if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
1709       info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1710      src_format = PIPE_FORMAT_S8_UINT;
1711   }
1712
1713   /* note: could use "R8_UNORM" when no UBWC */
1714   if (src_format == PIPE_FORMAT_Y8_UNORM)
1715      ops = &r3d_ops;
1716
1717   VkOffset3D offset = info->imageOffset;
1718   VkExtent3D extent = info->imageExtent;
1719   uint32_t src_width = info->bufferRowLength ?: extent.width;
1720   uint32_t src_height = info->bufferImageHeight ?: extent.height;
1721
1722   copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
1723
1724   uint32_t pitch = src_width * util_format_get_blocksize(src_format);
1725   uint32_t layer_size = src_height * pitch;
1726
1727   ops->setup(cmd, cs, src_format, dst_format,
1728              info->imageSubresource.aspectMask, 0, false, dst_image->layout[0].ubwc,
1729              dst_image->layout[0].nr_samples);
1730
1731   struct fdl6_view dst;
1732   tu_image_view_copy(&dst, dst_image, dst_format, &info->imageSubresource, offset.z);
1733
1734   for (uint32_t i = 0; i < layers; i++) {
1735      ops->dst(cs, &dst, i, src_format);
1736
1737      uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i;
1738      if ((src_va & 63) || (pitch & 63)) {
1739         for (uint32_t y = 0; y < extent.height; y++) {
1740            uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
1741            ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1742                            x + extent.width, 1, dst_format);
1743            ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1744                        &(VkExtent2D) {extent.width, 1});
1745            ops->run(cmd, cs);
1746            src_va += pitch;
1747         }
1748      } else {
1749         ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height, dst_format);
1750         coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1751         ops->run(cmd, cs);
1752      }
1753   }
1754
1755   ops->teardown(cmd, cs);
1756}
1757
1758VKAPI_ATTR void VKAPI_CALL
1759tu_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
1760                            const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
1761{
1762   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1763   TU_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage);
1764   TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
1765
1766   for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
1767      tu_copy_buffer_to_image(cmd, src_buffer, dst_image,
1768                              pCopyBufferToImageInfo->pRegions + i);
1769
1770   if (dst_image->lrz_height) {
1771      tu_disable_lrz(cmd, &cmd->cs, dst_image);
1772   }
1773}
1774
1775static void
1776tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1777                        struct tu_image *src_image,
1778                        struct tu_buffer *dst_buffer,
1779                        const VkBufferImageCopy2 *info)
1780{
1781   struct tu_cs *cs = &cmd->cs;
1782   uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1783   enum pipe_format dst_format =
1784      copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
1785   enum pipe_format src_format =
1786      copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
1787   const struct blit_ops *ops = &r2d_ops;
1788
1789   if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
1790       info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1791      dst_format = PIPE_FORMAT_S8_UINT;
1792   }
1793
1794   /* note: could use "R8_UNORM" when no UBWC */
1795   if (dst_format == PIPE_FORMAT_Y8_UNORM)
1796      ops = &r3d_ops;
1797
1798   VkOffset3D offset = info->imageOffset;
1799   VkExtent3D extent = info->imageExtent;
1800   uint32_t dst_width = info->bufferRowLength ?: extent.width;
1801   uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1802
1803   copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
1804
1805   uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
1806   uint32_t layer_size = pitch * dst_height;
1807
1808   ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
1809              VK_SAMPLE_COUNT_1_BIT);
1810
1811   struct fdl6_view src;
1812   tu_image_view_copy(&src, src_image, src_format, &info->imageSubresource, offset.z);
1813
1814   for (uint32_t i = 0; i < layers; i++) {
1815      ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
1816
1817      uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i;
1818      if ((dst_va & 63) || (pitch & 63)) {
1819         for (uint32_t y = 0; y < extent.height; y++) {
1820            uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
1821            ops->dst_buffer(cs, dst_format, dst_va & ~63, 0, src_format);
1822            ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1823                        &(VkExtent2D) {extent.width, 1});
1824            ops->run(cmd, cs);
1825            dst_va += pitch;
1826         }
1827      } else {
1828         ops->dst_buffer(cs, dst_format, dst_va, pitch, src_format);
1829         coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1830         ops->run(cmd, cs);
1831      }
1832   }
1833
1834   ops->teardown(cmd, cs);
1835}
1836
1837VKAPI_ATTR void VKAPI_CALL
1838tu_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
1839                            const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo)
1840{
1841   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1842   TU_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage);
1843   TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
1844
1845   for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i)
1846      tu_copy_image_to_buffer(cmd, src_image, dst_buffer,
1847                              pCopyImageToBufferInfo->pRegions + i);
1848}
1849
1850/* Tiled formats don't support swapping, which means that we can't support
1851 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1852 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1853 * Currently we fake support for tiled swapped formats and use the unswapped
1854 * format instead, but this means that reinterpreting copies to and from
1855 * swapped formats can't be performed correctly unless we can swizzle the
1856 * components by reinterpreting the other image as the "correct" swapped
1857 * format, i.e. only when the other image is linear.
1858 */
1859
1860static bool
1861is_swapped_format(enum pipe_format format)
1862{
1863   struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1864   struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1865   return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1866}
1867
1868/* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1869 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1870 * versa). This should mirror the logic in fdl6_layout.
1871 */
1872static bool
1873image_is_r8g8(struct tu_image *image)
1874{
1875   return image->layout[0].cpp == 2 &&
1876      vk_format_get_nr_components(image->vk.format) == 2;
1877}
1878
1879static void
1880tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1881                       struct tu_image *src_image,
1882                       struct tu_image *dst_image,
1883                       const VkImageCopy2 *info)
1884{
1885   const struct blit_ops *ops = &r2d_ops;
1886   struct tu_cs *cs = &cmd->cs;
1887
1888   if (dst_image->layout[0].nr_samples > 1)
1889      ops = &r3d_ops;
1890
1891   enum pipe_format format = PIPE_FORMAT_NONE;
1892   VkOffset3D src_offset = info->srcOffset;
1893   VkOffset3D dst_offset = info->dstOffset;
1894   VkExtent3D extent = info->extent;
1895   uint32_t layers_to_copy = MAX2(info->extent.depth, info->srcSubresource.layerCount);
1896
1897   /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1898    * Images":
1899    *
1900    *    When copying between compressed and uncompressed formats the extent
1901    *    members represent the texel dimensions of the source image and not
1902    *    the destination. When copying from a compressed image to an
1903    *    uncompressed image the image texel dimensions written to the
1904    *    uncompressed image will be source extent divided by the compressed
1905    *    texel block dimensions. When copying from an uncompressed image to a
1906    *    compressed image the image texel dimensions written to the compressed
1907    *    image will be the source extent multiplied by the compressed texel
1908    *    block dimensions.
1909    *
1910    * This means we only have to adjust the extent if the source image is
1911    * compressed.
1912    */
1913   copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
1914   copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
1915
1916   enum pipe_format dst_format = copy_format(dst_image->vk.format, info->dstSubresource.aspectMask);
1917   enum pipe_format src_format = copy_format(src_image->vk.format, info->srcSubresource.aspectMask);
1918
1919   /* note: could use "R8_UNORM" when no UBWC */
1920   if (dst_format == PIPE_FORMAT_Y8_UNORM ||
1921       src_format == PIPE_FORMAT_Y8_UNORM)
1922      ops = &r3d_ops;
1923
1924   bool use_staging_blit = false;
1925
1926   if (src_format == dst_format) {
1927      /* Images that share a format can always be copied directly because it's
1928       * the same as a blit.
1929       */
1930      format = src_format;
1931   } else if (!src_image->layout[0].tile_mode) {
1932      /* If an image is linear, we can always safely reinterpret it with the
1933       * other image's format and then do a regular blit.
1934       */
1935      format = dst_format;
1936   } else if (!dst_image->layout[0].tile_mode) {
1937      format = src_format;
1938   } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1939      /* We can't currently copy r8g8 images to/from other cpp=2 images,
1940       * due to the different tile layout.
1941       */
1942      use_staging_blit = true;
1943   } else if (is_swapped_format(src_format) ||
1944              is_swapped_format(dst_format)) {
1945      /* If either format has a non-identity swap, then we can't copy
1946       * to/from it.
1947       */
1948      use_staging_blit = true;
1949   } else if (!src_image->layout[0].ubwc) {
1950      format = dst_format;
1951   } else if (!dst_image->layout[0].ubwc) {
1952      format = src_format;
1953   } else {
1954      /* Both formats use UBWC and so neither can be reinterpreted.
1955       * TODO: We could do an in-place decompression of the dst instead.
1956       */
1957      perf_debug(cmd->device, "TODO: Do in-place UBWC decompression for UBWC->UBWC blits");
1958      use_staging_blit = true;
1959   }
1960
1961   struct fdl6_view dst, src;
1962
1963   if (use_staging_blit) {
1964      tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z);
1965      tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z);
1966
1967      struct fdl_layout staging_layout = { 0 };
1968      VkOffset3D staging_offset = { 0 };
1969
1970      staging_layout.tile_mode = TILE6_LINEAR;
1971      staging_layout.ubwc = false;
1972
1973      fdl6_layout(&staging_layout,
1974                  src_format,
1975                  src_image->layout[0].nr_samples,
1976                  extent.width,
1977                  extent.height,
1978                  extent.depth,
1979                  1,
1980                  info->srcSubresource.layerCount,
1981                  extent.depth > 1,
1982                  NULL);
1983
1984      struct tu_bo *staging_bo;
1985      VkResult result = tu_get_scratch_bo(cmd->device,
1986                                          staging_layout.size,
1987                                          &staging_bo);
1988      if (result != VK_SUCCESS) {
1989         cmd->record_result = result;
1990         return;
1991      }
1992
1993      struct fdl6_view staging;
1994      const struct fdl_layout *staging_layout_ptr = &staging_layout;
1995      fdl6_view_init(&staging, &staging_layout_ptr, &(struct fdl_view_args) {
1996         .iova = staging_bo->iova,
1997         .base_array_layer = 0,
1998         .layer_count = 1,
1999         .base_miplevel = 0,
2000         .level_count = info->srcSubresource.layerCount,
2001         .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
2002         .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2003         .type = FDL_VIEW_TYPE_2D,
2004      }, false);
2005
2006      ops->setup(cmd, cs, src_format, src_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
2007                 dst_image->layout[0].nr_samples);
2008      coords(ops, cs, &staging_offset, &src_offset, &extent);
2009
2010      for (uint32_t i = 0; i < layers_to_copy; i++) {
2011         ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, src_format);
2012         ops->dst(cs, &staging, i, src_format);
2013         ops->run(cmd, cs);
2014      }
2015
2016      /* When executed by the user there has to be a pipeline barrier here,
2017       * but since we're doing it manually we'll have to flush ourselves.
2018       */
2019      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2020      tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2021      tu_cs_emit_wfi(cs);
2022
2023      fdl6_view_init(&staging, &staging_layout_ptr, &(struct fdl_view_args) {
2024         .iova = staging_bo->iova,
2025         .base_array_layer = 0,
2026         .layer_count = 1,
2027         .base_miplevel = 0,
2028         .level_count = info->srcSubresource.layerCount,
2029         .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
2030         .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2031         .type = FDL_VIEW_TYPE_2D,
2032      }, false);
2033
2034      ops->setup(cmd, cs, dst_format, dst_format, info->dstSubresource.aspectMask,
2035                 0, false, dst_image->layout[0].ubwc,
2036                 dst_image->layout[0].nr_samples);
2037      coords(ops, cs, &dst_offset, &staging_offset, &extent);
2038
2039      for (uint32_t i = 0; i < layers_to_copy; i++) {
2040         ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST, dst_format);
2041         ops->dst(cs, &dst, i, dst_format);
2042         ops->run(cmd, cs);
2043      }
2044   } else {
2045      tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z);
2046      tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z);
2047
2048      ops->setup(cmd, cs, format, format, info->dstSubresource.aspectMask,
2049                 0, false, dst_image->layout[0].ubwc,
2050                 dst_image->layout[0].nr_samples);
2051      coords(ops, cs, &dst_offset, &src_offset, &extent);
2052
2053      for (uint32_t i = 0; i < layers_to_copy; i++) {
2054         ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, format);
2055         ops->dst(cs, &dst, i, format);
2056         ops->run(cmd, cs);
2057      }
2058   }
2059
2060   ops->teardown(cmd, cs);
2061}
2062
2063VKAPI_ATTR void VKAPI_CALL
2064tu_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
2065                    const VkCopyImageInfo2* pCopyImageInfo)
2066{
2067   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2068   TU_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage);
2069   TU_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage);
2070
2071   for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) {
2072      if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2073         VkImageCopy2 info = pCopyImageInfo->pRegions[i];
2074         u_foreach_bit(b, info.dstSubresource.aspectMask) {
2075            info.srcSubresource.aspectMask = BIT(b);
2076            info.dstSubresource.aspectMask = BIT(b);
2077            tu_copy_image_to_image(cmd, src_image, dst_image, &info);
2078         }
2079         continue;
2080      }
2081
2082      tu_copy_image_to_image(cmd, src_image, dst_image,
2083                             pCopyImageInfo->pRegions + i);
2084   }
2085
2086   if (dst_image->lrz_height) {
2087      tu_disable_lrz(cmd, &cmd->cs, dst_image);
2088   }
2089}
2090
2091static void
2092copy_buffer(struct tu_cmd_buffer *cmd,
2093            uint64_t dst_va,
2094            uint64_t src_va,
2095            uint64_t size,
2096            uint32_t block_size)
2097{
2098   const struct blit_ops *ops = &r2d_ops;
2099   struct tu_cs *cs = &cmd->cs;
2100   enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
2101   uint64_t blocks = size / block_size;
2102
2103   ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
2104              VK_SAMPLE_COUNT_1_BIT);
2105
2106   while (blocks) {
2107      uint32_t src_x = (src_va & 63) / block_size;
2108      uint32_t dst_x = (dst_va & 63) / block_size;
2109      uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
2110
2111      ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1, format);
2112      ops->dst_buffer(     cs, format, dst_va & ~63, 0, format);
2113      ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
2114      ops->run(cmd, cs);
2115
2116      src_va += width * block_size;
2117      dst_va += width * block_size;
2118      blocks -= width;
2119   }
2120
2121   ops->teardown(cmd, cs);
2122}
2123
2124VKAPI_ATTR void VKAPI_CALL
2125tu_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
2126                     const VkCopyBufferInfo2 *pCopyBufferInfo)
2127{
2128   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2129   TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
2130   TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
2131
2132   for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
2133      const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
2134      copy_buffer(cmd,
2135                  dst_buffer->iova + region->dstOffset,
2136                  src_buffer->iova + region->srcOffset,
2137                  region->size, 1);
2138   }
2139}
2140
2141VKAPI_ATTR void VKAPI_CALL
2142tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
2143                   VkBuffer dstBuffer,
2144                   VkDeviceSize dstOffset,
2145                   VkDeviceSize dataSize,
2146                   const void *pData)
2147{
2148   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2149   TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2150
2151   struct tu_cs_memory tmp;
2152   VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
2153   if (result != VK_SUCCESS) {
2154      cmd->record_result = result;
2155      return;
2156   }
2157
2158   memcpy(tmp.map, pData, dataSize);
2159   copy_buffer(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4);
2160}
2161
2162VKAPI_ATTR void VKAPI_CALL
2163tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
2164                 VkBuffer dstBuffer,
2165                 VkDeviceSize dstOffset,
2166                 VkDeviceSize fillSize,
2167                 uint32_t data)
2168{
2169   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2170   TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2171   const struct blit_ops *ops = &r2d_ops;
2172   struct tu_cs *cs = &cmd->cs;
2173
2174   if (fillSize == VK_WHOLE_SIZE)
2175      fillSize = buffer->size - dstOffset;
2176
2177   uint64_t dst_va = buffer->iova + dstOffset;
2178   uint32_t blocks = fillSize / 4;
2179
2180   ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2181              VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2182              VK_SAMPLE_COUNT_1_BIT);
2183   ops->clear_value(cs, PIPE_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
2184
2185   while (blocks) {
2186      uint32_t dst_x = (dst_va & 63) / 4;
2187      uint32_t width = MIN2(blocks, 0x4000 - dst_x);
2188
2189      ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dst_va & ~63, 0, PIPE_FORMAT_R32_UINT);
2190      ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
2191      ops->run(cmd, cs);
2192
2193      dst_va += width * 4;
2194      blocks -= width;
2195   }
2196
2197   ops->teardown(cmd, cs);
2198}
2199
2200VKAPI_ATTR void VKAPI_CALL
2201tu_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
2202                       const VkResolveImageInfo2* pResolveImageInfo)
2203{
2204   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2205   TU_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage);
2206   TU_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage);
2207   const struct blit_ops *ops = &r2d_ops;
2208   struct tu_cs *cs = &cmd->cs;
2209
2210   enum pipe_format src_format =
2211      tu_vk_format_to_pipe_format(src_image->vk.format);
2212   enum pipe_format dst_format =
2213      tu_vk_format_to_pipe_format(dst_image->vk.format);
2214   ops->setup(cmd, cs, src_format, dst_format,
2215              VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc,
2216              VK_SAMPLE_COUNT_1_BIT);
2217
2218   for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) {
2219      const VkImageResolve2 *info = &pResolveImageInfo->pRegions[i];
2220      uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
2221
2222      assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
2223      /* TODO: aspect masks possible ? */
2224
2225      coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
2226
2227      struct fdl6_view dst, src;
2228      tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
2229      tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
2230
2231      for (uint32_t i = 0; i < layers; i++) {
2232         ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2233         ops->dst(cs, &dst, i, src_format);
2234         ops->run(cmd, cs);
2235      }
2236   }
2237
2238   ops->teardown(cmd, cs);
2239}
2240
2241#define for_each_layer(layer, layer_mask, layers) \
2242   for (uint32_t layer = 0; \
2243        layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
2244        layer++) \
2245      if (!layer_mask || (layer_mask & BIT(layer)))
2246
2247static void
2248resolve_sysmem(struct tu_cmd_buffer *cmd,
2249               struct tu_cs *cs,
2250               VkFormat vk_src_format,
2251               VkFormat vk_dst_format,
2252               const struct tu_image_view *src,
2253               const struct tu_image_view *dst,
2254               uint32_t layer_mask,
2255               uint32_t layers,
2256               const VkRect2D *rect,
2257               bool src_separate_ds,
2258               bool dst_separate_ds)
2259{
2260   const struct blit_ops *ops = &r2d_ops;
2261
2262   trace_start_sysmem_resolve(&cmd->trace, cs);
2263
2264   enum pipe_format src_format = tu_vk_format_to_pipe_format(vk_src_format);
2265   enum pipe_format dst_format = tu_vk_format_to_pipe_format(vk_dst_format);
2266
2267   ops->setup(cmd, cs, src_format, dst_format,
2268              VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
2269              VK_SAMPLE_COUNT_1_BIT);
2270   ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
2271
2272   for_each_layer(i, layer_mask, layers) {
2273      if (src_separate_ds) {
2274         if (vk_src_format == VK_FORMAT_D32_SFLOAT) {
2275            r2d_src_depth(cmd, cs, src, i, VK_FILTER_NEAREST);
2276         } else {
2277            r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST);
2278         }
2279      } else {
2280         ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
2281      }
2282
2283      if (dst_separate_ds) {
2284         if (vk_dst_format == VK_FORMAT_D32_SFLOAT) {
2285            ops->dst_depth(cs, dst, i);
2286         } else {
2287            ops->dst_stencil(cs, dst, i);
2288         }
2289      } else {
2290         ops->dst(cs, &dst->view, i, src_format);
2291      }
2292
2293      ops->run(cmd, cs);
2294   }
2295
2296   ops->teardown(cmd, cs);
2297
2298   trace_end_sysmem_resolve(&cmd->trace, cs, vk_dst_format);
2299}
2300
2301void
2302tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
2303                  struct tu_cs *cs,
2304                  const struct tu_image_view *src,
2305                  const struct tu_image_view *dst,
2306                  uint32_t layer_mask,
2307                  uint32_t layers,
2308                  const VkRect2D *rect)
2309{
2310   assert(src->image->vk.format == dst->image->vk.format ||
2311          (vk_format_is_depth_or_stencil(src->image->vk.format) &&
2312           vk_format_is_depth_or_stencil(dst->image->vk.format)));
2313
2314   bool src_separate_ds = src->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
2315   bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
2316
2317   if (dst_separate_ds) {
2318      resolve_sysmem(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT,
2319                     src, dst, layer_mask, layers, rect,
2320                     src_separate_ds, dst_separate_ds);
2321      resolve_sysmem(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT,
2322                     src, dst, layer_mask, layers, rect,
2323                     src_separate_ds, dst_separate_ds);
2324   } else {
2325      resolve_sysmem(cmd, cs, src->image->vk.format, dst->image->vk.format,
2326                     src, dst, layer_mask, layers, rect,
2327                     src_separate_ds, dst_separate_ds);
2328   }
2329}
2330
2331static void
2332clear_image(struct tu_cmd_buffer *cmd,
2333            struct tu_image *image,
2334            const VkClearValue *clear_value,
2335            const VkImageSubresourceRange *range,
2336            VkImageAspectFlags aspect_mask)
2337{
2338   uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
2339   uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
2340   struct tu_cs *cs = &cmd->cs;
2341   enum pipe_format format;
2342   if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
2343      format = PIPE_FORMAT_R32_UINT;
2344   } else {
2345      format = tu6_plane_format(image->vk.format,
2346                                tu6_plane_index(image->vk.format,
2347                                                aspect_mask));
2348   }
2349
2350   if (image->layout[0].depth0 > 1) {
2351      assert(layer_count == 1);
2352      assert(range->baseArrayLayer == 0);
2353   }
2354
2355   const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops;
2356
2357   ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc,
2358              image->layout[0].nr_samples);
2359   if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
2360      ops->clear_value(cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
2361   else
2362      ops->clear_value(cs, format, clear_value);
2363
2364   for (unsigned j = 0; j < level_count; j++) {
2365      if (image->layout[0].depth0 > 1)
2366         layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
2367
2368      ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
2369                     u_minify(image->layout[0].width0, range->baseMipLevel + j),
2370                     u_minify(image->layout[0].height0, range->baseMipLevel + j)
2371                  });
2372
2373      struct fdl6_view dst;
2374      tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
2375         .aspectMask = aspect_mask,
2376         .mipLevel = range->baseMipLevel + j,
2377         .baseArrayLayer = range->baseArrayLayer,
2378         .layerCount = 1,
2379      }, 0, false);
2380
2381      for (uint32_t i = 0; i < layer_count; i++) {
2382         ops->dst(cs, &dst, i, format);
2383         ops->run(cmd, cs);
2384      }
2385   }
2386
2387   ops->teardown(cmd, cs);
2388}
2389
2390VKAPI_ATTR void VKAPI_CALL
2391tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
2392                      VkImage image_h,
2393                      VkImageLayout imageLayout,
2394                      const VkClearColorValue *pColor,
2395                      uint32_t rangeCount,
2396                      const VkImageSubresourceRange *pRanges)
2397{
2398   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2399   TU_FROM_HANDLE(tu_image, image, image_h);
2400
2401   for (unsigned i = 0; i < rangeCount; i++)
2402      clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
2403}
2404
2405VKAPI_ATTR void VKAPI_CALL
2406tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
2407                             VkImage image_h,
2408                             VkImageLayout imageLayout,
2409                             const VkClearDepthStencilValue *pDepthStencil,
2410                             uint32_t rangeCount,
2411                             const VkImageSubresourceRange *pRanges)
2412{
2413   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2414   TU_FROM_HANDLE(tu_image, image, image_h);
2415
2416   for (unsigned i = 0; i < rangeCount; i++) {
2417      const VkImageSubresourceRange *range = &pRanges[i];
2418
2419      if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2420         /* can't clear both depth and stencil at once, split up the aspect mask */
2421         u_foreach_bit(b, range->aspectMask)
2422            clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
2423         continue;
2424      }
2425
2426      clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
2427   }
2428
2429   tu_lrz_clear_depth_image(cmd, image, pDepthStencil, rangeCount, pRanges);
2430}
2431
2432static void
2433tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2434                            uint32_t attachment_count,
2435                            const VkClearAttachment *attachments,
2436                            uint32_t rect_count,
2437                            const VkClearRect *rects)
2438{
2439   /* the shader path here is special, it avoids changing MRT/etc state */
2440   const struct tu_subpass *subpass = cmd->state.subpass;
2441   const uint32_t mrt_count = subpass->color_count;
2442   struct tu_cs *cs = &cmd->draw_cs;
2443   uint32_t clear_value[MAX_RTS][4];
2444   float z_clear_val = 0.0f;
2445   uint8_t s_clear_val = 0;
2446   uint32_t clear_rts = 0, clear_components = 0;
2447   bool z_clear = false;
2448   bool s_clear = false;
2449
2450   trace_start_sysmem_clear_all(&cmd->trace, cs);
2451
2452   for (uint32_t i = 0; i < attachment_count; i++) {
2453      uint32_t a;
2454      if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2455         uint32_t c = attachments[i].colorAttachment;
2456         a = subpass->color_attachments[c].attachment;
2457         if (a == VK_ATTACHMENT_UNUSED)
2458            continue;
2459
2460         clear_rts |= 1 << c;
2461         clear_components |= 0xf << (c * 4);
2462         memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2463      } else {
2464         a = subpass->depth_stencil_attachment.attachment;
2465         if (a == VK_ATTACHMENT_UNUSED)
2466            continue;
2467
2468         if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2469            z_clear = true;
2470            z_clear_val = attachments[i].clearValue.depthStencil.depth;
2471         }
2472
2473         if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2474            s_clear = true;
2475            s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2476         }
2477      }
2478   }
2479
2480   /* We may not know the multisample count if there are no attachments, so
2481    * just bail early to avoid corner cases later.
2482    */
2483   if (clear_rts == 0 && !z_clear && !s_clear)
2484      return;
2485
2486   /* disable all draw states so they don't interfere
2487    * TODO: use and re-use draw states
2488    * we have to disable draw states individually to preserve
2489    * input attachment states, because a secondary command buffer
2490    * won't be able to restore them
2491    */
2492   tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
2493   for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
2494      if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
2495          i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
2496         continue;
2497      tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
2498                     CP_SET_DRAW_STATE__0_DISABLE);
2499      tu_cs_emit_qw(cs, 0);
2500   }
2501   cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
2502
2503   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2504   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2505                  A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2506                  0xfc000000);
2507   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2508
2509   r3d_common(cmd, cs, false, clear_rts, false, cmd->state.subpass->samples);
2510
2511   /* Disable sample counting in order to not affect occlusion query. */
2512   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
2513
2514   if (cmd->state.prim_generated_query_running_before_rp) {
2515      tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS);
2516   }
2517
2518   tu_cs_emit_regs(cs,
2519                   A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2520   tu_cs_emit_regs(cs,
2521                   A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2522
2523   tu_cs_emit_regs(cs,
2524                   A6XX_RB_FS_OUTPUT_CNTL0(),
2525                   A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2526
2527   tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2528   tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2529   for (uint32_t i = 0; i < mrt_count; i++) {
2530      tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2531            .component_enable = COND(clear_rts & (1 << i), 0xf)));
2532   }
2533
2534   tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
2535   tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
2536
2537   tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2538   tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2539         .z_test_enable = z_clear,
2540         .z_write_enable = z_clear,
2541         .zfunc = FUNC_ALWAYS));
2542   tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2543   tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2544         .stencil_enable = s_clear,
2545         .func = FUNC_ALWAYS,
2546         .zpass = STENCIL_REPLACE));
2547   tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2548   tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2549   tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2550
2551   unsigned num_rts = util_bitcount(clear_rts);
2552   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2553   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2554                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2555                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2556                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2557                  CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2558   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2559   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2560   u_foreach_bit(b, clear_rts)
2561      tu_cs_emit_array(cs, clear_value[b], 4);
2562
2563   for (uint32_t i = 0; i < rect_count; i++) {
2564      /* This should be true because of this valid usage for
2565       * vkCmdClearAttachments:
2566       *
2567       *    "If the render pass instance this is recorded in uses multiview,
2568       *    then baseArrayLayer must be zero and layerCount must be one"
2569       */
2570      assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
2571
2572      /* a630 doesn't support multiview masks, which means that we can't use
2573       * the normal multiview path without potentially recompiling a shader
2574       * on-demand or using a more complicated variant that takes the mask as
2575       * a const. Just use the layered path instead, since it shouldn't be
2576       * much worse.
2577       */
2578      for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) {
2579         r3d_coords_raw(cs, (float[]) {
2580            rects[i].rect.offset.x, rects[i].rect.offset.y,
2581            z_clear_val, uif(rects[i].baseArrayLayer + layer),
2582            rects[i].rect.offset.x + rects[i].rect.extent.width,
2583            rects[i].rect.offset.y + rects[i].rect.extent.height,
2584            z_clear_val, 1.0f,
2585         });
2586         r3d_run_vis(cmd, cs);
2587      }
2588   }
2589
2590   /* Re-enable sample counting. */
2591   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
2592
2593   if (cmd->state.prim_generated_query_running_before_rp) {
2594      tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS);
2595   }
2596
2597   trace_end_sysmem_clear_all(&cmd->trace,
2598                              cs, mrt_count, rect_count);
2599}
2600
2601static void
2602pack_gmem_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
2603{
2604   switch (format) {
2605   case PIPE_FORMAT_Z24X8_UNORM:
2606   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
2607      clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
2608                       val->depthStencil.stencil << 24;
2609      return;
2610   case PIPE_FORMAT_Z16_UNORM:
2611      clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
2612      return;
2613   case PIPE_FORMAT_Z32_FLOAT:
2614      clear_value[0] = fui(val->depthStencil.depth);
2615      return;
2616   case PIPE_FORMAT_S8_UINT:
2617      clear_value[0] = val->depthStencil.stencil;
2618      return;
2619   default:
2620      break;
2621   }
2622
2623   float tmp[4];
2624   memcpy(tmp, val->color.float32, 4 * sizeof(float));
2625   if (util_format_is_srgb(format)) {
2626      for (int i = 0; i < 3; i++)
2627         tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
2628   }
2629
2630#define PACK_F(type) util_format_##type##_pack_rgba_float \
2631   ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
2632   switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
2633   case 4:
2634      PACK_F(r4g4b4a4_unorm);
2635      break;
2636   case 5:
2637      if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
2638         PACK_F(r5g6b5_unorm);
2639      else
2640         PACK_F(r5g5b5a1_unorm);
2641      break;
2642   case 8:
2643      if (util_format_is_snorm(format))
2644         PACK_F(r8g8b8a8_snorm);
2645      else if (util_format_is_unorm(format))
2646         PACK_F(r8g8b8a8_unorm);
2647      else
2648         pack_int8(clear_value, val->color.uint32);
2649      break;
2650   case 10:
2651      if (util_format_is_pure_integer(format))
2652         pack_int10_2(clear_value, val->color.uint32);
2653      else
2654         PACK_F(r10g10b10a2_unorm);
2655      break;
2656   case 11:
2657      clear_value[0] = float3_to_r11g11b10f(val->color.float32);
2658      break;
2659   case 16:
2660      if (util_format_is_snorm(format))
2661         PACK_F(r16g16b16a16_snorm);
2662      else if (util_format_is_unorm(format))
2663         PACK_F(r16g16b16a16_unorm);
2664      else if (util_format_is_float(format))
2665         PACK_F(r16g16b16a16_float);
2666      else
2667         pack_int16(clear_value, val->color.uint32);
2668      break;
2669   case 32:
2670      memcpy(clear_value, val->color.float32, 4 * sizeof(float));
2671      break;
2672   default:
2673      unreachable("unexpected channel size");
2674   }
2675#undef PACK_F
2676}
2677
2678static void
2679clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2680                      struct tu_cs *cs,
2681                      enum pipe_format format,
2682                      uint8_t clear_mask,
2683                      uint32_t gmem_offset,
2684                      const VkClearValue *value)
2685{
2686   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2687   tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
2688
2689   tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
2690
2691   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2692   tu_cs_emit(cs, gmem_offset);
2693
2694   tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2695   tu_cs_emit(cs, 0);
2696
2697   uint32_t clear_vals[4] = {};
2698   pack_gmem_clear_value(value, format, clear_vals);
2699
2700   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2701   tu_cs_emit_array(cs, clear_vals, 4);
2702
2703   tu6_emit_event_write(cmd, cs, BLIT);
2704}
2705
2706static void
2707tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2708                              struct tu_cs *cs,
2709                              uint32_t attachment,
2710                              VkImageAspectFlags mask,
2711                              const VkClearValue *value)
2712{
2713   const struct tu_render_pass_attachment *att =
2714      &cmd->state.pass->attachments[attachment];
2715
2716   trace_start_gmem_clear(&cmd->trace, cs);
2717
2718   enum pipe_format format = tu_vk_format_to_pipe_format(att->format);
2719   if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2720      if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2721         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, tu_attachment_gmem_offset(cmd, att), value);
2722      if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2723         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, tu_attachment_gmem_offset_stencil(cmd, att), value);
2724      return;
2725   }
2726
2727   clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask),
2728                         tu_attachment_gmem_offset(cmd, att), value);
2729
2730   trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples);
2731}
2732
2733static void
2734tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2735                          uint32_t attachment_count,
2736                          const VkClearAttachment *attachments,
2737                          uint32_t rect_count,
2738                          const VkClearRect *rects)
2739{
2740   const struct tu_subpass *subpass = cmd->state.subpass;
2741   struct tu_cs *cs = &cmd->draw_cs;
2742
2743   if (rect_count > 1)
2744      perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream");
2745
2746   for (unsigned i = 0; i < rect_count; i++) {
2747      unsigned x1 = rects[i].rect.offset.x;
2748      unsigned y1 = rects[i].rect.offset.y;
2749      unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2750      unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2751
2752      tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2753      tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2754      tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2755
2756      for (unsigned j = 0; j < attachment_count; j++) {
2757         uint32_t a;
2758         if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2759            a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2760         else
2761            a = subpass->depth_stencil_attachment.attachment;
2762
2763         if (a == VK_ATTACHMENT_UNUSED)
2764               continue;
2765
2766         tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2767                                       &attachments[j].clearValue);
2768      }
2769   }
2770}
2771
2772VKAPI_ATTR void VKAPI_CALL
2773tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2774                       uint32_t attachmentCount,
2775                       const VkClearAttachment *pAttachments,
2776                       uint32_t rectCount,
2777                       const VkClearRect *pRects)
2778{
2779   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2780   struct tu_cs *cs = &cmd->draw_cs;
2781
2782   /* sysmem path behaves like a draw, note we don't have a way of using different
2783    * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
2784    */
2785   tu_emit_cache_flush_renderpass(cmd, cs);
2786
2787   for (uint32_t j = 0; j < attachmentCount; j++) {
2788      if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
2789         continue;
2790
2791      tu_lrz_disable_during_renderpass(cmd);
2792   }
2793
2794   /* vkCmdClearAttachments is supposed to respect the predicate if active. The
2795    * easiest way to do this is to always use the 3d path, which always works
2796    * even with GMEM because it's just a simple draw using the existing
2797    * attachment state.
2798    *
2799    * Similarly, we also use the 3D path when in a secondary command buffer that
2800    * doesn't know the GMEM layout that will be chosen by the primary.
2801    */
2802   if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
2803      tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2804      return;
2805   }
2806
2807   /* If we could skip tile load/stores based on any draws intersecting them at
2808    * binning time, then emit the clear as a 3D draw so that it contributes to
2809    * that visibility.
2810   */
2811   const struct tu_subpass *subpass = cmd->state.subpass;
2812   for (uint32_t i = 0; i < attachmentCount; i++) {
2813      uint32_t a;
2814      if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2815         uint32_t c = pAttachments[i].colorAttachment;
2816         a = subpass->color_attachments[c].attachment;
2817      } else {
2818         a = subpass->depth_stencil_attachment.attachment;
2819      }
2820      if (a != VK_ATTACHMENT_UNUSED) {
2821         const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
2822         if (att->cond_load_allowed || att->cond_store_allowed) {
2823            tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2824            return;
2825         }
2826      }
2827   }
2828
2829   /* Otherwise, emit 2D blits for gmem rendering. */
2830   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2831   tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2832   tu_cond_exec_end(cs);
2833
2834   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2835   tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2836   tu_cond_exec_end(cs);
2837}
2838
2839static void
2840clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2841                        struct tu_cs *cs,
2842                        VkFormat vk_format,
2843                        VkImageAspectFlags clear_mask,
2844                        const VkClearValue *value,
2845                        uint32_t a,
2846                        bool separate_ds)
2847{
2848   enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
2849   const struct tu_framebuffer *fb = cmd->state.framebuffer;
2850   const struct tu_image_view *iview = cmd->state.attachments[a];
2851   const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
2852   const struct blit_ops *ops = &r2d_ops;
2853   if (cmd->state.pass->attachments[a].samples > 1)
2854      ops = &r3d_ops;
2855
2856   trace_start_sysmem_clear(&cmd->trace, cs);
2857
2858   ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled,
2859              cmd->state.pass->attachments[a].samples);
2860   ops->coords(cs, &cmd->state.render_area.offset, NULL,
2861               &cmd->state.render_area.extent);
2862   ops->clear_value(cs, format, value);
2863
2864   for_each_layer(i, clear_views, fb->layers) {
2865      if (separate_ds) {
2866         if (vk_format == VK_FORMAT_D32_SFLOAT) {
2867            ops->dst_depth(cs, iview, i);
2868         } else {
2869            ops->dst_stencil(cs, iview, i);
2870         }
2871      } else {
2872         ops->dst(cs, &iview->view, i, format);
2873      }
2874      ops->run(cmd, cs);
2875   }
2876
2877   ops->teardown(cmd, cs);
2878
2879   trace_end_sysmem_clear(&cmd->trace, cs,
2880                          vk_format, ops == &r3d_ops,
2881                          cmd->state.pass->attachments[a].samples);
2882}
2883
2884void
2885tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2886                           struct tu_cs *cs,
2887                           uint32_t a,
2888                           const VkClearValue *value)
2889{
2890   const struct tu_render_pass_attachment *attachment =
2891      &cmd->state.pass->attachments[a];
2892
2893   if (!attachment->clear_mask)
2894      return;
2895
2896   if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2897      if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2898         clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
2899                                 value, a, true);
2900      }
2901      if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2902         clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
2903                                 value, a, true);
2904      }
2905   } else {
2906      clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
2907                              value, a, false);
2908   }
2909
2910   /* The spec doesn't explicitly say, but presumably the initial renderpass
2911    * clear is considered part of the renderpass, and therefore barriers
2912    * aren't required inside the subpass/renderpass.  Therefore we need to
2913    * flush CCU color into CCU depth here, just like with
2914    * vkCmdClearAttachments(). Note that because this only happens at the
2915    * beginning of a renderpass, and renderpass writes are considered
2916    * "incoherent", we shouldn't have to worry about syncing depth into color
2917    * beforehand as depth should already be flushed.
2918    */
2919   if (vk_format_is_depth_or_stencil(attachment->format)) {
2920      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2921      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
2922      tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2923   } else {
2924      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2925      tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2926   }
2927
2928   if (cmd->device->physical_device->info->a6xx.has_ccu_flush_bug)
2929      tu_cs_emit_wfi(cs);
2930}
2931
2932void
2933tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2934                         struct tu_cs *cs,
2935                         uint32_t a,
2936                         const VkClearValue *value)
2937{
2938   const struct tu_render_pass_attachment *attachment =
2939      &cmd->state.pass->attachments[a];
2940
2941   if (!attachment->clear_mask)
2942      return;
2943
2944   tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2945
2946   tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask, value);
2947}
2948
2949static void
2950tu_emit_blit(struct tu_cmd_buffer *cmd,
2951             struct tu_cs *cs,
2952             const struct tu_image_view *iview,
2953             const struct tu_render_pass_attachment *attachment,
2954             bool resolve,
2955             bool separate_stencil)
2956{
2957   tu_cs_emit_regs(cs,
2958                   A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2959
2960   tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2961      .unk0 = !resolve,
2962      .gmem = !resolve,
2963      .sample_0 = vk_format_is_int(attachment->format) ||
2964         vk_format_is_depth_or_stencil(attachment->format)));
2965
2966   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2967   if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2968      if (!separate_stencil) {
2969         tu_cs_emit(cs, tu_image_view_depth(iview, RB_BLIT_DST_INFO));
2970         tu_cs_emit_qw(cs, iview->depth_base_addr);
2971         tu_cs_emit(cs, iview->depth_PITCH);
2972
2973         tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
2974         tu_cs_image_flag_ref(cs, &iview->view, 0);
2975      } else {
2976         tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
2977         tu_cs_emit_qw(cs, iview->stencil_base_addr);
2978         tu_cs_emit(cs, iview->stencil_PITCH);
2979      }
2980   } else {
2981      tu_cs_emit(cs, iview->view.RB_BLIT_DST_INFO);
2982      tu_cs_image_ref_2d(cs, &iview->view, 0, false);
2983
2984      tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
2985      tu_cs_image_flag_ref(cs, &iview->view, 0);
2986   }
2987
2988   if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
2989         tu_cs_emit_regs(cs,
2990                        A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(cmd, attachment)));
2991   } else {
2992      tu_cs_emit_regs(cs,
2993                     A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(cmd, attachment)));
2994   }
2995
2996   tu6_emit_event_write(cmd, cs, BLIT);
2997}
2998
2999static bool
3000blit_can_resolve(VkFormat format)
3001{
3002   const struct util_format_description *desc = vk_format_description(format);
3003
3004   /* blit event can only do resolve for simple cases:
3005    * averaging samples as unsigned integers or choosing only one sample
3006    */
3007   if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
3008      return false;
3009
3010   /* can't do formats with larger channel sizes
3011    * note: this includes all float formats
3012    * note2: single channel integer formats seem OK
3013    */
3014   if (desc->channel[0].size > 10)
3015      return false;
3016
3017   switch (format) {
3018   /* for unknown reasons blit event can't msaa resolve these formats when tiled
3019    * likely related to these formats having different layout from other cpp=2 formats
3020    */
3021   case VK_FORMAT_R8G8_UNORM:
3022   case VK_FORMAT_R8G8_UINT:
3023   case VK_FORMAT_R8G8_SINT:
3024   /* TODO: this one should be able to work? */
3025   case VK_FORMAT_D24_UNORM_S8_UINT:
3026      return false;
3027   default:
3028      break;
3029   }
3030
3031   return true;
3032}
3033
3034static void
3035tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
3036                              struct tu_cs *cs, bool load)
3037{
3038   tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
3039
3040   if (!unlikely(cmd->device->physical_device->instance->debug_flags &
3041                 TU_DEBUG_LOG_SKIP_GMEM_OPS))
3042      return;
3043
3044   uint64_t result_iova;
3045   if (load)
3046      result_iova = global_iova(cmd, dbg_gmem_taken_loads);
3047   else
3048      result_iova = global_iova(cmd, dbg_gmem_taken_stores);
3049
3050   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
3051   tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
3052   tu_cs_emit_qw(cs, result_iova);
3053   tu_cs_emit_qw(cs, result_iova);
3054   tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
3055}
3056
3057static void
3058tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
3059                            struct tu_cs *cs, bool load)
3060{
3061   tu_cond_exec_end(cs);
3062
3063   if (!unlikely(cmd->device->physical_device->instance->debug_flags &
3064                 TU_DEBUG_LOG_SKIP_GMEM_OPS))
3065      return;
3066
3067   uint64_t result_iova;
3068   if (load)
3069      result_iova = global_iova(cmd, dbg_gmem_total_loads);
3070   else
3071      result_iova = global_iova(cmd, dbg_gmem_total_stores);
3072
3073   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
3074   tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
3075   tu_cs_emit_qw(cs, result_iova);
3076   tu_cs_emit_qw(cs, result_iova);
3077   tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
3078}
3079
3080void
3081tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
3082                        struct tu_cs *cs,
3083                        uint32_t a,
3084                        bool cond_exec_allowed,
3085                        bool force_load)
3086{
3087   const struct tu_image_view *iview = cmd->state.attachments[a];
3088   const struct tu_render_pass_attachment *attachment =
3089      &cmd->state.pass->attachments[a];
3090
3091   bool load_common = attachment->load || force_load;
3092   bool load_stencil =
3093      attachment->load_stencil ||
3094      (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
3095
3096   if (!load_common && !load_stencil)
3097      return;
3098
3099   trace_start_gmem_load(&cmd->trace, cs);
3100
3101   /* If attachment will be cleared by vkCmdClearAttachments - it is likely
3102    * that it would be partially cleared, and since it is done by 2d blit
3103    * it doesn't produce geometry, so we have to unconditionally load.
3104    *
3105    * To simplify conditions treat partially cleared separate DS as fully
3106    * cleared and don't emit cond_exec.
3107    */
3108   bool cond_exec = cond_exec_allowed && attachment->cond_load_allowed;
3109   if (cond_exec)
3110      tu_begin_load_store_cond_exec(cmd, cs, true);
3111
3112   if (load_common)
3113      tu_emit_blit(cmd, cs, iview, attachment, false, false);
3114
3115   if (load_stencil)
3116      tu_emit_blit(cmd, cs, iview, attachment, false, true);
3117
3118   if (cond_exec)
3119      tu_end_load_store_cond_exec(cmd, cs, true);
3120
3121   trace_end_gmem_load(&cmd->trace, cs, attachment->format, force_load);
3122}
3123
3124static void
3125store_cp_blit(struct tu_cmd_buffer *cmd,
3126              struct tu_cs *cs,
3127              const struct tu_image_view *iview,
3128              uint32_t samples,
3129              bool separate_stencil,
3130              enum pipe_format src_format,
3131              enum pipe_format dst_format,
3132              uint32_t gmem_offset,
3133              uint32_t cpp)
3134{
3135   r2d_setup_common(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
3136                    iview->view.ubwc_enabled, true);
3137
3138   if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3139      if (!separate_stencil) {
3140         r2d_dst_depth(cs, iview, 0);
3141      } else {
3142         r2d_dst_stencil(cs, iview, 0);
3143      }
3144   } else {
3145      r2d_dst(cs, &iview->view, 0, src_format);
3146   }
3147
3148   enum a6xx_format fmt = tu6_format_texture(src_format, TILE6_2).fmt;
3149   fixup_src_format(&src_format, dst_format, &fmt);
3150
3151   tu_cs_emit_regs(cs,
3152                   A6XX_SP_PS_2D_SRC_INFO(
3153                      .color_format = fmt,
3154                      .color_swap = WZYX,
3155                      .tile_mode = TILE6_2,
3156                      .srgb = util_format_is_srgb(src_format),
3157                      .samples = tu_msaa_samples(samples),
3158                      .samples_average = !util_format_is_pure_integer(dst_format) &&
3159                                         !util_format_is_depth_or_stencil(dst_format),
3160                      .unk20 = 1,
3161                      .unk22 = 1),
3162                   /* note: src size does not matter when not scaling */
3163                   A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
3164                   A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset),
3165                   A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.tiling->tile0.width * cpp));
3166
3167   /* sync GMEM writes with CACHE. */
3168   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
3169
3170   /* Wait for CACHE_INVALIDATE to land */
3171   tu_cs_emit_wfi(cs);
3172
3173   tu_cs_emit_pkt7(cs, CP_BLIT, 1);
3174   tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
3175
3176   /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
3177    * sysmem, and we generally assume that GMEM renderpasses leave their
3178    * results in sysmem, so we need to flush manually here.
3179    */
3180   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
3181}
3182
3183static void
3184store_3d_blit(struct tu_cmd_buffer *cmd,
3185              struct tu_cs *cs,
3186              const struct tu_image_view *iview,
3187              uint32_t dst_samples,
3188              bool separate_stencil,
3189              enum pipe_format src_format,
3190              enum pipe_format dst_format,
3191              const VkRect2D *render_area,
3192              uint32_t gmem_offset,
3193              uint32_t cpp)
3194{
3195   /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they
3196    * aren't set until we know whether we're HW binning or not, and we want to
3197    * avoid a dependence on that here to be able to store attachments before
3198    * the end of the renderpass in the future. Use the scratch space to
3199    * save/restore them dynamically.
3200    */
3201   tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
3202   tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) |
3203                  CP_REG_TO_SCRATCH_0_SCRATCH(0) |
3204                  CP_REG_TO_SCRATCH_0_CNT(1 - 1));
3205
3206   r3d_setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
3207             iview->view.ubwc_enabled, dst_samples);
3208
3209   r3d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
3210
3211   if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3212      if (!separate_stencil) {
3213         r3d_dst_depth(cs, iview, 0);
3214      } else {
3215         r3d_dst_stencil(cs, iview, 0);
3216      }
3217   } else {
3218      r3d_dst(cs, &iview->view, 0, src_format);
3219   }
3220
3221   r3d_src_gmem(cmd, cs, iview, src_format, dst_format, gmem_offset, cpp);
3222
3223   /* sync GMEM writes with CACHE. */
3224   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
3225
3226   /* Wait for CACHE_INVALIDATE to land */
3227   tu_cs_emit_wfi(cs);
3228
3229   r3d_run(cmd, cs);
3230
3231   r3d_teardown(cmd, cs);
3232
3233   /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
3234    * sysmem, and we generally assume that GMEM renderpasses leave their
3235    * results in sysmem, so we need to flush manually here. The 3d blit path
3236    * writes to depth images as a color RT, so there's no need to flush depth.
3237    */
3238   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
3239
3240   /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */
3241   tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
3242   tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) |
3243                  CP_SCRATCH_TO_REG_0_SCRATCH(0) |
3244                  CP_SCRATCH_TO_REG_0_CNT(1 - 1));
3245
3246   tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
3247   tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) |
3248                  CP_SCRATCH_TO_REG_0_SCRATCH(0) |
3249                  CP_SCRATCH_TO_REG_0_CNT(1 - 1));
3250}
3251
3252static bool
3253tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
3254{
3255   struct tu_physical_device *phys_dev = cmd->device->physical_device;
3256   const struct tu_image_view *iview = cmd->state.attachments[a];
3257   const VkRect2D *render_area = &cmd->state.render_area;
3258
3259   /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
3260   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_UNALIGNED_STORE))
3261      return true;
3262
3263   uint32_t x1 = render_area->offset.x;
3264   uint32_t y1 = render_area->offset.y;
3265   uint32_t x2 = x1 + render_area->extent.width;
3266   uint32_t y2 = y1 + render_area->extent.height;
3267   /* x2/y2 can be unaligned if equal to the size of the image, since it will
3268    * write into padding space. The one exception is linear levels which don't
3269    * have the required y padding in the layout (except for the last level)
3270    */
3271   bool need_y2_align =
3272      y2 != iview->view.height || iview->view.need_y2_align;
3273
3274   return (x1 % phys_dev->info->gmem_align_w ||
3275           (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
3276           y1 % phys_dev->info->gmem_align_h ||
3277           (y2 % phys_dev->info->gmem_align_h && need_y2_align));
3278}
3279
3280/* Choose the GMEM layout (use the CCU space or not) based on whether the
3281 * current attachments will need.  This has to happen at vkBeginRenderPass()
3282 * time because tu_attachment_store_unaligned() looks at the image views, which
3283 * are only available at that point.  This should match the logic for the
3284 * !unaligned case in tu_store_gmem_attachment().
3285 */
3286void
3287tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
3288{
3289   cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
3290
3291   for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
3292      if (!cmd->state.attachments[i])
3293         continue;
3294
3295      struct tu_render_pass_attachment *att =
3296         &cmd->state.pass->attachments[i];
3297      if ((att->store || att->store_stencil) &&
3298          tu_attachment_store_unaligned(cmd, i))
3299         cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
3300      if (att->will_be_resolved && !blit_can_resolve(att->format))
3301         cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
3302   }
3303
3304   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
3305}
3306
3307void
3308tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
3309                         struct tu_cs *cs,
3310                         uint32_t a,
3311                         uint32_t gmem_a,
3312                         bool cond_exec_allowed)
3313{
3314   const VkRect2D *render_area = &cmd->state.render_area;
3315   struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
3316   const struct tu_image_view *iview = cmd->state.attachments[a];
3317   struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
3318
3319   if (!dst->store && !dst->store_stencil)
3320      return;
3321
3322   /* Unconditional store should happen only if attachment was cleared,
3323    * which could have happened either by load_op or via vkCmdClearAttachments.
3324    */
3325   bool cond_exec = cond_exec_allowed && src->cond_store_allowed;
3326   if (cond_exec) {
3327      tu_begin_load_store_cond_exec(cmd, cs, false);
3328   }
3329
3330   bool unaligned = tu_attachment_store_unaligned(cmd, a);
3331
3332   /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
3333    * one for depth and other for stencil. When resolving a MSAA
3334    * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
3335    */
3336   bool resolve_d32s8_s8 =
3337      src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
3338      dst->format == VK_FORMAT_S8_UINT;
3339
3340   /* The fast path doesn't support picking out the last component of a D24S8
3341    * texture reinterpreted as RGBA8_UNORM.
3342    */
3343   bool resolve_d24s8_s8 =
3344      src->format == VK_FORMAT_D24_UNORM_S8_UINT &&
3345      dst->format == VK_FORMAT_S8_UINT;
3346
3347   bool store_common = dst->store && !resolve_d32s8_s8;
3348   bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
3349
3350   trace_start_gmem_store(&cmd->trace, cs);
3351
3352   /* use fast path when render area is aligned, except for unsupported resolve cases */
3353   if (!unaligned && !resolve_d24s8_s8 &&
3354       (a == gmem_a || blit_can_resolve(dst->format))) {
3355      if (store_common)
3356         tu_emit_blit(cmd, cs, iview, src, true, false);
3357      if (store_separate_stencil)
3358         tu_emit_blit(cmd, cs, iview, src, true, true);
3359
3360      if (cond_exec) {
3361         tu_end_load_store_cond_exec(cmd, cs, false);
3362      }
3363
3364      trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false);
3365      return;
3366   }
3367
3368   assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
3369
3370   enum pipe_format src_format = tu_vk_format_to_pipe_format(src->format);
3371   if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
3372      src_format = PIPE_FORMAT_Z32_FLOAT;
3373
3374   enum pipe_format dst_format = tu_vk_format_to_pipe_format(dst->format);
3375   if (dst_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
3376      dst_format = PIPE_FORMAT_Z32_FLOAT;
3377
3378   if (dst->samples > 1) {
3379      /* If we hit this path, we have to disable draw states after every tile
3380       * instead of once at the end of the renderpass, so that they aren't
3381       * executed when calling CP_DRAW.
3382       *
3383       * TODO: store a flag somewhere so we don't do this more than once and
3384       * don't do it after the renderpass when this happens.
3385       */
3386      if (store_common || store_separate_stencil)
3387         tu_disable_draw_states(cmd, cs);
3388
3389      if (store_common) {
3390         store_3d_blit(cmd, cs, iview, dst->samples, false, src_format,
3391                       dst_format, render_area, tu_attachment_gmem_offset(cmd, src), src->cpp);
3392      }
3393      if (store_separate_stencil) {
3394         store_3d_blit(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
3395                       PIPE_FORMAT_S8_UINT, render_area,
3396                       tu_attachment_gmem_offset_stencil(cmd, src), src->samples);
3397      }
3398   } else {
3399      r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
3400
3401      if (store_common) {
3402         store_cp_blit(cmd, cs, iview, src->samples, false, src_format,
3403                       dst_format, tu_attachment_gmem_offset(cmd, src), src->cpp);
3404      }
3405      if (store_separate_stencil) {
3406         store_cp_blit(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
3407                       PIPE_FORMAT_S8_UINT, tu_attachment_gmem_offset_stencil(cmd, src), src->samples);
3408      }
3409   }
3410
3411   if (cond_exec) {
3412      tu_end_load_store_cond_exec(cmd, cs, false);
3413   }
3414
3415   trace_end_gmem_store(&cmd->trace, cs, dst->format, false, unaligned);
3416}
3417