1 /*
2  * Copyright 2019-2020 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Jonathan Marek <jonathan@marek.ca>
7  */
8 
9 #include "tu_clear_blit.h"
10 
11 #include "ir3/ir3_nir.h"
12 
13 #include "util/format_r11g11b10f.h"
14 #include "util/format_rgb9e5.h"
15 #include "util/format_srgb.h"
16 #include "util/half_float.h"
17 #include "compiler/nir/nir_builder.h"
18 
19 #include "tu_cmd_buffer.h"
20 #include "tu_cs.h"
21 #include "tu_formats.h"
22 #include "tu_image.h"
23 #include "tu_tracepoints.h"
24 
25 static uint32_t
tu_pack_float32_for_unorm(float val, int bits)26 tu_pack_float32_for_unorm(float val, int bits)
27 {
28    return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
29 }
30 
31 /* r2d_ = BLIT_OP_SCALE operations */
32 
33 static enum a6xx_2d_ifmt
format_to_ifmt(enum pipe_format format)34 format_to_ifmt(enum pipe_format format)
35 {
36    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
37        format == PIPE_FORMAT_Z24X8_UNORM)
38       return R2D_UNORM8;
39 
40    /* get_component_bits doesn't work with depth/stencil formats: */
41    if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
42       return R2D_FLOAT32;
43    if (format == PIPE_FORMAT_S8_UINT)
44       return R2D_INT8;
45    if (format == PIPE_FORMAT_A8_UNORM)
46       return R2D_UNORM8;
47 
48    /* use the size of the red channel to find the corresponding "ifmt" */
49    bool is_int = util_format_is_pure_integer(format);
50    switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
51    case 4: case 5: case 8:
52       return is_int ? R2D_INT8 : R2D_UNORM8;
53    case 10: case 11:
54       return is_int ? R2D_INT16 : R2D_FLOAT16;
55    case 16:
56       if (util_format_is_float(format))
57          return R2D_FLOAT16;
58       return is_int ? R2D_INT16 : R2D_FLOAT32;
59    case 32:
60       return is_int ? R2D_INT32 : R2D_FLOAT32;
61     default:
62       unreachable("bad format");
63       return 0;
64    }
65 }
66 
67 static void
r2d_coords(struct tu_cs *cs, const VkOffset2D *dst, const VkOffset2D *src, const VkExtent2D *extent)68 r2d_coords(struct tu_cs *cs,
69            const VkOffset2D *dst,
70            const VkOffset2D *src,
71            const VkExtent2D *extent)
72 {
73    tu_cs_emit_regs(cs,
74       A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
75       A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
76 
77    if (!src)
78       return;
79 
80    tu_cs_emit_regs(cs,
81                    A6XX_GRAS_2D_SRC_TL_X(src->x),
82                    A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
83                    A6XX_GRAS_2D_SRC_TL_Y(src->y),
84                    A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
85 }
86 
87 static void
r2d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)88 r2d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
89 {
90    uint32_t clear_value[4] = {};
91 
92    switch (format) {
93    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
94    case PIPE_FORMAT_Z24X8_UNORM:
95       /* cleared as r8g8b8a8_unorm using special format */
96       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
97       clear_value[1] = clear_value[0] >> 8;
98       clear_value[2] = clear_value[0] >> 16;
99       clear_value[3] = val->depthStencil.stencil;
100       break;
101    case PIPE_FORMAT_Z16_UNORM:
102    case PIPE_FORMAT_Z32_FLOAT:
103       /* R2D_FLOAT32 */
104       clear_value[0] = fui(val->depthStencil.depth);
105       break;
106    case PIPE_FORMAT_S8_UINT:
107       clear_value[0] = val->depthStencil.stencil;
108       break;
109    case PIPE_FORMAT_R9G9B9E5_FLOAT:
110       /* cleared as UINT32 */
111       clear_value[0] = float3_to_rgb9e5(val->color.float32);
112       break;
113    default:
114       assert(!util_format_is_depth_or_stencil(format));
115       const struct util_format_description *desc = util_format_description(format);
116       enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
117 
118       assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
119              format == PIPE_FORMAT_R11G11B10_FLOAT);
120 
121       for (unsigned i = 0; i < desc->nr_channels; i++) {
122          const struct util_format_channel_description *ch = &desc->channel[i];
123          if (ifmt == R2D_UNORM8) {
124             float linear = val->color.float32[i];
125             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
126                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
127 
128             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
129                clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
130             else
131                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
132          } else if (ifmt == R2D_FLOAT16) {
133             clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
134          } else {
135             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
136                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
137             clear_value[i] = val->color.uint32[i];
138          }
139       }
140       break;
141    }
142 
143    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
144    tu_cs_emit_array(cs, clear_value, 4);
145 }
146 
147 static void
fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format, enum a6xx_format *fmt)148 fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format,
149                  enum a6xx_format *fmt)
150 {
151    /* When blitting S8 -> D24S8 or vice versa, we have to override S8, which
152     * is normally R8_UINT for sampling/blitting purposes, to a unorm format.
153     * We also have to move stencil, which is normally in the .w channel, into
154     * the right channel. Reintepreting the S8 texture as A8_UNORM solves both
155     * problems, and avoids using a swap, which seems to sometimes not work
156     * with a D24S8 source, or a texture swizzle which is only supported with
157     * the 3d path. Sometimes this blit happens on already-constructed
158     * fdl6_view's, e.g. for sysmem resolves, so this has to happen as a fixup.
159     */
160    if (*src_format == PIPE_FORMAT_S8_UINT &&
161        (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
162         dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
163       *fmt = FMT6_A8_UNORM;
164       *src_format = PIPE_FORMAT_A8_UNORM;
165    }
166 }
167 
168 static void
fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format, enum a6xx_format *fmt)169 fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format,
170                  enum a6xx_format *fmt)
171 {
172    if (*dst_format == PIPE_FORMAT_S8_UINT &&
173        (src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
174         src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
175       *dst_format = PIPE_FORMAT_A8_UNORM;
176       *fmt = FMT6_A8_UNORM;
177    }
178 }
179 
180 static void
r2d_src(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, VkFilter filter, enum pipe_format dst_format)181 r2d_src(struct tu_cmd_buffer *cmd,
182         struct tu_cs *cs,
183         const struct fdl6_view *iview,
184         uint32_t layer,
185         VkFilter filter,
186         enum pipe_format dst_format)
187 {
188    uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
189    if (filter != VK_FILTER_NEAREST)
190       src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
191 
192    enum a6xx_format fmt = (src_info & A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK);
193    enum pipe_format src_format = iview->format;
194    fixup_src_format(&src_format, dst_format, &fmt);
195 
196    src_info =
197       (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) |
198       A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt);
199 
200    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
201    tu_cs_emit(cs, src_info);
202    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
203    tu_cs_image_ref_2d(cs, iview, layer, true);
204 
205    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3);
206    tu_cs_image_flag_ref(cs, iview, layer);
207 }
208 
209 static void
r2d_src_depth(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, VkFilter filter)210 r2d_src_depth(struct tu_cmd_buffer *cmd,
211                 struct tu_cs *cs,
212                 const struct tu_image_view *iview,
213                 uint32_t layer,
214                 VkFilter filter)
215 {
216    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
217    tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO));
218    tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
219    tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
220    /* SP_PS_2D_SRC_PITCH has shifted pitch field */
221    tu_cs_emit(cs, iview->depth_PITCH << 9);
222 
223    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3);
224    tu_cs_image_flag_ref(cs, &iview->view, layer);
225 }
226 
227 static void
r2d_src_stencil(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, VkFilter filter)228 r2d_src_stencil(struct tu_cmd_buffer *cmd,
229                 struct tu_cs *cs,
230                 const struct tu_image_view *iview,
231                 uint32_t layer,
232                 VkFilter filter)
233 {
234    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
235    tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
236    tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
237    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
238    /* SP_PS_2D_SRC_PITCH has shifted pitch field */
239    tu_cs_emit(cs, iview->stencil_PITCH << 9);
240 }
241 
242 static void
r2d_src_buffer(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch, uint32_t width, uint32_t height, enum pipe_format dst_format)243 r2d_src_buffer(struct tu_cmd_buffer *cmd,
244                struct tu_cs *cs,
245                enum pipe_format format,
246                uint64_t va, uint32_t pitch,
247                uint32_t width, uint32_t height,
248                enum pipe_format dst_format)
249 {
250    struct tu_native_format fmt = tu6_format_texture(format, TILE6_LINEAR);
251    enum a6xx_format color_format = fmt.fmt;
252    fixup_src_format(&format, dst_format, &color_format);
253 
254    tu_cs_emit_regs(cs,
255                    A6XX_SP_PS_2D_SRC_INFO(
256                       .color_format = color_format,
257                       .color_swap = fmt.swap,
258                       .srgb = util_format_is_srgb(format),
259                       .unk20 = 1,
260                       .unk22 = 1),
261                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
262                    A6XX_SP_PS_2D_SRC(.qword = va),
263                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
264 }
265 
266 static void
r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, enum pipe_format src_format)267 r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
268         enum pipe_format src_format)
269 {
270    uint32_t dst_info = iview->RB_2D_DST_INFO;
271    enum a6xx_format fmt = dst_info & A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK;
272    enum pipe_format dst_format = iview->format;
273    fixup_dst_format(src_format, &dst_format, &fmt);
274 
275    dst_info =
276          (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt;
277    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
278    tu_cs_emit(cs, dst_info);
279    tu_cs_image_ref_2d(cs, iview, layer, false);
280 
281    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
282    tu_cs_image_flag_ref(cs, iview, layer);
283 }
284 
285 static void
r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)286 r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
287 {
288    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
289    tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO));
290    tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
291    tu_cs_emit(cs, iview->depth_PITCH);
292 
293    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
294    tu_cs_image_flag_ref(cs, &iview->view, layer);
295 }
296 
297 static void
r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)298 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
299 {
300    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
301    tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
302    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
303    tu_cs_emit(cs, iview->stencil_PITCH);
304 }
305 
306 static void
r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch, enum pipe_format src_format)307 r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
308                enum pipe_format src_format)
309 {
310    struct tu_native_format fmt = tu6_format_color(format, TILE6_LINEAR);
311    enum a6xx_format color_fmt = fmt.fmt;
312    fixup_dst_format(src_format, &format, &color_fmt);
313    fmt.fmt = color_fmt;
314 
315    tu_cs_emit_regs(cs,
316                    A6XX_RB_2D_DST_INFO(
317                       .color_format = fmt.fmt,
318                       .color_swap = fmt.swap,
319                       .srgb = util_format_is_srgb(format)),
320                    A6XX_RB_2D_DST(.qword = va),
321                    A6XX_RB_2D_DST_PITCH(pitch));
322 }
323 
324 static void
r2d_setup_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format src_format, enum pipe_format dst_format, VkImageAspectFlags aspect_mask, unsigned blit_param, bool clear, bool ubwc, bool scissor)325 r2d_setup_common(struct tu_cmd_buffer *cmd,
326                  struct tu_cs *cs,
327                  enum pipe_format src_format,
328                  enum pipe_format dst_format,
329                  VkImageAspectFlags aspect_mask,
330                  unsigned blit_param,
331                  bool clear,
332                  bool ubwc,
333                  bool scissor)
334 {
335    enum a6xx_format fmt = tu6_base_format(dst_format);
336    fixup_dst_format(src_format, &dst_format, &fmt);
337    enum a6xx_2d_ifmt ifmt = format_to_ifmt(dst_format);
338 
339    uint32_t unknown_8c01 = 0;
340 
341    if ((dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
342        dst_format == PIPE_FORMAT_Z24X8_UNORM) && ubwc) {
343       fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
344    }
345 
346    /* note: the only format with partial clearing is D24S8 */
347    if (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
348       /* preserve stencil channel */
349       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
350          unknown_8c01 = 0x08000041;
351       /* preserve depth channels */
352       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
353          unknown_8c01 = 0x00084001;
354    }
355 
356    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
357    tu_cs_emit(cs, unknown_8c01);
358 
359    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
360          .scissor = scissor,
361          .rotate = blit_param,
362          .solid_color = clear,
363          .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
364          .color_format = fmt,
365          .mask = 0xf,
366          .ifmt = util_format_is_srgb(dst_format) ? R2D_UNORM8_SRGB : ifmt,
367       ).value;
368 
369    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
370    tu_cs_emit(cs, blit_cntl);
371 
372    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
373    tu_cs_emit(cs, blit_cntl);
374 
375    if (fmt == FMT6_10_10_10_2_UNORM_DEST)
376       fmt = FMT6_16_16_16_16_FLOAT;
377 
378    tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
379          .sint = util_format_is_pure_sint(dst_format),
380          .uint = util_format_is_pure_uint(dst_format),
381          .color_format = fmt,
382          .srgb = util_format_is_srgb(dst_format),
383          .mask = 0xf));
384 }
385 
386 static void
r2d_setup(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format src_format, enum pipe_format dst_format, VkImageAspectFlags aspect_mask, unsigned blit_param, bool clear, bool ubwc, VkSampleCountFlagBits samples)387 r2d_setup(struct tu_cmd_buffer *cmd,
388           struct tu_cs *cs,
389           enum pipe_format src_format,
390           enum pipe_format dst_format,
391           VkImageAspectFlags aspect_mask,
392           unsigned blit_param,
393           bool clear,
394           bool ubwc,
395           VkSampleCountFlagBits samples)
396 {
397    assert(samples == VK_SAMPLE_COUNT_1_BIT);
398 
399    if (!cmd->state.pass) {
400       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
401    }
402 
403    r2d_setup_common(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false);
404 }
405 
406 static void
r2d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)407 r2d_teardown(struct tu_cmd_buffer *cmd,
408              struct tu_cs *cs)
409 {
410    /* nothing to do here */
411 }
412 
413 static void
r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)414 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
415 {
416    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
417    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
418 }
419 
420 /* r3d_ = shader path operations */
421 
422 static nir_ssa_def *
load_const(nir_builder *b, unsigned base, unsigned components)423 load_const(nir_builder *b, unsigned base, unsigned components)
424 {
425    return nir_load_uniform(b, components, 32, nir_imm_int(b, 0),
426                            .base = base);
427 }
428 
429 static nir_shader *
build_blit_vs_shader(void)430 build_blit_vs_shader(void)
431 {
432    nir_builder _b =
433       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
434    nir_builder *b = &_b;
435 
436    nir_variable *out_pos =
437       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
438                           "gl_Position");
439    out_pos->data.location = VARYING_SLOT_POS;
440 
441    nir_ssa_def *vert0_pos = load_const(b, 0, 2);
442    nir_ssa_def *vert1_pos = load_const(b, 4, 2);
443    nir_ssa_def *vertex = nir_load_vertex_id(b);
444 
445    nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos);
446    pos = nir_vec4(b, nir_channel(b, pos, 0),
447                      nir_channel(b, pos, 1),
448                      nir_imm_float(b, 0.0),
449                      nir_imm_float(b, 1.0));
450 
451    nir_store_var(b, out_pos, pos, 0xf);
452 
453    nir_variable *out_coords =
454       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
455                           "coords");
456    out_coords->data.location = VARYING_SLOT_VAR0;
457 
458    nir_ssa_def *vert0_coords = load_const(b, 2, 2);
459    nir_ssa_def *vert1_coords = load_const(b, 6, 2);
460 
461    /* Only used with "z scale" blit path which uses a 3d texture */
462    nir_ssa_def *z_coord = load_const(b, 8, 1);
463 
464    nir_ssa_def *coords = nir_bcsel(b, nir_i2b1(b, vertex), vert1_coords, vert0_coords);
465    coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
466                      z_coord);
467 
468    nir_store_var(b, out_coords, coords, 0x7);
469 
470    return b->shader;
471 }
472 
473 static nir_shader *
build_clear_vs_shader(void)474 build_clear_vs_shader(void)
475 {
476    nir_builder _b =
477       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
478    nir_builder *b = &_b;
479 
480    nir_variable *out_pos =
481       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
482                           "gl_Position");
483    out_pos->data.location = VARYING_SLOT_POS;
484 
485    nir_ssa_def *vert0_pos = load_const(b, 0, 2);
486    nir_ssa_def *vert1_pos = load_const(b, 4, 2);
487    /* c0.z is used to clear depth */
488    nir_ssa_def *depth = load_const(b, 2, 1);
489    nir_ssa_def *vertex = nir_load_vertex_id(b);
490 
491    nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos);
492    pos = nir_vec4(b, nir_channel(b, pos, 0),
493                      nir_channel(b, pos, 1),
494                      depth, nir_imm_float(b, 1.0));
495 
496    nir_store_var(b, out_pos, pos, 0xf);
497 
498    nir_variable *out_layer =
499       nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
500                           "gl_Layer");
501    out_layer->data.location = VARYING_SLOT_LAYER;
502    nir_ssa_def *layer = load_const(b, 3, 1);
503    nir_store_var(b, out_layer, layer, 1);
504 
505    return b->shader;
506 }
507 
508 static nir_shader *
build_blit_fs_shader(bool zscale)509 build_blit_fs_shader(bool zscale)
510 {
511    nir_builder _b =
512       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
513                                      zscale ? "zscale blit fs" : "blit fs");
514    nir_builder *b = &_b;
515 
516    nir_variable *out_color =
517       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
518                           "color0");
519    out_color->data.location = FRAG_RESULT_DATA0;
520 
521    unsigned coord_components = zscale ? 3 : 2;
522    nir_variable *in_coords =
523       nir_variable_create(b->shader, nir_var_shader_in,
524                           glsl_vec_type(coord_components),
525                           "coords");
526    in_coords->data.location = VARYING_SLOT_VAR0;
527 
528    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
529    /* Note: since we're just copying data, we rely on the HW ignoring the
530     * dest_type.
531     */
532    tex->dest_type = nir_type_int32;
533    tex->is_array = false;
534    tex->is_shadow = false;
535    tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
536 
537    tex->texture_index = 0;
538    tex->sampler_index = 0;
539 
540    b->shader->info.num_textures = 1;
541    BITSET_SET(b->shader->info.textures_used, 0);
542 
543    tex->src[0].src_type = nir_tex_src_coord;
544    tex->src[0].src = nir_src_for_ssa(nir_load_var(b, in_coords));
545    tex->coord_components = coord_components;
546 
547    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
548    nir_builder_instr_insert(b, &tex->instr);
549 
550    nir_store_var(b, out_color, &tex->dest.ssa, 0xf);
551 
552    return b->shader;
553 }
554 
555 /* We can only read multisample textures via txf_ms, so we need a separate
556  * variant for them.
557  */
558 static nir_shader *
build_ms_copy_fs_shader(void)559 build_ms_copy_fs_shader(void)
560 {
561    nir_builder _b =
562       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
563                                      "multisample copy fs");
564    nir_builder *b = &_b;
565 
566    nir_variable *out_color =
567       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
568                           "color0");
569    out_color->data.location = FRAG_RESULT_DATA0;
570 
571    nir_variable *in_coords =
572       nir_variable_create(b->shader, nir_var_shader_in,
573                           glsl_vec_type(2),
574                           "coords");
575    in_coords->data.location = VARYING_SLOT_VAR0;
576 
577    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
578 
579    tex->op = nir_texop_txf_ms;
580 
581    /* Note: since we're just copying data, we rely on the HW ignoring the
582     * dest_type.
583     */
584    tex->dest_type = nir_type_int32;
585    tex->is_array = false;
586    tex->is_shadow = false;
587    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
588 
589    tex->texture_index = 0;
590    tex->sampler_index = 0;
591 
592    b->shader->info.num_textures = 1;
593    BITSET_SET(b->shader->info.textures_used, 0);
594    BITSET_SET(b->shader->info.textures_used_by_txf, 0);
595 
596    nir_ssa_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
597 
598    tex->src[0].src_type = nir_tex_src_coord;
599    tex->src[0].src = nir_src_for_ssa(coord);
600    tex->coord_components = 2;
601 
602    tex->src[1].src_type = nir_tex_src_ms_index;
603    tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(b));
604 
605    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
606    nir_builder_instr_insert(b, &tex->instr);
607 
608    nir_store_var(b, out_color, &tex->dest.ssa, 0xf);
609 
610    return b->shader;
611 }
612 
613 static nir_shader *
build_clear_fs_shader(unsigned mrts)614 build_clear_fs_shader(unsigned mrts)
615 {
616    nir_builder _b =
617       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
618                                      "mrt%u clear fs", mrts);
619    nir_builder *b = &_b;
620 
621    for (unsigned i = 0; i < mrts; i++) {
622       nir_variable *out_color =
623          nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
624                              "color");
625       out_color->data.location = FRAG_RESULT_DATA0 + i;
626 
627       nir_ssa_def *color = load_const(b, 4 * i, 4);
628       nir_store_var(b, out_color, color, 0xf);
629    }
630 
631    return b->shader;
632 }
633 
634 static void
compile_shader(struct tu_device *dev, struct nir_shader *nir, unsigned consts, unsigned *offset, enum global_shader idx)635 compile_shader(struct tu_device *dev, struct nir_shader *nir,
636                unsigned consts, unsigned *offset, enum global_shader idx)
637 {
638    nir->options = ir3_get_compiler_options(dev->compiler);
639 
640    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
641    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
642 
643    ir3_finalize_nir(dev->compiler, nir);
644 
645    struct ir3_shader *sh =
646       ir3_shader_from_nir(dev->compiler, nir, &(struct ir3_shader_options) {
647                               .api_wavesize = IR3_SINGLE_OR_DOUBLE,
648                               .real_wavesize = IR3_SINGLE_OR_DOUBLE,
649                               .reserved_user_consts = align(consts, 4),
650                           }, NULL);
651 
652    struct ir3_shader_key key = {};
653    bool created;
654    struct ir3_shader_variant *so =
655       ir3_shader_get_variant(sh, &key, false, false, &created);
656 
657    struct tu6_global *global = dev->global_bo->map;
658 
659    assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
660    dev->global_shaders[idx] = sh;
661    dev->global_shader_variants[idx] = so;
662    memcpy(&global->shaders[*offset], so->bin,
663           sizeof(uint32_t) * so->info.sizedwords);
664    dev->global_shader_va[idx] = dev->global_bo->iova +
665       gb_offset(shaders[*offset]);
666    *offset += align(so->info.sizedwords, 32);
667 }
668 
669 void
tu_init_clear_blit_shaders(struct tu_device *dev)670 tu_init_clear_blit_shaders(struct tu_device *dev)
671 {
672    unsigned offset = 0;
673    compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
674    compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
675    compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
676    compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
677    compile_shader(dev, build_ms_copy_fs_shader(), 0, &offset, GLOBAL_SH_FS_COPY_MS);
678 
679    for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
680       compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
681                      GLOBAL_SH_FS_CLEAR0 + num_rts);
682    }
683 }
684 
685 void
tu_destroy_clear_blit_shaders(struct tu_device *dev)686 tu_destroy_clear_blit_shaders(struct tu_device *dev)
687 {
688    for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
689       if (dev->global_shaders[i])
690          ir3_shader_destroy(dev->global_shaders[i]);
691    }
692 }
693 
694 static void
r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)695 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit,
696            uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
697 {
698    enum global_shader vs_id =
699       blit ? GLOBAL_SH_VS_BLIT : GLOBAL_SH_VS_CLEAR;
700 
701    struct ir3_shader_variant *vs = cmd->device->global_shader_variants[vs_id];
702    uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
703 
704    enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
705 
706    if (z_scale)
707       fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
708    else if (samples != VK_SAMPLE_COUNT_1_BIT)
709       fs_id = GLOBAL_SH_FS_COPY_MS;
710 
711    unsigned num_rts = util_bitcount(rts_mask);
712    if (!blit)
713       fs_id = GLOBAL_SH_FS_CLEAR0 + num_rts;
714 
715    struct ir3_shader_variant *fs = cmd->device->global_shader_variants[fs_id];
716    uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
717 
718    tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
719          .vs_state = true,
720          .hs_state = true,
721          .ds_state = true,
722          .gs_state = true,
723          .fs_state = true,
724          .cs_state = true,
725          .gfx_ibo = true,
726          .cs_ibo = true,
727          .gfx_shared_const = true,
728          .gfx_bindless = 0x1f,
729          .cs_bindless = 0x1f));
730 
731    tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, vs);
732    tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL);
733    tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL);
734    tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL);
735    tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, fs);
736 
737    struct tu_pvtmem_config pvtmem = {};
738    tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
739    tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
740 
741    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
742    tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
743 
744    if (cmd->device->physical_device->info->a6xx.has_cp_reg_write) {
745    /* Copy what the blob does here. This will emit an extra 0x3f
746     * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
747     * this is working around yet.
748     */
749    tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
750    tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
751    tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
752    tu_cs_emit(cs, 0);
753    } else {
754       tu_cs_emit_regs(cs, A6XX_PC_MULTIVIEW_CNTL());
755    }
756    tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL());
757 
758    tu6_emit_vpc(cs, vs, NULL, NULL, NULL, fs, 0);
759 
760    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
761    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
762    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
763 
764    tu6_emit_fs_inputs(cs, fs);
765 
766    tu_cs_emit_regs(cs,
767                    A6XX_GRAS_CL_CNTL(
768                       .persp_division_disable = 1,
769                       .vp_xform_disable = 1,
770                       .vp_clip_code_ignore = 1,
771                       .clip_disable = 1));
772    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
773 
774    tu_cs_emit_regs(cs, A6XX_PC_RASTER_CNTL());
775    tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
776 
777    tu_cs_emit_regs(cs,
778                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
779                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
780    tu_cs_emit_regs(cs,
781                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
782                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
783 
784    tu_cs_emit_regs(cs,
785                    A6XX_VFD_INDEX_OFFSET(),
786                    A6XX_VFD_INSTANCE_START_OFFSET());
787 
788    if (rts_mask) {
789       unsigned rts_count = util_last_bit(rts_mask);
790       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
791       unsigned rt = 0;
792       for (unsigned i = 0; i < rts_count; i++) {
793          unsigned regid = 0;
794          if (rts_mask & (1u << i))
795             regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
796          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid));
797       }
798    }
799 
800    cmd->state.line_mode = RECTANGULAR;
801    tu6_emit_msaa(cs, samples, cmd->state.line_mode);
802 }
803 
804 static void
r3d_coords_raw(struct tu_cs *cs, const float *coords)805 r3d_coords_raw(struct tu_cs *cs, const float *coords)
806 {
807    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
808    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
809                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
810                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
811                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
812                   CP_LOAD_STATE6_0_NUM_UNIT(2));
813    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
814    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
815    tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
816 }
817 
818 /* z coordinate for "z scale" blit path which uses a 3d texture */
819 static void
r3d_coord_z(struct tu_cs *cs, float z)820 r3d_coord_z(struct tu_cs *cs, float z)
821 {
822    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 4);
823    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(2) |
824                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
825                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
826                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
827                   CP_LOAD_STATE6_0_NUM_UNIT(1));
828    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
829    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
830    tu_cs_emit(cs, fui(z));
831    tu_cs_emit(cs, 0);
832    tu_cs_emit(cs, 0);
833    tu_cs_emit(cs, 0);
834 }
835 
836 static void
r3d_coords(struct tu_cs *cs, const VkOffset2D *dst, const VkOffset2D *src, const VkExtent2D *extent)837 r3d_coords(struct tu_cs *cs,
838            const VkOffset2D *dst,
839            const VkOffset2D *src,
840            const VkExtent2D *extent)
841 {
842    int32_t src_x1 = src ? src->x : 0;
843    int32_t src_y1 = src ? src->y : 0;
844    r3d_coords_raw(cs, (float[]) {
845       dst->x,                 dst->y,
846       src_x1,                 src_y1,
847       dst->x + extent->width, dst->y + extent->height,
848       src_x1 + extent->width, src_y1 + extent->height,
849    });
850 }
851 
852 static void
r3d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)853 r3d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
854 {
855    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
856    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
857                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
858                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
859                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
860                   CP_LOAD_STATE6_0_NUM_UNIT(1));
861    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
862    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
863    switch (format) {
864    case PIPE_FORMAT_Z24X8_UNORM:
865    case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
866       /* cleared as r8g8b8a8_unorm using special format */
867       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
868       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
869       tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
870       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
871       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
872    } break;
873    case PIPE_FORMAT_Z16_UNORM:
874    case PIPE_FORMAT_Z32_FLOAT:
875       tu_cs_emit(cs, fui(val->depthStencil.depth));
876       tu_cs_emit(cs, 0);
877       tu_cs_emit(cs, 0);
878       tu_cs_emit(cs, 0);
879       break;
880    case PIPE_FORMAT_S8_UINT:
881       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
882       tu_cs_emit(cs, 0);
883       tu_cs_emit(cs, 0);
884       tu_cs_emit(cs, 0);
885       break;
886    default:
887       /* as color formats use clear value as-is */
888       assert(!util_format_is_depth_or_stencil(format));
889       tu_cs_emit_array(cs, val->color.uint32, 4);
890       break;
891    }
892 }
893 
894 static void
r3d_src_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const uint32_t *tex_const, uint32_t offset_base, uint32_t offset_ubwc, VkFilter filter)895 r3d_src_common(struct tu_cmd_buffer *cmd,
896                struct tu_cs *cs,
897                const uint32_t *tex_const,
898                uint32_t offset_base,
899                uint32_t offset_ubwc,
900                VkFilter filter)
901 {
902    struct tu_cs_memory texture = { };
903    VkResult result = tu_cs_alloc(&cmd->sub_cs,
904                                  2, /* allocate space for a sampler too */
905                                  A6XX_TEX_CONST_DWORDS, &texture);
906    if (result != VK_SUCCESS) {
907       cmd->record_result = result;
908       return;
909    }
910 
911    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
912 
913    /* patch addresses for layer offset */
914    *(uint64_t*) (texture.map + 4) += offset_base;
915    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
916    texture.map[7] = ubwc_addr;
917    texture.map[8] = ubwc_addr >> 32;
918 
919    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
920       A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
921       A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
922       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
923       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
924       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
925       0x60000; /* XXX used by blob, doesn't seem necessary */
926    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
927       A6XX_TEX_SAMP_1_UNNORM_COORDS |
928       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
929    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
930    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
931 
932    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
933    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
934                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
935                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
936                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
937                CP_LOAD_STATE6_0_NUM_UNIT(1));
938    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
939 
940    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
941 
942    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
943    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
944       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
945       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
946       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
947       CP_LOAD_STATE6_0_NUM_UNIT(1));
948    tu_cs_emit_qw(cs, texture.iova);
949 
950    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
951    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
952 }
953 
954 static void
r3d_src(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, VkFilter filter, enum pipe_format dst_format)955 r3d_src(struct tu_cmd_buffer *cmd,
956         struct tu_cs *cs,
957         const struct fdl6_view *iview,
958         uint32_t layer,
959         VkFilter filter,
960         enum pipe_format dst_format)
961 {
962    uint32_t desc[A6XX_TEX_CONST_DWORDS];
963    memcpy(desc, iview->descriptor, sizeof(desc));
964 
965    enum a6xx_format fmt = (desc[0] & A6XX_TEX_CONST_0_FMT__MASK) >>
966          A6XX_TEX_CONST_0_FMT__SHIFT;
967    enum pipe_format src_format = iview->format;
968    fixup_src_format(&src_format, dst_format, &fmt);
969    desc[0] = (desc[0] & ~A6XX_TEX_CONST_0_FMT__MASK) |
970       A6XX_TEX_CONST_0_FMT(fmt);
971 
972    r3d_src_common(cmd, cs, desc,
973                   iview->layer_size * layer,
974                   iview->ubwc_layer_size * layer,
975                   filter);
976 }
977 
978 static void
r3d_src_buffer(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch, uint32_t width, uint32_t height, enum pipe_format dst_format)979 r3d_src_buffer(struct tu_cmd_buffer *cmd,
980                struct tu_cs *cs,
981                enum pipe_format format,
982                uint64_t va, uint32_t pitch,
983                uint32_t width, uint32_t height,
984                enum pipe_format dst_format)
985 {
986    uint32_t desc[A6XX_TEX_CONST_DWORDS];
987 
988    struct tu_native_format fmt = tu6_format_texture(format, TILE6_LINEAR);
989    enum a6xx_format color_format = fmt.fmt;
990    fixup_src_format(&format, dst_format, &color_format);
991 
992    desc[0] =
993       COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
994       A6XX_TEX_CONST_0_FMT(color_format) |
995       A6XX_TEX_CONST_0_SWAP(fmt.swap) |
996       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
997       A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
998       A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
999       A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1000    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
1001    desc[2] =
1002       A6XX_TEX_CONST_2_PITCH(pitch) |
1003       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1004    desc[3] = 0;
1005    desc[4] = va;
1006    desc[5] = va >> 32;
1007    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1008       desc[i] = 0;
1009 
1010    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1011 }
1012 
1013 static void
r3d_src_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, enum pipe_format format, enum pipe_format dst_format, uint32_t gmem_offset, uint32_t cpp)1014 r3d_src_gmem(struct tu_cmd_buffer *cmd,
1015              struct tu_cs *cs,
1016              const struct tu_image_view *iview,
1017              enum pipe_format format,
1018              enum pipe_format dst_format,
1019              uint32_t gmem_offset,
1020              uint32_t cpp)
1021 {
1022    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1023    memcpy(desc, iview->view.descriptor, sizeof(desc));
1024 
1025    enum a6xx_format fmt = tu6_format_texture(format, TILE6_LINEAR).fmt;
1026    fixup_src_format(&format, dst_format, &fmt);
1027 
1028    /* patch the format so that depth/stencil get the right format and swizzle */
1029    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1030                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1031                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1032    desc[0] |= A6XX_TEX_CONST_0_FMT(fmt) |
1033                A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1034                A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1035                A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1036                A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1037 
1038    /* patched for gmem */
1039    desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1040    desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1041    desc[2] =
1042       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1043       A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
1044    desc[3] = 0;
1045    desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1046    desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
1047    for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1048       desc[i] = 0;
1049 
1050    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1051 }
1052 
1053 static void
r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, enum pipe_format src_format)1054 r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1055         enum pipe_format src_format)
1056 {
1057    uint32_t mrt_buf_info = iview->RB_MRT_BUF_INFO;
1058 
1059    enum a6xx_format fmt = mrt_buf_info & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK;
1060    enum pipe_format dst_format = iview->format;
1061    fixup_dst_format(src_format, &dst_format, &fmt);
1062    mrt_buf_info =
1063       (mrt_buf_info & ~A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK) |
1064       A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(fmt);
1065    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1066    tu_cs_emit(cs, mrt_buf_info);
1067    tu_cs_image_ref(cs, iview, layer);
1068    tu_cs_emit(cs, 0);
1069 
1070    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1071    tu_cs_image_flag_ref(cs, iview, layer);
1072 
1073    /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
1074     * FMT6_NV12_Y.
1075     */
1076    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt));
1077 
1078    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
1079 }
1080 
1081 static void
r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)1082 r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1083 {
1084    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1085    tu_cs_emit(cs, tu_image_view_depth(iview, RB_MRT_BUF_INFO));
1086    tu_cs_image_depth_ref(cs, iview, layer);
1087    tu_cs_emit(cs, 0);
1088 
1089    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1090    tu_cs_image_flag_ref(cs, &iview->view, layer);
1091 
1092    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->view.ubwc_enabled));
1093 }
1094 
1095 static void
r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)1096 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1097 {
1098    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1099    tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
1100    tu_cs_image_stencil_ref(cs, iview, layer);
1101    tu_cs_emit(cs, 0);
1102 
1103    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
1104 }
1105 
1106 static void
r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch, enum pipe_format src_format)1107 r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1108                enum pipe_format src_format)
1109 {
1110    struct tu_native_format fmt = tu6_format_color(format, TILE6_LINEAR);
1111 
1112    enum a6xx_format color_fmt = fmt.fmt;
1113    fixup_dst_format(src_format, &format, &color_fmt);
1114 
1115    tu_cs_emit_regs(cs,
1116                    A6XX_RB_MRT_BUF_INFO(0, .color_format = color_fmt, .color_swap = fmt.swap),
1117                    A6XX_RB_MRT_PITCH(0, pitch),
1118                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1119                    A6XX_RB_MRT_BASE(0, .qword = va),
1120                    A6XX_RB_MRT_BASE_GMEM(0, 0));
1121 
1122    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
1123 }
1124 
1125 static uint8_t
aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)1126 aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
1127 {
1128    uint8_t mask = 0xf;
1129    assert(aspect_mask);
1130    /* note: the only format with partial writing is D24S8,
1131     * clear/blit uses the _AS_R8G8B8A8 format to access it
1132     */
1133    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1134       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1135          mask = 0x7;
1136       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1137          mask = 0x8;
1138    }
1139    return mask;
1140 }
1141 
1142 static void
r3d_setup(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format src_format, enum pipe_format dst_format, VkImageAspectFlags aspect_mask, unsigned blit_param, bool clear, bool ubwc, VkSampleCountFlagBits samples)1143 r3d_setup(struct tu_cmd_buffer *cmd,
1144           struct tu_cs *cs,
1145           enum pipe_format src_format,
1146           enum pipe_format dst_format,
1147           VkImageAspectFlags aspect_mask,
1148           unsigned blit_param,
1149           bool clear,
1150           bool ubwc,
1151           VkSampleCountFlagBits samples)
1152 {
1153    enum a6xx_format fmt = tu6_base_format(dst_format);
1154    fixup_dst_format(src_format, &dst_format, &fmt);
1155 
1156    if ((dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
1157         dst_format == PIPE_FORMAT_Z24X8_UNORM) && ubwc) {
1158       fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
1159    }
1160 
1161    if (!cmd->state.pass) {
1162       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
1163       tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1164    }
1165 
1166    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
1167    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
1168 
1169    r3d_common(cmd, cs, !clear, 1, blit_param, samples);
1170 
1171    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1172    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1173                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1174                   0xfc000000);
1175    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
1176 
1177    tu_cs_emit_regs(cs,
1178                    A6XX_RB_FS_OUTPUT_CNTL0(),
1179                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1180 
1181    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1182    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1183 
1184    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1185    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1186    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1187    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1188    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1189    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1190    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1191 
1192    tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
1193    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
1194 
1195    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1196                         .color_format = fmt,
1197                         .color_sint = util_format_is_pure_sint(dst_format),
1198                         .color_uint = util_format_is_pure_uint(dst_format)));
1199 
1200    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1201       .component_enable = aspect_write_mask(dst_format, aspect_mask)));
1202    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
1203    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
1204 
1205    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1206    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1207 
1208    tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1209                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1210 
1211    /* Disable sample counting in order to not affect occlusion query. */
1212    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
1213 
1214    if (cmd->state.prim_generated_query_running_before_rp) {
1215       tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS);
1216    }
1217 
1218    if (cmd->state.predication_active) {
1219       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1220       tu_cs_emit(cs, 0);
1221    }
1222 }
1223 
1224 static void
r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)1225 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1226 {
1227    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1228    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1229                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1230                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1231    tu_cs_emit(cs, 1); /* instance count */
1232    tu_cs_emit(cs, 2); /* vertex count */
1233 }
1234 
1235 static void
r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)1236 r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1237 {
1238    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1239    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1240                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1241                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY));
1242    tu_cs_emit(cs, 1); /* instance count */
1243    tu_cs_emit(cs, 2); /* vertex count */
1244 }
1245 
1246 static void
r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)1247 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1248 {
1249    if (cmd->state.predication_active) {
1250       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1251       tu_cs_emit(cs, 1);
1252    }
1253 
1254    /* Re-enable sample counting. */
1255    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
1256 
1257    if (cmd->state.prim_generated_query_running_before_rp) {
1258       tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS);
1259    }
1260 }
1261 
1262 /* blit ops - common interface for 2d/shader paths */
1263 
1264 struct blit_ops {
1265    void (*coords)(struct tu_cs *cs,
1266                   const VkOffset2D *dst,
1267                   const VkOffset2D *src,
1268                   const VkExtent2D *extent);
1269    void (*clear_value)(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val);
1270    void (*src)(
1271         struct tu_cmd_buffer *cmd,
1272         struct tu_cs *cs,
1273         const struct fdl6_view *iview,
1274         uint32_t layer,
1275         VkFilter filter,
1276         enum pipe_format dst_format);
1277    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1278                       enum pipe_format format,
1279                       uint64_t va, uint32_t pitch,
1280                       uint32_t width, uint32_t height,
1281                       enum pipe_format dst_format);
1282    void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1283                enum pipe_format src_format);
1284    void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1285    void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1286    void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1287                       enum pipe_format src_format);
1288    void (*setup)(struct tu_cmd_buffer *cmd,
1289                  struct tu_cs *cs,
1290                  enum pipe_format src_format,
1291                  enum pipe_format dst_format,
1292                  VkImageAspectFlags aspect_mask,
1293                  unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1294                  bool clear,
1295                  bool ubwc,
1296                  VkSampleCountFlagBits samples);
1297    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1298    void (*teardown)(struct tu_cmd_buffer *cmd,
1299                     struct tu_cs *cs);
1300 };
1301 
1302 static const struct blit_ops r2d_ops = {
1303    .coords = r2d_coords,
1304    .clear_value = r2d_clear_value,
1305    .src = r2d_src,
1306    .src_buffer = r2d_src_buffer,
1307    .dst = r2d_dst,
1308    .dst_depth = r2d_dst_depth,
1309    .dst_stencil = r2d_dst_stencil,
1310    .dst_buffer = r2d_dst_buffer,
1311    .setup = r2d_setup,
1312    .run = r2d_run,
1313    .teardown = r2d_teardown,
1314 };
1315 
1316 static const struct blit_ops r3d_ops = {
1317    .coords = r3d_coords,
1318    .clear_value = r3d_clear_value,
1319    .src = r3d_src,
1320    .src_buffer = r3d_src_buffer,
1321    .dst = r3d_dst,
1322    .dst_depth = r3d_dst_depth,
1323    .dst_stencil = r3d_dst_stencil,
1324    .dst_buffer = r3d_dst_buffer,
1325    .setup = r3d_setup,
1326    .run = r3d_run,
1327    .teardown = r3d_teardown,
1328 };
1329 
1330 /* passthrough set coords from 3D extents */
1331 static void
coords(const struct blit_ops *ops, struct tu_cs *cs, const VkOffset3D *dst, const VkOffset3D *src, const VkExtent3D *extent)1332 coords(const struct blit_ops *ops,
1333        struct tu_cs *cs,
1334        const VkOffset3D *dst,
1335        const VkOffset3D *src,
1336        const VkExtent3D *extent)
1337 {
1338    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1339 }
1340 
1341 /* Decides the VK format to treat our data as for a memcpy-style blit. We have
1342  * to be a bit careful because we have to pick a format with matching UBWC
1343  * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1344  * everything.
1345  */
1346 static enum pipe_format
copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)1347 copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)
1348 {
1349    if (vk_format_is_compressed(vk_format)) {
1350       switch (vk_format_get_blocksize(vk_format)) {
1351       case 1: return PIPE_FORMAT_R8_UINT;
1352       case 2: return PIPE_FORMAT_R16_UINT;
1353       case 4: return PIPE_FORMAT_R32_UINT;
1354       case 8: return PIPE_FORMAT_R32G32_UINT;
1355       case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
1356       default:
1357          unreachable("unhandled format size");
1358       }
1359    }
1360 
1361    enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
1362 
1363    /* For SNORM formats, copy them as the equivalent UNORM format.  If we treat
1364     * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1365     * (also -1.0), when we're supposed to be memcpying the bits. See
1366     * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1367     */
1368    format = util_format_snorm_to_unorm(format);
1369 
1370    switch (format) {
1371    case PIPE_FORMAT_R9G9B9E5_FLOAT:
1372       return PIPE_FORMAT_R32_UINT;
1373 
1374    case PIPE_FORMAT_G8_B8R8_420_UNORM:
1375       if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
1376          return PIPE_FORMAT_R8G8_UNORM;
1377       else
1378          return PIPE_FORMAT_Y8_UNORM;
1379    case PIPE_FORMAT_G8_B8_R8_420_UNORM:
1380       return PIPE_FORMAT_R8_UNORM;
1381 
1382    case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
1383       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1384          return PIPE_FORMAT_S8_UINT;
1385       assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
1386       return PIPE_FORMAT_Z32_FLOAT;
1387 
1388    default:
1389       return format;
1390    }
1391 }
1392 
1393 void
tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image *image, const VkClearValue *value)1394 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
1395               struct tu_cs *cs,
1396               struct tu_image *image,
1397               const VkClearValue *value)
1398 {
1399    const struct blit_ops *ops = &r2d_ops;
1400 
1401    /* It is assumed that LRZ cache is invalidated at this point for
1402     * the writes here to become visible to LRZ.
1403     *
1404     * LRZ writes are going through UCHE cache, flush UCHE before changing
1405     * LRZ via CCU. Don't need to invalidate CCU since we are presumably
1406     * writing whole cache lines we assume to be 64 bytes.
1407     */
1408    tu6_emit_event_write(cmd, &cmd->cs, CACHE_FLUSH_TS);
1409 
1410    ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
1411               VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
1412               VK_SAMPLE_COUNT_1_BIT);
1413    ops->clear_value(cs, PIPE_FORMAT_Z16_UNORM, value);
1414    ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
1415                    image->iova + image->lrz_offset,
1416                    image->lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
1417    ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});
1418    ops->run(cmd, cs);
1419    ops->teardown(cmd, cs);
1420 
1421    /* Clearing writes via CCU color in the PS stage, and LRZ is read via
1422     * UCHE in the earlier GRAS stage.
1423     */
1424    cmd->state.cache.flush_bits |=
1425       TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
1426       TU_CMD_FLAG_WAIT_FOR_IDLE;
1427 }
1428 
1429 void
tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image *image)1430 tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd,
1431                  struct tu_cs *cs,
1432                  struct tu_image *image)
1433 {
1434    const struct blit_ops *ops = &r2d_ops;
1435    VkClearValue clear = { .color = { .uint32[0] = 0xffffffff } };
1436 
1437    /* LRZ fast-clear buffer is always allocated with 512 bytes size. */
1438    ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
1439               VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
1440               VK_SAMPLE_COUNT_1_BIT);
1441    ops->clear_value(cs, PIPE_FORMAT_R32_UINT, &clear);
1442    ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
1443                    image->iova + image->lrz_fc_offset, 512,
1444                    PIPE_FORMAT_R32_UINT);
1445    ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {128, 1});
1446    ops->run(cmd, cs);
1447    ops->teardown(cmd, cs);
1448 }
1449 
1450 static void
tu_image_view_copy_blit(struct fdl6_view *iview, struct tu_image *image, enum pipe_format format, const VkImageSubresourceLayers *subres, uint32_t layer, bool z_scale)1451 tu_image_view_copy_blit(struct fdl6_view *iview,
1452                         struct tu_image *image,
1453                         enum pipe_format format,
1454                         const VkImageSubresourceLayers *subres,
1455                         uint32_t layer,
1456                         bool z_scale)
1457 {
1458    VkImageAspectFlags aspect_mask = subres->aspectMask;
1459 
1460    /* always use the AS_R8G8B8A8 format for these */
1461    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
1462        format == PIPE_FORMAT_Z24X8_UNORM) {
1463       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1464    }
1465 
1466    const struct fdl_layout *layout =
1467       &image->layout[tu6_plane_index(image->vk.format, aspect_mask)];
1468 
1469    fdl6_view_init(iview, &layout, &(struct fdl_view_args) {
1470       .iova = image->iova,
1471       .base_array_layer = subres->baseArrayLayer + layer,
1472       .layer_count = 1,
1473       .base_miplevel = subres->mipLevel,
1474       .level_count = 1,
1475       .format = tu_format_for_aspect(format, aspect_mask),
1476       .swiz = {
1477          PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
1478       },
1479       .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
1480    }, false);
1481 }
1482 
1483 static void
tu_image_view_copy(struct fdl6_view *iview, struct tu_image *image, enum pipe_format format, const VkImageSubresourceLayers *subres, uint32_t layer)1484 tu_image_view_copy(struct fdl6_view *iview,
1485                    struct tu_image *image,
1486                    enum pipe_format format,
1487                    const VkImageSubresourceLayers *subres,
1488                    uint32_t layer)
1489 {
1490    tu_image_view_copy_blit(iview, image, format, subres, layer, false);
1491 }
1492 
1493 static void
tu_image_view_blit(struct fdl6_view *iview, struct tu_image *image, const VkImageSubresourceLayers *subres, uint32_t layer)1494 tu_image_view_blit(struct fdl6_view *iview,
1495                    struct tu_image *image,
1496                    const VkImageSubresourceLayers *subres,
1497                    uint32_t layer)
1498 {
1499    enum pipe_format format =
1500       tu6_plane_format(image->vk.format, tu6_plane_index(image->vk.format,
1501                                                          subres->aspectMask));
1502    tu_image_view_copy_blit(iview, image, format, subres, layer, false);
1503 }
1504 
1505 static void
tu6_blit_image(struct tu_cmd_buffer *cmd, struct tu_image *src_image, struct tu_image *dst_image, const VkImageBlit2 *info, VkFilter filter)1506 tu6_blit_image(struct tu_cmd_buffer *cmd,
1507                struct tu_image *src_image,
1508                struct tu_image *dst_image,
1509                const VkImageBlit2 *info,
1510                VkFilter filter)
1511 {
1512    const struct blit_ops *ops = &r2d_ops;
1513    struct tu_cs *cs = &cmd->cs;
1514    bool z_scale = false;
1515    uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
1516 
1517    /* 2D blit can't do rotation mirroring from just coordinates */
1518    static const enum a6xx_rotation rotate[2][2] = {
1519       {ROTATE_0, ROTATE_HFLIP},
1520       {ROTATE_VFLIP, ROTATE_180},
1521    };
1522 
1523    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1524                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
1525    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1526                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
1527 
1528    int32_t src0_z = info->srcOffsets[0].z;
1529    int32_t src1_z = info->srcOffsets[1].z;
1530 
1531    if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
1532         info->dstOffsets[1].z - info->dstOffsets[0].z) ||
1533        info->srcOffsets[1].z < info->srcOffsets[0].z) {
1534       z_scale = true;
1535    }
1536 
1537    if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
1538       layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
1539       src0_z = info->srcOffsets[1].z;
1540       src1_z = info->srcOffsets[0].z;
1541    }
1542 
1543    if (info->dstSubresource.layerCount > 1) {
1544       assert(layers <= 1);
1545       layers = info->dstSubresource.layerCount;
1546    }
1547 
1548    /* BC1_RGB_* formats need to have their last components overriden with 1
1549     * when sampling, which is normally handled with the texture descriptor
1550     * swizzle. The 2d path can't handle that, so use the 3d path.
1551     *
1552     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1553     * the 2d path.
1554     */
1555 
1556    unsigned blit_param = rotate[mirror_y][mirror_x];
1557    if (dst_image->layout[0].nr_samples > 1 ||
1558        src_image->vk.format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1559        src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1560        filter == VK_FILTER_CUBIC_EXT ||
1561        z_scale) {
1562       ops = &r3d_ops;
1563       blit_param = z_scale;
1564    }
1565 
1566    /* use the right format in setup() for D32_S8
1567     * TODO: this probably should use a helper
1568     */
1569    enum pipe_format src_format =
1570       tu6_plane_format(src_image->vk.format,
1571                        tu6_plane_index(src_image->vk.format,
1572                                        info->srcSubresource.aspectMask));
1573    enum pipe_format dst_format =
1574       tu6_plane_format(dst_image->vk.format,
1575                        tu6_plane_index(src_image->vk.format,
1576                                        info->srcSubresource.aspectMask));
1577    trace_start_blit(&cmd->trace, cs);
1578 
1579    ops->setup(cmd, cs, src_format, dst_format, info->dstSubresource.aspectMask,
1580               blit_param, false, dst_image->layout[0].ubwc,
1581               dst_image->layout[0].nr_samples);
1582 
1583    if (ops == &r3d_ops) {
1584       r3d_coords_raw(cs, (float[]) {
1585          info->dstOffsets[0].x, info->dstOffsets[0].y,
1586          info->srcOffsets[0].x, info->srcOffsets[0].y,
1587          info->dstOffsets[1].x, info->dstOffsets[1].y,
1588          info->srcOffsets[1].x, info->srcOffsets[1].y
1589       });
1590    } else {
1591       tu_cs_emit_regs(cs,
1592          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1593                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1594          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1595                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1596       tu_cs_emit_regs(cs,
1597          A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1598          A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1599          A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1600          A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1601    }
1602 
1603    struct fdl6_view dst, src;
1604    tu_image_view_blit(&dst, dst_image, &info->dstSubresource,
1605                       MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
1606 
1607    if (z_scale) {
1608       tu_image_view_copy_blit(&src, src_image, src_format,
1609                               &info->srcSubresource, 0, true);
1610       ops->src(cmd, cs, &src, 0, filter, dst_format);
1611    } else {
1612       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1613    }
1614 
1615    for (uint32_t i = 0; i < layers; i++) {
1616       if (z_scale) {
1617          float t = ((float) i + 0.5f) / (float) layers;
1618          r3d_coord_z(cs, t * (src1_z - src0_z) + src0_z);
1619       } else {
1620          ops->src(cmd, cs, &src, i, filter, dst_format);
1621       }
1622       ops->dst(cs, &dst, i, src_format);
1623       ops->run(cmd, cs);
1624    }
1625 
1626    ops->teardown(cmd, cs);
1627 
1628    trace_end_blit(&cmd->trace, cs,
1629                   ops == &r3d_ops,
1630                   src_image->vk.format,
1631                   dst_image->vk.format,
1632                   layers);
1633 }
1634 
1635 VKAPI_ATTR void VKAPI_CALL
tu_CmdBlitImage2KHR(VkCommandBuffer commandBuffer, const VkBlitImageInfo2* pBlitImageInfo)1636 tu_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
1637                     const VkBlitImageInfo2* pBlitImageInfo)
1638 
1639 {
1640    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1641    TU_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage);
1642    TU_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage);
1643 
1644    for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) {
1645       /* can't blit both depth and stencil at once with D32_S8
1646        * TODO: more advanced 3D blit path to support it instead?
1647        */
1648       if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1649           dst_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1650          VkImageBlit2 region = pBlitImageInfo->pRegions[i];
1651          u_foreach_bit(b, region.dstSubresource.aspectMask) {
1652             region.srcSubresource.aspectMask = BIT(b);
1653             region.dstSubresource.aspectMask = BIT(b);
1654             tu6_blit_image(cmd, src_image, dst_image, &region, pBlitImageInfo->filter);
1655          }
1656          continue;
1657       }
1658       tu6_blit_image(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
1659                      pBlitImageInfo->filter);
1660    }
1661 
1662    if (dst_image->lrz_height) {
1663       tu_disable_lrz(cmd, &cmd->cs, dst_image);
1664    }
1665 }
1666 
1667 static void
copy_compressed(VkFormat format, VkOffset3D *offset, VkExtent3D *extent, uint32_t *width, uint32_t *height)1668 copy_compressed(VkFormat format,
1669                 VkOffset3D *offset,
1670                 VkExtent3D *extent,
1671                 uint32_t *width,
1672                 uint32_t *height)
1673 {
1674    if (!vk_format_is_compressed(format))
1675       return;
1676 
1677    uint32_t block_width = vk_format_get_blockwidth(format);
1678    uint32_t block_height = vk_format_get_blockheight(format);
1679 
1680    offset->x /= block_width;
1681    offset->y /= block_height;
1682 
1683    if (extent) {
1684       extent->width = DIV_ROUND_UP(extent->width, block_width);
1685       extent->height = DIV_ROUND_UP(extent->height, block_height);
1686    }
1687    if (width)
1688       *width = DIV_ROUND_UP(*width, block_width);
1689    if (height)
1690       *height = DIV_ROUND_UP(*height, block_height);
1691 }
1692 
1693 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd, struct tu_buffer *src_buffer, struct tu_image *dst_image, const VkBufferImageCopy2 *info)1694 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1695                         struct tu_buffer *src_buffer,
1696                         struct tu_image *dst_image,
1697                         const VkBufferImageCopy2 *info)
1698 {
1699    struct tu_cs *cs = &cmd->cs;
1700    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1701    enum pipe_format src_format =
1702       copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
1703    enum pipe_format dst_format =
1704       copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
1705    const struct blit_ops *ops = &r2d_ops;
1706 
1707    /* special case for buffer to stencil */
1708    if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
1709        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1710       src_format = PIPE_FORMAT_S8_UINT;
1711    }
1712 
1713    /* note: could use "R8_UNORM" when no UBWC */
1714    if (src_format == PIPE_FORMAT_Y8_UNORM)
1715       ops = &r3d_ops;
1716 
1717    VkOffset3D offset = info->imageOffset;
1718    VkExtent3D extent = info->imageExtent;
1719    uint32_t src_width = info->bufferRowLength ?: extent.width;
1720    uint32_t src_height = info->bufferImageHeight ?: extent.height;
1721 
1722    copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
1723 
1724    uint32_t pitch = src_width * util_format_get_blocksize(src_format);
1725    uint32_t layer_size = src_height * pitch;
1726 
1727    ops->setup(cmd, cs, src_format, dst_format,
1728               info->imageSubresource.aspectMask, 0, false, dst_image->layout[0].ubwc,
1729               dst_image->layout[0].nr_samples);
1730 
1731    struct fdl6_view dst;
1732    tu_image_view_copy(&dst, dst_image, dst_format, &info->imageSubresource, offset.z);
1733 
1734    for (uint32_t i = 0; i < layers; i++) {
1735       ops->dst(cs, &dst, i, src_format);
1736 
1737       uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i;
1738       if ((src_va & 63) || (pitch & 63)) {
1739          for (uint32_t y = 0; y < extent.height; y++) {
1740             uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
1741             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1742                             x + extent.width, 1, dst_format);
1743             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1744                         &(VkExtent2D) {extent.width, 1});
1745             ops->run(cmd, cs);
1746             src_va += pitch;
1747          }
1748       } else {
1749          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height, dst_format);
1750          coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1751          ops->run(cmd, cs);
1752       }
1753    }
1754 
1755    ops->teardown(cmd, cs);
1756 }
1757 
1758 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer, const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)1759 tu_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
1760                             const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
1761 {
1762    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1763    TU_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage);
1764    TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
1765 
1766    for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
1767       tu_copy_buffer_to_image(cmd, src_buffer, dst_image,
1768                               pCopyBufferToImageInfo->pRegions + i);
1769 
1770    if (dst_image->lrz_height) {
1771       tu_disable_lrz(cmd, &cmd->cs, dst_image);
1772    }
1773 }
1774 
1775 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, struct tu_image *src_image, struct tu_buffer *dst_buffer, const VkBufferImageCopy2 *info)1776 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1777                         struct tu_image *src_image,
1778                         struct tu_buffer *dst_buffer,
1779                         const VkBufferImageCopy2 *info)
1780 {
1781    struct tu_cs *cs = &cmd->cs;
1782    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1783    enum pipe_format dst_format =
1784       copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
1785    enum pipe_format src_format =
1786       copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
1787    const struct blit_ops *ops = &r2d_ops;
1788 
1789    if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
1790        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1791       dst_format = PIPE_FORMAT_S8_UINT;
1792    }
1793 
1794    /* note: could use "R8_UNORM" when no UBWC */
1795    if (dst_format == PIPE_FORMAT_Y8_UNORM)
1796       ops = &r3d_ops;
1797 
1798    VkOffset3D offset = info->imageOffset;
1799    VkExtent3D extent = info->imageExtent;
1800    uint32_t dst_width = info->bufferRowLength ?: extent.width;
1801    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1802 
1803    copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
1804 
1805    uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
1806    uint32_t layer_size = pitch * dst_height;
1807 
1808    ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
1809               VK_SAMPLE_COUNT_1_BIT);
1810 
1811    struct fdl6_view src;
1812    tu_image_view_copy(&src, src_image, src_format, &info->imageSubresource, offset.z);
1813 
1814    for (uint32_t i = 0; i < layers; i++) {
1815       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
1816 
1817       uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i;
1818       if ((dst_va & 63) || (pitch & 63)) {
1819          for (uint32_t y = 0; y < extent.height; y++) {
1820             uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
1821             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0, src_format);
1822             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1823                         &(VkExtent2D) {extent.width, 1});
1824             ops->run(cmd, cs);
1825             dst_va += pitch;
1826          }
1827       } else {
1828          ops->dst_buffer(cs, dst_format, dst_va, pitch, src_format);
1829          coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1830          ops->run(cmd, cs);
1831       }
1832    }
1833 
1834    ops->teardown(cmd, cs);
1835 }
1836 
1837 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer, const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo)1838 tu_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
1839                             const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo)
1840 {
1841    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1842    TU_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage);
1843    TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
1844 
1845    for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i)
1846       tu_copy_image_to_buffer(cmd, src_image, dst_buffer,
1847                               pCopyImageToBufferInfo->pRegions + i);
1848 }
1849 
1850 /* Tiled formats don't support swapping, which means that we can't support
1851  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1852  * formats like B5G5R5A1 have a separate linear-only format when sampling.
1853  * Currently we fake support for tiled swapped formats and use the unswapped
1854  * format instead, but this means that reinterpreting copies to and from
1855  * swapped formats can't be performed correctly unless we can swizzle the
1856  * components by reinterpreting the other image as the "correct" swapped
1857  * format, i.e. only when the other image is linear.
1858  */
1859 
1860 static bool
is_swapped_format(enum pipe_format format)1861 is_swapped_format(enum pipe_format format)
1862 {
1863    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1864    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1865    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1866 }
1867 
1868 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1869  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1870  * versa). This should mirror the logic in fdl6_layout.
1871  */
1872 static bool
image_is_r8g8(struct tu_image *image)1873 image_is_r8g8(struct tu_image *image)
1874 {
1875    return image->layout[0].cpp == 2 &&
1876       vk_format_get_nr_components(image->vk.format) == 2;
1877 }
1878 
1879 static void
tu_copy_image_to_image(struct tu_cmd_buffer *cmd, struct tu_image *src_image, struct tu_image *dst_image, const VkImageCopy2 *info)1880 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1881                        struct tu_image *src_image,
1882                        struct tu_image *dst_image,
1883                        const VkImageCopy2 *info)
1884 {
1885    const struct blit_ops *ops = &r2d_ops;
1886    struct tu_cs *cs = &cmd->cs;
1887 
1888    if (dst_image->layout[0].nr_samples > 1)
1889       ops = &r3d_ops;
1890 
1891    enum pipe_format format = PIPE_FORMAT_NONE;
1892    VkOffset3D src_offset = info->srcOffset;
1893    VkOffset3D dst_offset = info->dstOffset;
1894    VkExtent3D extent = info->extent;
1895    uint32_t layers_to_copy = MAX2(info->extent.depth, info->srcSubresource.layerCount);
1896 
1897    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1898     * Images":
1899     *
1900     *    When copying between compressed and uncompressed formats the extent
1901     *    members represent the texel dimensions of the source image and not
1902     *    the destination. When copying from a compressed image to an
1903     *    uncompressed image the image texel dimensions written to the
1904     *    uncompressed image will be source extent divided by the compressed
1905     *    texel block dimensions. When copying from an uncompressed image to a
1906     *    compressed image the image texel dimensions written to the compressed
1907     *    image will be the source extent multiplied by the compressed texel
1908     *    block dimensions.
1909     *
1910     * This means we only have to adjust the extent if the source image is
1911     * compressed.
1912     */
1913    copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
1914    copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
1915 
1916    enum pipe_format dst_format = copy_format(dst_image->vk.format, info->dstSubresource.aspectMask);
1917    enum pipe_format src_format = copy_format(src_image->vk.format, info->srcSubresource.aspectMask);
1918 
1919    /* note: could use "R8_UNORM" when no UBWC */
1920    if (dst_format == PIPE_FORMAT_Y8_UNORM ||
1921        src_format == PIPE_FORMAT_Y8_UNORM)
1922       ops = &r3d_ops;
1923 
1924    bool use_staging_blit = false;
1925 
1926    if (src_format == dst_format) {
1927       /* Images that share a format can always be copied directly because it's
1928        * the same as a blit.
1929        */
1930       format = src_format;
1931    } else if (!src_image->layout[0].tile_mode) {
1932       /* If an image is linear, we can always safely reinterpret it with the
1933        * other image's format and then do a regular blit.
1934        */
1935       format = dst_format;
1936    } else if (!dst_image->layout[0].tile_mode) {
1937       format = src_format;
1938    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1939       /* We can't currently copy r8g8 images to/from other cpp=2 images,
1940        * due to the different tile layout.
1941        */
1942       use_staging_blit = true;
1943    } else if (is_swapped_format(src_format) ||
1944               is_swapped_format(dst_format)) {
1945       /* If either format has a non-identity swap, then we can't copy
1946        * to/from it.
1947        */
1948       use_staging_blit = true;
1949    } else if (!src_image->layout[0].ubwc) {
1950       format = dst_format;
1951    } else if (!dst_image->layout[0].ubwc) {
1952       format = src_format;
1953    } else {
1954       /* Both formats use UBWC and so neither can be reinterpreted.
1955        * TODO: We could do an in-place decompression of the dst instead.
1956        */
1957       perf_debug(cmd->device, "TODO: Do in-place UBWC decompression for UBWC->UBWC blits");
1958       use_staging_blit = true;
1959    }
1960 
1961    struct fdl6_view dst, src;
1962 
1963    if (use_staging_blit) {
1964       tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z);
1965       tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z);
1966 
1967       struct fdl_layout staging_layout = { 0 };
1968       VkOffset3D staging_offset = { 0 };
1969 
1970       staging_layout.tile_mode = TILE6_LINEAR;
1971       staging_layout.ubwc = false;
1972 
1973       fdl6_layout(&staging_layout,
1974                   src_format,
1975                   src_image->layout[0].nr_samples,
1976                   extent.width,
1977                   extent.height,
1978                   extent.depth,
1979                   1,
1980                   info->srcSubresource.layerCount,
1981                   extent.depth > 1,
1982                   NULL);
1983 
1984       struct tu_bo *staging_bo;
1985       VkResult result = tu_get_scratch_bo(cmd->device,
1986                                           staging_layout.size,
1987                                           &staging_bo);
1988       if (result != VK_SUCCESS) {
1989          cmd->record_result = result;
1990          return;
1991       }
1992 
1993       struct fdl6_view staging;
1994       const struct fdl_layout *staging_layout_ptr = &staging_layout;
1995       fdl6_view_init(&staging, &staging_layout_ptr, &(struct fdl_view_args) {
1996          .iova = staging_bo->iova,
1997          .base_array_layer = 0,
1998          .layer_count = 1,
1999          .base_miplevel = 0,
2000          .level_count = info->srcSubresource.layerCount,
2001          .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
2002          .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2003          .type = FDL_VIEW_TYPE_2D,
2004       }, false);
2005 
2006       ops->setup(cmd, cs, src_format, src_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
2007                  dst_image->layout[0].nr_samples);
2008       coords(ops, cs, &staging_offset, &src_offset, &extent);
2009 
2010       for (uint32_t i = 0; i < layers_to_copy; i++) {
2011          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, src_format);
2012          ops->dst(cs, &staging, i, src_format);
2013          ops->run(cmd, cs);
2014       }
2015 
2016       /* When executed by the user there has to be a pipeline barrier here,
2017        * but since we're doing it manually we'll have to flush ourselves.
2018        */
2019       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2020       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2021       tu_cs_emit_wfi(cs);
2022 
2023       fdl6_view_init(&staging, &staging_layout_ptr, &(struct fdl_view_args) {
2024          .iova = staging_bo->iova,
2025          .base_array_layer = 0,
2026          .layer_count = 1,
2027          .base_miplevel = 0,
2028          .level_count = info->srcSubresource.layerCount,
2029          .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
2030          .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2031          .type = FDL_VIEW_TYPE_2D,
2032       }, false);
2033 
2034       ops->setup(cmd, cs, dst_format, dst_format, info->dstSubresource.aspectMask,
2035                  0, false, dst_image->layout[0].ubwc,
2036                  dst_image->layout[0].nr_samples);
2037       coords(ops, cs, &dst_offset, &staging_offset, &extent);
2038 
2039       for (uint32_t i = 0; i < layers_to_copy; i++) {
2040          ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST, dst_format);
2041          ops->dst(cs, &dst, i, dst_format);
2042          ops->run(cmd, cs);
2043       }
2044    } else {
2045       tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z);
2046       tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z);
2047 
2048       ops->setup(cmd, cs, format, format, info->dstSubresource.aspectMask,
2049                  0, false, dst_image->layout[0].ubwc,
2050                  dst_image->layout[0].nr_samples);
2051       coords(ops, cs, &dst_offset, &src_offset, &extent);
2052 
2053       for (uint32_t i = 0; i < layers_to_copy; i++) {
2054          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, format);
2055          ops->dst(cs, &dst, i, format);
2056          ops->run(cmd, cs);
2057       }
2058    }
2059 
2060    ops->teardown(cmd, cs);
2061 }
2062 
2063 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImage2KHR(VkCommandBuffer commandBuffer, const VkCopyImageInfo2* pCopyImageInfo)2064 tu_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
2065                     const VkCopyImageInfo2* pCopyImageInfo)
2066 {
2067    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2068    TU_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage);
2069    TU_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage);
2070 
2071    for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) {
2072       if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2073          VkImageCopy2 info = pCopyImageInfo->pRegions[i];
2074          u_foreach_bit(b, info.dstSubresource.aspectMask) {
2075             info.srcSubresource.aspectMask = BIT(b);
2076             info.dstSubresource.aspectMask = BIT(b);
2077             tu_copy_image_to_image(cmd, src_image, dst_image, &info);
2078          }
2079          continue;
2080       }
2081 
2082       tu_copy_image_to_image(cmd, src_image, dst_image,
2083                              pCopyImageInfo->pRegions + i);
2084    }
2085 
2086    if (dst_image->lrz_height) {
2087       tu_disable_lrz(cmd, &cmd->cs, dst_image);
2088    }
2089 }
2090 
2091 static void
copy_buffer(struct tu_cmd_buffer *cmd, uint64_t dst_va, uint64_t src_va, uint64_t size, uint32_t block_size)2092 copy_buffer(struct tu_cmd_buffer *cmd,
2093             uint64_t dst_va,
2094             uint64_t src_va,
2095             uint64_t size,
2096             uint32_t block_size)
2097 {
2098    const struct blit_ops *ops = &r2d_ops;
2099    struct tu_cs *cs = &cmd->cs;
2100    enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
2101    uint64_t blocks = size / block_size;
2102 
2103    ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
2104               VK_SAMPLE_COUNT_1_BIT);
2105 
2106    while (blocks) {
2107       uint32_t src_x = (src_va & 63) / block_size;
2108       uint32_t dst_x = (dst_va & 63) / block_size;
2109       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
2110 
2111       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1, format);
2112       ops->dst_buffer(     cs, format, dst_va & ~63, 0, format);
2113       ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
2114       ops->run(cmd, cs);
2115 
2116       src_va += width * block_size;
2117       dst_va += width * block_size;
2118       blocks -= width;
2119    }
2120 
2121    ops->teardown(cmd, cs);
2122 }
2123 
2124 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer, const VkCopyBufferInfo2 *pCopyBufferInfo)2125 tu_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
2126                      const VkCopyBufferInfo2 *pCopyBufferInfo)
2127 {
2128    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2129    TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
2130    TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
2131 
2132    for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
2133       const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
2134       copy_buffer(cmd,
2135                   dst_buffer->iova + region->dstOffset,
2136                   src_buffer->iova + region->srcOffset,
2137                   region->size, 1);
2138    }
2139 }
2140 
2141 VKAPI_ATTR void VKAPI_CALL
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize dataSize, const void *pData)2142 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
2143                    VkBuffer dstBuffer,
2144                    VkDeviceSize dstOffset,
2145                    VkDeviceSize dataSize,
2146                    const void *pData)
2147 {
2148    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2149    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2150 
2151    struct tu_cs_memory tmp;
2152    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
2153    if (result != VK_SUCCESS) {
2154       cmd->record_result = result;
2155       return;
2156    }
2157 
2158    memcpy(tmp.map, pData, dataSize);
2159    copy_buffer(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4);
2160 }
2161 
2162 VKAPI_ATTR void VKAPI_CALL
tu_CmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize fillSize, uint32_t data)2163 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
2164                  VkBuffer dstBuffer,
2165                  VkDeviceSize dstOffset,
2166                  VkDeviceSize fillSize,
2167                  uint32_t data)
2168 {
2169    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2170    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2171    const struct blit_ops *ops = &r2d_ops;
2172    struct tu_cs *cs = &cmd->cs;
2173 
2174    if (fillSize == VK_WHOLE_SIZE)
2175       fillSize = buffer->size - dstOffset;
2176 
2177    uint64_t dst_va = buffer->iova + dstOffset;
2178    uint32_t blocks = fillSize / 4;
2179 
2180    ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2181               VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2182               VK_SAMPLE_COUNT_1_BIT);
2183    ops->clear_value(cs, PIPE_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
2184 
2185    while (blocks) {
2186       uint32_t dst_x = (dst_va & 63) / 4;
2187       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
2188 
2189       ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dst_va & ~63, 0, PIPE_FORMAT_R32_UINT);
2190       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
2191       ops->run(cmd, cs);
2192 
2193       dst_va += width * 4;
2194       blocks -= width;
2195    }
2196 
2197    ops->teardown(cmd, cs);
2198 }
2199 
2200 VKAPI_ATTR void VKAPI_CALL
tu_CmdResolveImage2KHR(VkCommandBuffer commandBuffer, const VkResolveImageInfo2* pResolveImageInfo)2201 tu_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
2202                        const VkResolveImageInfo2* pResolveImageInfo)
2203 {
2204    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2205    TU_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage);
2206    TU_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage);
2207    const struct blit_ops *ops = &r2d_ops;
2208    struct tu_cs *cs = &cmd->cs;
2209 
2210    enum pipe_format src_format =
2211       tu_vk_format_to_pipe_format(src_image->vk.format);
2212    enum pipe_format dst_format =
2213       tu_vk_format_to_pipe_format(dst_image->vk.format);
2214    ops->setup(cmd, cs, src_format, dst_format,
2215               VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc,
2216               VK_SAMPLE_COUNT_1_BIT);
2217 
2218    for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) {
2219       const VkImageResolve2 *info = &pResolveImageInfo->pRegions[i];
2220       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
2221 
2222       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
2223       /* TODO: aspect masks possible ? */
2224 
2225       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
2226 
2227       struct fdl6_view dst, src;
2228       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
2229       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
2230 
2231       for (uint32_t i = 0; i < layers; i++) {
2232          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2233          ops->dst(cs, &dst, i, src_format);
2234          ops->run(cmd, cs);
2235       }
2236    }
2237 
2238    ops->teardown(cmd, cs);
2239 }
2240 
2241 #define for_each_layer(layer, layer_mask, layers) \
2242    for (uint32_t layer = 0; \
2243         layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
2244         layer++) \
2245       if (!layer_mask || (layer_mask & BIT(layer)))
2246 
2247 static void
resolve_sysmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs, VkFormat vk_src_format, VkFormat vk_dst_format, const struct tu_image_view *src, const struct tu_image_view *dst, uint32_t layer_mask, uint32_t layers, const VkRect2D *rect, bool src_separate_ds, bool dst_separate_ds)2248 resolve_sysmem(struct tu_cmd_buffer *cmd,
2249                struct tu_cs *cs,
2250                VkFormat vk_src_format,
2251                VkFormat vk_dst_format,
2252                const struct tu_image_view *src,
2253                const struct tu_image_view *dst,
2254                uint32_t layer_mask,
2255                uint32_t layers,
2256                const VkRect2D *rect,
2257                bool src_separate_ds,
2258                bool dst_separate_ds)
2259 {
2260    const struct blit_ops *ops = &r2d_ops;
2261 
2262    trace_start_sysmem_resolve(&cmd->trace, cs);
2263 
2264    enum pipe_format src_format = tu_vk_format_to_pipe_format(vk_src_format);
2265    enum pipe_format dst_format = tu_vk_format_to_pipe_format(vk_dst_format);
2266 
2267    ops->setup(cmd, cs, src_format, dst_format,
2268               VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
2269               VK_SAMPLE_COUNT_1_BIT);
2270    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
2271 
2272    for_each_layer(i, layer_mask, layers) {
2273       if (src_separate_ds) {
2274          if (vk_src_format == VK_FORMAT_D32_SFLOAT) {
2275             r2d_src_depth(cmd, cs, src, i, VK_FILTER_NEAREST);
2276          } else {
2277             r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST);
2278          }
2279       } else {
2280          ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
2281       }
2282 
2283       if (dst_separate_ds) {
2284          if (vk_dst_format == VK_FORMAT_D32_SFLOAT) {
2285             ops->dst_depth(cs, dst, i);
2286          } else {
2287             ops->dst_stencil(cs, dst, i);
2288          }
2289       } else {
2290          ops->dst(cs, &dst->view, i, src_format);
2291       }
2292 
2293       ops->run(cmd, cs);
2294    }
2295 
2296    ops->teardown(cmd, cs);
2297 
2298    trace_end_sysmem_resolve(&cmd->trace, cs, vk_dst_format);
2299 }
2300 
2301 void
tu_resolve_sysmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *src, const struct tu_image_view *dst, uint32_t layer_mask, uint32_t layers, const VkRect2D *rect)2302 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
2303                   struct tu_cs *cs,
2304                   const struct tu_image_view *src,
2305                   const struct tu_image_view *dst,
2306                   uint32_t layer_mask,
2307                   uint32_t layers,
2308                   const VkRect2D *rect)
2309 {
2310    assert(src->image->vk.format == dst->image->vk.format ||
2311           (vk_format_is_depth_or_stencil(src->image->vk.format) &&
2312            vk_format_is_depth_or_stencil(dst->image->vk.format)));
2313 
2314    bool src_separate_ds = src->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
2315    bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
2316 
2317    if (dst_separate_ds) {
2318       resolve_sysmem(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT,
2319                      src, dst, layer_mask, layers, rect,
2320                      src_separate_ds, dst_separate_ds);
2321       resolve_sysmem(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT,
2322                      src, dst, layer_mask, layers, rect,
2323                      src_separate_ds, dst_separate_ds);
2324    } else {
2325       resolve_sysmem(cmd, cs, src->image->vk.format, dst->image->vk.format,
2326                      src, dst, layer_mask, layers, rect,
2327                      src_separate_ds, dst_separate_ds);
2328    }
2329 }
2330 
2331 static void
clear_image(struct tu_cmd_buffer *cmd, struct tu_image *image, const VkClearValue *clear_value, const VkImageSubresourceRange *range, VkImageAspectFlags aspect_mask)2332 clear_image(struct tu_cmd_buffer *cmd,
2333             struct tu_image *image,
2334             const VkClearValue *clear_value,
2335             const VkImageSubresourceRange *range,
2336             VkImageAspectFlags aspect_mask)
2337 {
2338    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
2339    uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
2340    struct tu_cs *cs = &cmd->cs;
2341    enum pipe_format format;
2342    if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
2343       format = PIPE_FORMAT_R32_UINT;
2344    } else {
2345       format = tu6_plane_format(image->vk.format,
2346                                 tu6_plane_index(image->vk.format,
2347                                                 aspect_mask));
2348    }
2349 
2350    if (image->layout[0].depth0 > 1) {
2351       assert(layer_count == 1);
2352       assert(range->baseArrayLayer == 0);
2353    }
2354 
2355    const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops;
2356 
2357    ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc,
2358               image->layout[0].nr_samples);
2359    if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
2360       ops->clear_value(cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
2361    else
2362       ops->clear_value(cs, format, clear_value);
2363 
2364    for (unsigned j = 0; j < level_count; j++) {
2365       if (image->layout[0].depth0 > 1)
2366          layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
2367 
2368       ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
2369                      u_minify(image->layout[0].width0, range->baseMipLevel + j),
2370                      u_minify(image->layout[0].height0, range->baseMipLevel + j)
2371                   });
2372 
2373       struct fdl6_view dst;
2374       tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
2375          .aspectMask = aspect_mask,
2376          .mipLevel = range->baseMipLevel + j,
2377          .baseArrayLayer = range->baseArrayLayer,
2378          .layerCount = 1,
2379       }, 0, false);
2380 
2381       for (uint32_t i = 0; i < layer_count; i++) {
2382          ops->dst(cs, &dst, i, format);
2383          ops->run(cmd, cs);
2384       }
2385    }
2386 
2387    ops->teardown(cmd, cs);
2388 }
2389 
2390 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearColorImage(VkCommandBuffer commandBuffer, VkImage image_h, VkImageLayout imageLayout, const VkClearColorValue *pColor, uint32_t rangeCount, const VkImageSubresourceRange *pRanges)2391 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
2392                       VkImage image_h,
2393                       VkImageLayout imageLayout,
2394                       const VkClearColorValue *pColor,
2395                       uint32_t rangeCount,
2396                       const VkImageSubresourceRange *pRanges)
2397 {
2398    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2399    TU_FROM_HANDLE(tu_image, image, image_h);
2400 
2401    for (unsigned i = 0; i < rangeCount; i++)
2402       clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
2403 }
2404 
2405 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, VkImage image_h, VkImageLayout imageLayout, const VkClearDepthStencilValue *pDepthStencil, uint32_t rangeCount, const VkImageSubresourceRange *pRanges)2406 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
2407                              VkImage image_h,
2408                              VkImageLayout imageLayout,
2409                              const VkClearDepthStencilValue *pDepthStencil,
2410                              uint32_t rangeCount,
2411                              const VkImageSubresourceRange *pRanges)
2412 {
2413    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2414    TU_FROM_HANDLE(tu_image, image, image_h);
2415 
2416    for (unsigned i = 0; i < rangeCount; i++) {
2417       const VkImageSubresourceRange *range = &pRanges[i];
2418 
2419       if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2420          /* can't clear both depth and stencil at once, split up the aspect mask */
2421          u_foreach_bit(b, range->aspectMask)
2422             clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
2423          continue;
2424       }
2425 
2426       clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
2427    }
2428 
2429    tu_lrz_clear_depth_image(cmd, image, pDepthStencil, rangeCount, pRanges);
2430 }
2431 
2432 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, uint32_t attachment_count, const VkClearAttachment *attachments, uint32_t rect_count, const VkClearRect *rects)2433 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2434                             uint32_t attachment_count,
2435                             const VkClearAttachment *attachments,
2436                             uint32_t rect_count,
2437                             const VkClearRect *rects)
2438 {
2439    /* the shader path here is special, it avoids changing MRT/etc state */
2440    const struct tu_subpass *subpass = cmd->state.subpass;
2441    const uint32_t mrt_count = subpass->color_count;
2442    struct tu_cs *cs = &cmd->draw_cs;
2443    uint32_t clear_value[MAX_RTS][4];
2444    float z_clear_val = 0.0f;
2445    uint8_t s_clear_val = 0;
2446    uint32_t clear_rts = 0, clear_components = 0;
2447    bool z_clear = false;
2448    bool s_clear = false;
2449 
2450    trace_start_sysmem_clear_all(&cmd->trace, cs);
2451 
2452    for (uint32_t i = 0; i < attachment_count; i++) {
2453       uint32_t a;
2454       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2455          uint32_t c = attachments[i].colorAttachment;
2456          a = subpass->color_attachments[c].attachment;
2457          if (a == VK_ATTACHMENT_UNUSED)
2458             continue;
2459 
2460          clear_rts |= 1 << c;
2461          clear_components |= 0xf << (c * 4);
2462          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2463       } else {
2464          a = subpass->depth_stencil_attachment.attachment;
2465          if (a == VK_ATTACHMENT_UNUSED)
2466             continue;
2467 
2468          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2469             z_clear = true;
2470             z_clear_val = attachments[i].clearValue.depthStencil.depth;
2471          }
2472 
2473          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2474             s_clear = true;
2475             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2476          }
2477       }
2478    }
2479 
2480    /* We may not know the multisample count if there are no attachments, so
2481     * just bail early to avoid corner cases later.
2482     */
2483    if (clear_rts == 0 && !z_clear && !s_clear)
2484       return;
2485 
2486    /* disable all draw states so they don't interfere
2487     * TODO: use and re-use draw states
2488     * we have to disable draw states individually to preserve
2489     * input attachment states, because a secondary command buffer
2490     * won't be able to restore them
2491     */
2492    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
2493    for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
2494       if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
2495           i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
2496          continue;
2497       tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
2498                      CP_SET_DRAW_STATE__0_DISABLE);
2499       tu_cs_emit_qw(cs, 0);
2500    }
2501    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
2502 
2503    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2504    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2505                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2506                   0xfc000000);
2507    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2508 
2509    r3d_common(cmd, cs, false, clear_rts, false, cmd->state.subpass->samples);
2510 
2511    /* Disable sample counting in order to not affect occlusion query. */
2512    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
2513 
2514    if (cmd->state.prim_generated_query_running_before_rp) {
2515       tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS);
2516    }
2517 
2518    tu_cs_emit_regs(cs,
2519                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2520    tu_cs_emit_regs(cs,
2521                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2522 
2523    tu_cs_emit_regs(cs,
2524                    A6XX_RB_FS_OUTPUT_CNTL0(),
2525                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2526 
2527    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2528    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2529    for (uint32_t i = 0; i < mrt_count; i++) {
2530       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2531             .component_enable = COND(clear_rts & (1 << i), 0xf)));
2532    }
2533 
2534    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
2535    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
2536 
2537    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2538    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2539          .z_test_enable = z_clear,
2540          .z_write_enable = z_clear,
2541          .zfunc = FUNC_ALWAYS));
2542    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2543    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2544          .stencil_enable = s_clear,
2545          .func = FUNC_ALWAYS,
2546          .zpass = STENCIL_REPLACE));
2547    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2548    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2549    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2550 
2551    unsigned num_rts = util_bitcount(clear_rts);
2552    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2553    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2554                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2555                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2556                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2557                   CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2558    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2559    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2560    u_foreach_bit(b, clear_rts)
2561       tu_cs_emit_array(cs, clear_value[b], 4);
2562 
2563    for (uint32_t i = 0; i < rect_count; i++) {
2564       /* This should be true because of this valid usage for
2565        * vkCmdClearAttachments:
2566        *
2567        *    "If the render pass instance this is recorded in uses multiview,
2568        *    then baseArrayLayer must be zero and layerCount must be one"
2569        */
2570       assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
2571 
2572       /* a630 doesn't support multiview masks, which means that we can't use
2573        * the normal multiview path without potentially recompiling a shader
2574        * on-demand or using a more complicated variant that takes the mask as
2575        * a const. Just use the layered path instead, since it shouldn't be
2576        * much worse.
2577        */
2578       for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) {
2579          r3d_coords_raw(cs, (float[]) {
2580             rects[i].rect.offset.x, rects[i].rect.offset.y,
2581             z_clear_val, uif(rects[i].baseArrayLayer + layer),
2582             rects[i].rect.offset.x + rects[i].rect.extent.width,
2583             rects[i].rect.offset.y + rects[i].rect.extent.height,
2584             z_clear_val, 1.0f,
2585          });
2586          r3d_run_vis(cmd, cs);
2587       }
2588    }
2589 
2590    /* Re-enable sample counting. */
2591    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
2592 
2593    if (cmd->state.prim_generated_query_running_before_rp) {
2594       tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS);
2595    }
2596 
2597    trace_end_sysmem_clear_all(&cmd->trace,
2598                               cs, mrt_count, rect_count);
2599 }
2600 
2601 static void
pack_gmem_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])2602 pack_gmem_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
2603 {
2604    switch (format) {
2605    case PIPE_FORMAT_Z24X8_UNORM:
2606    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
2607       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
2608                        val->depthStencil.stencil << 24;
2609       return;
2610    case PIPE_FORMAT_Z16_UNORM:
2611       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
2612       return;
2613    case PIPE_FORMAT_Z32_FLOAT:
2614       clear_value[0] = fui(val->depthStencil.depth);
2615       return;
2616    case PIPE_FORMAT_S8_UINT:
2617       clear_value[0] = val->depthStencil.stencil;
2618       return;
2619    default:
2620       break;
2621    }
2622 
2623    float tmp[4];
2624    memcpy(tmp, val->color.float32, 4 * sizeof(float));
2625    if (util_format_is_srgb(format)) {
2626       for (int i = 0; i < 3; i++)
2627          tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
2628    }
2629 
2630 #define PACK_F(type) util_format_##type##_pack_rgba_float \
2631    ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
2632    switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
2633    case 4:
2634       PACK_F(r4g4b4a4_unorm);
2635       break;
2636    case 5:
2637       if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
2638          PACK_F(r5g6b5_unorm);
2639       else
2640          PACK_F(r5g5b5a1_unorm);
2641       break;
2642    case 8:
2643       if (util_format_is_snorm(format))
2644          PACK_F(r8g8b8a8_snorm);
2645       else if (util_format_is_unorm(format))
2646          PACK_F(r8g8b8a8_unorm);
2647       else
2648          pack_int8(clear_value, val->color.uint32);
2649       break;
2650    case 10:
2651       if (util_format_is_pure_integer(format))
2652          pack_int10_2(clear_value, val->color.uint32);
2653       else
2654          PACK_F(r10g10b10a2_unorm);
2655       break;
2656    case 11:
2657       clear_value[0] = float3_to_r11g11b10f(val->color.float32);
2658       break;
2659    case 16:
2660       if (util_format_is_snorm(format))
2661          PACK_F(r16g16b16a16_snorm);
2662       else if (util_format_is_unorm(format))
2663          PACK_F(r16g16b16a16_unorm);
2664       else if (util_format_is_float(format))
2665          PACK_F(r16g16b16a16_float);
2666       else
2667          pack_int16(clear_value, val->color.uint32);
2668       break;
2669    case 32:
2670       memcpy(clear_value, val->color.float32, 4 * sizeof(float));
2671       break;
2672    default:
2673       unreachable("unexpected channel size");
2674    }
2675 #undef PACK_F
2676 }
2677 
2678 static void
clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, uint8_t clear_mask, uint32_t gmem_offset, const VkClearValue *value)2679 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2680                       struct tu_cs *cs,
2681                       enum pipe_format format,
2682                       uint8_t clear_mask,
2683                       uint32_t gmem_offset,
2684                       const VkClearValue *value)
2685 {
2686    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2687    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
2688 
2689    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
2690 
2691    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2692    tu_cs_emit(cs, gmem_offset);
2693 
2694    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2695    tu_cs_emit(cs, 0);
2696 
2697    uint32_t clear_vals[4] = {};
2698    pack_gmem_clear_value(value, format, clear_vals);
2699 
2700    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2701    tu_cs_emit_array(cs, clear_vals, 4);
2702 
2703    tu6_emit_event_write(cmd, cs, BLIT);
2704 }
2705 
2706 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t attachment, VkImageAspectFlags mask, const VkClearValue *value)2707 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2708                               struct tu_cs *cs,
2709                               uint32_t attachment,
2710                               VkImageAspectFlags mask,
2711                               const VkClearValue *value)
2712 {
2713    const struct tu_render_pass_attachment *att =
2714       &cmd->state.pass->attachments[attachment];
2715 
2716    trace_start_gmem_clear(&cmd->trace, cs);
2717 
2718    enum pipe_format format = tu_vk_format_to_pipe_format(att->format);
2719    if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2720       if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2721          clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, tu_attachment_gmem_offset(cmd, att), value);
2722       if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2723          clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, tu_attachment_gmem_offset_stencil(cmd, att), value);
2724       return;
2725    }
2726 
2727    clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask),
2728                          tu_attachment_gmem_offset(cmd, att), value);
2729 
2730    trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples);
2731 }
2732 
2733 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, uint32_t attachment_count, const VkClearAttachment *attachments, uint32_t rect_count, const VkClearRect *rects)2734 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2735                           uint32_t attachment_count,
2736                           const VkClearAttachment *attachments,
2737                           uint32_t rect_count,
2738                           const VkClearRect *rects)
2739 {
2740    const struct tu_subpass *subpass = cmd->state.subpass;
2741    struct tu_cs *cs = &cmd->draw_cs;
2742 
2743    if (rect_count > 1)
2744       perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream");
2745 
2746    for (unsigned i = 0; i < rect_count; i++) {
2747       unsigned x1 = rects[i].rect.offset.x;
2748       unsigned y1 = rects[i].rect.offset.y;
2749       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2750       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2751 
2752       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2753       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2754       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2755 
2756       for (unsigned j = 0; j < attachment_count; j++) {
2757          uint32_t a;
2758          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2759             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2760          else
2761             a = subpass->depth_stencil_attachment.attachment;
2762 
2763          if (a == VK_ATTACHMENT_UNUSED)
2764                continue;
2765 
2766          tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2767                                        &attachments[j].clearValue);
2768       }
2769    }
2770 }
2771 
2772 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearAttachments(VkCommandBuffer commandBuffer, uint32_t attachmentCount, const VkClearAttachment *pAttachments, uint32_t rectCount, const VkClearRect *pRects)2773 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2774                        uint32_t attachmentCount,
2775                        const VkClearAttachment *pAttachments,
2776                        uint32_t rectCount,
2777                        const VkClearRect *pRects)
2778 {
2779    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2780    struct tu_cs *cs = &cmd->draw_cs;
2781 
2782    /* sysmem path behaves like a draw, note we don't have a way of using different
2783     * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
2784     */
2785    tu_emit_cache_flush_renderpass(cmd, cs);
2786 
2787    for (uint32_t j = 0; j < attachmentCount; j++) {
2788       if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
2789          continue;
2790 
2791       tu_lrz_disable_during_renderpass(cmd);
2792    }
2793 
2794    /* vkCmdClearAttachments is supposed to respect the predicate if active. The
2795     * easiest way to do this is to always use the 3d path, which always works
2796     * even with GMEM because it's just a simple draw using the existing
2797     * attachment state.
2798     *
2799     * Similarly, we also use the 3D path when in a secondary command buffer that
2800     * doesn't know the GMEM layout that will be chosen by the primary.
2801     */
2802    if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
2803       tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2804       return;
2805    }
2806 
2807    /* If we could skip tile load/stores based on any draws intersecting them at
2808     * binning time, then emit the clear as a 3D draw so that it contributes to
2809     * that visibility.
2810    */
2811    const struct tu_subpass *subpass = cmd->state.subpass;
2812    for (uint32_t i = 0; i < attachmentCount; i++) {
2813       uint32_t a;
2814       if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2815          uint32_t c = pAttachments[i].colorAttachment;
2816          a = subpass->color_attachments[c].attachment;
2817       } else {
2818          a = subpass->depth_stencil_attachment.attachment;
2819       }
2820       if (a != VK_ATTACHMENT_UNUSED) {
2821          const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
2822          if (att->cond_load_allowed || att->cond_store_allowed) {
2823             tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2824             return;
2825          }
2826       }
2827    }
2828 
2829    /* Otherwise, emit 2D blits for gmem rendering. */
2830    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2831    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2832    tu_cond_exec_end(cs);
2833 
2834    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2835    tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2836    tu_cond_exec_end(cs);
2837 }
2838 
2839 static void
clear_sysmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, VkFormat vk_format, VkImageAspectFlags clear_mask, const VkClearValue *value, uint32_t a, bool separate_ds)2840 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2841                         struct tu_cs *cs,
2842                         VkFormat vk_format,
2843                         VkImageAspectFlags clear_mask,
2844                         const VkClearValue *value,
2845                         uint32_t a,
2846                         bool separate_ds)
2847 {
2848    enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
2849    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2850    const struct tu_image_view *iview = cmd->state.attachments[a];
2851    const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
2852    const struct blit_ops *ops = &r2d_ops;
2853    if (cmd->state.pass->attachments[a].samples > 1)
2854       ops = &r3d_ops;
2855 
2856    trace_start_sysmem_clear(&cmd->trace, cs);
2857 
2858    ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled,
2859               cmd->state.pass->attachments[a].samples);
2860    ops->coords(cs, &cmd->state.render_area.offset, NULL,
2861                &cmd->state.render_area.extent);
2862    ops->clear_value(cs, format, value);
2863 
2864    for_each_layer(i, clear_views, fb->layers) {
2865       if (separate_ds) {
2866          if (vk_format == VK_FORMAT_D32_SFLOAT) {
2867             ops->dst_depth(cs, iview, i);
2868          } else {
2869             ops->dst_stencil(cs, iview, i);
2870          }
2871       } else {
2872          ops->dst(cs, &iview->view, i, format);
2873       }
2874       ops->run(cmd, cs);
2875    }
2876 
2877    ops->teardown(cmd, cs);
2878 
2879    trace_end_sysmem_clear(&cmd->trace, cs,
2880                           vk_format, ops == &r3d_ops,
2881                           cmd->state.pass->attachments[a].samples);
2882 }
2883 
2884 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, const VkClearValue *value)2885 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2886                            struct tu_cs *cs,
2887                            uint32_t a,
2888                            const VkClearValue *value)
2889 {
2890    const struct tu_render_pass_attachment *attachment =
2891       &cmd->state.pass->attachments[a];
2892 
2893    if (!attachment->clear_mask)
2894       return;
2895 
2896    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2897       if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2898          clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
2899                                  value, a, true);
2900       }
2901       if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2902          clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
2903                                  value, a, true);
2904       }
2905    } else {
2906       clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
2907                               value, a, false);
2908    }
2909 
2910    /* The spec doesn't explicitly say, but presumably the initial renderpass
2911     * clear is considered part of the renderpass, and therefore barriers
2912     * aren't required inside the subpass/renderpass.  Therefore we need to
2913     * flush CCU color into CCU depth here, just like with
2914     * vkCmdClearAttachments(). Note that because this only happens at the
2915     * beginning of a renderpass, and renderpass writes are considered
2916     * "incoherent", we shouldn't have to worry about syncing depth into color
2917     * beforehand as depth should already be flushed.
2918     */
2919    if (vk_format_is_depth_or_stencil(attachment->format)) {
2920       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2921       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
2922       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2923    } else {
2924       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2925       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2926    }
2927 
2928    if (cmd->device->physical_device->info->a6xx.has_ccu_flush_bug)
2929       tu_cs_emit_wfi(cs);
2930 }
2931 
2932 void
tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, const VkClearValue *value)2933 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2934                          struct tu_cs *cs,
2935                          uint32_t a,
2936                          const VkClearValue *value)
2937 {
2938    const struct tu_render_pass_attachment *attachment =
2939       &cmd->state.pass->attachments[a];
2940 
2941    if (!attachment->clear_mask)
2942       return;
2943 
2944    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2945 
2946    tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask, value);
2947 }
2948 
2949 static void
tu_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, const struct tu_render_pass_attachment *attachment, bool resolve, bool separate_stencil)2950 tu_emit_blit(struct tu_cmd_buffer *cmd,
2951              struct tu_cs *cs,
2952              const struct tu_image_view *iview,
2953              const struct tu_render_pass_attachment *attachment,
2954              bool resolve,
2955              bool separate_stencil)
2956 {
2957    tu_cs_emit_regs(cs,
2958                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2959 
2960    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2961       .unk0 = !resolve,
2962       .gmem = !resolve,
2963       .sample_0 = vk_format_is_int(attachment->format) ||
2964          vk_format_is_depth_or_stencil(attachment->format)));
2965 
2966    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2967    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2968       if (!separate_stencil) {
2969          tu_cs_emit(cs, tu_image_view_depth(iview, RB_BLIT_DST_INFO));
2970          tu_cs_emit_qw(cs, iview->depth_base_addr);
2971          tu_cs_emit(cs, iview->depth_PITCH);
2972 
2973          tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
2974          tu_cs_image_flag_ref(cs, &iview->view, 0);
2975       } else {
2976          tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
2977          tu_cs_emit_qw(cs, iview->stencil_base_addr);
2978          tu_cs_emit(cs, iview->stencil_PITCH);
2979       }
2980    } else {
2981       tu_cs_emit(cs, iview->view.RB_BLIT_DST_INFO);
2982       tu_cs_image_ref_2d(cs, &iview->view, 0, false);
2983 
2984       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
2985       tu_cs_image_flag_ref(cs, &iview->view, 0);
2986    }
2987 
2988    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
2989          tu_cs_emit_regs(cs,
2990                         A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(cmd, attachment)));
2991    } else {
2992       tu_cs_emit_regs(cs,
2993                      A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(cmd, attachment)));
2994    }
2995 
2996    tu6_emit_event_write(cmd, cs, BLIT);
2997 }
2998 
2999 static bool
blit_can_resolve(VkFormat format)3000 blit_can_resolve(VkFormat format)
3001 {
3002    const struct util_format_description *desc = vk_format_description(format);
3003 
3004    /* blit event can only do resolve for simple cases:
3005     * averaging samples as unsigned integers or choosing only one sample
3006     */
3007    if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
3008       return false;
3009 
3010    /* can't do formats with larger channel sizes
3011     * note: this includes all float formats
3012     * note2: single channel integer formats seem OK
3013     */
3014    if (desc->channel[0].size > 10)
3015       return false;
3016 
3017    switch (format) {
3018    /* for unknown reasons blit event can't msaa resolve these formats when tiled
3019     * likely related to these formats having different layout from other cpp=2 formats
3020     */
3021    case VK_FORMAT_R8G8_UNORM:
3022    case VK_FORMAT_R8G8_UINT:
3023    case VK_FORMAT_R8G8_SINT:
3024    /* TODO: this one should be able to work? */
3025    case VK_FORMAT_D24_UNORM_S8_UINT:
3026       return false;
3027    default:
3028       break;
3029    }
3030 
3031    return true;
3032 }
3033 
3034 static void
tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool load)3035 tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
3036                               struct tu_cs *cs, bool load)
3037 {
3038    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
3039 
3040    if (!unlikely(cmd->device->physical_device->instance->debug_flags &
3041                  TU_DEBUG_LOG_SKIP_GMEM_OPS))
3042       return;
3043 
3044    uint64_t result_iova;
3045    if (load)
3046       result_iova = global_iova(cmd, dbg_gmem_taken_loads);
3047    else
3048       result_iova = global_iova(cmd, dbg_gmem_taken_stores);
3049 
3050    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
3051    tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
3052    tu_cs_emit_qw(cs, result_iova);
3053    tu_cs_emit_qw(cs, result_iova);
3054    tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
3055 }
3056 
3057 static void
tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool load)3058 tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
3059                             struct tu_cs *cs, bool load)
3060 {
3061    tu_cond_exec_end(cs);
3062 
3063    if (!unlikely(cmd->device->physical_device->instance->debug_flags &
3064                  TU_DEBUG_LOG_SKIP_GMEM_OPS))
3065       return;
3066 
3067    uint64_t result_iova;
3068    if (load)
3069       result_iova = global_iova(cmd, dbg_gmem_total_loads);
3070    else
3071       result_iova = global_iova(cmd, dbg_gmem_total_stores);
3072 
3073    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
3074    tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
3075    tu_cs_emit_qw(cs, result_iova);
3076    tu_cs_emit_qw(cs, result_iova);
3077    tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
3078 }
3079 
3080 void
tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, bool cond_exec_allowed, bool force_load)3081 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
3082                         struct tu_cs *cs,
3083                         uint32_t a,
3084                         bool cond_exec_allowed,
3085                         bool force_load)
3086 {
3087    const struct tu_image_view *iview = cmd->state.attachments[a];
3088    const struct tu_render_pass_attachment *attachment =
3089       &cmd->state.pass->attachments[a];
3090 
3091    bool load_common = attachment->load || force_load;
3092    bool load_stencil =
3093       attachment->load_stencil ||
3094       (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
3095 
3096    if (!load_common && !load_stencil)
3097       return;
3098 
3099    trace_start_gmem_load(&cmd->trace, cs);
3100 
3101    /* If attachment will be cleared by vkCmdClearAttachments - it is likely
3102     * that it would be partially cleared, and since it is done by 2d blit
3103     * it doesn't produce geometry, so we have to unconditionally load.
3104     *
3105     * To simplify conditions treat partially cleared separate DS as fully
3106     * cleared and don't emit cond_exec.
3107     */
3108    bool cond_exec = cond_exec_allowed && attachment->cond_load_allowed;
3109    if (cond_exec)
3110       tu_begin_load_store_cond_exec(cmd, cs, true);
3111 
3112    if (load_common)
3113       tu_emit_blit(cmd, cs, iview, attachment, false, false);
3114 
3115    if (load_stencil)
3116       tu_emit_blit(cmd, cs, iview, attachment, false, true);
3117 
3118    if (cond_exec)
3119       tu_end_load_store_cond_exec(cmd, cs, true);
3120 
3121    trace_end_gmem_load(&cmd->trace, cs, attachment->format, force_load);
3122 }
3123 
3124 static void
store_cp_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, uint32_t samples, bool separate_stencil, enum pipe_format src_format, enum pipe_format dst_format, uint32_t gmem_offset, uint32_t cpp)3125 store_cp_blit(struct tu_cmd_buffer *cmd,
3126               struct tu_cs *cs,
3127               const struct tu_image_view *iview,
3128               uint32_t samples,
3129               bool separate_stencil,
3130               enum pipe_format src_format,
3131               enum pipe_format dst_format,
3132               uint32_t gmem_offset,
3133               uint32_t cpp)
3134 {
3135    r2d_setup_common(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
3136                     iview->view.ubwc_enabled, true);
3137 
3138    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3139       if (!separate_stencil) {
3140          r2d_dst_depth(cs, iview, 0);
3141       } else {
3142          r2d_dst_stencil(cs, iview, 0);
3143       }
3144    } else {
3145       r2d_dst(cs, &iview->view, 0, src_format);
3146    }
3147 
3148    enum a6xx_format fmt = tu6_format_texture(src_format, TILE6_2).fmt;
3149    fixup_src_format(&src_format, dst_format, &fmt);
3150 
3151    tu_cs_emit_regs(cs,
3152                    A6XX_SP_PS_2D_SRC_INFO(
3153                       .color_format = fmt,
3154                       .color_swap = WZYX,
3155                       .tile_mode = TILE6_2,
3156                       .srgb = util_format_is_srgb(src_format),
3157                       .samples = tu_msaa_samples(samples),
3158                       .samples_average = !util_format_is_pure_integer(dst_format) &&
3159                                          !util_format_is_depth_or_stencil(dst_format),
3160                       .unk20 = 1,
3161                       .unk22 = 1),
3162                    /* note: src size does not matter when not scaling */
3163                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
3164                    A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset),
3165                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.tiling->tile0.width * cpp));
3166 
3167    /* sync GMEM writes with CACHE. */
3168    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
3169 
3170    /* Wait for CACHE_INVALIDATE to land */
3171    tu_cs_emit_wfi(cs);
3172 
3173    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
3174    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
3175 
3176    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
3177     * sysmem, and we generally assume that GMEM renderpasses leave their
3178     * results in sysmem, so we need to flush manually here.
3179     */
3180    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
3181 }
3182 
3183 static void
store_3d_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, uint32_t dst_samples, bool separate_stencil, enum pipe_format src_format, enum pipe_format dst_format, const VkRect2D *render_area, uint32_t gmem_offset, uint32_t cpp)3184 store_3d_blit(struct tu_cmd_buffer *cmd,
3185               struct tu_cs *cs,
3186               const struct tu_image_view *iview,
3187               uint32_t dst_samples,
3188               bool separate_stencil,
3189               enum pipe_format src_format,
3190               enum pipe_format dst_format,
3191               const VkRect2D *render_area,
3192               uint32_t gmem_offset,
3193               uint32_t cpp)
3194 {
3195    /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they
3196     * aren't set until we know whether we're HW binning or not, and we want to
3197     * avoid a dependence on that here to be able to store attachments before
3198     * the end of the renderpass in the future. Use the scratch space to
3199     * save/restore them dynamically.
3200     */
3201    tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
3202    tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) |
3203                   CP_REG_TO_SCRATCH_0_SCRATCH(0) |
3204                   CP_REG_TO_SCRATCH_0_CNT(1 - 1));
3205 
3206    r3d_setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
3207              iview->view.ubwc_enabled, dst_samples);
3208 
3209    r3d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
3210 
3211    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3212       if (!separate_stencil) {
3213          r3d_dst_depth(cs, iview, 0);
3214       } else {
3215          r3d_dst_stencil(cs, iview, 0);
3216       }
3217    } else {
3218       r3d_dst(cs, &iview->view, 0, src_format);
3219    }
3220 
3221    r3d_src_gmem(cmd, cs, iview, src_format, dst_format, gmem_offset, cpp);
3222 
3223    /* sync GMEM writes with CACHE. */
3224    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
3225 
3226    /* Wait for CACHE_INVALIDATE to land */
3227    tu_cs_emit_wfi(cs);
3228 
3229    r3d_run(cmd, cs);
3230 
3231    r3d_teardown(cmd, cs);
3232 
3233    /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
3234     * sysmem, and we generally assume that GMEM renderpasses leave their
3235     * results in sysmem, so we need to flush manually here. The 3d blit path
3236     * writes to depth images as a color RT, so there's no need to flush depth.
3237     */
3238    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
3239 
3240    /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */
3241    tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
3242    tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) |
3243                   CP_SCRATCH_TO_REG_0_SCRATCH(0) |
3244                   CP_SCRATCH_TO_REG_0_CNT(1 - 1));
3245 
3246    tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
3247    tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) |
3248                   CP_SCRATCH_TO_REG_0_SCRATCH(0) |
3249                   CP_SCRATCH_TO_REG_0_CNT(1 - 1));
3250 }
3251 
3252 static bool
tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)3253 tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
3254 {
3255    struct tu_physical_device *phys_dev = cmd->device->physical_device;
3256    const struct tu_image_view *iview = cmd->state.attachments[a];
3257    const VkRect2D *render_area = &cmd->state.render_area;
3258 
3259    /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
3260    if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_UNALIGNED_STORE))
3261       return true;
3262 
3263    uint32_t x1 = render_area->offset.x;
3264    uint32_t y1 = render_area->offset.y;
3265    uint32_t x2 = x1 + render_area->extent.width;
3266    uint32_t y2 = y1 + render_area->extent.height;
3267    /* x2/y2 can be unaligned if equal to the size of the image, since it will
3268     * write into padding space. The one exception is linear levels which don't
3269     * have the required y padding in the layout (except for the last level)
3270     */
3271    bool need_y2_align =
3272       y2 != iview->view.height || iview->view.need_y2_align;
3273 
3274    return (x1 % phys_dev->info->gmem_align_w ||
3275            (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
3276            y1 % phys_dev->info->gmem_align_h ||
3277            (y2 % phys_dev->info->gmem_align_h && need_y2_align));
3278 }
3279 
3280 /* Choose the GMEM layout (use the CCU space or not) based on whether the
3281  * current attachments will need.  This has to happen at vkBeginRenderPass()
3282  * time because tu_attachment_store_unaligned() looks at the image views, which
3283  * are only available at that point.  This should match the logic for the
3284  * !unaligned case in tu_store_gmem_attachment().
3285  */
3286 void
tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)3287 tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
3288 {
3289    cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
3290 
3291    for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
3292       if (!cmd->state.attachments[i])
3293          continue;
3294 
3295       struct tu_render_pass_attachment *att =
3296          &cmd->state.pass->attachments[i];
3297       if ((att->store || att->store_stencil) &&
3298           tu_attachment_store_unaligned(cmd, i))
3299          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
3300       if (att->will_be_resolved && !blit_can_resolve(att->format))
3301          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
3302    }
3303 
3304    cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
3305 }
3306 
3307 void
tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, uint32_t gmem_a, bool cond_exec_allowed)3308 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
3309                          struct tu_cs *cs,
3310                          uint32_t a,
3311                          uint32_t gmem_a,
3312                          bool cond_exec_allowed)
3313 {
3314    const VkRect2D *render_area = &cmd->state.render_area;
3315    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
3316    const struct tu_image_view *iview = cmd->state.attachments[a];
3317    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
3318 
3319    if (!dst->store && !dst->store_stencil)
3320       return;
3321 
3322    /* Unconditional store should happen only if attachment was cleared,
3323     * which could have happened either by load_op or via vkCmdClearAttachments.
3324     */
3325    bool cond_exec = cond_exec_allowed && src->cond_store_allowed;
3326    if (cond_exec) {
3327       tu_begin_load_store_cond_exec(cmd, cs, false);
3328    }
3329 
3330    bool unaligned = tu_attachment_store_unaligned(cmd, a);
3331 
3332    /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
3333     * one for depth and other for stencil. When resolving a MSAA
3334     * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
3335     */
3336    bool resolve_d32s8_s8 =
3337       src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
3338       dst->format == VK_FORMAT_S8_UINT;
3339 
3340    /* The fast path doesn't support picking out the last component of a D24S8
3341     * texture reinterpreted as RGBA8_UNORM.
3342     */
3343    bool resolve_d24s8_s8 =
3344       src->format == VK_FORMAT_D24_UNORM_S8_UINT &&
3345       dst->format == VK_FORMAT_S8_UINT;
3346 
3347    bool store_common = dst->store && !resolve_d32s8_s8;
3348    bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
3349 
3350    trace_start_gmem_store(&cmd->trace, cs);
3351 
3352    /* use fast path when render area is aligned, except for unsupported resolve cases */
3353    if (!unaligned && !resolve_d24s8_s8 &&
3354        (a == gmem_a || blit_can_resolve(dst->format))) {
3355       if (store_common)
3356          tu_emit_blit(cmd, cs, iview, src, true, false);
3357       if (store_separate_stencil)
3358          tu_emit_blit(cmd, cs, iview, src, true, true);
3359 
3360       if (cond_exec) {
3361          tu_end_load_store_cond_exec(cmd, cs, false);
3362       }
3363 
3364       trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false);
3365       return;
3366    }
3367 
3368    assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
3369 
3370    enum pipe_format src_format = tu_vk_format_to_pipe_format(src->format);
3371    if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
3372       src_format = PIPE_FORMAT_Z32_FLOAT;
3373 
3374    enum pipe_format dst_format = tu_vk_format_to_pipe_format(dst->format);
3375    if (dst_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
3376       dst_format = PIPE_FORMAT_Z32_FLOAT;
3377 
3378    if (dst->samples > 1) {
3379       /* If we hit this path, we have to disable draw states after every tile
3380        * instead of once at the end of the renderpass, so that they aren't
3381        * executed when calling CP_DRAW.
3382        *
3383        * TODO: store a flag somewhere so we don't do this more than once and
3384        * don't do it after the renderpass when this happens.
3385        */
3386       if (store_common || store_separate_stencil)
3387          tu_disable_draw_states(cmd, cs);
3388 
3389       if (store_common) {
3390          store_3d_blit(cmd, cs, iview, dst->samples, false, src_format,
3391                        dst_format, render_area, tu_attachment_gmem_offset(cmd, src), src->cpp);
3392       }
3393       if (store_separate_stencil) {
3394          store_3d_blit(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
3395                        PIPE_FORMAT_S8_UINT, render_area,
3396                        tu_attachment_gmem_offset_stencil(cmd, src), src->samples);
3397       }
3398    } else {
3399       r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
3400 
3401       if (store_common) {
3402          store_cp_blit(cmd, cs, iview, src->samples, false, src_format,
3403                        dst_format, tu_attachment_gmem_offset(cmd, src), src->cpp);
3404       }
3405       if (store_separate_stencil) {
3406          store_cp_blit(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
3407                        PIPE_FORMAT_S8_UINT, tu_attachment_gmem_offset_stencil(cmd, src), src->samples);
3408       }
3409    }
3410 
3411    if (cond_exec) {
3412       tu_end_load_store_cond_exec(cmd, cs, false);
3413    }
3414 
3415    trace_end_gmem_store(&cmd->trace, cs, dst->format, false, unaligned);
3416 }
3417