1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_clear_blit.h"
10
11 #include "ir3/ir3_nir.h"
12
13 #include "util/format_r11g11b10f.h"
14 #include "util/format_rgb9e5.h"
15 #include "util/format_srgb.h"
16 #include "util/half_float.h"
17 #include "compiler/nir/nir_builder.h"
18
19 #include "tu_cmd_buffer.h"
20 #include "tu_cs.h"
21 #include "tu_formats.h"
22 #include "tu_image.h"
23 #include "tu_tracepoints.h"
24
25 static uint32_t
tu_pack_float32_for_unorm(float val, int bits)26 tu_pack_float32_for_unorm(float val, int bits)
27 {
28 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
29 }
30
31 /* r2d_ = BLIT_OP_SCALE operations */
32
33 static enum a6xx_2d_ifmt
format_to_ifmt(enum pipe_format format)34 format_to_ifmt(enum pipe_format format)
35 {
36 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
37 format == PIPE_FORMAT_Z24X8_UNORM)
38 return R2D_UNORM8;
39
40 /* get_component_bits doesn't work with depth/stencil formats: */
41 if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
42 return R2D_FLOAT32;
43 if (format == PIPE_FORMAT_S8_UINT)
44 return R2D_INT8;
45 if (format == PIPE_FORMAT_A8_UNORM)
46 return R2D_UNORM8;
47
48 /* use the size of the red channel to find the corresponding "ifmt" */
49 bool is_int = util_format_is_pure_integer(format);
50 switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
51 case 4: case 5: case 8:
52 return is_int ? R2D_INT8 : R2D_UNORM8;
53 case 10: case 11:
54 return is_int ? R2D_INT16 : R2D_FLOAT16;
55 case 16:
56 if (util_format_is_float(format))
57 return R2D_FLOAT16;
58 return is_int ? R2D_INT16 : R2D_FLOAT32;
59 case 32:
60 return is_int ? R2D_INT32 : R2D_FLOAT32;
61 default:
62 unreachable("bad format");
63 return 0;
64 }
65 }
66
67 static void
r2d_coords(struct tu_cs *cs, const VkOffset2D *dst, const VkOffset2D *src, const VkExtent2D *extent)68 r2d_coords(struct tu_cs *cs,
69 const VkOffset2D *dst,
70 const VkOffset2D *src,
71 const VkExtent2D *extent)
72 {
73 tu_cs_emit_regs(cs,
74 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
75 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
76
77 if (!src)
78 return;
79
80 tu_cs_emit_regs(cs,
81 A6XX_GRAS_2D_SRC_TL_X(src->x),
82 A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
83 A6XX_GRAS_2D_SRC_TL_Y(src->y),
84 A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
85 }
86
87 static void
r2d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)88 r2d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
89 {
90 uint32_t clear_value[4] = {};
91
92 switch (format) {
93 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
94 case PIPE_FORMAT_Z24X8_UNORM:
95 /* cleared as r8g8b8a8_unorm using special format */
96 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
97 clear_value[1] = clear_value[0] >> 8;
98 clear_value[2] = clear_value[0] >> 16;
99 clear_value[3] = val->depthStencil.stencil;
100 break;
101 case PIPE_FORMAT_Z16_UNORM:
102 case PIPE_FORMAT_Z32_FLOAT:
103 /* R2D_FLOAT32 */
104 clear_value[0] = fui(val->depthStencil.depth);
105 break;
106 case PIPE_FORMAT_S8_UINT:
107 clear_value[0] = val->depthStencil.stencil;
108 break;
109 case PIPE_FORMAT_R9G9B9E5_FLOAT:
110 /* cleared as UINT32 */
111 clear_value[0] = float3_to_rgb9e5(val->color.float32);
112 break;
113 default:
114 assert(!util_format_is_depth_or_stencil(format));
115 const struct util_format_description *desc = util_format_description(format);
116 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
117
118 assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
119 format == PIPE_FORMAT_R11G11B10_FLOAT);
120
121 for (unsigned i = 0; i < desc->nr_channels; i++) {
122 const struct util_format_channel_description *ch = &desc->channel[i];
123 if (ifmt == R2D_UNORM8) {
124 float linear = val->color.float32[i];
125 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
126 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
127
128 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
129 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
130 else
131 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
132 } else if (ifmt == R2D_FLOAT16) {
133 clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
134 } else {
135 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
136 ifmt == R2D_INT16 || ifmt == R2D_INT8);
137 clear_value[i] = val->color.uint32[i];
138 }
139 }
140 break;
141 }
142
143 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
144 tu_cs_emit_array(cs, clear_value, 4);
145 }
146
147 static void
fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format, enum a6xx_format *fmt)148 fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format,
149 enum a6xx_format *fmt)
150 {
151 /* When blitting S8 -> D24S8 or vice versa, we have to override S8, which
152 * is normally R8_UINT for sampling/blitting purposes, to a unorm format.
153 * We also have to move stencil, which is normally in the .w channel, into
154 * the right channel. Reintepreting the S8 texture as A8_UNORM solves both
155 * problems, and avoids using a swap, which seems to sometimes not work
156 * with a D24S8 source, or a texture swizzle which is only supported with
157 * the 3d path. Sometimes this blit happens on already-constructed
158 * fdl6_view's, e.g. for sysmem resolves, so this has to happen as a fixup.
159 */
160 if (*src_format == PIPE_FORMAT_S8_UINT &&
161 (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
162 dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
163 *fmt = FMT6_A8_UNORM;
164 *src_format = PIPE_FORMAT_A8_UNORM;
165 }
166 }
167
168 static void
fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format, enum a6xx_format *fmt)169 fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format,
170 enum a6xx_format *fmt)
171 {
172 if (*dst_format == PIPE_FORMAT_S8_UINT &&
173 (src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
174 src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
175 *dst_format = PIPE_FORMAT_A8_UNORM;
176 *fmt = FMT6_A8_UNORM;
177 }
178 }
179
180 static void
r2d_src(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, VkFilter filter, enum pipe_format dst_format)181 r2d_src(struct tu_cmd_buffer *cmd,
182 struct tu_cs *cs,
183 const struct fdl6_view *iview,
184 uint32_t layer,
185 VkFilter filter,
186 enum pipe_format dst_format)
187 {
188 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
189 if (filter != VK_FILTER_NEAREST)
190 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
191
192 enum a6xx_format fmt = (src_info & A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK);
193 enum pipe_format src_format = iview->format;
194 fixup_src_format(&src_format, dst_format, &fmt);
195
196 src_info =
197 (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) |
198 A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt);
199
200 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
201 tu_cs_emit(cs, src_info);
202 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
203 tu_cs_image_ref_2d(cs, iview, layer, true);
204
205 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3);
206 tu_cs_image_flag_ref(cs, iview, layer);
207 }
208
209 static void
r2d_src_depth(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, VkFilter filter)210 r2d_src_depth(struct tu_cmd_buffer *cmd,
211 struct tu_cs *cs,
212 const struct tu_image_view *iview,
213 uint32_t layer,
214 VkFilter filter)
215 {
216 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
217 tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO));
218 tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
219 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
220 /* SP_PS_2D_SRC_PITCH has shifted pitch field */
221 tu_cs_emit(cs, iview->depth_PITCH << 9);
222
223 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3);
224 tu_cs_image_flag_ref(cs, &iview->view, layer);
225 }
226
227 static void
r2d_src_stencil(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, VkFilter filter)228 r2d_src_stencil(struct tu_cmd_buffer *cmd,
229 struct tu_cs *cs,
230 const struct tu_image_view *iview,
231 uint32_t layer,
232 VkFilter filter)
233 {
234 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
235 tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
236 tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
237 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
238 /* SP_PS_2D_SRC_PITCH has shifted pitch field */
239 tu_cs_emit(cs, iview->stencil_PITCH << 9);
240 }
241
242 static void
r2d_src_buffer(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch, uint32_t width, uint32_t height, enum pipe_format dst_format)243 r2d_src_buffer(struct tu_cmd_buffer *cmd,
244 struct tu_cs *cs,
245 enum pipe_format format,
246 uint64_t va, uint32_t pitch,
247 uint32_t width, uint32_t height,
248 enum pipe_format dst_format)
249 {
250 struct tu_native_format fmt = tu6_format_texture(format, TILE6_LINEAR);
251 enum a6xx_format color_format = fmt.fmt;
252 fixup_src_format(&format, dst_format, &color_format);
253
254 tu_cs_emit_regs(cs,
255 A6XX_SP_PS_2D_SRC_INFO(
256 .color_format = color_format,
257 .color_swap = fmt.swap,
258 .srgb = util_format_is_srgb(format),
259 .unk20 = 1,
260 .unk22 = 1),
261 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
262 A6XX_SP_PS_2D_SRC(.qword = va),
263 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
264 }
265
266 static void
r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, enum pipe_format src_format)267 r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
268 enum pipe_format src_format)
269 {
270 uint32_t dst_info = iview->RB_2D_DST_INFO;
271 enum a6xx_format fmt = dst_info & A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK;
272 enum pipe_format dst_format = iview->format;
273 fixup_dst_format(src_format, &dst_format, &fmt);
274
275 dst_info =
276 (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt;
277 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
278 tu_cs_emit(cs, dst_info);
279 tu_cs_image_ref_2d(cs, iview, layer, false);
280
281 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
282 tu_cs_image_flag_ref(cs, iview, layer);
283 }
284
285 static void
r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)286 r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
287 {
288 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
289 tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO));
290 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
291 tu_cs_emit(cs, iview->depth_PITCH);
292
293 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
294 tu_cs_image_flag_ref(cs, &iview->view, layer);
295 }
296
297 static void
r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)298 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
299 {
300 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
301 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
302 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
303 tu_cs_emit(cs, iview->stencil_PITCH);
304 }
305
306 static void
r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch, enum pipe_format src_format)307 r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
308 enum pipe_format src_format)
309 {
310 struct tu_native_format fmt = tu6_format_color(format, TILE6_LINEAR);
311 enum a6xx_format color_fmt = fmt.fmt;
312 fixup_dst_format(src_format, &format, &color_fmt);
313 fmt.fmt = color_fmt;
314
315 tu_cs_emit_regs(cs,
316 A6XX_RB_2D_DST_INFO(
317 .color_format = fmt.fmt,
318 .color_swap = fmt.swap,
319 .srgb = util_format_is_srgb(format)),
320 A6XX_RB_2D_DST(.qword = va),
321 A6XX_RB_2D_DST_PITCH(pitch));
322 }
323
324 static void
r2d_setup_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format src_format, enum pipe_format dst_format, VkImageAspectFlags aspect_mask, unsigned blit_param, bool clear, bool ubwc, bool scissor)325 r2d_setup_common(struct tu_cmd_buffer *cmd,
326 struct tu_cs *cs,
327 enum pipe_format src_format,
328 enum pipe_format dst_format,
329 VkImageAspectFlags aspect_mask,
330 unsigned blit_param,
331 bool clear,
332 bool ubwc,
333 bool scissor)
334 {
335 enum a6xx_format fmt = tu6_base_format(dst_format);
336 fixup_dst_format(src_format, &dst_format, &fmt);
337 enum a6xx_2d_ifmt ifmt = format_to_ifmt(dst_format);
338
339 uint32_t unknown_8c01 = 0;
340
341 if ((dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
342 dst_format == PIPE_FORMAT_Z24X8_UNORM) && ubwc) {
343 fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
344 }
345
346 /* note: the only format with partial clearing is D24S8 */
347 if (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
348 /* preserve stencil channel */
349 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
350 unknown_8c01 = 0x08000041;
351 /* preserve depth channels */
352 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
353 unknown_8c01 = 0x00084001;
354 }
355
356 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
357 tu_cs_emit(cs, unknown_8c01);
358
359 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
360 .scissor = scissor,
361 .rotate = blit_param,
362 .solid_color = clear,
363 .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
364 .color_format = fmt,
365 .mask = 0xf,
366 .ifmt = util_format_is_srgb(dst_format) ? R2D_UNORM8_SRGB : ifmt,
367 ).value;
368
369 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
370 tu_cs_emit(cs, blit_cntl);
371
372 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
373 tu_cs_emit(cs, blit_cntl);
374
375 if (fmt == FMT6_10_10_10_2_UNORM_DEST)
376 fmt = FMT6_16_16_16_16_FLOAT;
377
378 tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
379 .sint = util_format_is_pure_sint(dst_format),
380 .uint = util_format_is_pure_uint(dst_format),
381 .color_format = fmt,
382 .srgb = util_format_is_srgb(dst_format),
383 .mask = 0xf));
384 }
385
386 static void
r2d_setup(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format src_format, enum pipe_format dst_format, VkImageAspectFlags aspect_mask, unsigned blit_param, bool clear, bool ubwc, VkSampleCountFlagBits samples)387 r2d_setup(struct tu_cmd_buffer *cmd,
388 struct tu_cs *cs,
389 enum pipe_format src_format,
390 enum pipe_format dst_format,
391 VkImageAspectFlags aspect_mask,
392 unsigned blit_param,
393 bool clear,
394 bool ubwc,
395 VkSampleCountFlagBits samples)
396 {
397 assert(samples == VK_SAMPLE_COUNT_1_BIT);
398
399 if (!cmd->state.pass) {
400 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
401 }
402
403 r2d_setup_common(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false);
404 }
405
406 static void
r2d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)407 r2d_teardown(struct tu_cmd_buffer *cmd,
408 struct tu_cs *cs)
409 {
410 /* nothing to do here */
411 }
412
413 static void
r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)414 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
415 {
416 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
417 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
418 }
419
420 /* r3d_ = shader path operations */
421
422 static nir_ssa_def *
load_const(nir_builder *b, unsigned base, unsigned components)423 load_const(nir_builder *b, unsigned base, unsigned components)
424 {
425 return nir_load_uniform(b, components, 32, nir_imm_int(b, 0),
426 .base = base);
427 }
428
429 static nir_shader *
build_blit_vs_shader(void)430 build_blit_vs_shader(void)
431 {
432 nir_builder _b =
433 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
434 nir_builder *b = &_b;
435
436 nir_variable *out_pos =
437 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
438 "gl_Position");
439 out_pos->data.location = VARYING_SLOT_POS;
440
441 nir_ssa_def *vert0_pos = load_const(b, 0, 2);
442 nir_ssa_def *vert1_pos = load_const(b, 4, 2);
443 nir_ssa_def *vertex = nir_load_vertex_id(b);
444
445 nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos);
446 pos = nir_vec4(b, nir_channel(b, pos, 0),
447 nir_channel(b, pos, 1),
448 nir_imm_float(b, 0.0),
449 nir_imm_float(b, 1.0));
450
451 nir_store_var(b, out_pos, pos, 0xf);
452
453 nir_variable *out_coords =
454 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
455 "coords");
456 out_coords->data.location = VARYING_SLOT_VAR0;
457
458 nir_ssa_def *vert0_coords = load_const(b, 2, 2);
459 nir_ssa_def *vert1_coords = load_const(b, 6, 2);
460
461 /* Only used with "z scale" blit path which uses a 3d texture */
462 nir_ssa_def *z_coord = load_const(b, 8, 1);
463
464 nir_ssa_def *coords = nir_bcsel(b, nir_i2b1(b, vertex), vert1_coords, vert0_coords);
465 coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
466 z_coord);
467
468 nir_store_var(b, out_coords, coords, 0x7);
469
470 return b->shader;
471 }
472
473 static nir_shader *
build_clear_vs_shader(void)474 build_clear_vs_shader(void)
475 {
476 nir_builder _b =
477 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
478 nir_builder *b = &_b;
479
480 nir_variable *out_pos =
481 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
482 "gl_Position");
483 out_pos->data.location = VARYING_SLOT_POS;
484
485 nir_ssa_def *vert0_pos = load_const(b, 0, 2);
486 nir_ssa_def *vert1_pos = load_const(b, 4, 2);
487 /* c0.z is used to clear depth */
488 nir_ssa_def *depth = load_const(b, 2, 1);
489 nir_ssa_def *vertex = nir_load_vertex_id(b);
490
491 nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos);
492 pos = nir_vec4(b, nir_channel(b, pos, 0),
493 nir_channel(b, pos, 1),
494 depth, nir_imm_float(b, 1.0));
495
496 nir_store_var(b, out_pos, pos, 0xf);
497
498 nir_variable *out_layer =
499 nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
500 "gl_Layer");
501 out_layer->data.location = VARYING_SLOT_LAYER;
502 nir_ssa_def *layer = load_const(b, 3, 1);
503 nir_store_var(b, out_layer, layer, 1);
504
505 return b->shader;
506 }
507
508 static nir_shader *
build_blit_fs_shader(bool zscale)509 build_blit_fs_shader(bool zscale)
510 {
511 nir_builder _b =
512 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
513 zscale ? "zscale blit fs" : "blit fs");
514 nir_builder *b = &_b;
515
516 nir_variable *out_color =
517 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
518 "color0");
519 out_color->data.location = FRAG_RESULT_DATA0;
520
521 unsigned coord_components = zscale ? 3 : 2;
522 nir_variable *in_coords =
523 nir_variable_create(b->shader, nir_var_shader_in,
524 glsl_vec_type(coord_components),
525 "coords");
526 in_coords->data.location = VARYING_SLOT_VAR0;
527
528 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
529 /* Note: since we're just copying data, we rely on the HW ignoring the
530 * dest_type.
531 */
532 tex->dest_type = nir_type_int32;
533 tex->is_array = false;
534 tex->is_shadow = false;
535 tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
536
537 tex->texture_index = 0;
538 tex->sampler_index = 0;
539
540 b->shader->info.num_textures = 1;
541 BITSET_SET(b->shader->info.textures_used, 0);
542
543 tex->src[0].src_type = nir_tex_src_coord;
544 tex->src[0].src = nir_src_for_ssa(nir_load_var(b, in_coords));
545 tex->coord_components = coord_components;
546
547 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
548 nir_builder_instr_insert(b, &tex->instr);
549
550 nir_store_var(b, out_color, &tex->dest.ssa, 0xf);
551
552 return b->shader;
553 }
554
555 /* We can only read multisample textures via txf_ms, so we need a separate
556 * variant for them.
557 */
558 static nir_shader *
build_ms_copy_fs_shader(void)559 build_ms_copy_fs_shader(void)
560 {
561 nir_builder _b =
562 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
563 "multisample copy fs");
564 nir_builder *b = &_b;
565
566 nir_variable *out_color =
567 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
568 "color0");
569 out_color->data.location = FRAG_RESULT_DATA0;
570
571 nir_variable *in_coords =
572 nir_variable_create(b->shader, nir_var_shader_in,
573 glsl_vec_type(2),
574 "coords");
575 in_coords->data.location = VARYING_SLOT_VAR0;
576
577 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
578
579 tex->op = nir_texop_txf_ms;
580
581 /* Note: since we're just copying data, we rely on the HW ignoring the
582 * dest_type.
583 */
584 tex->dest_type = nir_type_int32;
585 tex->is_array = false;
586 tex->is_shadow = false;
587 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
588
589 tex->texture_index = 0;
590 tex->sampler_index = 0;
591
592 b->shader->info.num_textures = 1;
593 BITSET_SET(b->shader->info.textures_used, 0);
594 BITSET_SET(b->shader->info.textures_used_by_txf, 0);
595
596 nir_ssa_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
597
598 tex->src[0].src_type = nir_tex_src_coord;
599 tex->src[0].src = nir_src_for_ssa(coord);
600 tex->coord_components = 2;
601
602 tex->src[1].src_type = nir_tex_src_ms_index;
603 tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(b));
604
605 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
606 nir_builder_instr_insert(b, &tex->instr);
607
608 nir_store_var(b, out_color, &tex->dest.ssa, 0xf);
609
610 return b->shader;
611 }
612
613 static nir_shader *
build_clear_fs_shader(unsigned mrts)614 build_clear_fs_shader(unsigned mrts)
615 {
616 nir_builder _b =
617 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
618 "mrt%u clear fs", mrts);
619 nir_builder *b = &_b;
620
621 for (unsigned i = 0; i < mrts; i++) {
622 nir_variable *out_color =
623 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
624 "color");
625 out_color->data.location = FRAG_RESULT_DATA0 + i;
626
627 nir_ssa_def *color = load_const(b, 4 * i, 4);
628 nir_store_var(b, out_color, color, 0xf);
629 }
630
631 return b->shader;
632 }
633
634 static void
compile_shader(struct tu_device *dev, struct nir_shader *nir, unsigned consts, unsigned *offset, enum global_shader idx)635 compile_shader(struct tu_device *dev, struct nir_shader *nir,
636 unsigned consts, unsigned *offset, enum global_shader idx)
637 {
638 nir->options = ir3_get_compiler_options(dev->compiler);
639
640 nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
641 nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
642
643 ir3_finalize_nir(dev->compiler, nir);
644
645 struct ir3_shader *sh =
646 ir3_shader_from_nir(dev->compiler, nir, &(struct ir3_shader_options) {
647 .api_wavesize = IR3_SINGLE_OR_DOUBLE,
648 .real_wavesize = IR3_SINGLE_OR_DOUBLE,
649 .reserved_user_consts = align(consts, 4),
650 }, NULL);
651
652 struct ir3_shader_key key = {};
653 bool created;
654 struct ir3_shader_variant *so =
655 ir3_shader_get_variant(sh, &key, false, false, &created);
656
657 struct tu6_global *global = dev->global_bo->map;
658
659 assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
660 dev->global_shaders[idx] = sh;
661 dev->global_shader_variants[idx] = so;
662 memcpy(&global->shaders[*offset], so->bin,
663 sizeof(uint32_t) * so->info.sizedwords);
664 dev->global_shader_va[idx] = dev->global_bo->iova +
665 gb_offset(shaders[*offset]);
666 *offset += align(so->info.sizedwords, 32);
667 }
668
669 void
tu_init_clear_blit_shaders(struct tu_device *dev)670 tu_init_clear_blit_shaders(struct tu_device *dev)
671 {
672 unsigned offset = 0;
673 compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
674 compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
675 compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
676 compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
677 compile_shader(dev, build_ms_copy_fs_shader(), 0, &offset, GLOBAL_SH_FS_COPY_MS);
678
679 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
680 compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
681 GLOBAL_SH_FS_CLEAR0 + num_rts);
682 }
683 }
684
685 void
tu_destroy_clear_blit_shaders(struct tu_device *dev)686 tu_destroy_clear_blit_shaders(struct tu_device *dev)
687 {
688 for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
689 if (dev->global_shaders[i])
690 ir3_shader_destroy(dev->global_shaders[i]);
691 }
692 }
693
694 static void
r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)695 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit,
696 uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
697 {
698 enum global_shader vs_id =
699 blit ? GLOBAL_SH_VS_BLIT : GLOBAL_SH_VS_CLEAR;
700
701 struct ir3_shader_variant *vs = cmd->device->global_shader_variants[vs_id];
702 uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
703
704 enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
705
706 if (z_scale)
707 fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
708 else if (samples != VK_SAMPLE_COUNT_1_BIT)
709 fs_id = GLOBAL_SH_FS_COPY_MS;
710
711 unsigned num_rts = util_bitcount(rts_mask);
712 if (!blit)
713 fs_id = GLOBAL_SH_FS_CLEAR0 + num_rts;
714
715 struct ir3_shader_variant *fs = cmd->device->global_shader_variants[fs_id];
716 uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
717
718 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
719 .vs_state = true,
720 .hs_state = true,
721 .ds_state = true,
722 .gs_state = true,
723 .fs_state = true,
724 .cs_state = true,
725 .gfx_ibo = true,
726 .cs_ibo = true,
727 .gfx_shared_const = true,
728 .gfx_bindless = 0x1f,
729 .cs_bindless = 0x1f));
730
731 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, vs);
732 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL);
733 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL);
734 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL);
735 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, fs);
736
737 struct tu_pvtmem_config pvtmem = {};
738 tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
739 tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
740
741 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
742 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
743
744 if (cmd->device->physical_device->info->a6xx.has_cp_reg_write) {
745 /* Copy what the blob does here. This will emit an extra 0x3f
746 * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
747 * this is working around yet.
748 */
749 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
750 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
751 tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
752 tu_cs_emit(cs, 0);
753 } else {
754 tu_cs_emit_regs(cs, A6XX_PC_MULTIVIEW_CNTL());
755 }
756 tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL());
757
758 tu6_emit_vpc(cs, vs, NULL, NULL, NULL, fs, 0);
759
760 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
761 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
762 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
763
764 tu6_emit_fs_inputs(cs, fs);
765
766 tu_cs_emit_regs(cs,
767 A6XX_GRAS_CL_CNTL(
768 .persp_division_disable = 1,
769 .vp_xform_disable = 1,
770 .vp_clip_code_ignore = 1,
771 .clip_disable = 1));
772 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
773
774 tu_cs_emit_regs(cs, A6XX_PC_RASTER_CNTL());
775 tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
776
777 tu_cs_emit_regs(cs,
778 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
779 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
780 tu_cs_emit_regs(cs,
781 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
782 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
783
784 tu_cs_emit_regs(cs,
785 A6XX_VFD_INDEX_OFFSET(),
786 A6XX_VFD_INSTANCE_START_OFFSET());
787
788 if (rts_mask) {
789 unsigned rts_count = util_last_bit(rts_mask);
790 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
791 unsigned rt = 0;
792 for (unsigned i = 0; i < rts_count; i++) {
793 unsigned regid = 0;
794 if (rts_mask & (1u << i))
795 regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
796 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid));
797 }
798 }
799
800 cmd->state.line_mode = RECTANGULAR;
801 tu6_emit_msaa(cs, samples, cmd->state.line_mode);
802 }
803
804 static void
r3d_coords_raw(struct tu_cs *cs, const float *coords)805 r3d_coords_raw(struct tu_cs *cs, const float *coords)
806 {
807 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
808 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
809 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
810 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
811 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
812 CP_LOAD_STATE6_0_NUM_UNIT(2));
813 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
814 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
815 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
816 }
817
818 /* z coordinate for "z scale" blit path which uses a 3d texture */
819 static void
r3d_coord_z(struct tu_cs *cs, float z)820 r3d_coord_z(struct tu_cs *cs, float z)
821 {
822 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 4);
823 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(2) |
824 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
825 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
826 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
827 CP_LOAD_STATE6_0_NUM_UNIT(1));
828 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
829 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
830 tu_cs_emit(cs, fui(z));
831 tu_cs_emit(cs, 0);
832 tu_cs_emit(cs, 0);
833 tu_cs_emit(cs, 0);
834 }
835
836 static void
r3d_coords(struct tu_cs *cs, const VkOffset2D *dst, const VkOffset2D *src, const VkExtent2D *extent)837 r3d_coords(struct tu_cs *cs,
838 const VkOffset2D *dst,
839 const VkOffset2D *src,
840 const VkExtent2D *extent)
841 {
842 int32_t src_x1 = src ? src->x : 0;
843 int32_t src_y1 = src ? src->y : 0;
844 r3d_coords_raw(cs, (float[]) {
845 dst->x, dst->y,
846 src_x1, src_y1,
847 dst->x + extent->width, dst->y + extent->height,
848 src_x1 + extent->width, src_y1 + extent->height,
849 });
850 }
851
852 static void
r3d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)853 r3d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
854 {
855 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
856 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
857 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
858 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
859 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
860 CP_LOAD_STATE6_0_NUM_UNIT(1));
861 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
862 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
863 switch (format) {
864 case PIPE_FORMAT_Z24X8_UNORM:
865 case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
866 /* cleared as r8g8b8a8_unorm using special format */
867 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
868 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
869 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
870 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
871 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
872 } break;
873 case PIPE_FORMAT_Z16_UNORM:
874 case PIPE_FORMAT_Z32_FLOAT:
875 tu_cs_emit(cs, fui(val->depthStencil.depth));
876 tu_cs_emit(cs, 0);
877 tu_cs_emit(cs, 0);
878 tu_cs_emit(cs, 0);
879 break;
880 case PIPE_FORMAT_S8_UINT:
881 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
882 tu_cs_emit(cs, 0);
883 tu_cs_emit(cs, 0);
884 tu_cs_emit(cs, 0);
885 break;
886 default:
887 /* as color formats use clear value as-is */
888 assert(!util_format_is_depth_or_stencil(format));
889 tu_cs_emit_array(cs, val->color.uint32, 4);
890 break;
891 }
892 }
893
894 static void
r3d_src_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const uint32_t *tex_const, uint32_t offset_base, uint32_t offset_ubwc, VkFilter filter)895 r3d_src_common(struct tu_cmd_buffer *cmd,
896 struct tu_cs *cs,
897 const uint32_t *tex_const,
898 uint32_t offset_base,
899 uint32_t offset_ubwc,
900 VkFilter filter)
901 {
902 struct tu_cs_memory texture = { };
903 VkResult result = tu_cs_alloc(&cmd->sub_cs,
904 2, /* allocate space for a sampler too */
905 A6XX_TEX_CONST_DWORDS, &texture);
906 if (result != VK_SUCCESS) {
907 cmd->record_result = result;
908 return;
909 }
910
911 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
912
913 /* patch addresses for layer offset */
914 *(uint64_t*) (texture.map + 4) += offset_base;
915 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
916 texture.map[7] = ubwc_addr;
917 texture.map[8] = ubwc_addr >> 32;
918
919 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
920 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
921 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
922 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
923 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
924 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
925 0x60000; /* XXX used by blob, doesn't seem necessary */
926 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
927 A6XX_TEX_SAMP_1_UNNORM_COORDS |
928 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
929 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
930 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
931
932 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
933 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
934 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
935 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
936 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
937 CP_LOAD_STATE6_0_NUM_UNIT(1));
938 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
939
940 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
941
942 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
943 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
944 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
945 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
946 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
947 CP_LOAD_STATE6_0_NUM_UNIT(1));
948 tu_cs_emit_qw(cs, texture.iova);
949
950 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
951 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
952 }
953
954 static void
r3d_src(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, VkFilter filter, enum pipe_format dst_format)955 r3d_src(struct tu_cmd_buffer *cmd,
956 struct tu_cs *cs,
957 const struct fdl6_view *iview,
958 uint32_t layer,
959 VkFilter filter,
960 enum pipe_format dst_format)
961 {
962 uint32_t desc[A6XX_TEX_CONST_DWORDS];
963 memcpy(desc, iview->descriptor, sizeof(desc));
964
965 enum a6xx_format fmt = (desc[0] & A6XX_TEX_CONST_0_FMT__MASK) >>
966 A6XX_TEX_CONST_0_FMT__SHIFT;
967 enum pipe_format src_format = iview->format;
968 fixup_src_format(&src_format, dst_format, &fmt);
969 desc[0] = (desc[0] & ~A6XX_TEX_CONST_0_FMT__MASK) |
970 A6XX_TEX_CONST_0_FMT(fmt);
971
972 r3d_src_common(cmd, cs, desc,
973 iview->layer_size * layer,
974 iview->ubwc_layer_size * layer,
975 filter);
976 }
977
978 static void
r3d_src_buffer(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch, uint32_t width, uint32_t height, enum pipe_format dst_format)979 r3d_src_buffer(struct tu_cmd_buffer *cmd,
980 struct tu_cs *cs,
981 enum pipe_format format,
982 uint64_t va, uint32_t pitch,
983 uint32_t width, uint32_t height,
984 enum pipe_format dst_format)
985 {
986 uint32_t desc[A6XX_TEX_CONST_DWORDS];
987
988 struct tu_native_format fmt = tu6_format_texture(format, TILE6_LINEAR);
989 enum a6xx_format color_format = fmt.fmt;
990 fixup_src_format(&format, dst_format, &color_format);
991
992 desc[0] =
993 COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
994 A6XX_TEX_CONST_0_FMT(color_format) |
995 A6XX_TEX_CONST_0_SWAP(fmt.swap) |
996 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
997 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
998 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
999 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1000 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
1001 desc[2] =
1002 A6XX_TEX_CONST_2_PITCH(pitch) |
1003 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1004 desc[3] = 0;
1005 desc[4] = va;
1006 desc[5] = va >> 32;
1007 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1008 desc[i] = 0;
1009
1010 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1011 }
1012
1013 static void
r3d_src_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, enum pipe_format format, enum pipe_format dst_format, uint32_t gmem_offset, uint32_t cpp)1014 r3d_src_gmem(struct tu_cmd_buffer *cmd,
1015 struct tu_cs *cs,
1016 const struct tu_image_view *iview,
1017 enum pipe_format format,
1018 enum pipe_format dst_format,
1019 uint32_t gmem_offset,
1020 uint32_t cpp)
1021 {
1022 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1023 memcpy(desc, iview->view.descriptor, sizeof(desc));
1024
1025 enum a6xx_format fmt = tu6_format_texture(format, TILE6_LINEAR).fmt;
1026 fixup_src_format(&format, dst_format, &fmt);
1027
1028 /* patch the format so that depth/stencil get the right format and swizzle */
1029 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1030 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1031 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1032 desc[0] |= A6XX_TEX_CONST_0_FMT(fmt) |
1033 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1034 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1035 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1036 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1037
1038 /* patched for gmem */
1039 desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1040 desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1041 desc[2] =
1042 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1043 A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
1044 desc[3] = 0;
1045 desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1046 desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
1047 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1048 desc[i] = 0;
1049
1050 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1051 }
1052
1053 static void
r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, enum pipe_format src_format)1054 r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1055 enum pipe_format src_format)
1056 {
1057 uint32_t mrt_buf_info = iview->RB_MRT_BUF_INFO;
1058
1059 enum a6xx_format fmt = mrt_buf_info & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK;
1060 enum pipe_format dst_format = iview->format;
1061 fixup_dst_format(src_format, &dst_format, &fmt);
1062 mrt_buf_info =
1063 (mrt_buf_info & ~A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK) |
1064 A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(fmt);
1065 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1066 tu_cs_emit(cs, mrt_buf_info);
1067 tu_cs_image_ref(cs, iview, layer);
1068 tu_cs_emit(cs, 0);
1069
1070 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1071 tu_cs_image_flag_ref(cs, iview, layer);
1072
1073 /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
1074 * FMT6_NV12_Y.
1075 */
1076 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt));
1077
1078 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
1079 }
1080
1081 static void
r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)1082 r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1083 {
1084 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1085 tu_cs_emit(cs, tu_image_view_depth(iview, RB_MRT_BUF_INFO));
1086 tu_cs_image_depth_ref(cs, iview, layer);
1087 tu_cs_emit(cs, 0);
1088
1089 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1090 tu_cs_image_flag_ref(cs, &iview->view, layer);
1091
1092 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->view.ubwc_enabled));
1093 }
1094
1095 static void
r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)1096 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1097 {
1098 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1099 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
1100 tu_cs_image_stencil_ref(cs, iview, layer);
1101 tu_cs_emit(cs, 0);
1102
1103 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
1104 }
1105
1106 static void
r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch, enum pipe_format src_format)1107 r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1108 enum pipe_format src_format)
1109 {
1110 struct tu_native_format fmt = tu6_format_color(format, TILE6_LINEAR);
1111
1112 enum a6xx_format color_fmt = fmt.fmt;
1113 fixup_dst_format(src_format, &format, &color_fmt);
1114
1115 tu_cs_emit_regs(cs,
1116 A6XX_RB_MRT_BUF_INFO(0, .color_format = color_fmt, .color_swap = fmt.swap),
1117 A6XX_RB_MRT_PITCH(0, pitch),
1118 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1119 A6XX_RB_MRT_BASE(0, .qword = va),
1120 A6XX_RB_MRT_BASE_GMEM(0, 0));
1121
1122 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
1123 }
1124
1125 static uint8_t
aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)1126 aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
1127 {
1128 uint8_t mask = 0xf;
1129 assert(aspect_mask);
1130 /* note: the only format with partial writing is D24S8,
1131 * clear/blit uses the _AS_R8G8B8A8 format to access it
1132 */
1133 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1134 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1135 mask = 0x7;
1136 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1137 mask = 0x8;
1138 }
1139 return mask;
1140 }
1141
1142 static void
r3d_setup(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format src_format, enum pipe_format dst_format, VkImageAspectFlags aspect_mask, unsigned blit_param, bool clear, bool ubwc, VkSampleCountFlagBits samples)1143 r3d_setup(struct tu_cmd_buffer *cmd,
1144 struct tu_cs *cs,
1145 enum pipe_format src_format,
1146 enum pipe_format dst_format,
1147 VkImageAspectFlags aspect_mask,
1148 unsigned blit_param,
1149 bool clear,
1150 bool ubwc,
1151 VkSampleCountFlagBits samples)
1152 {
1153 enum a6xx_format fmt = tu6_base_format(dst_format);
1154 fixup_dst_format(src_format, &dst_format, &fmt);
1155
1156 if ((dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
1157 dst_format == PIPE_FORMAT_Z24X8_UNORM) && ubwc) {
1158 fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
1159 }
1160
1161 if (!cmd->state.pass) {
1162 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
1163 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1164 }
1165
1166 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
1167 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
1168
1169 r3d_common(cmd, cs, !clear, 1, blit_param, samples);
1170
1171 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1172 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1173 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1174 0xfc000000);
1175 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
1176
1177 tu_cs_emit_regs(cs,
1178 A6XX_RB_FS_OUTPUT_CNTL0(),
1179 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1180
1181 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1182 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1183
1184 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1185 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1186 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1187 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1188 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1189 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1190 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1191
1192 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
1193 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
1194
1195 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1196 .color_format = fmt,
1197 .color_sint = util_format_is_pure_sint(dst_format),
1198 .color_uint = util_format_is_pure_uint(dst_format)));
1199
1200 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1201 .component_enable = aspect_write_mask(dst_format, aspect_mask)));
1202 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
1203 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
1204
1205 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1206 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1207
1208 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1209 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1210
1211 /* Disable sample counting in order to not affect occlusion query. */
1212 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
1213
1214 if (cmd->state.prim_generated_query_running_before_rp) {
1215 tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS);
1216 }
1217
1218 if (cmd->state.predication_active) {
1219 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1220 tu_cs_emit(cs, 0);
1221 }
1222 }
1223
1224 static void
r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)1225 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1226 {
1227 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1228 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1229 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1230 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1231 tu_cs_emit(cs, 1); /* instance count */
1232 tu_cs_emit(cs, 2); /* vertex count */
1233 }
1234
1235 static void
r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)1236 r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1237 {
1238 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1239 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1240 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1241 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY));
1242 tu_cs_emit(cs, 1); /* instance count */
1243 tu_cs_emit(cs, 2); /* vertex count */
1244 }
1245
1246 static void
r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)1247 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1248 {
1249 if (cmd->state.predication_active) {
1250 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1251 tu_cs_emit(cs, 1);
1252 }
1253
1254 /* Re-enable sample counting. */
1255 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
1256
1257 if (cmd->state.prim_generated_query_running_before_rp) {
1258 tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS);
1259 }
1260 }
1261
1262 /* blit ops - common interface for 2d/shader paths */
1263
1264 struct blit_ops {
1265 void (*coords)(struct tu_cs *cs,
1266 const VkOffset2D *dst,
1267 const VkOffset2D *src,
1268 const VkExtent2D *extent);
1269 void (*clear_value)(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val);
1270 void (*src)(
1271 struct tu_cmd_buffer *cmd,
1272 struct tu_cs *cs,
1273 const struct fdl6_view *iview,
1274 uint32_t layer,
1275 VkFilter filter,
1276 enum pipe_format dst_format);
1277 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1278 enum pipe_format format,
1279 uint64_t va, uint32_t pitch,
1280 uint32_t width, uint32_t height,
1281 enum pipe_format dst_format);
1282 void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1283 enum pipe_format src_format);
1284 void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1285 void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1286 void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1287 enum pipe_format src_format);
1288 void (*setup)(struct tu_cmd_buffer *cmd,
1289 struct tu_cs *cs,
1290 enum pipe_format src_format,
1291 enum pipe_format dst_format,
1292 VkImageAspectFlags aspect_mask,
1293 unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1294 bool clear,
1295 bool ubwc,
1296 VkSampleCountFlagBits samples);
1297 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1298 void (*teardown)(struct tu_cmd_buffer *cmd,
1299 struct tu_cs *cs);
1300 };
1301
1302 static const struct blit_ops r2d_ops = {
1303 .coords = r2d_coords,
1304 .clear_value = r2d_clear_value,
1305 .src = r2d_src,
1306 .src_buffer = r2d_src_buffer,
1307 .dst = r2d_dst,
1308 .dst_depth = r2d_dst_depth,
1309 .dst_stencil = r2d_dst_stencil,
1310 .dst_buffer = r2d_dst_buffer,
1311 .setup = r2d_setup,
1312 .run = r2d_run,
1313 .teardown = r2d_teardown,
1314 };
1315
1316 static const struct blit_ops r3d_ops = {
1317 .coords = r3d_coords,
1318 .clear_value = r3d_clear_value,
1319 .src = r3d_src,
1320 .src_buffer = r3d_src_buffer,
1321 .dst = r3d_dst,
1322 .dst_depth = r3d_dst_depth,
1323 .dst_stencil = r3d_dst_stencil,
1324 .dst_buffer = r3d_dst_buffer,
1325 .setup = r3d_setup,
1326 .run = r3d_run,
1327 .teardown = r3d_teardown,
1328 };
1329
1330 /* passthrough set coords from 3D extents */
1331 static void
coords(const struct blit_ops *ops, struct tu_cs *cs, const VkOffset3D *dst, const VkOffset3D *src, const VkExtent3D *extent)1332 coords(const struct blit_ops *ops,
1333 struct tu_cs *cs,
1334 const VkOffset3D *dst,
1335 const VkOffset3D *src,
1336 const VkExtent3D *extent)
1337 {
1338 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1339 }
1340
1341 /* Decides the VK format to treat our data as for a memcpy-style blit. We have
1342 * to be a bit careful because we have to pick a format with matching UBWC
1343 * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1344 * everything.
1345 */
1346 static enum pipe_format
copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)1347 copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)
1348 {
1349 if (vk_format_is_compressed(vk_format)) {
1350 switch (vk_format_get_blocksize(vk_format)) {
1351 case 1: return PIPE_FORMAT_R8_UINT;
1352 case 2: return PIPE_FORMAT_R16_UINT;
1353 case 4: return PIPE_FORMAT_R32_UINT;
1354 case 8: return PIPE_FORMAT_R32G32_UINT;
1355 case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
1356 default:
1357 unreachable("unhandled format size");
1358 }
1359 }
1360
1361 enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
1362
1363 /* For SNORM formats, copy them as the equivalent UNORM format. If we treat
1364 * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1365 * (also -1.0), when we're supposed to be memcpying the bits. See
1366 * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1367 */
1368 format = util_format_snorm_to_unorm(format);
1369
1370 switch (format) {
1371 case PIPE_FORMAT_R9G9B9E5_FLOAT:
1372 return PIPE_FORMAT_R32_UINT;
1373
1374 case PIPE_FORMAT_G8_B8R8_420_UNORM:
1375 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
1376 return PIPE_FORMAT_R8G8_UNORM;
1377 else
1378 return PIPE_FORMAT_Y8_UNORM;
1379 case PIPE_FORMAT_G8_B8_R8_420_UNORM:
1380 return PIPE_FORMAT_R8_UNORM;
1381
1382 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
1383 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1384 return PIPE_FORMAT_S8_UINT;
1385 assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
1386 return PIPE_FORMAT_Z32_FLOAT;
1387
1388 default:
1389 return format;
1390 }
1391 }
1392
1393 void
tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image *image, const VkClearValue *value)1394 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
1395 struct tu_cs *cs,
1396 struct tu_image *image,
1397 const VkClearValue *value)
1398 {
1399 const struct blit_ops *ops = &r2d_ops;
1400
1401 /* It is assumed that LRZ cache is invalidated at this point for
1402 * the writes here to become visible to LRZ.
1403 *
1404 * LRZ writes are going through UCHE cache, flush UCHE before changing
1405 * LRZ via CCU. Don't need to invalidate CCU since we are presumably
1406 * writing whole cache lines we assume to be 64 bytes.
1407 */
1408 tu6_emit_event_write(cmd, &cmd->cs, CACHE_FLUSH_TS);
1409
1410 ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
1411 VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
1412 VK_SAMPLE_COUNT_1_BIT);
1413 ops->clear_value(cs, PIPE_FORMAT_Z16_UNORM, value);
1414 ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
1415 image->iova + image->lrz_offset,
1416 image->lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
1417 ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});
1418 ops->run(cmd, cs);
1419 ops->teardown(cmd, cs);
1420
1421 /* Clearing writes via CCU color in the PS stage, and LRZ is read via
1422 * UCHE in the earlier GRAS stage.
1423 */
1424 cmd->state.cache.flush_bits |=
1425 TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
1426 TU_CMD_FLAG_WAIT_FOR_IDLE;
1427 }
1428
1429 void
tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image *image)1430 tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd,
1431 struct tu_cs *cs,
1432 struct tu_image *image)
1433 {
1434 const struct blit_ops *ops = &r2d_ops;
1435 VkClearValue clear = { .color = { .uint32[0] = 0xffffffff } };
1436
1437 /* LRZ fast-clear buffer is always allocated with 512 bytes size. */
1438 ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
1439 VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
1440 VK_SAMPLE_COUNT_1_BIT);
1441 ops->clear_value(cs, PIPE_FORMAT_R32_UINT, &clear);
1442 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
1443 image->iova + image->lrz_fc_offset, 512,
1444 PIPE_FORMAT_R32_UINT);
1445 ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {128, 1});
1446 ops->run(cmd, cs);
1447 ops->teardown(cmd, cs);
1448 }
1449
1450 static void
tu_image_view_copy_blit(struct fdl6_view *iview, struct tu_image *image, enum pipe_format format, const VkImageSubresourceLayers *subres, uint32_t layer, bool z_scale)1451 tu_image_view_copy_blit(struct fdl6_view *iview,
1452 struct tu_image *image,
1453 enum pipe_format format,
1454 const VkImageSubresourceLayers *subres,
1455 uint32_t layer,
1456 bool z_scale)
1457 {
1458 VkImageAspectFlags aspect_mask = subres->aspectMask;
1459
1460 /* always use the AS_R8G8B8A8 format for these */
1461 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
1462 format == PIPE_FORMAT_Z24X8_UNORM) {
1463 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1464 }
1465
1466 const struct fdl_layout *layout =
1467 &image->layout[tu6_plane_index(image->vk.format, aspect_mask)];
1468
1469 fdl6_view_init(iview, &layout, &(struct fdl_view_args) {
1470 .iova = image->iova,
1471 .base_array_layer = subres->baseArrayLayer + layer,
1472 .layer_count = 1,
1473 .base_miplevel = subres->mipLevel,
1474 .level_count = 1,
1475 .format = tu_format_for_aspect(format, aspect_mask),
1476 .swiz = {
1477 PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
1478 },
1479 .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
1480 }, false);
1481 }
1482
1483 static void
tu_image_view_copy(struct fdl6_view *iview, struct tu_image *image, enum pipe_format format, const VkImageSubresourceLayers *subres, uint32_t layer)1484 tu_image_view_copy(struct fdl6_view *iview,
1485 struct tu_image *image,
1486 enum pipe_format format,
1487 const VkImageSubresourceLayers *subres,
1488 uint32_t layer)
1489 {
1490 tu_image_view_copy_blit(iview, image, format, subres, layer, false);
1491 }
1492
1493 static void
tu_image_view_blit(struct fdl6_view *iview, struct tu_image *image, const VkImageSubresourceLayers *subres, uint32_t layer)1494 tu_image_view_blit(struct fdl6_view *iview,
1495 struct tu_image *image,
1496 const VkImageSubresourceLayers *subres,
1497 uint32_t layer)
1498 {
1499 enum pipe_format format =
1500 tu6_plane_format(image->vk.format, tu6_plane_index(image->vk.format,
1501 subres->aspectMask));
1502 tu_image_view_copy_blit(iview, image, format, subres, layer, false);
1503 }
1504
1505 static void
tu6_blit_image(struct tu_cmd_buffer *cmd, struct tu_image *src_image, struct tu_image *dst_image, const VkImageBlit2 *info, VkFilter filter)1506 tu6_blit_image(struct tu_cmd_buffer *cmd,
1507 struct tu_image *src_image,
1508 struct tu_image *dst_image,
1509 const VkImageBlit2 *info,
1510 VkFilter filter)
1511 {
1512 const struct blit_ops *ops = &r2d_ops;
1513 struct tu_cs *cs = &cmd->cs;
1514 bool z_scale = false;
1515 uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
1516
1517 /* 2D blit can't do rotation mirroring from just coordinates */
1518 static const enum a6xx_rotation rotate[2][2] = {
1519 {ROTATE_0, ROTATE_HFLIP},
1520 {ROTATE_VFLIP, ROTATE_180},
1521 };
1522
1523 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1524 (info->dstOffsets[1].x < info->dstOffsets[0].x);
1525 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1526 (info->dstOffsets[1].y < info->dstOffsets[0].y);
1527
1528 int32_t src0_z = info->srcOffsets[0].z;
1529 int32_t src1_z = info->srcOffsets[1].z;
1530
1531 if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
1532 info->dstOffsets[1].z - info->dstOffsets[0].z) ||
1533 info->srcOffsets[1].z < info->srcOffsets[0].z) {
1534 z_scale = true;
1535 }
1536
1537 if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
1538 layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
1539 src0_z = info->srcOffsets[1].z;
1540 src1_z = info->srcOffsets[0].z;
1541 }
1542
1543 if (info->dstSubresource.layerCount > 1) {
1544 assert(layers <= 1);
1545 layers = info->dstSubresource.layerCount;
1546 }
1547
1548 /* BC1_RGB_* formats need to have their last components overriden with 1
1549 * when sampling, which is normally handled with the texture descriptor
1550 * swizzle. The 2d path can't handle that, so use the 3d path.
1551 *
1552 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1553 * the 2d path.
1554 */
1555
1556 unsigned blit_param = rotate[mirror_y][mirror_x];
1557 if (dst_image->layout[0].nr_samples > 1 ||
1558 src_image->vk.format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1559 src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1560 filter == VK_FILTER_CUBIC_EXT ||
1561 z_scale) {
1562 ops = &r3d_ops;
1563 blit_param = z_scale;
1564 }
1565
1566 /* use the right format in setup() for D32_S8
1567 * TODO: this probably should use a helper
1568 */
1569 enum pipe_format src_format =
1570 tu6_plane_format(src_image->vk.format,
1571 tu6_plane_index(src_image->vk.format,
1572 info->srcSubresource.aspectMask));
1573 enum pipe_format dst_format =
1574 tu6_plane_format(dst_image->vk.format,
1575 tu6_plane_index(src_image->vk.format,
1576 info->srcSubresource.aspectMask));
1577 trace_start_blit(&cmd->trace, cs);
1578
1579 ops->setup(cmd, cs, src_format, dst_format, info->dstSubresource.aspectMask,
1580 blit_param, false, dst_image->layout[0].ubwc,
1581 dst_image->layout[0].nr_samples);
1582
1583 if (ops == &r3d_ops) {
1584 r3d_coords_raw(cs, (float[]) {
1585 info->dstOffsets[0].x, info->dstOffsets[0].y,
1586 info->srcOffsets[0].x, info->srcOffsets[0].y,
1587 info->dstOffsets[1].x, info->dstOffsets[1].y,
1588 info->srcOffsets[1].x, info->srcOffsets[1].y
1589 });
1590 } else {
1591 tu_cs_emit_regs(cs,
1592 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1593 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1594 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1595 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1596 tu_cs_emit_regs(cs,
1597 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1598 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1599 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1600 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1601 }
1602
1603 struct fdl6_view dst, src;
1604 tu_image_view_blit(&dst, dst_image, &info->dstSubresource,
1605 MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
1606
1607 if (z_scale) {
1608 tu_image_view_copy_blit(&src, src_image, src_format,
1609 &info->srcSubresource, 0, true);
1610 ops->src(cmd, cs, &src, 0, filter, dst_format);
1611 } else {
1612 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1613 }
1614
1615 for (uint32_t i = 0; i < layers; i++) {
1616 if (z_scale) {
1617 float t = ((float) i + 0.5f) / (float) layers;
1618 r3d_coord_z(cs, t * (src1_z - src0_z) + src0_z);
1619 } else {
1620 ops->src(cmd, cs, &src, i, filter, dst_format);
1621 }
1622 ops->dst(cs, &dst, i, src_format);
1623 ops->run(cmd, cs);
1624 }
1625
1626 ops->teardown(cmd, cs);
1627
1628 trace_end_blit(&cmd->trace, cs,
1629 ops == &r3d_ops,
1630 src_image->vk.format,
1631 dst_image->vk.format,
1632 layers);
1633 }
1634
1635 VKAPI_ATTR void VKAPI_CALL
tu_CmdBlitImage2KHR(VkCommandBuffer commandBuffer, const VkBlitImageInfo2* pBlitImageInfo)1636 tu_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
1637 const VkBlitImageInfo2* pBlitImageInfo)
1638
1639 {
1640 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1641 TU_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage);
1642 TU_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage);
1643
1644 for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) {
1645 /* can't blit both depth and stencil at once with D32_S8
1646 * TODO: more advanced 3D blit path to support it instead?
1647 */
1648 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1649 dst_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1650 VkImageBlit2 region = pBlitImageInfo->pRegions[i];
1651 u_foreach_bit(b, region.dstSubresource.aspectMask) {
1652 region.srcSubresource.aspectMask = BIT(b);
1653 region.dstSubresource.aspectMask = BIT(b);
1654 tu6_blit_image(cmd, src_image, dst_image, ®ion, pBlitImageInfo->filter);
1655 }
1656 continue;
1657 }
1658 tu6_blit_image(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
1659 pBlitImageInfo->filter);
1660 }
1661
1662 if (dst_image->lrz_height) {
1663 tu_disable_lrz(cmd, &cmd->cs, dst_image);
1664 }
1665 }
1666
1667 static void
copy_compressed(VkFormat format, VkOffset3D *offset, VkExtent3D *extent, uint32_t *width, uint32_t *height)1668 copy_compressed(VkFormat format,
1669 VkOffset3D *offset,
1670 VkExtent3D *extent,
1671 uint32_t *width,
1672 uint32_t *height)
1673 {
1674 if (!vk_format_is_compressed(format))
1675 return;
1676
1677 uint32_t block_width = vk_format_get_blockwidth(format);
1678 uint32_t block_height = vk_format_get_blockheight(format);
1679
1680 offset->x /= block_width;
1681 offset->y /= block_height;
1682
1683 if (extent) {
1684 extent->width = DIV_ROUND_UP(extent->width, block_width);
1685 extent->height = DIV_ROUND_UP(extent->height, block_height);
1686 }
1687 if (width)
1688 *width = DIV_ROUND_UP(*width, block_width);
1689 if (height)
1690 *height = DIV_ROUND_UP(*height, block_height);
1691 }
1692
1693 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd, struct tu_buffer *src_buffer, struct tu_image *dst_image, const VkBufferImageCopy2 *info)1694 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1695 struct tu_buffer *src_buffer,
1696 struct tu_image *dst_image,
1697 const VkBufferImageCopy2 *info)
1698 {
1699 struct tu_cs *cs = &cmd->cs;
1700 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1701 enum pipe_format src_format =
1702 copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
1703 enum pipe_format dst_format =
1704 copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
1705 const struct blit_ops *ops = &r2d_ops;
1706
1707 /* special case for buffer to stencil */
1708 if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
1709 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1710 src_format = PIPE_FORMAT_S8_UINT;
1711 }
1712
1713 /* note: could use "R8_UNORM" when no UBWC */
1714 if (src_format == PIPE_FORMAT_Y8_UNORM)
1715 ops = &r3d_ops;
1716
1717 VkOffset3D offset = info->imageOffset;
1718 VkExtent3D extent = info->imageExtent;
1719 uint32_t src_width = info->bufferRowLength ?: extent.width;
1720 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1721
1722 copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
1723
1724 uint32_t pitch = src_width * util_format_get_blocksize(src_format);
1725 uint32_t layer_size = src_height * pitch;
1726
1727 ops->setup(cmd, cs, src_format, dst_format,
1728 info->imageSubresource.aspectMask, 0, false, dst_image->layout[0].ubwc,
1729 dst_image->layout[0].nr_samples);
1730
1731 struct fdl6_view dst;
1732 tu_image_view_copy(&dst, dst_image, dst_format, &info->imageSubresource, offset.z);
1733
1734 for (uint32_t i = 0; i < layers; i++) {
1735 ops->dst(cs, &dst, i, src_format);
1736
1737 uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i;
1738 if ((src_va & 63) || (pitch & 63)) {
1739 for (uint32_t y = 0; y < extent.height; y++) {
1740 uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
1741 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1742 x + extent.width, 1, dst_format);
1743 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1744 &(VkExtent2D) {extent.width, 1});
1745 ops->run(cmd, cs);
1746 src_va += pitch;
1747 }
1748 } else {
1749 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height, dst_format);
1750 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1751 ops->run(cmd, cs);
1752 }
1753 }
1754
1755 ops->teardown(cmd, cs);
1756 }
1757
1758 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer, const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)1759 tu_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
1760 const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
1761 {
1762 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1763 TU_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage);
1764 TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
1765
1766 for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
1767 tu_copy_buffer_to_image(cmd, src_buffer, dst_image,
1768 pCopyBufferToImageInfo->pRegions + i);
1769
1770 if (dst_image->lrz_height) {
1771 tu_disable_lrz(cmd, &cmd->cs, dst_image);
1772 }
1773 }
1774
1775 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, struct tu_image *src_image, struct tu_buffer *dst_buffer, const VkBufferImageCopy2 *info)1776 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1777 struct tu_image *src_image,
1778 struct tu_buffer *dst_buffer,
1779 const VkBufferImageCopy2 *info)
1780 {
1781 struct tu_cs *cs = &cmd->cs;
1782 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1783 enum pipe_format dst_format =
1784 copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
1785 enum pipe_format src_format =
1786 copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
1787 const struct blit_ops *ops = &r2d_ops;
1788
1789 if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
1790 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1791 dst_format = PIPE_FORMAT_S8_UINT;
1792 }
1793
1794 /* note: could use "R8_UNORM" when no UBWC */
1795 if (dst_format == PIPE_FORMAT_Y8_UNORM)
1796 ops = &r3d_ops;
1797
1798 VkOffset3D offset = info->imageOffset;
1799 VkExtent3D extent = info->imageExtent;
1800 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1801 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1802
1803 copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
1804
1805 uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
1806 uint32_t layer_size = pitch * dst_height;
1807
1808 ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
1809 VK_SAMPLE_COUNT_1_BIT);
1810
1811 struct fdl6_view src;
1812 tu_image_view_copy(&src, src_image, src_format, &info->imageSubresource, offset.z);
1813
1814 for (uint32_t i = 0; i < layers; i++) {
1815 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
1816
1817 uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i;
1818 if ((dst_va & 63) || (pitch & 63)) {
1819 for (uint32_t y = 0; y < extent.height; y++) {
1820 uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
1821 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0, src_format);
1822 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1823 &(VkExtent2D) {extent.width, 1});
1824 ops->run(cmd, cs);
1825 dst_va += pitch;
1826 }
1827 } else {
1828 ops->dst_buffer(cs, dst_format, dst_va, pitch, src_format);
1829 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1830 ops->run(cmd, cs);
1831 }
1832 }
1833
1834 ops->teardown(cmd, cs);
1835 }
1836
1837 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer, const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo)1838 tu_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
1839 const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo)
1840 {
1841 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1842 TU_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage);
1843 TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
1844
1845 for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i)
1846 tu_copy_image_to_buffer(cmd, src_image, dst_buffer,
1847 pCopyImageToBufferInfo->pRegions + i);
1848 }
1849
1850 /* Tiled formats don't support swapping, which means that we can't support
1851 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1852 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1853 * Currently we fake support for tiled swapped formats and use the unswapped
1854 * format instead, but this means that reinterpreting copies to and from
1855 * swapped formats can't be performed correctly unless we can swizzle the
1856 * components by reinterpreting the other image as the "correct" swapped
1857 * format, i.e. only when the other image is linear.
1858 */
1859
1860 static bool
is_swapped_format(enum pipe_format format)1861 is_swapped_format(enum pipe_format format)
1862 {
1863 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1864 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1865 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1866 }
1867
1868 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1869 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1870 * versa). This should mirror the logic in fdl6_layout.
1871 */
1872 static bool
image_is_r8g8(struct tu_image *image)1873 image_is_r8g8(struct tu_image *image)
1874 {
1875 return image->layout[0].cpp == 2 &&
1876 vk_format_get_nr_components(image->vk.format) == 2;
1877 }
1878
1879 static void
tu_copy_image_to_image(struct tu_cmd_buffer *cmd, struct tu_image *src_image, struct tu_image *dst_image, const VkImageCopy2 *info)1880 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1881 struct tu_image *src_image,
1882 struct tu_image *dst_image,
1883 const VkImageCopy2 *info)
1884 {
1885 const struct blit_ops *ops = &r2d_ops;
1886 struct tu_cs *cs = &cmd->cs;
1887
1888 if (dst_image->layout[0].nr_samples > 1)
1889 ops = &r3d_ops;
1890
1891 enum pipe_format format = PIPE_FORMAT_NONE;
1892 VkOffset3D src_offset = info->srcOffset;
1893 VkOffset3D dst_offset = info->dstOffset;
1894 VkExtent3D extent = info->extent;
1895 uint32_t layers_to_copy = MAX2(info->extent.depth, info->srcSubresource.layerCount);
1896
1897 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1898 * Images":
1899 *
1900 * When copying between compressed and uncompressed formats the extent
1901 * members represent the texel dimensions of the source image and not
1902 * the destination. When copying from a compressed image to an
1903 * uncompressed image the image texel dimensions written to the
1904 * uncompressed image will be source extent divided by the compressed
1905 * texel block dimensions. When copying from an uncompressed image to a
1906 * compressed image the image texel dimensions written to the compressed
1907 * image will be the source extent multiplied by the compressed texel
1908 * block dimensions.
1909 *
1910 * This means we only have to adjust the extent if the source image is
1911 * compressed.
1912 */
1913 copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
1914 copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
1915
1916 enum pipe_format dst_format = copy_format(dst_image->vk.format, info->dstSubresource.aspectMask);
1917 enum pipe_format src_format = copy_format(src_image->vk.format, info->srcSubresource.aspectMask);
1918
1919 /* note: could use "R8_UNORM" when no UBWC */
1920 if (dst_format == PIPE_FORMAT_Y8_UNORM ||
1921 src_format == PIPE_FORMAT_Y8_UNORM)
1922 ops = &r3d_ops;
1923
1924 bool use_staging_blit = false;
1925
1926 if (src_format == dst_format) {
1927 /* Images that share a format can always be copied directly because it's
1928 * the same as a blit.
1929 */
1930 format = src_format;
1931 } else if (!src_image->layout[0].tile_mode) {
1932 /* If an image is linear, we can always safely reinterpret it with the
1933 * other image's format and then do a regular blit.
1934 */
1935 format = dst_format;
1936 } else if (!dst_image->layout[0].tile_mode) {
1937 format = src_format;
1938 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1939 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1940 * due to the different tile layout.
1941 */
1942 use_staging_blit = true;
1943 } else if (is_swapped_format(src_format) ||
1944 is_swapped_format(dst_format)) {
1945 /* If either format has a non-identity swap, then we can't copy
1946 * to/from it.
1947 */
1948 use_staging_blit = true;
1949 } else if (!src_image->layout[0].ubwc) {
1950 format = dst_format;
1951 } else if (!dst_image->layout[0].ubwc) {
1952 format = src_format;
1953 } else {
1954 /* Both formats use UBWC and so neither can be reinterpreted.
1955 * TODO: We could do an in-place decompression of the dst instead.
1956 */
1957 perf_debug(cmd->device, "TODO: Do in-place UBWC decompression for UBWC->UBWC blits");
1958 use_staging_blit = true;
1959 }
1960
1961 struct fdl6_view dst, src;
1962
1963 if (use_staging_blit) {
1964 tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z);
1965 tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z);
1966
1967 struct fdl_layout staging_layout = { 0 };
1968 VkOffset3D staging_offset = { 0 };
1969
1970 staging_layout.tile_mode = TILE6_LINEAR;
1971 staging_layout.ubwc = false;
1972
1973 fdl6_layout(&staging_layout,
1974 src_format,
1975 src_image->layout[0].nr_samples,
1976 extent.width,
1977 extent.height,
1978 extent.depth,
1979 1,
1980 info->srcSubresource.layerCount,
1981 extent.depth > 1,
1982 NULL);
1983
1984 struct tu_bo *staging_bo;
1985 VkResult result = tu_get_scratch_bo(cmd->device,
1986 staging_layout.size,
1987 &staging_bo);
1988 if (result != VK_SUCCESS) {
1989 cmd->record_result = result;
1990 return;
1991 }
1992
1993 struct fdl6_view staging;
1994 const struct fdl_layout *staging_layout_ptr = &staging_layout;
1995 fdl6_view_init(&staging, &staging_layout_ptr, &(struct fdl_view_args) {
1996 .iova = staging_bo->iova,
1997 .base_array_layer = 0,
1998 .layer_count = 1,
1999 .base_miplevel = 0,
2000 .level_count = info->srcSubresource.layerCount,
2001 .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
2002 .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2003 .type = FDL_VIEW_TYPE_2D,
2004 }, false);
2005
2006 ops->setup(cmd, cs, src_format, src_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
2007 dst_image->layout[0].nr_samples);
2008 coords(ops, cs, &staging_offset, &src_offset, &extent);
2009
2010 for (uint32_t i = 0; i < layers_to_copy; i++) {
2011 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, src_format);
2012 ops->dst(cs, &staging, i, src_format);
2013 ops->run(cmd, cs);
2014 }
2015
2016 /* When executed by the user there has to be a pipeline barrier here,
2017 * but since we're doing it manually we'll have to flush ourselves.
2018 */
2019 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2020 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2021 tu_cs_emit_wfi(cs);
2022
2023 fdl6_view_init(&staging, &staging_layout_ptr, &(struct fdl_view_args) {
2024 .iova = staging_bo->iova,
2025 .base_array_layer = 0,
2026 .layer_count = 1,
2027 .base_miplevel = 0,
2028 .level_count = info->srcSubresource.layerCount,
2029 .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
2030 .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2031 .type = FDL_VIEW_TYPE_2D,
2032 }, false);
2033
2034 ops->setup(cmd, cs, dst_format, dst_format, info->dstSubresource.aspectMask,
2035 0, false, dst_image->layout[0].ubwc,
2036 dst_image->layout[0].nr_samples);
2037 coords(ops, cs, &dst_offset, &staging_offset, &extent);
2038
2039 for (uint32_t i = 0; i < layers_to_copy; i++) {
2040 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST, dst_format);
2041 ops->dst(cs, &dst, i, dst_format);
2042 ops->run(cmd, cs);
2043 }
2044 } else {
2045 tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z);
2046 tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z);
2047
2048 ops->setup(cmd, cs, format, format, info->dstSubresource.aspectMask,
2049 0, false, dst_image->layout[0].ubwc,
2050 dst_image->layout[0].nr_samples);
2051 coords(ops, cs, &dst_offset, &src_offset, &extent);
2052
2053 for (uint32_t i = 0; i < layers_to_copy; i++) {
2054 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, format);
2055 ops->dst(cs, &dst, i, format);
2056 ops->run(cmd, cs);
2057 }
2058 }
2059
2060 ops->teardown(cmd, cs);
2061 }
2062
2063 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImage2KHR(VkCommandBuffer commandBuffer, const VkCopyImageInfo2* pCopyImageInfo)2064 tu_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
2065 const VkCopyImageInfo2* pCopyImageInfo)
2066 {
2067 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2068 TU_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage);
2069 TU_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage);
2070
2071 for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) {
2072 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2073 VkImageCopy2 info = pCopyImageInfo->pRegions[i];
2074 u_foreach_bit(b, info.dstSubresource.aspectMask) {
2075 info.srcSubresource.aspectMask = BIT(b);
2076 info.dstSubresource.aspectMask = BIT(b);
2077 tu_copy_image_to_image(cmd, src_image, dst_image, &info);
2078 }
2079 continue;
2080 }
2081
2082 tu_copy_image_to_image(cmd, src_image, dst_image,
2083 pCopyImageInfo->pRegions + i);
2084 }
2085
2086 if (dst_image->lrz_height) {
2087 tu_disable_lrz(cmd, &cmd->cs, dst_image);
2088 }
2089 }
2090
2091 static void
copy_buffer(struct tu_cmd_buffer *cmd, uint64_t dst_va, uint64_t src_va, uint64_t size, uint32_t block_size)2092 copy_buffer(struct tu_cmd_buffer *cmd,
2093 uint64_t dst_va,
2094 uint64_t src_va,
2095 uint64_t size,
2096 uint32_t block_size)
2097 {
2098 const struct blit_ops *ops = &r2d_ops;
2099 struct tu_cs *cs = &cmd->cs;
2100 enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
2101 uint64_t blocks = size / block_size;
2102
2103 ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
2104 VK_SAMPLE_COUNT_1_BIT);
2105
2106 while (blocks) {
2107 uint32_t src_x = (src_va & 63) / block_size;
2108 uint32_t dst_x = (dst_va & 63) / block_size;
2109 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
2110
2111 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1, format);
2112 ops->dst_buffer( cs, format, dst_va & ~63, 0, format);
2113 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
2114 ops->run(cmd, cs);
2115
2116 src_va += width * block_size;
2117 dst_va += width * block_size;
2118 blocks -= width;
2119 }
2120
2121 ops->teardown(cmd, cs);
2122 }
2123
2124 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer, const VkCopyBufferInfo2 *pCopyBufferInfo)2125 tu_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
2126 const VkCopyBufferInfo2 *pCopyBufferInfo)
2127 {
2128 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2129 TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
2130 TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
2131
2132 for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
2133 const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
2134 copy_buffer(cmd,
2135 dst_buffer->iova + region->dstOffset,
2136 src_buffer->iova + region->srcOffset,
2137 region->size, 1);
2138 }
2139 }
2140
2141 VKAPI_ATTR void VKAPI_CALL
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize dataSize, const void *pData)2142 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
2143 VkBuffer dstBuffer,
2144 VkDeviceSize dstOffset,
2145 VkDeviceSize dataSize,
2146 const void *pData)
2147 {
2148 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2149 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2150
2151 struct tu_cs_memory tmp;
2152 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
2153 if (result != VK_SUCCESS) {
2154 cmd->record_result = result;
2155 return;
2156 }
2157
2158 memcpy(tmp.map, pData, dataSize);
2159 copy_buffer(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4);
2160 }
2161
2162 VKAPI_ATTR void VKAPI_CALL
tu_CmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize fillSize, uint32_t data)2163 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
2164 VkBuffer dstBuffer,
2165 VkDeviceSize dstOffset,
2166 VkDeviceSize fillSize,
2167 uint32_t data)
2168 {
2169 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2170 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2171 const struct blit_ops *ops = &r2d_ops;
2172 struct tu_cs *cs = &cmd->cs;
2173
2174 if (fillSize == VK_WHOLE_SIZE)
2175 fillSize = buffer->size - dstOffset;
2176
2177 uint64_t dst_va = buffer->iova + dstOffset;
2178 uint32_t blocks = fillSize / 4;
2179
2180 ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2181 VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2182 VK_SAMPLE_COUNT_1_BIT);
2183 ops->clear_value(cs, PIPE_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
2184
2185 while (blocks) {
2186 uint32_t dst_x = (dst_va & 63) / 4;
2187 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
2188
2189 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dst_va & ~63, 0, PIPE_FORMAT_R32_UINT);
2190 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
2191 ops->run(cmd, cs);
2192
2193 dst_va += width * 4;
2194 blocks -= width;
2195 }
2196
2197 ops->teardown(cmd, cs);
2198 }
2199
2200 VKAPI_ATTR void VKAPI_CALL
tu_CmdResolveImage2KHR(VkCommandBuffer commandBuffer, const VkResolveImageInfo2* pResolveImageInfo)2201 tu_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
2202 const VkResolveImageInfo2* pResolveImageInfo)
2203 {
2204 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2205 TU_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage);
2206 TU_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage);
2207 const struct blit_ops *ops = &r2d_ops;
2208 struct tu_cs *cs = &cmd->cs;
2209
2210 enum pipe_format src_format =
2211 tu_vk_format_to_pipe_format(src_image->vk.format);
2212 enum pipe_format dst_format =
2213 tu_vk_format_to_pipe_format(dst_image->vk.format);
2214 ops->setup(cmd, cs, src_format, dst_format,
2215 VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc,
2216 VK_SAMPLE_COUNT_1_BIT);
2217
2218 for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) {
2219 const VkImageResolve2 *info = &pResolveImageInfo->pRegions[i];
2220 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
2221
2222 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
2223 /* TODO: aspect masks possible ? */
2224
2225 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
2226
2227 struct fdl6_view dst, src;
2228 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
2229 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
2230
2231 for (uint32_t i = 0; i < layers; i++) {
2232 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2233 ops->dst(cs, &dst, i, src_format);
2234 ops->run(cmd, cs);
2235 }
2236 }
2237
2238 ops->teardown(cmd, cs);
2239 }
2240
2241 #define for_each_layer(layer, layer_mask, layers) \
2242 for (uint32_t layer = 0; \
2243 layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
2244 layer++) \
2245 if (!layer_mask || (layer_mask & BIT(layer)))
2246
2247 static void
resolve_sysmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs, VkFormat vk_src_format, VkFormat vk_dst_format, const struct tu_image_view *src, const struct tu_image_view *dst, uint32_t layer_mask, uint32_t layers, const VkRect2D *rect, bool src_separate_ds, bool dst_separate_ds)2248 resolve_sysmem(struct tu_cmd_buffer *cmd,
2249 struct tu_cs *cs,
2250 VkFormat vk_src_format,
2251 VkFormat vk_dst_format,
2252 const struct tu_image_view *src,
2253 const struct tu_image_view *dst,
2254 uint32_t layer_mask,
2255 uint32_t layers,
2256 const VkRect2D *rect,
2257 bool src_separate_ds,
2258 bool dst_separate_ds)
2259 {
2260 const struct blit_ops *ops = &r2d_ops;
2261
2262 trace_start_sysmem_resolve(&cmd->trace, cs);
2263
2264 enum pipe_format src_format = tu_vk_format_to_pipe_format(vk_src_format);
2265 enum pipe_format dst_format = tu_vk_format_to_pipe_format(vk_dst_format);
2266
2267 ops->setup(cmd, cs, src_format, dst_format,
2268 VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
2269 VK_SAMPLE_COUNT_1_BIT);
2270 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
2271
2272 for_each_layer(i, layer_mask, layers) {
2273 if (src_separate_ds) {
2274 if (vk_src_format == VK_FORMAT_D32_SFLOAT) {
2275 r2d_src_depth(cmd, cs, src, i, VK_FILTER_NEAREST);
2276 } else {
2277 r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST);
2278 }
2279 } else {
2280 ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
2281 }
2282
2283 if (dst_separate_ds) {
2284 if (vk_dst_format == VK_FORMAT_D32_SFLOAT) {
2285 ops->dst_depth(cs, dst, i);
2286 } else {
2287 ops->dst_stencil(cs, dst, i);
2288 }
2289 } else {
2290 ops->dst(cs, &dst->view, i, src_format);
2291 }
2292
2293 ops->run(cmd, cs);
2294 }
2295
2296 ops->teardown(cmd, cs);
2297
2298 trace_end_sysmem_resolve(&cmd->trace, cs, vk_dst_format);
2299 }
2300
2301 void
tu_resolve_sysmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *src, const struct tu_image_view *dst, uint32_t layer_mask, uint32_t layers, const VkRect2D *rect)2302 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
2303 struct tu_cs *cs,
2304 const struct tu_image_view *src,
2305 const struct tu_image_view *dst,
2306 uint32_t layer_mask,
2307 uint32_t layers,
2308 const VkRect2D *rect)
2309 {
2310 assert(src->image->vk.format == dst->image->vk.format ||
2311 (vk_format_is_depth_or_stencil(src->image->vk.format) &&
2312 vk_format_is_depth_or_stencil(dst->image->vk.format)));
2313
2314 bool src_separate_ds = src->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
2315 bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
2316
2317 if (dst_separate_ds) {
2318 resolve_sysmem(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT,
2319 src, dst, layer_mask, layers, rect,
2320 src_separate_ds, dst_separate_ds);
2321 resolve_sysmem(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT,
2322 src, dst, layer_mask, layers, rect,
2323 src_separate_ds, dst_separate_ds);
2324 } else {
2325 resolve_sysmem(cmd, cs, src->image->vk.format, dst->image->vk.format,
2326 src, dst, layer_mask, layers, rect,
2327 src_separate_ds, dst_separate_ds);
2328 }
2329 }
2330
2331 static void
clear_image(struct tu_cmd_buffer *cmd, struct tu_image *image, const VkClearValue *clear_value, const VkImageSubresourceRange *range, VkImageAspectFlags aspect_mask)2332 clear_image(struct tu_cmd_buffer *cmd,
2333 struct tu_image *image,
2334 const VkClearValue *clear_value,
2335 const VkImageSubresourceRange *range,
2336 VkImageAspectFlags aspect_mask)
2337 {
2338 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
2339 uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
2340 struct tu_cs *cs = &cmd->cs;
2341 enum pipe_format format;
2342 if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
2343 format = PIPE_FORMAT_R32_UINT;
2344 } else {
2345 format = tu6_plane_format(image->vk.format,
2346 tu6_plane_index(image->vk.format,
2347 aspect_mask));
2348 }
2349
2350 if (image->layout[0].depth0 > 1) {
2351 assert(layer_count == 1);
2352 assert(range->baseArrayLayer == 0);
2353 }
2354
2355 const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops;
2356
2357 ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc,
2358 image->layout[0].nr_samples);
2359 if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
2360 ops->clear_value(cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
2361 else
2362 ops->clear_value(cs, format, clear_value);
2363
2364 for (unsigned j = 0; j < level_count; j++) {
2365 if (image->layout[0].depth0 > 1)
2366 layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
2367
2368 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
2369 u_minify(image->layout[0].width0, range->baseMipLevel + j),
2370 u_minify(image->layout[0].height0, range->baseMipLevel + j)
2371 });
2372
2373 struct fdl6_view dst;
2374 tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
2375 .aspectMask = aspect_mask,
2376 .mipLevel = range->baseMipLevel + j,
2377 .baseArrayLayer = range->baseArrayLayer,
2378 .layerCount = 1,
2379 }, 0, false);
2380
2381 for (uint32_t i = 0; i < layer_count; i++) {
2382 ops->dst(cs, &dst, i, format);
2383 ops->run(cmd, cs);
2384 }
2385 }
2386
2387 ops->teardown(cmd, cs);
2388 }
2389
2390 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearColorImage(VkCommandBuffer commandBuffer, VkImage image_h, VkImageLayout imageLayout, const VkClearColorValue *pColor, uint32_t rangeCount, const VkImageSubresourceRange *pRanges)2391 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
2392 VkImage image_h,
2393 VkImageLayout imageLayout,
2394 const VkClearColorValue *pColor,
2395 uint32_t rangeCount,
2396 const VkImageSubresourceRange *pRanges)
2397 {
2398 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2399 TU_FROM_HANDLE(tu_image, image, image_h);
2400
2401 for (unsigned i = 0; i < rangeCount; i++)
2402 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
2403 }
2404
2405 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, VkImage image_h, VkImageLayout imageLayout, const VkClearDepthStencilValue *pDepthStencil, uint32_t rangeCount, const VkImageSubresourceRange *pRanges)2406 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
2407 VkImage image_h,
2408 VkImageLayout imageLayout,
2409 const VkClearDepthStencilValue *pDepthStencil,
2410 uint32_t rangeCount,
2411 const VkImageSubresourceRange *pRanges)
2412 {
2413 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2414 TU_FROM_HANDLE(tu_image, image, image_h);
2415
2416 for (unsigned i = 0; i < rangeCount; i++) {
2417 const VkImageSubresourceRange *range = &pRanges[i];
2418
2419 if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2420 /* can't clear both depth and stencil at once, split up the aspect mask */
2421 u_foreach_bit(b, range->aspectMask)
2422 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
2423 continue;
2424 }
2425
2426 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
2427 }
2428
2429 tu_lrz_clear_depth_image(cmd, image, pDepthStencil, rangeCount, pRanges);
2430 }
2431
2432 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, uint32_t attachment_count, const VkClearAttachment *attachments, uint32_t rect_count, const VkClearRect *rects)2433 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2434 uint32_t attachment_count,
2435 const VkClearAttachment *attachments,
2436 uint32_t rect_count,
2437 const VkClearRect *rects)
2438 {
2439 /* the shader path here is special, it avoids changing MRT/etc state */
2440 const struct tu_subpass *subpass = cmd->state.subpass;
2441 const uint32_t mrt_count = subpass->color_count;
2442 struct tu_cs *cs = &cmd->draw_cs;
2443 uint32_t clear_value[MAX_RTS][4];
2444 float z_clear_val = 0.0f;
2445 uint8_t s_clear_val = 0;
2446 uint32_t clear_rts = 0, clear_components = 0;
2447 bool z_clear = false;
2448 bool s_clear = false;
2449
2450 trace_start_sysmem_clear_all(&cmd->trace, cs);
2451
2452 for (uint32_t i = 0; i < attachment_count; i++) {
2453 uint32_t a;
2454 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2455 uint32_t c = attachments[i].colorAttachment;
2456 a = subpass->color_attachments[c].attachment;
2457 if (a == VK_ATTACHMENT_UNUSED)
2458 continue;
2459
2460 clear_rts |= 1 << c;
2461 clear_components |= 0xf << (c * 4);
2462 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2463 } else {
2464 a = subpass->depth_stencil_attachment.attachment;
2465 if (a == VK_ATTACHMENT_UNUSED)
2466 continue;
2467
2468 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2469 z_clear = true;
2470 z_clear_val = attachments[i].clearValue.depthStencil.depth;
2471 }
2472
2473 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2474 s_clear = true;
2475 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2476 }
2477 }
2478 }
2479
2480 /* We may not know the multisample count if there are no attachments, so
2481 * just bail early to avoid corner cases later.
2482 */
2483 if (clear_rts == 0 && !z_clear && !s_clear)
2484 return;
2485
2486 /* disable all draw states so they don't interfere
2487 * TODO: use and re-use draw states
2488 * we have to disable draw states individually to preserve
2489 * input attachment states, because a secondary command buffer
2490 * won't be able to restore them
2491 */
2492 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
2493 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
2494 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
2495 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
2496 continue;
2497 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
2498 CP_SET_DRAW_STATE__0_DISABLE);
2499 tu_cs_emit_qw(cs, 0);
2500 }
2501 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
2502
2503 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2504 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2505 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2506 0xfc000000);
2507 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2508
2509 r3d_common(cmd, cs, false, clear_rts, false, cmd->state.subpass->samples);
2510
2511 /* Disable sample counting in order to not affect occlusion query. */
2512 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
2513
2514 if (cmd->state.prim_generated_query_running_before_rp) {
2515 tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS);
2516 }
2517
2518 tu_cs_emit_regs(cs,
2519 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2520 tu_cs_emit_regs(cs,
2521 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2522
2523 tu_cs_emit_regs(cs,
2524 A6XX_RB_FS_OUTPUT_CNTL0(),
2525 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2526
2527 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2528 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2529 for (uint32_t i = 0; i < mrt_count; i++) {
2530 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2531 .component_enable = COND(clear_rts & (1 << i), 0xf)));
2532 }
2533
2534 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
2535 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
2536
2537 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2538 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2539 .z_test_enable = z_clear,
2540 .z_write_enable = z_clear,
2541 .zfunc = FUNC_ALWAYS));
2542 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2543 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2544 .stencil_enable = s_clear,
2545 .func = FUNC_ALWAYS,
2546 .zpass = STENCIL_REPLACE));
2547 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2548 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2549 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2550
2551 unsigned num_rts = util_bitcount(clear_rts);
2552 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2553 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2554 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2555 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2556 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2557 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2558 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2559 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2560 u_foreach_bit(b, clear_rts)
2561 tu_cs_emit_array(cs, clear_value[b], 4);
2562
2563 for (uint32_t i = 0; i < rect_count; i++) {
2564 /* This should be true because of this valid usage for
2565 * vkCmdClearAttachments:
2566 *
2567 * "If the render pass instance this is recorded in uses multiview,
2568 * then baseArrayLayer must be zero and layerCount must be one"
2569 */
2570 assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
2571
2572 /* a630 doesn't support multiview masks, which means that we can't use
2573 * the normal multiview path without potentially recompiling a shader
2574 * on-demand or using a more complicated variant that takes the mask as
2575 * a const. Just use the layered path instead, since it shouldn't be
2576 * much worse.
2577 */
2578 for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) {
2579 r3d_coords_raw(cs, (float[]) {
2580 rects[i].rect.offset.x, rects[i].rect.offset.y,
2581 z_clear_val, uif(rects[i].baseArrayLayer + layer),
2582 rects[i].rect.offset.x + rects[i].rect.extent.width,
2583 rects[i].rect.offset.y + rects[i].rect.extent.height,
2584 z_clear_val, 1.0f,
2585 });
2586 r3d_run_vis(cmd, cs);
2587 }
2588 }
2589
2590 /* Re-enable sample counting. */
2591 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
2592
2593 if (cmd->state.prim_generated_query_running_before_rp) {
2594 tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS);
2595 }
2596
2597 trace_end_sysmem_clear_all(&cmd->trace,
2598 cs, mrt_count, rect_count);
2599 }
2600
2601 static void
pack_gmem_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])2602 pack_gmem_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
2603 {
2604 switch (format) {
2605 case PIPE_FORMAT_Z24X8_UNORM:
2606 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
2607 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
2608 val->depthStencil.stencil << 24;
2609 return;
2610 case PIPE_FORMAT_Z16_UNORM:
2611 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
2612 return;
2613 case PIPE_FORMAT_Z32_FLOAT:
2614 clear_value[0] = fui(val->depthStencil.depth);
2615 return;
2616 case PIPE_FORMAT_S8_UINT:
2617 clear_value[0] = val->depthStencil.stencil;
2618 return;
2619 default:
2620 break;
2621 }
2622
2623 float tmp[4];
2624 memcpy(tmp, val->color.float32, 4 * sizeof(float));
2625 if (util_format_is_srgb(format)) {
2626 for (int i = 0; i < 3; i++)
2627 tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
2628 }
2629
2630 #define PACK_F(type) util_format_##type##_pack_rgba_float \
2631 ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
2632 switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
2633 case 4:
2634 PACK_F(r4g4b4a4_unorm);
2635 break;
2636 case 5:
2637 if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
2638 PACK_F(r5g6b5_unorm);
2639 else
2640 PACK_F(r5g5b5a1_unorm);
2641 break;
2642 case 8:
2643 if (util_format_is_snorm(format))
2644 PACK_F(r8g8b8a8_snorm);
2645 else if (util_format_is_unorm(format))
2646 PACK_F(r8g8b8a8_unorm);
2647 else
2648 pack_int8(clear_value, val->color.uint32);
2649 break;
2650 case 10:
2651 if (util_format_is_pure_integer(format))
2652 pack_int10_2(clear_value, val->color.uint32);
2653 else
2654 PACK_F(r10g10b10a2_unorm);
2655 break;
2656 case 11:
2657 clear_value[0] = float3_to_r11g11b10f(val->color.float32);
2658 break;
2659 case 16:
2660 if (util_format_is_snorm(format))
2661 PACK_F(r16g16b16a16_snorm);
2662 else if (util_format_is_unorm(format))
2663 PACK_F(r16g16b16a16_unorm);
2664 else if (util_format_is_float(format))
2665 PACK_F(r16g16b16a16_float);
2666 else
2667 pack_int16(clear_value, val->color.uint32);
2668 break;
2669 case 32:
2670 memcpy(clear_value, val->color.float32, 4 * sizeof(float));
2671 break;
2672 default:
2673 unreachable("unexpected channel size");
2674 }
2675 #undef PACK_F
2676 }
2677
2678 static void
clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, uint8_t clear_mask, uint32_t gmem_offset, const VkClearValue *value)2679 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2680 struct tu_cs *cs,
2681 enum pipe_format format,
2682 uint8_t clear_mask,
2683 uint32_t gmem_offset,
2684 const VkClearValue *value)
2685 {
2686 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2687 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
2688
2689 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
2690
2691 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2692 tu_cs_emit(cs, gmem_offset);
2693
2694 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2695 tu_cs_emit(cs, 0);
2696
2697 uint32_t clear_vals[4] = {};
2698 pack_gmem_clear_value(value, format, clear_vals);
2699
2700 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2701 tu_cs_emit_array(cs, clear_vals, 4);
2702
2703 tu6_emit_event_write(cmd, cs, BLIT);
2704 }
2705
2706 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t attachment, VkImageAspectFlags mask, const VkClearValue *value)2707 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2708 struct tu_cs *cs,
2709 uint32_t attachment,
2710 VkImageAspectFlags mask,
2711 const VkClearValue *value)
2712 {
2713 const struct tu_render_pass_attachment *att =
2714 &cmd->state.pass->attachments[attachment];
2715
2716 trace_start_gmem_clear(&cmd->trace, cs);
2717
2718 enum pipe_format format = tu_vk_format_to_pipe_format(att->format);
2719 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2720 if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2721 clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, tu_attachment_gmem_offset(cmd, att), value);
2722 if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2723 clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, tu_attachment_gmem_offset_stencil(cmd, att), value);
2724 return;
2725 }
2726
2727 clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask),
2728 tu_attachment_gmem_offset(cmd, att), value);
2729
2730 trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples);
2731 }
2732
2733 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, uint32_t attachment_count, const VkClearAttachment *attachments, uint32_t rect_count, const VkClearRect *rects)2734 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2735 uint32_t attachment_count,
2736 const VkClearAttachment *attachments,
2737 uint32_t rect_count,
2738 const VkClearRect *rects)
2739 {
2740 const struct tu_subpass *subpass = cmd->state.subpass;
2741 struct tu_cs *cs = &cmd->draw_cs;
2742
2743 if (rect_count > 1)
2744 perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream");
2745
2746 for (unsigned i = 0; i < rect_count; i++) {
2747 unsigned x1 = rects[i].rect.offset.x;
2748 unsigned y1 = rects[i].rect.offset.y;
2749 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2750 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2751
2752 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2753 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2754 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2755
2756 for (unsigned j = 0; j < attachment_count; j++) {
2757 uint32_t a;
2758 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2759 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2760 else
2761 a = subpass->depth_stencil_attachment.attachment;
2762
2763 if (a == VK_ATTACHMENT_UNUSED)
2764 continue;
2765
2766 tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2767 &attachments[j].clearValue);
2768 }
2769 }
2770 }
2771
2772 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearAttachments(VkCommandBuffer commandBuffer, uint32_t attachmentCount, const VkClearAttachment *pAttachments, uint32_t rectCount, const VkClearRect *pRects)2773 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2774 uint32_t attachmentCount,
2775 const VkClearAttachment *pAttachments,
2776 uint32_t rectCount,
2777 const VkClearRect *pRects)
2778 {
2779 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2780 struct tu_cs *cs = &cmd->draw_cs;
2781
2782 /* sysmem path behaves like a draw, note we don't have a way of using different
2783 * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
2784 */
2785 tu_emit_cache_flush_renderpass(cmd, cs);
2786
2787 for (uint32_t j = 0; j < attachmentCount; j++) {
2788 if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
2789 continue;
2790
2791 tu_lrz_disable_during_renderpass(cmd);
2792 }
2793
2794 /* vkCmdClearAttachments is supposed to respect the predicate if active. The
2795 * easiest way to do this is to always use the 3d path, which always works
2796 * even with GMEM because it's just a simple draw using the existing
2797 * attachment state.
2798 *
2799 * Similarly, we also use the 3D path when in a secondary command buffer that
2800 * doesn't know the GMEM layout that will be chosen by the primary.
2801 */
2802 if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
2803 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2804 return;
2805 }
2806
2807 /* If we could skip tile load/stores based on any draws intersecting them at
2808 * binning time, then emit the clear as a 3D draw so that it contributes to
2809 * that visibility.
2810 */
2811 const struct tu_subpass *subpass = cmd->state.subpass;
2812 for (uint32_t i = 0; i < attachmentCount; i++) {
2813 uint32_t a;
2814 if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2815 uint32_t c = pAttachments[i].colorAttachment;
2816 a = subpass->color_attachments[c].attachment;
2817 } else {
2818 a = subpass->depth_stencil_attachment.attachment;
2819 }
2820 if (a != VK_ATTACHMENT_UNUSED) {
2821 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
2822 if (att->cond_load_allowed || att->cond_store_allowed) {
2823 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2824 return;
2825 }
2826 }
2827 }
2828
2829 /* Otherwise, emit 2D blits for gmem rendering. */
2830 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2831 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2832 tu_cond_exec_end(cs);
2833
2834 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2835 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2836 tu_cond_exec_end(cs);
2837 }
2838
2839 static void
clear_sysmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, VkFormat vk_format, VkImageAspectFlags clear_mask, const VkClearValue *value, uint32_t a, bool separate_ds)2840 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2841 struct tu_cs *cs,
2842 VkFormat vk_format,
2843 VkImageAspectFlags clear_mask,
2844 const VkClearValue *value,
2845 uint32_t a,
2846 bool separate_ds)
2847 {
2848 enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
2849 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2850 const struct tu_image_view *iview = cmd->state.attachments[a];
2851 const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
2852 const struct blit_ops *ops = &r2d_ops;
2853 if (cmd->state.pass->attachments[a].samples > 1)
2854 ops = &r3d_ops;
2855
2856 trace_start_sysmem_clear(&cmd->trace, cs);
2857
2858 ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled,
2859 cmd->state.pass->attachments[a].samples);
2860 ops->coords(cs, &cmd->state.render_area.offset, NULL,
2861 &cmd->state.render_area.extent);
2862 ops->clear_value(cs, format, value);
2863
2864 for_each_layer(i, clear_views, fb->layers) {
2865 if (separate_ds) {
2866 if (vk_format == VK_FORMAT_D32_SFLOAT) {
2867 ops->dst_depth(cs, iview, i);
2868 } else {
2869 ops->dst_stencil(cs, iview, i);
2870 }
2871 } else {
2872 ops->dst(cs, &iview->view, i, format);
2873 }
2874 ops->run(cmd, cs);
2875 }
2876
2877 ops->teardown(cmd, cs);
2878
2879 trace_end_sysmem_clear(&cmd->trace, cs,
2880 vk_format, ops == &r3d_ops,
2881 cmd->state.pass->attachments[a].samples);
2882 }
2883
2884 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, const VkClearValue *value)2885 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2886 struct tu_cs *cs,
2887 uint32_t a,
2888 const VkClearValue *value)
2889 {
2890 const struct tu_render_pass_attachment *attachment =
2891 &cmd->state.pass->attachments[a];
2892
2893 if (!attachment->clear_mask)
2894 return;
2895
2896 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2897 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2898 clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
2899 value, a, true);
2900 }
2901 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2902 clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
2903 value, a, true);
2904 }
2905 } else {
2906 clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
2907 value, a, false);
2908 }
2909
2910 /* The spec doesn't explicitly say, but presumably the initial renderpass
2911 * clear is considered part of the renderpass, and therefore barriers
2912 * aren't required inside the subpass/renderpass. Therefore we need to
2913 * flush CCU color into CCU depth here, just like with
2914 * vkCmdClearAttachments(). Note that because this only happens at the
2915 * beginning of a renderpass, and renderpass writes are considered
2916 * "incoherent", we shouldn't have to worry about syncing depth into color
2917 * beforehand as depth should already be flushed.
2918 */
2919 if (vk_format_is_depth_or_stencil(attachment->format)) {
2920 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2921 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
2922 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2923 } else {
2924 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2925 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2926 }
2927
2928 if (cmd->device->physical_device->info->a6xx.has_ccu_flush_bug)
2929 tu_cs_emit_wfi(cs);
2930 }
2931
2932 void
tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, const VkClearValue *value)2933 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2934 struct tu_cs *cs,
2935 uint32_t a,
2936 const VkClearValue *value)
2937 {
2938 const struct tu_render_pass_attachment *attachment =
2939 &cmd->state.pass->attachments[a];
2940
2941 if (!attachment->clear_mask)
2942 return;
2943
2944 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2945
2946 tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask, value);
2947 }
2948
2949 static void
tu_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, const struct tu_render_pass_attachment *attachment, bool resolve, bool separate_stencil)2950 tu_emit_blit(struct tu_cmd_buffer *cmd,
2951 struct tu_cs *cs,
2952 const struct tu_image_view *iview,
2953 const struct tu_render_pass_attachment *attachment,
2954 bool resolve,
2955 bool separate_stencil)
2956 {
2957 tu_cs_emit_regs(cs,
2958 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2959
2960 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2961 .unk0 = !resolve,
2962 .gmem = !resolve,
2963 .sample_0 = vk_format_is_int(attachment->format) ||
2964 vk_format_is_depth_or_stencil(attachment->format)));
2965
2966 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2967 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2968 if (!separate_stencil) {
2969 tu_cs_emit(cs, tu_image_view_depth(iview, RB_BLIT_DST_INFO));
2970 tu_cs_emit_qw(cs, iview->depth_base_addr);
2971 tu_cs_emit(cs, iview->depth_PITCH);
2972
2973 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
2974 tu_cs_image_flag_ref(cs, &iview->view, 0);
2975 } else {
2976 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
2977 tu_cs_emit_qw(cs, iview->stencil_base_addr);
2978 tu_cs_emit(cs, iview->stencil_PITCH);
2979 }
2980 } else {
2981 tu_cs_emit(cs, iview->view.RB_BLIT_DST_INFO);
2982 tu_cs_image_ref_2d(cs, &iview->view, 0, false);
2983
2984 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
2985 tu_cs_image_flag_ref(cs, &iview->view, 0);
2986 }
2987
2988 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
2989 tu_cs_emit_regs(cs,
2990 A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(cmd, attachment)));
2991 } else {
2992 tu_cs_emit_regs(cs,
2993 A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(cmd, attachment)));
2994 }
2995
2996 tu6_emit_event_write(cmd, cs, BLIT);
2997 }
2998
2999 static bool
blit_can_resolve(VkFormat format)3000 blit_can_resolve(VkFormat format)
3001 {
3002 const struct util_format_description *desc = vk_format_description(format);
3003
3004 /* blit event can only do resolve for simple cases:
3005 * averaging samples as unsigned integers or choosing only one sample
3006 */
3007 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
3008 return false;
3009
3010 /* can't do formats with larger channel sizes
3011 * note: this includes all float formats
3012 * note2: single channel integer formats seem OK
3013 */
3014 if (desc->channel[0].size > 10)
3015 return false;
3016
3017 switch (format) {
3018 /* for unknown reasons blit event can't msaa resolve these formats when tiled
3019 * likely related to these formats having different layout from other cpp=2 formats
3020 */
3021 case VK_FORMAT_R8G8_UNORM:
3022 case VK_FORMAT_R8G8_UINT:
3023 case VK_FORMAT_R8G8_SINT:
3024 /* TODO: this one should be able to work? */
3025 case VK_FORMAT_D24_UNORM_S8_UINT:
3026 return false;
3027 default:
3028 break;
3029 }
3030
3031 return true;
3032 }
3033
3034 static void
tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool load)3035 tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
3036 struct tu_cs *cs, bool load)
3037 {
3038 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
3039
3040 if (!unlikely(cmd->device->physical_device->instance->debug_flags &
3041 TU_DEBUG_LOG_SKIP_GMEM_OPS))
3042 return;
3043
3044 uint64_t result_iova;
3045 if (load)
3046 result_iova = global_iova(cmd, dbg_gmem_taken_loads);
3047 else
3048 result_iova = global_iova(cmd, dbg_gmem_taken_stores);
3049
3050 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
3051 tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
3052 tu_cs_emit_qw(cs, result_iova);
3053 tu_cs_emit_qw(cs, result_iova);
3054 tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
3055 }
3056
3057 static void
tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool load)3058 tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
3059 struct tu_cs *cs, bool load)
3060 {
3061 tu_cond_exec_end(cs);
3062
3063 if (!unlikely(cmd->device->physical_device->instance->debug_flags &
3064 TU_DEBUG_LOG_SKIP_GMEM_OPS))
3065 return;
3066
3067 uint64_t result_iova;
3068 if (load)
3069 result_iova = global_iova(cmd, dbg_gmem_total_loads);
3070 else
3071 result_iova = global_iova(cmd, dbg_gmem_total_stores);
3072
3073 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
3074 tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
3075 tu_cs_emit_qw(cs, result_iova);
3076 tu_cs_emit_qw(cs, result_iova);
3077 tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
3078 }
3079
3080 void
tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, bool cond_exec_allowed, bool force_load)3081 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
3082 struct tu_cs *cs,
3083 uint32_t a,
3084 bool cond_exec_allowed,
3085 bool force_load)
3086 {
3087 const struct tu_image_view *iview = cmd->state.attachments[a];
3088 const struct tu_render_pass_attachment *attachment =
3089 &cmd->state.pass->attachments[a];
3090
3091 bool load_common = attachment->load || force_load;
3092 bool load_stencil =
3093 attachment->load_stencil ||
3094 (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
3095
3096 if (!load_common && !load_stencil)
3097 return;
3098
3099 trace_start_gmem_load(&cmd->trace, cs);
3100
3101 /* If attachment will be cleared by vkCmdClearAttachments - it is likely
3102 * that it would be partially cleared, and since it is done by 2d blit
3103 * it doesn't produce geometry, so we have to unconditionally load.
3104 *
3105 * To simplify conditions treat partially cleared separate DS as fully
3106 * cleared and don't emit cond_exec.
3107 */
3108 bool cond_exec = cond_exec_allowed && attachment->cond_load_allowed;
3109 if (cond_exec)
3110 tu_begin_load_store_cond_exec(cmd, cs, true);
3111
3112 if (load_common)
3113 tu_emit_blit(cmd, cs, iview, attachment, false, false);
3114
3115 if (load_stencil)
3116 tu_emit_blit(cmd, cs, iview, attachment, false, true);
3117
3118 if (cond_exec)
3119 tu_end_load_store_cond_exec(cmd, cs, true);
3120
3121 trace_end_gmem_load(&cmd->trace, cs, attachment->format, force_load);
3122 }
3123
3124 static void
store_cp_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, uint32_t samples, bool separate_stencil, enum pipe_format src_format, enum pipe_format dst_format, uint32_t gmem_offset, uint32_t cpp)3125 store_cp_blit(struct tu_cmd_buffer *cmd,
3126 struct tu_cs *cs,
3127 const struct tu_image_view *iview,
3128 uint32_t samples,
3129 bool separate_stencil,
3130 enum pipe_format src_format,
3131 enum pipe_format dst_format,
3132 uint32_t gmem_offset,
3133 uint32_t cpp)
3134 {
3135 r2d_setup_common(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
3136 iview->view.ubwc_enabled, true);
3137
3138 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3139 if (!separate_stencil) {
3140 r2d_dst_depth(cs, iview, 0);
3141 } else {
3142 r2d_dst_stencil(cs, iview, 0);
3143 }
3144 } else {
3145 r2d_dst(cs, &iview->view, 0, src_format);
3146 }
3147
3148 enum a6xx_format fmt = tu6_format_texture(src_format, TILE6_2).fmt;
3149 fixup_src_format(&src_format, dst_format, &fmt);
3150
3151 tu_cs_emit_regs(cs,
3152 A6XX_SP_PS_2D_SRC_INFO(
3153 .color_format = fmt,
3154 .color_swap = WZYX,
3155 .tile_mode = TILE6_2,
3156 .srgb = util_format_is_srgb(src_format),
3157 .samples = tu_msaa_samples(samples),
3158 .samples_average = !util_format_is_pure_integer(dst_format) &&
3159 !util_format_is_depth_or_stencil(dst_format),
3160 .unk20 = 1,
3161 .unk22 = 1),
3162 /* note: src size does not matter when not scaling */
3163 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
3164 A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset),
3165 A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.tiling->tile0.width * cpp));
3166
3167 /* sync GMEM writes with CACHE. */
3168 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
3169
3170 /* Wait for CACHE_INVALIDATE to land */
3171 tu_cs_emit_wfi(cs);
3172
3173 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
3174 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
3175
3176 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
3177 * sysmem, and we generally assume that GMEM renderpasses leave their
3178 * results in sysmem, so we need to flush manually here.
3179 */
3180 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
3181 }
3182
3183 static void
store_3d_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_image_view *iview, uint32_t dst_samples, bool separate_stencil, enum pipe_format src_format, enum pipe_format dst_format, const VkRect2D *render_area, uint32_t gmem_offset, uint32_t cpp)3184 store_3d_blit(struct tu_cmd_buffer *cmd,
3185 struct tu_cs *cs,
3186 const struct tu_image_view *iview,
3187 uint32_t dst_samples,
3188 bool separate_stencil,
3189 enum pipe_format src_format,
3190 enum pipe_format dst_format,
3191 const VkRect2D *render_area,
3192 uint32_t gmem_offset,
3193 uint32_t cpp)
3194 {
3195 /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they
3196 * aren't set until we know whether we're HW binning or not, and we want to
3197 * avoid a dependence on that here to be able to store attachments before
3198 * the end of the renderpass in the future. Use the scratch space to
3199 * save/restore them dynamically.
3200 */
3201 tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
3202 tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) |
3203 CP_REG_TO_SCRATCH_0_SCRATCH(0) |
3204 CP_REG_TO_SCRATCH_0_CNT(1 - 1));
3205
3206 r3d_setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
3207 iview->view.ubwc_enabled, dst_samples);
3208
3209 r3d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
3210
3211 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3212 if (!separate_stencil) {
3213 r3d_dst_depth(cs, iview, 0);
3214 } else {
3215 r3d_dst_stencil(cs, iview, 0);
3216 }
3217 } else {
3218 r3d_dst(cs, &iview->view, 0, src_format);
3219 }
3220
3221 r3d_src_gmem(cmd, cs, iview, src_format, dst_format, gmem_offset, cpp);
3222
3223 /* sync GMEM writes with CACHE. */
3224 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
3225
3226 /* Wait for CACHE_INVALIDATE to land */
3227 tu_cs_emit_wfi(cs);
3228
3229 r3d_run(cmd, cs);
3230
3231 r3d_teardown(cmd, cs);
3232
3233 /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
3234 * sysmem, and we generally assume that GMEM renderpasses leave their
3235 * results in sysmem, so we need to flush manually here. The 3d blit path
3236 * writes to depth images as a color RT, so there's no need to flush depth.
3237 */
3238 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
3239
3240 /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */
3241 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
3242 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) |
3243 CP_SCRATCH_TO_REG_0_SCRATCH(0) |
3244 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
3245
3246 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
3247 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) |
3248 CP_SCRATCH_TO_REG_0_SCRATCH(0) |
3249 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
3250 }
3251
3252 static bool
tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)3253 tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
3254 {
3255 struct tu_physical_device *phys_dev = cmd->device->physical_device;
3256 const struct tu_image_view *iview = cmd->state.attachments[a];
3257 const VkRect2D *render_area = &cmd->state.render_area;
3258
3259 /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
3260 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_UNALIGNED_STORE))
3261 return true;
3262
3263 uint32_t x1 = render_area->offset.x;
3264 uint32_t y1 = render_area->offset.y;
3265 uint32_t x2 = x1 + render_area->extent.width;
3266 uint32_t y2 = y1 + render_area->extent.height;
3267 /* x2/y2 can be unaligned if equal to the size of the image, since it will
3268 * write into padding space. The one exception is linear levels which don't
3269 * have the required y padding in the layout (except for the last level)
3270 */
3271 bool need_y2_align =
3272 y2 != iview->view.height || iview->view.need_y2_align;
3273
3274 return (x1 % phys_dev->info->gmem_align_w ||
3275 (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
3276 y1 % phys_dev->info->gmem_align_h ||
3277 (y2 % phys_dev->info->gmem_align_h && need_y2_align));
3278 }
3279
3280 /* Choose the GMEM layout (use the CCU space or not) based on whether the
3281 * current attachments will need. This has to happen at vkBeginRenderPass()
3282 * time because tu_attachment_store_unaligned() looks at the image views, which
3283 * are only available at that point. This should match the logic for the
3284 * !unaligned case in tu_store_gmem_attachment().
3285 */
3286 void
tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)3287 tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
3288 {
3289 cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
3290
3291 for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
3292 if (!cmd->state.attachments[i])
3293 continue;
3294
3295 struct tu_render_pass_attachment *att =
3296 &cmd->state.pass->attachments[i];
3297 if ((att->store || att->store_stencil) &&
3298 tu_attachment_store_unaligned(cmd, i))
3299 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
3300 if (att->will_be_resolved && !blit_can_resolve(att->format))
3301 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
3302 }
3303
3304 cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
3305 }
3306
3307 void
tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, uint32_t gmem_a, bool cond_exec_allowed)3308 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
3309 struct tu_cs *cs,
3310 uint32_t a,
3311 uint32_t gmem_a,
3312 bool cond_exec_allowed)
3313 {
3314 const VkRect2D *render_area = &cmd->state.render_area;
3315 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
3316 const struct tu_image_view *iview = cmd->state.attachments[a];
3317 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
3318
3319 if (!dst->store && !dst->store_stencil)
3320 return;
3321
3322 /* Unconditional store should happen only if attachment was cleared,
3323 * which could have happened either by load_op or via vkCmdClearAttachments.
3324 */
3325 bool cond_exec = cond_exec_allowed && src->cond_store_allowed;
3326 if (cond_exec) {
3327 tu_begin_load_store_cond_exec(cmd, cs, false);
3328 }
3329
3330 bool unaligned = tu_attachment_store_unaligned(cmd, a);
3331
3332 /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
3333 * one for depth and other for stencil. When resolving a MSAA
3334 * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
3335 */
3336 bool resolve_d32s8_s8 =
3337 src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
3338 dst->format == VK_FORMAT_S8_UINT;
3339
3340 /* The fast path doesn't support picking out the last component of a D24S8
3341 * texture reinterpreted as RGBA8_UNORM.
3342 */
3343 bool resolve_d24s8_s8 =
3344 src->format == VK_FORMAT_D24_UNORM_S8_UINT &&
3345 dst->format == VK_FORMAT_S8_UINT;
3346
3347 bool store_common = dst->store && !resolve_d32s8_s8;
3348 bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
3349
3350 trace_start_gmem_store(&cmd->trace, cs);
3351
3352 /* use fast path when render area is aligned, except for unsupported resolve cases */
3353 if (!unaligned && !resolve_d24s8_s8 &&
3354 (a == gmem_a || blit_can_resolve(dst->format))) {
3355 if (store_common)
3356 tu_emit_blit(cmd, cs, iview, src, true, false);
3357 if (store_separate_stencil)
3358 tu_emit_blit(cmd, cs, iview, src, true, true);
3359
3360 if (cond_exec) {
3361 tu_end_load_store_cond_exec(cmd, cs, false);
3362 }
3363
3364 trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false);
3365 return;
3366 }
3367
3368 assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
3369
3370 enum pipe_format src_format = tu_vk_format_to_pipe_format(src->format);
3371 if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
3372 src_format = PIPE_FORMAT_Z32_FLOAT;
3373
3374 enum pipe_format dst_format = tu_vk_format_to_pipe_format(dst->format);
3375 if (dst_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
3376 dst_format = PIPE_FORMAT_Z32_FLOAT;
3377
3378 if (dst->samples > 1) {
3379 /* If we hit this path, we have to disable draw states after every tile
3380 * instead of once at the end of the renderpass, so that they aren't
3381 * executed when calling CP_DRAW.
3382 *
3383 * TODO: store a flag somewhere so we don't do this more than once and
3384 * don't do it after the renderpass when this happens.
3385 */
3386 if (store_common || store_separate_stencil)
3387 tu_disable_draw_states(cmd, cs);
3388
3389 if (store_common) {
3390 store_3d_blit(cmd, cs, iview, dst->samples, false, src_format,
3391 dst_format, render_area, tu_attachment_gmem_offset(cmd, src), src->cpp);
3392 }
3393 if (store_separate_stencil) {
3394 store_3d_blit(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
3395 PIPE_FORMAT_S8_UINT, render_area,
3396 tu_attachment_gmem_offset_stencil(cmd, src), src->samples);
3397 }
3398 } else {
3399 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
3400
3401 if (store_common) {
3402 store_cp_blit(cmd, cs, iview, src->samples, false, src_format,
3403 dst_format, tu_attachment_gmem_offset(cmd, src), src->cpp);
3404 }
3405 if (store_separate_stencil) {
3406 store_cp_blit(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
3407 PIPE_FORMAT_S8_UINT, tu_attachment_gmem_offset_stencil(cmd, src), src->samples);
3408 }
3409 }
3410
3411 if (cond_exec) {
3412 tu_end_load_store_cond_exec(cmd, cs, false);
3413 }
3414
3415 trace_end_gmem_store(&cmd->trace, cs, dst->format, false, unaligned);
3416 }
3417