1/* 2 * Copyright 2019-2020 Valve Corporation 3 * SPDX-License-Identifier: MIT 4 * 5 * Authors: 6 * Jonathan Marek <jonathan@marek.ca> 7 */ 8 9#include "tu_clear_blit.h" 10 11#include "ir3/ir3_nir.h" 12 13#include "util/format_r11g11b10f.h" 14#include "util/format_rgb9e5.h" 15#include "util/format_srgb.h" 16#include "util/half_float.h" 17#include "compiler/nir/nir_builder.h" 18 19#include "tu_cmd_buffer.h" 20#include "tu_cs.h" 21#include "tu_formats.h" 22#include "tu_image.h" 23#include "tu_tracepoints.h" 24 25static uint32_t 26tu_pack_float32_for_unorm(float val, int bits) 27{ 28 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1)); 29} 30 31/* r2d_ = BLIT_OP_SCALE operations */ 32 33static enum a6xx_2d_ifmt 34format_to_ifmt(enum pipe_format format) 35{ 36 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT || 37 format == PIPE_FORMAT_Z24X8_UNORM) 38 return R2D_UNORM8; 39 40 /* get_component_bits doesn't work with depth/stencil formats: */ 41 if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT) 42 return R2D_FLOAT32; 43 if (format == PIPE_FORMAT_S8_UINT) 44 return R2D_INT8; 45 if (format == PIPE_FORMAT_A8_UNORM) 46 return R2D_UNORM8; 47 48 /* use the size of the red channel to find the corresponding "ifmt" */ 49 bool is_int = util_format_is_pure_integer(format); 50 switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) { 51 case 4: case 5: case 8: 52 return is_int ? R2D_INT8 : R2D_UNORM8; 53 case 10: case 11: 54 return is_int ? R2D_INT16 : R2D_FLOAT16; 55 case 16: 56 if (util_format_is_float(format)) 57 return R2D_FLOAT16; 58 return is_int ? R2D_INT16 : R2D_FLOAT32; 59 case 32: 60 return is_int ? R2D_INT32 : R2D_FLOAT32; 61 default: 62 unreachable("bad format"); 63 return 0; 64 } 65} 66 67static void 68r2d_coords(struct tu_cs *cs, 69 const VkOffset2D *dst, 70 const VkOffset2D *src, 71 const VkExtent2D *extent) 72{ 73 tu_cs_emit_regs(cs, 74 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y), 75 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1)); 76 77 if (!src) 78 return; 79 80 tu_cs_emit_regs(cs, 81 A6XX_GRAS_2D_SRC_TL_X(src->x), 82 A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1), 83 A6XX_GRAS_2D_SRC_TL_Y(src->y), 84 A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1)); 85} 86 87static void 88r2d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val) 89{ 90 uint32_t clear_value[4] = {}; 91 92 switch (format) { 93 case PIPE_FORMAT_Z24_UNORM_S8_UINT: 94 case PIPE_FORMAT_Z24X8_UNORM: 95 /* cleared as r8g8b8a8_unorm using special format */ 96 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24); 97 clear_value[1] = clear_value[0] >> 8; 98 clear_value[2] = clear_value[0] >> 16; 99 clear_value[3] = val->depthStencil.stencil; 100 break; 101 case PIPE_FORMAT_Z16_UNORM: 102 case PIPE_FORMAT_Z32_FLOAT: 103 /* R2D_FLOAT32 */ 104 clear_value[0] = fui(val->depthStencil.depth); 105 break; 106 case PIPE_FORMAT_S8_UINT: 107 clear_value[0] = val->depthStencil.stencil; 108 break; 109 case PIPE_FORMAT_R9G9B9E5_FLOAT: 110 /* cleared as UINT32 */ 111 clear_value[0] = float3_to_rgb9e5(val->color.float32); 112 break; 113 default: 114 assert(!util_format_is_depth_or_stencil(format)); 115 const struct util_format_description *desc = util_format_description(format); 116 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format); 117 118 assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN || 119 format == PIPE_FORMAT_R11G11B10_FLOAT); 120 121 for (unsigned i = 0; i < desc->nr_channels; i++) { 122 const struct util_format_channel_description *ch = &desc->channel[i]; 123 if (ifmt == R2D_UNORM8) { 124 float linear = val->color.float32[i]; 125 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3) 126 linear = util_format_linear_to_srgb_float(val->color.float32[i]); 127 128 if (ch->type == UTIL_FORMAT_TYPE_SIGNED) 129 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f); 130 else 131 clear_value[i] = tu_pack_float32_for_unorm(linear, 8); 132 } else if (ifmt == R2D_FLOAT16) { 133 clear_value[i] = _mesa_float_to_half(val->color.float32[i]); 134 } else { 135 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 || 136 ifmt == R2D_INT16 || ifmt == R2D_INT8); 137 clear_value[i] = val->color.uint32[i]; 138 } 139 } 140 break; 141 } 142 143 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4); 144 tu_cs_emit_array(cs, clear_value, 4); 145} 146 147static void 148fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format, 149 enum a6xx_format *fmt) 150{ 151 /* When blitting S8 -> D24S8 or vice versa, we have to override S8, which 152 * is normally R8_UINT for sampling/blitting purposes, to a unorm format. 153 * We also have to move stencil, which is normally in the .w channel, into 154 * the right channel. Reintepreting the S8 texture as A8_UNORM solves both 155 * problems, and avoids using a swap, which seems to sometimes not work 156 * with a D24S8 source, or a texture swizzle which is only supported with 157 * the 3d path. Sometimes this blit happens on already-constructed 158 * fdl6_view's, e.g. for sysmem resolves, so this has to happen as a fixup. 159 */ 160 if (*src_format == PIPE_FORMAT_S8_UINT && 161 (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT || 162 dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) { 163 *fmt = FMT6_A8_UNORM; 164 *src_format = PIPE_FORMAT_A8_UNORM; 165 } 166} 167 168static void 169fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format, 170 enum a6xx_format *fmt) 171{ 172 if (*dst_format == PIPE_FORMAT_S8_UINT && 173 (src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT || 174 src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) { 175 *dst_format = PIPE_FORMAT_A8_UNORM; 176 *fmt = FMT6_A8_UNORM; 177 } 178} 179 180static void 181r2d_src(struct tu_cmd_buffer *cmd, 182 struct tu_cs *cs, 183 const struct fdl6_view *iview, 184 uint32_t layer, 185 VkFilter filter, 186 enum pipe_format dst_format) 187{ 188 uint32_t src_info = iview->SP_PS_2D_SRC_INFO; 189 if (filter != VK_FILTER_NEAREST) 190 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER; 191 192 enum a6xx_format fmt = (src_info & A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK); 193 enum pipe_format src_format = iview->format; 194 fixup_src_format(&src_format, dst_format, &fmt); 195 196 src_info = 197 (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) | 198 A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt); 199 200 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5); 201 tu_cs_emit(cs, src_info); 202 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE); 203 tu_cs_image_ref_2d(cs, iview, layer, true); 204 205 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3); 206 tu_cs_image_flag_ref(cs, iview, layer); 207} 208 209static void 210r2d_src_depth(struct tu_cmd_buffer *cmd, 211 struct tu_cs *cs, 212 const struct tu_image_view *iview, 213 uint32_t layer, 214 VkFilter filter) 215{ 216 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5); 217 tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO)); 218 tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE); 219 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer); 220 /* SP_PS_2D_SRC_PITCH has shifted pitch field */ 221 tu_cs_emit(cs, iview->depth_PITCH << 9); 222 223 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3); 224 tu_cs_image_flag_ref(cs, &iview->view, layer); 225} 226 227static void 228r2d_src_stencil(struct tu_cmd_buffer *cmd, 229 struct tu_cs *cs, 230 const struct tu_image_view *iview, 231 uint32_t layer, 232 VkFilter filter) 233{ 234 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5); 235 tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS); 236 tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE); 237 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer); 238 /* SP_PS_2D_SRC_PITCH has shifted pitch field */ 239 tu_cs_emit(cs, iview->stencil_PITCH << 9); 240} 241 242static void 243r2d_src_buffer(struct tu_cmd_buffer *cmd, 244 struct tu_cs *cs, 245 enum pipe_format format, 246 uint64_t va, uint32_t pitch, 247 uint32_t width, uint32_t height, 248 enum pipe_format dst_format) 249{ 250 struct tu_native_format fmt = tu6_format_texture(format, TILE6_LINEAR); 251 enum a6xx_format color_format = fmt.fmt; 252 fixup_src_format(&format, dst_format, &color_format); 253 254 tu_cs_emit_regs(cs, 255 A6XX_SP_PS_2D_SRC_INFO( 256 .color_format = color_format, 257 .color_swap = fmt.swap, 258 .srgb = util_format_is_srgb(format), 259 .unk20 = 1, 260 .unk22 = 1), 261 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height), 262 A6XX_SP_PS_2D_SRC(.qword = va), 263 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch)); 264} 265 266static void 267r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, 268 enum pipe_format src_format) 269{ 270 uint32_t dst_info = iview->RB_2D_DST_INFO; 271 enum a6xx_format fmt = dst_info & A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK; 272 enum pipe_format dst_format = iview->format; 273 fixup_dst_format(src_format, &dst_format, &fmt); 274 275 dst_info = 276 (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt; 277 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4); 278 tu_cs_emit(cs, dst_info); 279 tu_cs_image_ref_2d(cs, iview, layer, false); 280 281 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3); 282 tu_cs_image_flag_ref(cs, iview, layer); 283} 284 285static void 286r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) 287{ 288 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4); 289 tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO)); 290 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer); 291 tu_cs_emit(cs, iview->depth_PITCH); 292 293 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3); 294 tu_cs_image_flag_ref(cs, &iview->view, layer); 295} 296 297static void 298r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) 299{ 300 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4); 301 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS); 302 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer); 303 tu_cs_emit(cs, iview->stencil_PITCH); 304} 305 306static void 307r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch, 308 enum pipe_format src_format) 309{ 310 struct tu_native_format fmt = tu6_format_color(format, TILE6_LINEAR); 311 enum a6xx_format color_fmt = fmt.fmt; 312 fixup_dst_format(src_format, &format, &color_fmt); 313 fmt.fmt = color_fmt; 314 315 tu_cs_emit_regs(cs, 316 A6XX_RB_2D_DST_INFO( 317 .color_format = fmt.fmt, 318 .color_swap = fmt.swap, 319 .srgb = util_format_is_srgb(format)), 320 A6XX_RB_2D_DST(.qword = va), 321 A6XX_RB_2D_DST_PITCH(pitch)); 322} 323 324static void 325r2d_setup_common(struct tu_cmd_buffer *cmd, 326 struct tu_cs *cs, 327 enum pipe_format src_format, 328 enum pipe_format dst_format, 329 VkImageAspectFlags aspect_mask, 330 unsigned blit_param, 331 bool clear, 332 bool ubwc, 333 bool scissor) 334{ 335 enum a6xx_format fmt = tu6_base_format(dst_format); 336 fixup_dst_format(src_format, &dst_format, &fmt); 337 enum a6xx_2d_ifmt ifmt = format_to_ifmt(dst_format); 338 339 uint32_t unknown_8c01 = 0; 340 341 if ((dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT || 342 dst_format == PIPE_FORMAT_Z24X8_UNORM) && ubwc) { 343 fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; 344 } 345 346 /* note: the only format with partial clearing is D24S8 */ 347 if (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT) { 348 /* preserve stencil channel */ 349 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) 350 unknown_8c01 = 0x08000041; 351 /* preserve depth channels */ 352 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) 353 unknown_8c01 = 0x00084001; 354 } 355 356 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1); 357 tu_cs_emit(cs, unknown_8c01); 358 359 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL( 360 .scissor = scissor, 361 .rotate = blit_param, 362 .solid_color = clear, 363 .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear, 364 .color_format = fmt, 365 .mask = 0xf, 366 .ifmt = util_format_is_srgb(dst_format) ? R2D_UNORM8_SRGB : ifmt, 367 ).value; 368 369 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1); 370 tu_cs_emit(cs, blit_cntl); 371 372 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); 373 tu_cs_emit(cs, blit_cntl); 374 375 if (fmt == FMT6_10_10_10_2_UNORM_DEST) 376 fmt = FMT6_16_16_16_16_FLOAT; 377 378 tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT( 379 .sint = util_format_is_pure_sint(dst_format), 380 .uint = util_format_is_pure_uint(dst_format), 381 .color_format = fmt, 382 .srgb = util_format_is_srgb(dst_format), 383 .mask = 0xf)); 384} 385 386static void 387r2d_setup(struct tu_cmd_buffer *cmd, 388 struct tu_cs *cs, 389 enum pipe_format src_format, 390 enum pipe_format dst_format, 391 VkImageAspectFlags aspect_mask, 392 unsigned blit_param, 393 bool clear, 394 bool ubwc, 395 VkSampleCountFlagBits samples) 396{ 397 assert(samples == VK_SAMPLE_COUNT_1_BIT); 398 399 if (!cmd->state.pass) { 400 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); 401 } 402 403 r2d_setup_common(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false); 404} 405 406static void 407r2d_teardown(struct tu_cmd_buffer *cmd, 408 struct tu_cs *cs) 409{ 410 /* nothing to do here */ 411} 412 413static void 414r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 415{ 416 tu_cs_emit_pkt7(cs, CP_BLIT, 1); 417 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); 418} 419 420/* r3d_ = shader path operations */ 421 422static nir_ssa_def * 423load_const(nir_builder *b, unsigned base, unsigned components) 424{ 425 return nir_load_uniform(b, components, 32, nir_imm_int(b, 0), 426 .base = base); 427} 428 429static nir_shader * 430build_blit_vs_shader(void) 431{ 432 nir_builder _b = 433 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs"); 434 nir_builder *b = &_b; 435 436 nir_variable *out_pos = 437 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(), 438 "gl_Position"); 439 out_pos->data.location = VARYING_SLOT_POS; 440 441 nir_ssa_def *vert0_pos = load_const(b, 0, 2); 442 nir_ssa_def *vert1_pos = load_const(b, 4, 2); 443 nir_ssa_def *vertex = nir_load_vertex_id(b); 444 445 nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos); 446 pos = nir_vec4(b, nir_channel(b, pos, 0), 447 nir_channel(b, pos, 1), 448 nir_imm_float(b, 0.0), 449 nir_imm_float(b, 1.0)); 450 451 nir_store_var(b, out_pos, pos, 0xf); 452 453 nir_variable *out_coords = 454 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3), 455 "coords"); 456 out_coords->data.location = VARYING_SLOT_VAR0; 457 458 nir_ssa_def *vert0_coords = load_const(b, 2, 2); 459 nir_ssa_def *vert1_coords = load_const(b, 6, 2); 460 461 /* Only used with "z scale" blit path which uses a 3d texture */ 462 nir_ssa_def *z_coord = load_const(b, 8, 1); 463 464 nir_ssa_def *coords = nir_bcsel(b, nir_i2b1(b, vertex), vert1_coords, vert0_coords); 465 coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1), 466 z_coord); 467 468 nir_store_var(b, out_coords, coords, 0x7); 469 470 return b->shader; 471} 472 473static nir_shader * 474build_clear_vs_shader(void) 475{ 476 nir_builder _b = 477 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs"); 478 nir_builder *b = &_b; 479 480 nir_variable *out_pos = 481 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(), 482 "gl_Position"); 483 out_pos->data.location = VARYING_SLOT_POS; 484 485 nir_ssa_def *vert0_pos = load_const(b, 0, 2); 486 nir_ssa_def *vert1_pos = load_const(b, 4, 2); 487 /* c0.z is used to clear depth */ 488 nir_ssa_def *depth = load_const(b, 2, 1); 489 nir_ssa_def *vertex = nir_load_vertex_id(b); 490 491 nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos); 492 pos = nir_vec4(b, nir_channel(b, pos, 0), 493 nir_channel(b, pos, 1), 494 depth, nir_imm_float(b, 1.0)); 495 496 nir_store_var(b, out_pos, pos, 0xf); 497 498 nir_variable *out_layer = 499 nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(), 500 "gl_Layer"); 501 out_layer->data.location = VARYING_SLOT_LAYER; 502 nir_ssa_def *layer = load_const(b, 3, 1); 503 nir_store_var(b, out_layer, layer, 1); 504 505 return b->shader; 506} 507 508static nir_shader * 509build_blit_fs_shader(bool zscale) 510{ 511 nir_builder _b = 512 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL, 513 zscale ? "zscale blit fs" : "blit fs"); 514 nir_builder *b = &_b; 515 516 nir_variable *out_color = 517 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(), 518 "color0"); 519 out_color->data.location = FRAG_RESULT_DATA0; 520 521 unsigned coord_components = zscale ? 3 : 2; 522 nir_variable *in_coords = 523 nir_variable_create(b->shader, nir_var_shader_in, 524 glsl_vec_type(coord_components), 525 "coords"); 526 in_coords->data.location = VARYING_SLOT_VAR0; 527 528 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1); 529 /* Note: since we're just copying data, we rely on the HW ignoring the 530 * dest_type. 531 */ 532 tex->dest_type = nir_type_int32; 533 tex->is_array = false; 534 tex->is_shadow = false; 535 tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D; 536 537 tex->texture_index = 0; 538 tex->sampler_index = 0; 539 540 b->shader->info.num_textures = 1; 541 BITSET_SET(b->shader->info.textures_used, 0); 542 543 tex->src[0].src_type = nir_tex_src_coord; 544 tex->src[0].src = nir_src_for_ssa(nir_load_var(b, in_coords)); 545 tex->coord_components = coord_components; 546 547 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); 548 nir_builder_instr_insert(b, &tex->instr); 549 550 nir_store_var(b, out_color, &tex->dest.ssa, 0xf); 551 552 return b->shader; 553} 554 555/* We can only read multisample textures via txf_ms, so we need a separate 556 * variant for them. 557 */ 558static nir_shader * 559build_ms_copy_fs_shader(void) 560{ 561 nir_builder _b = 562 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL, 563 "multisample copy fs"); 564 nir_builder *b = &_b; 565 566 nir_variable *out_color = 567 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(), 568 "color0"); 569 out_color->data.location = FRAG_RESULT_DATA0; 570 571 nir_variable *in_coords = 572 nir_variable_create(b->shader, nir_var_shader_in, 573 glsl_vec_type(2), 574 "coords"); 575 in_coords->data.location = VARYING_SLOT_VAR0; 576 577 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2); 578 579 tex->op = nir_texop_txf_ms; 580 581 /* Note: since we're just copying data, we rely on the HW ignoring the 582 * dest_type. 583 */ 584 tex->dest_type = nir_type_int32; 585 tex->is_array = false; 586 tex->is_shadow = false; 587 tex->sampler_dim = GLSL_SAMPLER_DIM_MS; 588 589 tex->texture_index = 0; 590 tex->sampler_index = 0; 591 592 b->shader->info.num_textures = 1; 593 BITSET_SET(b->shader->info.textures_used, 0); 594 BITSET_SET(b->shader->info.textures_used_by_txf, 0); 595 596 nir_ssa_def *coord = nir_f2i32(b, nir_load_var(b, in_coords)); 597 598 tex->src[0].src_type = nir_tex_src_coord; 599 tex->src[0].src = nir_src_for_ssa(coord); 600 tex->coord_components = 2; 601 602 tex->src[1].src_type = nir_tex_src_ms_index; 603 tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(b)); 604 605 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); 606 nir_builder_instr_insert(b, &tex->instr); 607 608 nir_store_var(b, out_color, &tex->dest.ssa, 0xf); 609 610 return b->shader; 611} 612 613static nir_shader * 614build_clear_fs_shader(unsigned mrts) 615{ 616 nir_builder _b = 617 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL, 618 "mrt%u clear fs", mrts); 619 nir_builder *b = &_b; 620 621 for (unsigned i = 0; i < mrts; i++) { 622 nir_variable *out_color = 623 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(), 624 "color"); 625 out_color->data.location = FRAG_RESULT_DATA0 + i; 626 627 nir_ssa_def *color = load_const(b, 4 * i, 4); 628 nir_store_var(b, out_color, color, 0xf); 629 } 630 631 return b->shader; 632} 633 634static void 635compile_shader(struct tu_device *dev, struct nir_shader *nir, 636 unsigned consts, unsigned *offset, enum global_shader idx) 637{ 638 nir->options = ir3_get_compiler_options(dev->compiler); 639 640 nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage); 641 nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage); 642 643 ir3_finalize_nir(dev->compiler, nir); 644 645 struct ir3_shader *sh = 646 ir3_shader_from_nir(dev->compiler, nir, &(struct ir3_shader_options) { 647 .api_wavesize = IR3_SINGLE_OR_DOUBLE, 648 .real_wavesize = IR3_SINGLE_OR_DOUBLE, 649 .reserved_user_consts = align(consts, 4), 650 }, NULL); 651 652 struct ir3_shader_key key = {}; 653 bool created; 654 struct ir3_shader_variant *so = 655 ir3_shader_get_variant(sh, &key, false, false, &created); 656 657 struct tu6_global *global = dev->global_bo->map; 658 659 assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders)); 660 dev->global_shaders[idx] = sh; 661 dev->global_shader_variants[idx] = so; 662 memcpy(&global->shaders[*offset], so->bin, 663 sizeof(uint32_t) * so->info.sizedwords); 664 dev->global_shader_va[idx] = dev->global_bo->iova + 665 gb_offset(shaders[*offset]); 666 *offset += align(so->info.sizedwords, 32); 667} 668 669void 670tu_init_clear_blit_shaders(struct tu_device *dev) 671{ 672 unsigned offset = 0; 673 compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT); 674 compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR); 675 compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT); 676 compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE); 677 compile_shader(dev, build_ms_copy_fs_shader(), 0, &offset, GLOBAL_SH_FS_COPY_MS); 678 679 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) { 680 compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset, 681 GLOBAL_SH_FS_CLEAR0 + num_rts); 682 } 683} 684 685void 686tu_destroy_clear_blit_shaders(struct tu_device *dev) 687{ 688 for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) { 689 if (dev->global_shaders[i]) 690 ir3_shader_destroy(dev->global_shaders[i]); 691 } 692} 693 694static void 695r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, 696 uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples) 697{ 698 enum global_shader vs_id = 699 blit ? GLOBAL_SH_VS_BLIT : GLOBAL_SH_VS_CLEAR; 700 701 struct ir3_shader_variant *vs = cmd->device->global_shader_variants[vs_id]; 702 uint64_t vs_iova = cmd->device->global_shader_va[vs_id]; 703 704 enum global_shader fs_id = GLOBAL_SH_FS_BLIT; 705 706 if (z_scale) 707 fs_id = GLOBAL_SH_FS_BLIT_ZSCALE; 708 else if (samples != VK_SAMPLE_COUNT_1_BIT) 709 fs_id = GLOBAL_SH_FS_COPY_MS; 710 711 unsigned num_rts = util_bitcount(rts_mask); 712 if (!blit) 713 fs_id = GLOBAL_SH_FS_CLEAR0 + num_rts; 714 715 struct ir3_shader_variant *fs = cmd->device->global_shader_variants[fs_id]; 716 uint64_t fs_iova = cmd->device->global_shader_va[fs_id]; 717 718 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( 719 .vs_state = true, 720 .hs_state = true, 721 .ds_state = true, 722 .gs_state = true, 723 .fs_state = true, 724 .cs_state = true, 725 .gfx_ibo = true, 726 .cs_ibo = true, 727 .gfx_shared_const = true, 728 .gfx_bindless = 0x1f, 729 .cs_bindless = 0x1f)); 730 731 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, vs); 732 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL); 733 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL); 734 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL); 735 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, fs); 736 737 struct tu_pvtmem_config pvtmem = {}; 738 tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova); 739 tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova); 740 741 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0()); 742 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0()); 743 744 if (cmd->device->physical_device->info->a6xx.has_cp_reg_write) { 745 /* Copy what the blob does here. This will emit an extra 0x3f 746 * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what 747 * this is working around yet. 748 */ 749 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3); 750 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE)); 751 tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL); 752 tu_cs_emit(cs, 0); 753 } else { 754 tu_cs_emit_regs(cs, A6XX_PC_MULTIVIEW_CNTL()); 755 } 756 tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL()); 757 758 tu6_emit_vpc(cs, vs, NULL, NULL, NULL, fs, 0); 759 760 /* REPL_MODE for varying with RECTLIST (2 vertices only) */ 761 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0)); 762 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0)); 763 764 tu6_emit_fs_inputs(cs, fs); 765 766 tu_cs_emit_regs(cs, 767 A6XX_GRAS_CL_CNTL( 768 .persp_division_disable = 1, 769 .vp_xform_disable = 1, 770 .vp_clip_code_ignore = 1, 771 .clip_disable = 1)); 772 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable? 773 774 tu_cs_emit_regs(cs, A6XX_PC_RASTER_CNTL()); 775 tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107()); 776 777 tu_cs_emit_regs(cs, 778 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0), 779 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff)); 780 tu_cs_emit_regs(cs, 781 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0), 782 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff)); 783 784 tu_cs_emit_regs(cs, 785 A6XX_VFD_INDEX_OFFSET(), 786 A6XX_VFD_INSTANCE_START_OFFSET()); 787 788 if (rts_mask) { 789 unsigned rts_count = util_last_bit(rts_mask); 790 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count); 791 unsigned rt = 0; 792 for (unsigned i = 0; i < rts_count; i++) { 793 unsigned regid = 0; 794 if (rts_mask & (1u << i)) 795 regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++); 796 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid)); 797 } 798 } 799 800 cmd->state.line_mode = RECTANGULAR; 801 tu6_emit_msaa(cs, samples, cmd->state.line_mode); 802} 803 804static void 805r3d_coords_raw(struct tu_cs *cs, const float *coords) 806{ 807 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8); 808 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 809 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 810 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 811 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) | 812 CP_LOAD_STATE6_0_NUM_UNIT(2)); 813 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 814 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 815 tu_cs_emit_array(cs, (const uint32_t *) coords, 8); 816} 817 818/* z coordinate for "z scale" blit path which uses a 3d texture */ 819static void 820r3d_coord_z(struct tu_cs *cs, float z) 821{ 822 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 4); 823 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(2) | 824 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 825 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 826 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) | 827 CP_LOAD_STATE6_0_NUM_UNIT(1)); 828 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 829 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 830 tu_cs_emit(cs, fui(z)); 831 tu_cs_emit(cs, 0); 832 tu_cs_emit(cs, 0); 833 tu_cs_emit(cs, 0); 834} 835 836static void 837r3d_coords(struct tu_cs *cs, 838 const VkOffset2D *dst, 839 const VkOffset2D *src, 840 const VkExtent2D *extent) 841{ 842 int32_t src_x1 = src ? src->x : 0; 843 int32_t src_y1 = src ? src->y : 0; 844 r3d_coords_raw(cs, (float[]) { 845 dst->x, dst->y, 846 src_x1, src_y1, 847 dst->x + extent->width, dst->y + extent->height, 848 src_x1 + extent->width, src_y1 + extent->height, 849 }); 850} 851 852static void 853r3d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val) 854{ 855 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4); 856 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 857 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 858 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 859 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) | 860 CP_LOAD_STATE6_0_NUM_UNIT(1)); 861 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 862 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 863 switch (format) { 864 case PIPE_FORMAT_Z24X8_UNORM: 865 case PIPE_FORMAT_Z24_UNORM_S8_UINT: { 866 /* cleared as r8g8b8a8_unorm using special format */ 867 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24); 868 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f)); 869 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f)); 870 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f)); 871 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f)); 872 } break; 873 case PIPE_FORMAT_Z16_UNORM: 874 case PIPE_FORMAT_Z32_FLOAT: 875 tu_cs_emit(cs, fui(val->depthStencil.depth)); 876 tu_cs_emit(cs, 0); 877 tu_cs_emit(cs, 0); 878 tu_cs_emit(cs, 0); 879 break; 880 case PIPE_FORMAT_S8_UINT: 881 tu_cs_emit(cs, val->depthStencil.stencil & 0xff); 882 tu_cs_emit(cs, 0); 883 tu_cs_emit(cs, 0); 884 tu_cs_emit(cs, 0); 885 break; 886 default: 887 /* as color formats use clear value as-is */ 888 assert(!util_format_is_depth_or_stencil(format)); 889 tu_cs_emit_array(cs, val->color.uint32, 4); 890 break; 891 } 892} 893 894static void 895r3d_src_common(struct tu_cmd_buffer *cmd, 896 struct tu_cs *cs, 897 const uint32_t *tex_const, 898 uint32_t offset_base, 899 uint32_t offset_ubwc, 900 VkFilter filter) 901{ 902 struct tu_cs_memory texture = { }; 903 VkResult result = tu_cs_alloc(&cmd->sub_cs, 904 2, /* allocate space for a sampler too */ 905 A6XX_TEX_CONST_DWORDS, &texture); 906 if (result != VK_SUCCESS) { 907 cmd->record_result = result; 908 return; 909 } 910 911 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4); 912 913 /* patch addresses for layer offset */ 914 *(uint64_t*) (texture.map + 4) += offset_base; 915 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc; 916 texture.map[7] = ubwc_addr; 917 texture.map[8] = ubwc_addr >> 32; 918 919 texture.map[A6XX_TEX_CONST_DWORDS + 0] = 920 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) | 921 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) | 922 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) | 923 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) | 924 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) | 925 0x60000; /* XXX used by blob, doesn't seem necessary */ 926 texture.map[A6XX_TEX_CONST_DWORDS + 1] = 927 A6XX_TEX_SAMP_1_UNNORM_COORDS | 928 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR; 929 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0; 930 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0; 931 932 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3); 933 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 934 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | 935 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 936 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) | 937 CP_LOAD_STATE6_0_NUM_UNIT(1)); 938 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4); 939 940 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4)); 941 942 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3); 943 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 944 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 945 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 946 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) | 947 CP_LOAD_STATE6_0_NUM_UNIT(1)); 948 tu_cs_emit_qw(cs, texture.iova); 949 950 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova)); 951 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1)); 952} 953 954static void 955r3d_src(struct tu_cmd_buffer *cmd, 956 struct tu_cs *cs, 957 const struct fdl6_view *iview, 958 uint32_t layer, 959 VkFilter filter, 960 enum pipe_format dst_format) 961{ 962 uint32_t desc[A6XX_TEX_CONST_DWORDS]; 963 memcpy(desc, iview->descriptor, sizeof(desc)); 964 965 enum a6xx_format fmt = (desc[0] & A6XX_TEX_CONST_0_FMT__MASK) >> 966 A6XX_TEX_CONST_0_FMT__SHIFT; 967 enum pipe_format src_format = iview->format; 968 fixup_src_format(&src_format, dst_format, &fmt); 969 desc[0] = (desc[0] & ~A6XX_TEX_CONST_0_FMT__MASK) | 970 A6XX_TEX_CONST_0_FMT(fmt); 971 972 r3d_src_common(cmd, cs, desc, 973 iview->layer_size * layer, 974 iview->ubwc_layer_size * layer, 975 filter); 976} 977 978static void 979r3d_src_buffer(struct tu_cmd_buffer *cmd, 980 struct tu_cs *cs, 981 enum pipe_format format, 982 uint64_t va, uint32_t pitch, 983 uint32_t width, uint32_t height, 984 enum pipe_format dst_format) 985{ 986 uint32_t desc[A6XX_TEX_CONST_DWORDS]; 987 988 struct tu_native_format fmt = tu6_format_texture(format, TILE6_LINEAR); 989 enum a6xx_format color_format = fmt.fmt; 990 fixup_src_format(&format, dst_format, &color_format); 991 992 desc[0] = 993 COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) | 994 A6XX_TEX_CONST_0_FMT(color_format) | 995 A6XX_TEX_CONST_0_SWAP(fmt.swap) | 996 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) | 997 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) | 998 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) | 999 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W); 1000 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height); 1001 desc[2] = 1002 A6XX_TEX_CONST_2_PITCH(pitch) | 1003 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D); 1004 desc[3] = 0; 1005 desc[4] = va; 1006 desc[5] = va >> 32; 1007 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++) 1008 desc[i] = 0; 1009 1010 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST); 1011} 1012 1013static void 1014r3d_src_gmem(struct tu_cmd_buffer *cmd, 1015 struct tu_cs *cs, 1016 const struct tu_image_view *iview, 1017 enum pipe_format format, 1018 enum pipe_format dst_format, 1019 uint32_t gmem_offset, 1020 uint32_t cpp) 1021{ 1022 uint32_t desc[A6XX_TEX_CONST_DWORDS]; 1023 memcpy(desc, iview->view.descriptor, sizeof(desc)); 1024 1025 enum a6xx_format fmt = tu6_format_texture(format, TILE6_LINEAR).fmt; 1026 fixup_src_format(&format, dst_format, &fmt); 1027 1028 /* patch the format so that depth/stencil get the right format and swizzle */ 1029 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK | 1030 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK | 1031 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK); 1032 desc[0] |= A6XX_TEX_CONST_0_FMT(fmt) | 1033 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) | 1034 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) | 1035 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) | 1036 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W); 1037 1038 /* patched for gmem */ 1039 desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); 1040 desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); 1041 desc[2] = 1042 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | 1043 A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp); 1044 desc[3] = 0; 1045 desc[4] = cmd->device->physical_device->gmem_base + gmem_offset; 1046 desc[5] = A6XX_TEX_CONST_5_DEPTH(1); 1047 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++) 1048 desc[i] = 0; 1049 1050 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST); 1051} 1052 1053static void 1054r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, 1055 enum pipe_format src_format) 1056{ 1057 uint32_t mrt_buf_info = iview->RB_MRT_BUF_INFO; 1058 1059 enum a6xx_format fmt = mrt_buf_info & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK; 1060 enum pipe_format dst_format = iview->format; 1061 fixup_dst_format(src_format, &dst_format, &fmt); 1062 mrt_buf_info = 1063 (mrt_buf_info & ~A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK) | 1064 A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(fmt); 1065 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6); 1066 tu_cs_emit(cs, mrt_buf_info); 1067 tu_cs_image_ref(cs, iview, layer); 1068 tu_cs_emit(cs, 0); 1069 1070 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3); 1071 tu_cs_image_flag_ref(cs, iview, layer); 1072 1073 /* Use color format from RB_MRT_BUF_INFO. This register is relevant for 1074 * FMT6_NV12_Y. 1075 */ 1076 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt)); 1077 1078 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled)); 1079} 1080 1081static void 1082r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) 1083{ 1084 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6); 1085 tu_cs_emit(cs, tu_image_view_depth(iview, RB_MRT_BUF_INFO)); 1086 tu_cs_image_depth_ref(cs, iview, layer); 1087 tu_cs_emit(cs, 0); 1088 1089 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3); 1090 tu_cs_image_flag_ref(cs, &iview->view, layer); 1091 1092 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->view.ubwc_enabled)); 1093} 1094 1095static void 1096r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) 1097{ 1098 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6); 1099 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO)); 1100 tu_cs_image_stencil_ref(cs, iview, layer); 1101 tu_cs_emit(cs, 0); 1102 1103 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL()); 1104} 1105 1106static void 1107r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch, 1108 enum pipe_format src_format) 1109{ 1110 struct tu_native_format fmt = tu6_format_color(format, TILE6_LINEAR); 1111 1112 enum a6xx_format color_fmt = fmt.fmt; 1113 fixup_dst_format(src_format, &format, &color_fmt); 1114 1115 tu_cs_emit_regs(cs, 1116 A6XX_RB_MRT_BUF_INFO(0, .color_format = color_fmt, .color_swap = fmt.swap), 1117 A6XX_RB_MRT_PITCH(0, pitch), 1118 A6XX_RB_MRT_ARRAY_PITCH(0, 0), 1119 A6XX_RB_MRT_BASE(0, .qword = va), 1120 A6XX_RB_MRT_BASE_GMEM(0, 0)); 1121 1122 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL()); 1123} 1124 1125static uint8_t 1126aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask) 1127{ 1128 uint8_t mask = 0xf; 1129 assert(aspect_mask); 1130 /* note: the only format with partial writing is D24S8, 1131 * clear/blit uses the _AS_R8G8B8A8 format to access it 1132 */ 1133 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) { 1134 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) 1135 mask = 0x7; 1136 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) 1137 mask = 0x8; 1138 } 1139 return mask; 1140} 1141 1142static void 1143r3d_setup(struct tu_cmd_buffer *cmd, 1144 struct tu_cs *cs, 1145 enum pipe_format src_format, 1146 enum pipe_format dst_format, 1147 VkImageAspectFlags aspect_mask, 1148 unsigned blit_param, 1149 bool clear, 1150 bool ubwc, 1151 VkSampleCountFlagBits samples) 1152{ 1153 enum a6xx_format fmt = tu6_base_format(dst_format); 1154 fixup_dst_format(src_format, &dst_format, &fmt); 1155 1156 if ((dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT || 1157 dst_format == PIPE_FORMAT_Z24X8_UNORM) && ubwc) { 1158 fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; 1159 } 1160 1161 if (!cmd->state.pass) { 1162 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); 1163 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff); 1164 } 1165 1166 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000)); 1167 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000)); 1168 1169 r3d_common(cmd, cs, !clear, 1, blit_param, samples); 1170 1171 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); 1172 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) | 1173 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) | 1174 0xfc000000); 1175 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1)); 1176 1177 tu_cs_emit_regs(cs, 1178 A6XX_RB_FS_OUTPUT_CNTL0(), 1179 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1)); 1180 1181 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL()); 1182 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff)); 1183 1184 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL()); 1185 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL()); 1186 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL()); 1187 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL()); 1188 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK()); 1189 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK()); 1190 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF()); 1191 1192 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf)); 1193 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf)); 1194 1195 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0, 1196 .color_format = fmt, 1197 .color_sint = util_format_is_pure_sint(dst_format), 1198 .color_uint = util_format_is_pure_uint(dst_format))); 1199 1200 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, 1201 .component_enable = aspect_write_mask(dst_format, aspect_mask))); 1202 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format))); 1203 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format))); 1204 1205 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0)); 1206 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0)); 1207 1208 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL, 1209 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2)); 1210 1211 /* Disable sample counting in order to not affect occlusion query. */ 1212 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true)); 1213 1214 if (cmd->state.prim_generated_query_running_before_rp) { 1215 tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS); 1216 } 1217 1218 if (cmd->state.predication_active) { 1219 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1); 1220 tu_cs_emit(cs, 0); 1221 } 1222} 1223 1224static void 1225r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 1226{ 1227 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3); 1228 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) | 1229 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) | 1230 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY)); 1231 tu_cs_emit(cs, 1); /* instance count */ 1232 tu_cs_emit(cs, 2); /* vertex count */ 1233} 1234 1235static void 1236r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 1237{ 1238 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3); 1239 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) | 1240 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) | 1241 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY)); 1242 tu_cs_emit(cs, 1); /* instance count */ 1243 tu_cs_emit(cs, 2); /* vertex count */ 1244} 1245 1246static void 1247r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs) 1248{ 1249 if (cmd->state.predication_active) { 1250 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1); 1251 tu_cs_emit(cs, 1); 1252 } 1253 1254 /* Re-enable sample counting. */ 1255 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false)); 1256 1257 if (cmd->state.prim_generated_query_running_before_rp) { 1258 tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS); 1259 } 1260} 1261 1262/* blit ops - common interface for 2d/shader paths */ 1263 1264struct blit_ops { 1265 void (*coords)(struct tu_cs *cs, 1266 const VkOffset2D *dst, 1267 const VkOffset2D *src, 1268 const VkExtent2D *extent); 1269 void (*clear_value)(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val); 1270 void (*src)( 1271 struct tu_cmd_buffer *cmd, 1272 struct tu_cs *cs, 1273 const struct fdl6_view *iview, 1274 uint32_t layer, 1275 VkFilter filter, 1276 enum pipe_format dst_format); 1277 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs, 1278 enum pipe_format format, 1279 uint64_t va, uint32_t pitch, 1280 uint32_t width, uint32_t height, 1281 enum pipe_format dst_format); 1282 void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, 1283 enum pipe_format src_format); 1284 void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); 1285 void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); 1286 void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch, 1287 enum pipe_format src_format); 1288 void (*setup)(struct tu_cmd_buffer *cmd, 1289 struct tu_cs *cs, 1290 enum pipe_format src_format, 1291 enum pipe_format dst_format, 1292 VkImageAspectFlags aspect_mask, 1293 unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */ 1294 bool clear, 1295 bool ubwc, 1296 VkSampleCountFlagBits samples); 1297 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs); 1298 void (*teardown)(struct tu_cmd_buffer *cmd, 1299 struct tu_cs *cs); 1300}; 1301 1302static const struct blit_ops r2d_ops = { 1303 .coords = r2d_coords, 1304 .clear_value = r2d_clear_value, 1305 .src = r2d_src, 1306 .src_buffer = r2d_src_buffer, 1307 .dst = r2d_dst, 1308 .dst_depth = r2d_dst_depth, 1309 .dst_stencil = r2d_dst_stencil, 1310 .dst_buffer = r2d_dst_buffer, 1311 .setup = r2d_setup, 1312 .run = r2d_run, 1313 .teardown = r2d_teardown, 1314}; 1315 1316static const struct blit_ops r3d_ops = { 1317 .coords = r3d_coords, 1318 .clear_value = r3d_clear_value, 1319 .src = r3d_src, 1320 .src_buffer = r3d_src_buffer, 1321 .dst = r3d_dst, 1322 .dst_depth = r3d_dst_depth, 1323 .dst_stencil = r3d_dst_stencil, 1324 .dst_buffer = r3d_dst_buffer, 1325 .setup = r3d_setup, 1326 .run = r3d_run, 1327 .teardown = r3d_teardown, 1328}; 1329 1330/* passthrough set coords from 3D extents */ 1331static void 1332coords(const struct blit_ops *ops, 1333 struct tu_cs *cs, 1334 const VkOffset3D *dst, 1335 const VkOffset3D *src, 1336 const VkExtent3D *extent) 1337{ 1338 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent); 1339} 1340 1341/* Decides the VK format to treat our data as for a memcpy-style blit. We have 1342 * to be a bit careful because we have to pick a format with matching UBWC 1343 * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for 1344 * everything. 1345 */ 1346static enum pipe_format 1347copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask) 1348{ 1349 if (vk_format_is_compressed(vk_format)) { 1350 switch (vk_format_get_blocksize(vk_format)) { 1351 case 1: return PIPE_FORMAT_R8_UINT; 1352 case 2: return PIPE_FORMAT_R16_UINT; 1353 case 4: return PIPE_FORMAT_R32_UINT; 1354 case 8: return PIPE_FORMAT_R32G32_UINT; 1355 case 16:return PIPE_FORMAT_R32G32B32A32_UINT; 1356 default: 1357 unreachable("unhandled format size"); 1358 } 1359 } 1360 1361 enum pipe_format format = tu_vk_format_to_pipe_format(vk_format); 1362 1363 /* For SNORM formats, copy them as the equivalent UNORM format. If we treat 1364 * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81 1365 * (also -1.0), when we're supposed to be memcpying the bits. See 1366 * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion. 1367 */ 1368 format = util_format_snorm_to_unorm(format); 1369 1370 switch (format) { 1371 case PIPE_FORMAT_R9G9B9E5_FLOAT: 1372 return PIPE_FORMAT_R32_UINT; 1373 1374 case PIPE_FORMAT_G8_B8R8_420_UNORM: 1375 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT) 1376 return PIPE_FORMAT_R8G8_UNORM; 1377 else 1378 return PIPE_FORMAT_Y8_UNORM; 1379 case PIPE_FORMAT_G8_B8_R8_420_UNORM: 1380 return PIPE_FORMAT_R8_UNORM; 1381 1382 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: 1383 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) 1384 return PIPE_FORMAT_S8_UINT; 1385 assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT); 1386 return PIPE_FORMAT_Z32_FLOAT; 1387 1388 default: 1389 return format; 1390 } 1391} 1392 1393void 1394tu6_clear_lrz(struct tu_cmd_buffer *cmd, 1395 struct tu_cs *cs, 1396 struct tu_image *image, 1397 const VkClearValue *value) 1398{ 1399 const struct blit_ops *ops = &r2d_ops; 1400 1401 /* It is assumed that LRZ cache is invalidated at this point for 1402 * the writes here to become visible to LRZ. 1403 * 1404 * LRZ writes are going through UCHE cache, flush UCHE before changing 1405 * LRZ via CCU. Don't need to invalidate CCU since we are presumably 1406 * writing whole cache lines we assume to be 64 bytes. 1407 */ 1408 tu6_emit_event_write(cmd, &cmd->cs, CACHE_FLUSH_TS); 1409 1410 ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM, 1411 VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false, 1412 VK_SAMPLE_COUNT_1_BIT); 1413 ops->clear_value(cs, PIPE_FORMAT_Z16_UNORM, value); 1414 ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM, 1415 image->iova + image->lrz_offset, 1416 image->lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM); 1417 ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height}); 1418 ops->run(cmd, cs); 1419 ops->teardown(cmd, cs); 1420 1421 /* Clearing writes via CCU color in the PS stage, and LRZ is read via 1422 * UCHE in the earlier GRAS stage. 1423 */ 1424 cmd->state.cache.flush_bits |= 1425 TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE | 1426 TU_CMD_FLAG_WAIT_FOR_IDLE; 1427} 1428 1429void 1430tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd, 1431 struct tu_cs *cs, 1432 struct tu_image *image) 1433{ 1434 const struct blit_ops *ops = &r2d_ops; 1435 VkClearValue clear = { .color = { .uint32[0] = 0xffffffff } }; 1436 1437 /* LRZ fast-clear buffer is always allocated with 512 bytes size. */ 1438 ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT, 1439 VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false, 1440 VK_SAMPLE_COUNT_1_BIT); 1441 ops->clear_value(cs, PIPE_FORMAT_R32_UINT, &clear); 1442 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, 1443 image->iova + image->lrz_fc_offset, 512, 1444 PIPE_FORMAT_R32_UINT); 1445 ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {128, 1}); 1446 ops->run(cmd, cs); 1447 ops->teardown(cmd, cs); 1448} 1449 1450static void 1451tu_image_view_copy_blit(struct fdl6_view *iview, 1452 struct tu_image *image, 1453 enum pipe_format format, 1454 const VkImageSubresourceLayers *subres, 1455 uint32_t layer, 1456 bool z_scale) 1457{ 1458 VkImageAspectFlags aspect_mask = subres->aspectMask; 1459 1460 /* always use the AS_R8G8B8A8 format for these */ 1461 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT || 1462 format == PIPE_FORMAT_Z24X8_UNORM) { 1463 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT; 1464 } 1465 1466 const struct fdl_layout *layout = 1467 &image->layout[tu6_plane_index(image->vk.format, aspect_mask)]; 1468 1469 fdl6_view_init(iview, &layout, &(struct fdl_view_args) { 1470 .iova = image->iova, 1471 .base_array_layer = subres->baseArrayLayer + layer, 1472 .layer_count = 1, 1473 .base_miplevel = subres->mipLevel, 1474 .level_count = 1, 1475 .format = tu_format_for_aspect(format, aspect_mask), 1476 .swiz = { 1477 PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W 1478 }, 1479 .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D, 1480 }, false); 1481} 1482 1483static void 1484tu_image_view_copy(struct fdl6_view *iview, 1485 struct tu_image *image, 1486 enum pipe_format format, 1487 const VkImageSubresourceLayers *subres, 1488 uint32_t layer) 1489{ 1490 tu_image_view_copy_blit(iview, image, format, subres, layer, false); 1491} 1492 1493static void 1494tu_image_view_blit(struct fdl6_view *iview, 1495 struct tu_image *image, 1496 const VkImageSubresourceLayers *subres, 1497 uint32_t layer) 1498{ 1499 enum pipe_format format = 1500 tu6_plane_format(image->vk.format, tu6_plane_index(image->vk.format, 1501 subres->aspectMask)); 1502 tu_image_view_copy_blit(iview, image, format, subres, layer, false); 1503} 1504 1505static void 1506tu6_blit_image(struct tu_cmd_buffer *cmd, 1507 struct tu_image *src_image, 1508 struct tu_image *dst_image, 1509 const VkImageBlit2 *info, 1510 VkFilter filter) 1511{ 1512 const struct blit_ops *ops = &r2d_ops; 1513 struct tu_cs *cs = &cmd->cs; 1514 bool z_scale = false; 1515 uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z; 1516 1517 /* 2D blit can't do rotation mirroring from just coordinates */ 1518 static const enum a6xx_rotation rotate[2][2] = { 1519 {ROTATE_0, ROTATE_HFLIP}, 1520 {ROTATE_VFLIP, ROTATE_180}, 1521 }; 1522 1523 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) != 1524 (info->dstOffsets[1].x < info->dstOffsets[0].x); 1525 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) != 1526 (info->dstOffsets[1].y < info->dstOffsets[0].y); 1527 1528 int32_t src0_z = info->srcOffsets[0].z; 1529 int32_t src1_z = info->srcOffsets[1].z; 1530 1531 if ((info->srcOffsets[1].z - info->srcOffsets[0].z != 1532 info->dstOffsets[1].z - info->dstOffsets[0].z) || 1533 info->srcOffsets[1].z < info->srcOffsets[0].z) { 1534 z_scale = true; 1535 } 1536 1537 if (info->dstOffsets[1].z < info->dstOffsets[0].z) { 1538 layers = info->dstOffsets[0].z - info->dstOffsets[1].z; 1539 src0_z = info->srcOffsets[1].z; 1540 src1_z = info->srcOffsets[0].z; 1541 } 1542 1543 if (info->dstSubresource.layerCount > 1) { 1544 assert(layers <= 1); 1545 layers = info->dstSubresource.layerCount; 1546 } 1547 1548 /* BC1_RGB_* formats need to have their last components overriden with 1 1549 * when sampling, which is normally handled with the texture descriptor 1550 * swizzle. The 2d path can't handle that, so use the 3d path. 1551 * 1552 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with 1553 * the 2d path. 1554 */ 1555 1556 unsigned blit_param = rotate[mirror_y][mirror_x]; 1557 if (dst_image->layout[0].nr_samples > 1 || 1558 src_image->vk.format == VK_FORMAT_BC1_RGB_UNORM_BLOCK || 1559 src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK || 1560 filter == VK_FILTER_CUBIC_EXT || 1561 z_scale) { 1562 ops = &r3d_ops; 1563 blit_param = z_scale; 1564 } 1565 1566 /* use the right format in setup() for D32_S8 1567 * TODO: this probably should use a helper 1568 */ 1569 enum pipe_format src_format = 1570 tu6_plane_format(src_image->vk.format, 1571 tu6_plane_index(src_image->vk.format, 1572 info->srcSubresource.aspectMask)); 1573 enum pipe_format dst_format = 1574 tu6_plane_format(dst_image->vk.format, 1575 tu6_plane_index(src_image->vk.format, 1576 info->srcSubresource.aspectMask)); 1577 trace_start_blit(&cmd->trace, cs); 1578 1579 ops->setup(cmd, cs, src_format, dst_format, info->dstSubresource.aspectMask, 1580 blit_param, false, dst_image->layout[0].ubwc, 1581 dst_image->layout[0].nr_samples); 1582 1583 if (ops == &r3d_ops) { 1584 r3d_coords_raw(cs, (float[]) { 1585 info->dstOffsets[0].x, info->dstOffsets[0].y, 1586 info->srcOffsets[0].x, info->srcOffsets[0].y, 1587 info->dstOffsets[1].x, info->dstOffsets[1].y, 1588 info->srcOffsets[1].x, info->srcOffsets[1].y 1589 }); 1590 } else { 1591 tu_cs_emit_regs(cs, 1592 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x), 1593 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)), 1594 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1, 1595 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1)); 1596 tu_cs_emit_regs(cs, 1597 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)), 1598 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1), 1599 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)), 1600 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1)); 1601 } 1602 1603 struct fdl6_view dst, src; 1604 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, 1605 MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z)); 1606 1607 if (z_scale) { 1608 tu_image_view_copy_blit(&src, src_image, src_format, 1609 &info->srcSubresource, 0, true); 1610 ops->src(cmd, cs, &src, 0, filter, dst_format); 1611 } else { 1612 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z); 1613 } 1614 1615 for (uint32_t i = 0; i < layers; i++) { 1616 if (z_scale) { 1617 float t = ((float) i + 0.5f) / (float) layers; 1618 r3d_coord_z(cs, t * (src1_z - src0_z) + src0_z); 1619 } else { 1620 ops->src(cmd, cs, &src, i, filter, dst_format); 1621 } 1622 ops->dst(cs, &dst, i, src_format); 1623 ops->run(cmd, cs); 1624 } 1625 1626 ops->teardown(cmd, cs); 1627 1628 trace_end_blit(&cmd->trace, cs, 1629 ops == &r3d_ops, 1630 src_image->vk.format, 1631 dst_image->vk.format, 1632 layers); 1633} 1634 1635VKAPI_ATTR void VKAPI_CALL 1636tu_CmdBlitImage2KHR(VkCommandBuffer commandBuffer, 1637 const VkBlitImageInfo2* pBlitImageInfo) 1638 1639{ 1640 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1641 TU_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage); 1642 TU_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage); 1643 1644 for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) { 1645 /* can't blit both depth and stencil at once with D32_S8 1646 * TODO: more advanced 3D blit path to support it instead? 1647 */ 1648 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT || 1649 dst_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 1650 VkImageBlit2 region = pBlitImageInfo->pRegions[i]; 1651 u_foreach_bit(b, region.dstSubresource.aspectMask) { 1652 region.srcSubresource.aspectMask = BIT(b); 1653 region.dstSubresource.aspectMask = BIT(b); 1654 tu6_blit_image(cmd, src_image, dst_image, ®ion, pBlitImageInfo->filter); 1655 } 1656 continue; 1657 } 1658 tu6_blit_image(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i, 1659 pBlitImageInfo->filter); 1660 } 1661 1662 if (dst_image->lrz_height) { 1663 tu_disable_lrz(cmd, &cmd->cs, dst_image); 1664 } 1665} 1666 1667static void 1668copy_compressed(VkFormat format, 1669 VkOffset3D *offset, 1670 VkExtent3D *extent, 1671 uint32_t *width, 1672 uint32_t *height) 1673{ 1674 if (!vk_format_is_compressed(format)) 1675 return; 1676 1677 uint32_t block_width = vk_format_get_blockwidth(format); 1678 uint32_t block_height = vk_format_get_blockheight(format); 1679 1680 offset->x /= block_width; 1681 offset->y /= block_height; 1682 1683 if (extent) { 1684 extent->width = DIV_ROUND_UP(extent->width, block_width); 1685 extent->height = DIV_ROUND_UP(extent->height, block_height); 1686 } 1687 if (width) 1688 *width = DIV_ROUND_UP(*width, block_width); 1689 if (height) 1690 *height = DIV_ROUND_UP(*height, block_height); 1691} 1692 1693static void 1694tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd, 1695 struct tu_buffer *src_buffer, 1696 struct tu_image *dst_image, 1697 const VkBufferImageCopy2 *info) 1698{ 1699 struct tu_cs *cs = &cmd->cs; 1700 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount); 1701 enum pipe_format src_format = 1702 copy_format(dst_image->vk.format, info->imageSubresource.aspectMask); 1703 enum pipe_format dst_format = 1704 copy_format(dst_image->vk.format, info->imageSubresource.aspectMask); 1705 const struct blit_ops *ops = &r2d_ops; 1706 1707 /* special case for buffer to stencil */ 1708 if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT && 1709 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { 1710 src_format = PIPE_FORMAT_S8_UINT; 1711 } 1712 1713 /* note: could use "R8_UNORM" when no UBWC */ 1714 if (src_format == PIPE_FORMAT_Y8_UNORM) 1715 ops = &r3d_ops; 1716 1717 VkOffset3D offset = info->imageOffset; 1718 VkExtent3D extent = info->imageExtent; 1719 uint32_t src_width = info->bufferRowLength ?: extent.width; 1720 uint32_t src_height = info->bufferImageHeight ?: extent.height; 1721 1722 copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height); 1723 1724 uint32_t pitch = src_width * util_format_get_blocksize(src_format); 1725 uint32_t layer_size = src_height * pitch; 1726 1727 ops->setup(cmd, cs, src_format, dst_format, 1728 info->imageSubresource.aspectMask, 0, false, dst_image->layout[0].ubwc, 1729 dst_image->layout[0].nr_samples); 1730 1731 struct fdl6_view dst; 1732 tu_image_view_copy(&dst, dst_image, dst_format, &info->imageSubresource, offset.z); 1733 1734 for (uint32_t i = 0; i < layers; i++) { 1735 ops->dst(cs, &dst, i, src_format); 1736 1737 uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i; 1738 if ((src_va & 63) || (pitch & 63)) { 1739 for (uint32_t y = 0; y < extent.height; y++) { 1740 uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format); 1741 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch, 1742 x + extent.width, 1, dst_format); 1743 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x}, 1744 &(VkExtent2D) {extent.width, 1}); 1745 ops->run(cmd, cs); 1746 src_va += pitch; 1747 } 1748 } else { 1749 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height, dst_format); 1750 coords(ops, cs, &offset, &(VkOffset3D){}, &extent); 1751 ops->run(cmd, cs); 1752 } 1753 } 1754 1755 ops->teardown(cmd, cs); 1756} 1757 1758VKAPI_ATTR void VKAPI_CALL 1759tu_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer, 1760 const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo) 1761{ 1762 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1763 TU_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage); 1764 TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer); 1765 1766 for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i) 1767 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, 1768 pCopyBufferToImageInfo->pRegions + i); 1769 1770 if (dst_image->lrz_height) { 1771 tu_disable_lrz(cmd, &cmd->cs, dst_image); 1772 } 1773} 1774 1775static void 1776tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd, 1777 struct tu_image *src_image, 1778 struct tu_buffer *dst_buffer, 1779 const VkBufferImageCopy2 *info) 1780{ 1781 struct tu_cs *cs = &cmd->cs; 1782 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount); 1783 enum pipe_format dst_format = 1784 copy_format(src_image->vk.format, info->imageSubresource.aspectMask); 1785 enum pipe_format src_format = 1786 copy_format(src_image->vk.format, info->imageSubresource.aspectMask); 1787 const struct blit_ops *ops = &r2d_ops; 1788 1789 if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT && 1790 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { 1791 dst_format = PIPE_FORMAT_S8_UINT; 1792 } 1793 1794 /* note: could use "R8_UNORM" when no UBWC */ 1795 if (dst_format == PIPE_FORMAT_Y8_UNORM) 1796 ops = &r3d_ops; 1797 1798 VkOffset3D offset = info->imageOffset; 1799 VkExtent3D extent = info->imageExtent; 1800 uint32_t dst_width = info->bufferRowLength ?: extent.width; 1801 uint32_t dst_height = info->bufferImageHeight ?: extent.height; 1802 1803 copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height); 1804 1805 uint32_t pitch = dst_width * util_format_get_blocksize(dst_format); 1806 uint32_t layer_size = pitch * dst_height; 1807 1808 ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false, 1809 VK_SAMPLE_COUNT_1_BIT); 1810 1811 struct fdl6_view src; 1812 tu_image_view_copy(&src, src_image, src_format, &info->imageSubresource, offset.z); 1813 1814 for (uint32_t i = 0; i < layers; i++) { 1815 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format); 1816 1817 uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i; 1818 if ((dst_va & 63) || (pitch & 63)) { 1819 for (uint32_t y = 0; y < extent.height; y++) { 1820 uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format); 1821 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0, src_format); 1822 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y}, 1823 &(VkExtent2D) {extent.width, 1}); 1824 ops->run(cmd, cs); 1825 dst_va += pitch; 1826 } 1827 } else { 1828 ops->dst_buffer(cs, dst_format, dst_va, pitch, src_format); 1829 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent); 1830 ops->run(cmd, cs); 1831 } 1832 } 1833 1834 ops->teardown(cmd, cs); 1835} 1836 1837VKAPI_ATTR void VKAPI_CALL 1838tu_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer, 1839 const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo) 1840{ 1841 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1842 TU_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage); 1843 TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer); 1844 1845 for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i) 1846 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, 1847 pCopyImageToBufferInfo->pRegions + i); 1848} 1849 1850/* Tiled formats don't support swapping, which means that we can't support 1851 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some 1852 * formats like B5G5R5A1 have a separate linear-only format when sampling. 1853 * Currently we fake support for tiled swapped formats and use the unswapped 1854 * format instead, but this means that reinterpreting copies to and from 1855 * swapped formats can't be performed correctly unless we can swizzle the 1856 * components by reinterpreting the other image as the "correct" swapped 1857 * format, i.e. only when the other image is linear. 1858 */ 1859 1860static bool 1861is_swapped_format(enum pipe_format format) 1862{ 1863 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR); 1864 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3); 1865 return linear.fmt != tiled.fmt || linear.swap != tiled.swap; 1866} 1867 1868/* R8G8_* formats have a different tiling layout than other cpp=2 formats, and 1869 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice 1870 * versa). This should mirror the logic in fdl6_layout. 1871 */ 1872static bool 1873image_is_r8g8(struct tu_image *image) 1874{ 1875 return image->layout[0].cpp == 2 && 1876 vk_format_get_nr_components(image->vk.format) == 2; 1877} 1878 1879static void 1880tu_copy_image_to_image(struct tu_cmd_buffer *cmd, 1881 struct tu_image *src_image, 1882 struct tu_image *dst_image, 1883 const VkImageCopy2 *info) 1884{ 1885 const struct blit_ops *ops = &r2d_ops; 1886 struct tu_cs *cs = &cmd->cs; 1887 1888 if (dst_image->layout[0].nr_samples > 1) 1889 ops = &r3d_ops; 1890 1891 enum pipe_format format = PIPE_FORMAT_NONE; 1892 VkOffset3D src_offset = info->srcOffset; 1893 VkOffset3D dst_offset = info->dstOffset; 1894 VkExtent3D extent = info->extent; 1895 uint32_t layers_to_copy = MAX2(info->extent.depth, info->srcSubresource.layerCount); 1896 1897 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between 1898 * Images": 1899 * 1900 * When copying between compressed and uncompressed formats the extent 1901 * members represent the texel dimensions of the source image and not 1902 * the destination. When copying from a compressed image to an 1903 * uncompressed image the image texel dimensions written to the 1904 * uncompressed image will be source extent divided by the compressed 1905 * texel block dimensions. When copying from an uncompressed image to a 1906 * compressed image the image texel dimensions written to the compressed 1907 * image will be the source extent multiplied by the compressed texel 1908 * block dimensions. 1909 * 1910 * This means we only have to adjust the extent if the source image is 1911 * compressed. 1912 */ 1913 copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL); 1914 copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL); 1915 1916 enum pipe_format dst_format = copy_format(dst_image->vk.format, info->dstSubresource.aspectMask); 1917 enum pipe_format src_format = copy_format(src_image->vk.format, info->srcSubresource.aspectMask); 1918 1919 /* note: could use "R8_UNORM" when no UBWC */ 1920 if (dst_format == PIPE_FORMAT_Y8_UNORM || 1921 src_format == PIPE_FORMAT_Y8_UNORM) 1922 ops = &r3d_ops; 1923 1924 bool use_staging_blit = false; 1925 1926 if (src_format == dst_format) { 1927 /* Images that share a format can always be copied directly because it's 1928 * the same as a blit. 1929 */ 1930 format = src_format; 1931 } else if (!src_image->layout[0].tile_mode) { 1932 /* If an image is linear, we can always safely reinterpret it with the 1933 * other image's format and then do a regular blit. 1934 */ 1935 format = dst_format; 1936 } else if (!dst_image->layout[0].tile_mode) { 1937 format = src_format; 1938 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) { 1939 /* We can't currently copy r8g8 images to/from other cpp=2 images, 1940 * due to the different tile layout. 1941 */ 1942 use_staging_blit = true; 1943 } else if (is_swapped_format(src_format) || 1944 is_swapped_format(dst_format)) { 1945 /* If either format has a non-identity swap, then we can't copy 1946 * to/from it. 1947 */ 1948 use_staging_blit = true; 1949 } else if (!src_image->layout[0].ubwc) { 1950 format = dst_format; 1951 } else if (!dst_image->layout[0].ubwc) { 1952 format = src_format; 1953 } else { 1954 /* Both formats use UBWC and so neither can be reinterpreted. 1955 * TODO: We could do an in-place decompression of the dst instead. 1956 */ 1957 perf_debug(cmd->device, "TODO: Do in-place UBWC decompression for UBWC->UBWC blits"); 1958 use_staging_blit = true; 1959 } 1960 1961 struct fdl6_view dst, src; 1962 1963 if (use_staging_blit) { 1964 tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z); 1965 tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z); 1966 1967 struct fdl_layout staging_layout = { 0 }; 1968 VkOffset3D staging_offset = { 0 }; 1969 1970 staging_layout.tile_mode = TILE6_LINEAR; 1971 staging_layout.ubwc = false; 1972 1973 fdl6_layout(&staging_layout, 1974 src_format, 1975 src_image->layout[0].nr_samples, 1976 extent.width, 1977 extent.height, 1978 extent.depth, 1979 1, 1980 info->srcSubresource.layerCount, 1981 extent.depth > 1, 1982 NULL); 1983 1984 struct tu_bo *staging_bo; 1985 VkResult result = tu_get_scratch_bo(cmd->device, 1986 staging_layout.size, 1987 &staging_bo); 1988 if (result != VK_SUCCESS) { 1989 cmd->record_result = result; 1990 return; 1991 } 1992 1993 struct fdl6_view staging; 1994 const struct fdl_layout *staging_layout_ptr = &staging_layout; 1995 fdl6_view_init(&staging, &staging_layout_ptr, &(struct fdl_view_args) { 1996 .iova = staging_bo->iova, 1997 .base_array_layer = 0, 1998 .layer_count = 1, 1999 .base_miplevel = 0, 2000 .level_count = info->srcSubresource.layerCount, 2001 .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT), 2002 .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W }, 2003 .type = FDL_VIEW_TYPE_2D, 2004 }, false); 2005 2006 ops->setup(cmd, cs, src_format, src_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false, 2007 dst_image->layout[0].nr_samples); 2008 coords(ops, cs, &staging_offset, &src_offset, &extent); 2009 2010 for (uint32_t i = 0; i < layers_to_copy; i++) { 2011 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, src_format); 2012 ops->dst(cs, &staging, i, src_format); 2013 ops->run(cmd, cs); 2014 } 2015 2016 /* When executed by the user there has to be a pipeline barrier here, 2017 * but since we're doing it manually we'll have to flush ourselves. 2018 */ 2019 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); 2020 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); 2021 tu_cs_emit_wfi(cs); 2022 2023 fdl6_view_init(&staging, &staging_layout_ptr, &(struct fdl_view_args) { 2024 .iova = staging_bo->iova, 2025 .base_array_layer = 0, 2026 .layer_count = 1, 2027 .base_miplevel = 0, 2028 .level_count = info->srcSubresource.layerCount, 2029 .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT), 2030 .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W }, 2031 .type = FDL_VIEW_TYPE_2D, 2032 }, false); 2033 2034 ops->setup(cmd, cs, dst_format, dst_format, info->dstSubresource.aspectMask, 2035 0, false, dst_image->layout[0].ubwc, 2036 dst_image->layout[0].nr_samples); 2037 coords(ops, cs, &dst_offset, &staging_offset, &extent); 2038 2039 for (uint32_t i = 0; i < layers_to_copy; i++) { 2040 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST, dst_format); 2041 ops->dst(cs, &dst, i, dst_format); 2042 ops->run(cmd, cs); 2043 } 2044 } else { 2045 tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z); 2046 tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z); 2047 2048 ops->setup(cmd, cs, format, format, info->dstSubresource.aspectMask, 2049 0, false, dst_image->layout[0].ubwc, 2050 dst_image->layout[0].nr_samples); 2051 coords(ops, cs, &dst_offset, &src_offset, &extent); 2052 2053 for (uint32_t i = 0; i < layers_to_copy; i++) { 2054 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, format); 2055 ops->dst(cs, &dst, i, format); 2056 ops->run(cmd, cs); 2057 } 2058 } 2059 2060 ops->teardown(cmd, cs); 2061} 2062 2063VKAPI_ATTR void VKAPI_CALL 2064tu_CmdCopyImage2KHR(VkCommandBuffer commandBuffer, 2065 const VkCopyImageInfo2* pCopyImageInfo) 2066{ 2067 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2068 TU_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage); 2069 TU_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage); 2070 2071 for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) { 2072 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 2073 VkImageCopy2 info = pCopyImageInfo->pRegions[i]; 2074 u_foreach_bit(b, info.dstSubresource.aspectMask) { 2075 info.srcSubresource.aspectMask = BIT(b); 2076 info.dstSubresource.aspectMask = BIT(b); 2077 tu_copy_image_to_image(cmd, src_image, dst_image, &info); 2078 } 2079 continue; 2080 } 2081 2082 tu_copy_image_to_image(cmd, src_image, dst_image, 2083 pCopyImageInfo->pRegions + i); 2084 } 2085 2086 if (dst_image->lrz_height) { 2087 tu_disable_lrz(cmd, &cmd->cs, dst_image); 2088 } 2089} 2090 2091static void 2092copy_buffer(struct tu_cmd_buffer *cmd, 2093 uint64_t dst_va, 2094 uint64_t src_va, 2095 uint64_t size, 2096 uint32_t block_size) 2097{ 2098 const struct blit_ops *ops = &r2d_ops; 2099 struct tu_cs *cs = &cmd->cs; 2100 enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM; 2101 uint64_t blocks = size / block_size; 2102 2103 ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false, 2104 VK_SAMPLE_COUNT_1_BIT); 2105 2106 while (blocks) { 2107 uint32_t src_x = (src_va & 63) / block_size; 2108 uint32_t dst_x = (dst_va & 63) / block_size; 2109 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x); 2110 2111 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1, format); 2112 ops->dst_buffer( cs, format, dst_va & ~63, 0, format); 2113 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1}); 2114 ops->run(cmd, cs); 2115 2116 src_va += width * block_size; 2117 dst_va += width * block_size; 2118 blocks -= width; 2119 } 2120 2121 ops->teardown(cmd, cs); 2122} 2123 2124VKAPI_ATTR void VKAPI_CALL 2125tu_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer, 2126 const VkCopyBufferInfo2 *pCopyBufferInfo) 2127{ 2128 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2129 TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer); 2130 TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer); 2131 2132 for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) { 2133 const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i]; 2134 copy_buffer(cmd, 2135 dst_buffer->iova + region->dstOffset, 2136 src_buffer->iova + region->srcOffset, 2137 region->size, 1); 2138 } 2139} 2140 2141VKAPI_ATTR void VKAPI_CALL 2142tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer, 2143 VkBuffer dstBuffer, 2144 VkDeviceSize dstOffset, 2145 VkDeviceSize dataSize, 2146 const void *pData) 2147{ 2148 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2149 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); 2150 2151 struct tu_cs_memory tmp; 2152 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp); 2153 if (result != VK_SUCCESS) { 2154 cmd->record_result = result; 2155 return; 2156 } 2157 2158 memcpy(tmp.map, pData, dataSize); 2159 copy_buffer(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4); 2160} 2161 2162VKAPI_ATTR void VKAPI_CALL 2163tu_CmdFillBuffer(VkCommandBuffer commandBuffer, 2164 VkBuffer dstBuffer, 2165 VkDeviceSize dstOffset, 2166 VkDeviceSize fillSize, 2167 uint32_t data) 2168{ 2169 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2170 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); 2171 const struct blit_ops *ops = &r2d_ops; 2172 struct tu_cs *cs = &cmd->cs; 2173 2174 if (fillSize == VK_WHOLE_SIZE) 2175 fillSize = buffer->size - dstOffset; 2176 2177 uint64_t dst_va = buffer->iova + dstOffset; 2178 uint32_t blocks = fillSize / 4; 2179 2180 ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT, 2181 VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false, 2182 VK_SAMPLE_COUNT_1_BIT); 2183 ops->clear_value(cs, PIPE_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}}); 2184 2185 while (blocks) { 2186 uint32_t dst_x = (dst_va & 63) / 4; 2187 uint32_t width = MIN2(blocks, 0x4000 - dst_x); 2188 2189 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dst_va & ~63, 0, PIPE_FORMAT_R32_UINT); 2190 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1}); 2191 ops->run(cmd, cs); 2192 2193 dst_va += width * 4; 2194 blocks -= width; 2195 } 2196 2197 ops->teardown(cmd, cs); 2198} 2199 2200VKAPI_ATTR void VKAPI_CALL 2201tu_CmdResolveImage2KHR(VkCommandBuffer commandBuffer, 2202 const VkResolveImageInfo2* pResolveImageInfo) 2203{ 2204 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2205 TU_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage); 2206 TU_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage); 2207 const struct blit_ops *ops = &r2d_ops; 2208 struct tu_cs *cs = &cmd->cs; 2209 2210 enum pipe_format src_format = 2211 tu_vk_format_to_pipe_format(src_image->vk.format); 2212 enum pipe_format dst_format = 2213 tu_vk_format_to_pipe_format(dst_image->vk.format); 2214 ops->setup(cmd, cs, src_format, dst_format, 2215 VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc, 2216 VK_SAMPLE_COUNT_1_BIT); 2217 2218 for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) { 2219 const VkImageResolve2 *info = &pResolveImageInfo->pRegions[i]; 2220 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount); 2221 2222 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount); 2223 /* TODO: aspect masks possible ? */ 2224 2225 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent); 2226 2227 struct fdl6_view dst, src; 2228 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z); 2229 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z); 2230 2231 for (uint32_t i = 0; i < layers; i++) { 2232 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format); 2233 ops->dst(cs, &dst, i, src_format); 2234 ops->run(cmd, cs); 2235 } 2236 } 2237 2238 ops->teardown(cmd, cs); 2239} 2240 2241#define for_each_layer(layer, layer_mask, layers) \ 2242 for (uint32_t layer = 0; \ 2243 layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \ 2244 layer++) \ 2245 if (!layer_mask || (layer_mask & BIT(layer))) 2246 2247static void 2248resolve_sysmem(struct tu_cmd_buffer *cmd, 2249 struct tu_cs *cs, 2250 VkFormat vk_src_format, 2251 VkFormat vk_dst_format, 2252 const struct tu_image_view *src, 2253 const struct tu_image_view *dst, 2254 uint32_t layer_mask, 2255 uint32_t layers, 2256 const VkRect2D *rect, 2257 bool src_separate_ds, 2258 bool dst_separate_ds) 2259{ 2260 const struct blit_ops *ops = &r2d_ops; 2261 2262 trace_start_sysmem_resolve(&cmd->trace, cs); 2263 2264 enum pipe_format src_format = tu_vk_format_to_pipe_format(vk_src_format); 2265 enum pipe_format dst_format = tu_vk_format_to_pipe_format(vk_dst_format); 2266 2267 ops->setup(cmd, cs, src_format, dst_format, 2268 VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled, 2269 VK_SAMPLE_COUNT_1_BIT); 2270 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent); 2271 2272 for_each_layer(i, layer_mask, layers) { 2273 if (src_separate_ds) { 2274 if (vk_src_format == VK_FORMAT_D32_SFLOAT) { 2275 r2d_src_depth(cmd, cs, src, i, VK_FILTER_NEAREST); 2276 } else { 2277 r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST); 2278 } 2279 } else { 2280 ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format); 2281 } 2282 2283 if (dst_separate_ds) { 2284 if (vk_dst_format == VK_FORMAT_D32_SFLOAT) { 2285 ops->dst_depth(cs, dst, i); 2286 } else { 2287 ops->dst_stencil(cs, dst, i); 2288 } 2289 } else { 2290 ops->dst(cs, &dst->view, i, src_format); 2291 } 2292 2293 ops->run(cmd, cs); 2294 } 2295 2296 ops->teardown(cmd, cs); 2297 2298 trace_end_sysmem_resolve(&cmd->trace, cs, vk_dst_format); 2299} 2300 2301void 2302tu_resolve_sysmem(struct tu_cmd_buffer *cmd, 2303 struct tu_cs *cs, 2304 const struct tu_image_view *src, 2305 const struct tu_image_view *dst, 2306 uint32_t layer_mask, 2307 uint32_t layers, 2308 const VkRect2D *rect) 2309{ 2310 assert(src->image->vk.format == dst->image->vk.format || 2311 (vk_format_is_depth_or_stencil(src->image->vk.format) && 2312 vk_format_is_depth_or_stencil(dst->image->vk.format))); 2313 2314 bool src_separate_ds = src->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT; 2315 bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT; 2316 2317 if (dst_separate_ds) { 2318 resolve_sysmem(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT, 2319 src, dst, layer_mask, layers, rect, 2320 src_separate_ds, dst_separate_ds); 2321 resolve_sysmem(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT, 2322 src, dst, layer_mask, layers, rect, 2323 src_separate_ds, dst_separate_ds); 2324 } else { 2325 resolve_sysmem(cmd, cs, src->image->vk.format, dst->image->vk.format, 2326 src, dst, layer_mask, layers, rect, 2327 src_separate_ds, dst_separate_ds); 2328 } 2329} 2330 2331static void 2332clear_image(struct tu_cmd_buffer *cmd, 2333 struct tu_image *image, 2334 const VkClearValue *clear_value, 2335 const VkImageSubresourceRange *range, 2336 VkImageAspectFlags aspect_mask) 2337{ 2338 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range); 2339 uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range); 2340 struct tu_cs *cs = &cmd->cs; 2341 enum pipe_format format; 2342 if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) { 2343 format = PIPE_FORMAT_R32_UINT; 2344 } else { 2345 format = tu6_plane_format(image->vk.format, 2346 tu6_plane_index(image->vk.format, 2347 aspect_mask)); 2348 } 2349 2350 if (image->layout[0].depth0 > 1) { 2351 assert(layer_count == 1); 2352 assert(range->baseArrayLayer == 0); 2353 } 2354 2355 const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops; 2356 2357 ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc, 2358 image->layout[0].nr_samples); 2359 if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) 2360 ops->clear_value(cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value); 2361 else 2362 ops->clear_value(cs, format, clear_value); 2363 2364 for (unsigned j = 0; j < level_count; j++) { 2365 if (image->layout[0].depth0 > 1) 2366 layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j); 2367 2368 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) { 2369 u_minify(image->layout[0].width0, range->baseMipLevel + j), 2370 u_minify(image->layout[0].height0, range->baseMipLevel + j) 2371 }); 2372 2373 struct fdl6_view dst; 2374 tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) { 2375 .aspectMask = aspect_mask, 2376 .mipLevel = range->baseMipLevel + j, 2377 .baseArrayLayer = range->baseArrayLayer, 2378 .layerCount = 1, 2379 }, 0, false); 2380 2381 for (uint32_t i = 0; i < layer_count; i++) { 2382 ops->dst(cs, &dst, i, format); 2383 ops->run(cmd, cs); 2384 } 2385 } 2386 2387 ops->teardown(cmd, cs); 2388} 2389 2390VKAPI_ATTR void VKAPI_CALL 2391tu_CmdClearColorImage(VkCommandBuffer commandBuffer, 2392 VkImage image_h, 2393 VkImageLayout imageLayout, 2394 const VkClearColorValue *pColor, 2395 uint32_t rangeCount, 2396 const VkImageSubresourceRange *pRanges) 2397{ 2398 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2399 TU_FROM_HANDLE(tu_image, image, image_h); 2400 2401 for (unsigned i = 0; i < rangeCount; i++) 2402 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT); 2403} 2404 2405VKAPI_ATTR void VKAPI_CALL 2406tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, 2407 VkImage image_h, 2408 VkImageLayout imageLayout, 2409 const VkClearDepthStencilValue *pDepthStencil, 2410 uint32_t rangeCount, 2411 const VkImageSubresourceRange *pRanges) 2412{ 2413 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2414 TU_FROM_HANDLE(tu_image, image, image_h); 2415 2416 for (unsigned i = 0; i < rangeCount; i++) { 2417 const VkImageSubresourceRange *range = &pRanges[i]; 2418 2419 if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 2420 /* can't clear both depth and stencil at once, split up the aspect mask */ 2421 u_foreach_bit(b, range->aspectMask) 2422 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b)); 2423 continue; 2424 } 2425 2426 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask); 2427 } 2428 2429 tu_lrz_clear_depth_image(cmd, image, pDepthStencil, rangeCount, pRanges); 2430} 2431 2432static void 2433tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, 2434 uint32_t attachment_count, 2435 const VkClearAttachment *attachments, 2436 uint32_t rect_count, 2437 const VkClearRect *rects) 2438{ 2439 /* the shader path here is special, it avoids changing MRT/etc state */ 2440 const struct tu_subpass *subpass = cmd->state.subpass; 2441 const uint32_t mrt_count = subpass->color_count; 2442 struct tu_cs *cs = &cmd->draw_cs; 2443 uint32_t clear_value[MAX_RTS][4]; 2444 float z_clear_val = 0.0f; 2445 uint8_t s_clear_val = 0; 2446 uint32_t clear_rts = 0, clear_components = 0; 2447 bool z_clear = false; 2448 bool s_clear = false; 2449 2450 trace_start_sysmem_clear_all(&cmd->trace, cs); 2451 2452 for (uint32_t i = 0; i < attachment_count; i++) { 2453 uint32_t a; 2454 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { 2455 uint32_t c = attachments[i].colorAttachment; 2456 a = subpass->color_attachments[c].attachment; 2457 if (a == VK_ATTACHMENT_UNUSED) 2458 continue; 2459 2460 clear_rts |= 1 << c; 2461 clear_components |= 0xf << (c * 4); 2462 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t)); 2463 } else { 2464 a = subpass->depth_stencil_attachment.attachment; 2465 if (a == VK_ATTACHMENT_UNUSED) 2466 continue; 2467 2468 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { 2469 z_clear = true; 2470 z_clear_val = attachments[i].clearValue.depthStencil.depth; 2471 } 2472 2473 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { 2474 s_clear = true; 2475 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff; 2476 } 2477 } 2478 } 2479 2480 /* We may not know the multisample count if there are no attachments, so 2481 * just bail early to avoid corner cases later. 2482 */ 2483 if (clear_rts == 0 && !z_clear && !s_clear) 2484 return; 2485 2486 /* disable all draw states so they don't interfere 2487 * TODO: use and re-use draw states 2488 * we have to disable draw states individually to preserve 2489 * input attachment states, because a secondary command buffer 2490 * won't be able to restore them 2491 */ 2492 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2)); 2493 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) { 2494 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM || 2495 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM) 2496 continue; 2497 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) | 2498 CP_SET_DRAW_STATE__0_DISABLE); 2499 tu_cs_emit_qw(cs, 0); 2500 } 2501 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE; 2502 2503 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); 2504 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) | 2505 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) | 2506 0xfc000000); 2507 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count)); 2508 2509 r3d_common(cmd, cs, false, clear_rts, false, cmd->state.subpass->samples); 2510 2511 /* Disable sample counting in order to not affect occlusion query. */ 2512 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true)); 2513 2514 if (cmd->state.prim_generated_query_running_before_rp) { 2515 tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS); 2516 } 2517 2518 tu_cs_emit_regs(cs, 2519 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components)); 2520 tu_cs_emit_regs(cs, 2521 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components)); 2522 2523 tu_cs_emit_regs(cs, 2524 A6XX_RB_FS_OUTPUT_CNTL0(), 2525 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count)); 2526 2527 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL()); 2528 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff)); 2529 for (uint32_t i = 0; i < mrt_count; i++) { 2530 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i, 2531 .component_enable = COND(clear_rts & (1 << i), 0xf))); 2532 } 2533 2534 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0)); 2535 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0)); 2536 2537 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL()); 2538 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL( 2539 .z_test_enable = z_clear, 2540 .z_write_enable = z_clear, 2541 .zfunc = FUNC_ALWAYS)); 2542 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL()); 2543 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL( 2544 .stencil_enable = s_clear, 2545 .func = FUNC_ALWAYS, 2546 .zpass = STENCIL_REPLACE)); 2547 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff)); 2548 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff)); 2549 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val)); 2550 2551 unsigned num_rts = util_bitcount(clear_rts); 2552 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts); 2553 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 2554 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 2555 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 2556 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) | 2557 CP_LOAD_STATE6_0_NUM_UNIT(num_rts)); 2558 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 2559 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 2560 u_foreach_bit(b, clear_rts) 2561 tu_cs_emit_array(cs, clear_value[b], 4); 2562 2563 for (uint32_t i = 0; i < rect_count; i++) { 2564 /* This should be true because of this valid usage for 2565 * vkCmdClearAttachments: 2566 * 2567 * "If the render pass instance this is recorded in uses multiview, 2568 * then baseArrayLayer must be zero and layerCount must be one" 2569 */ 2570 assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0); 2571 2572 /* a630 doesn't support multiview masks, which means that we can't use 2573 * the normal multiview path without potentially recompiling a shader 2574 * on-demand or using a more complicated variant that takes the mask as 2575 * a const. Just use the layered path instead, since it shouldn't be 2576 * much worse. 2577 */ 2578 for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) { 2579 r3d_coords_raw(cs, (float[]) { 2580 rects[i].rect.offset.x, rects[i].rect.offset.y, 2581 z_clear_val, uif(rects[i].baseArrayLayer + layer), 2582 rects[i].rect.offset.x + rects[i].rect.extent.width, 2583 rects[i].rect.offset.y + rects[i].rect.extent.height, 2584 z_clear_val, 1.0f, 2585 }); 2586 r3d_run_vis(cmd, cs); 2587 } 2588 } 2589 2590 /* Re-enable sample counting. */ 2591 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false)); 2592 2593 if (cmd->state.prim_generated_query_running_before_rp) { 2594 tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS); 2595 } 2596 2597 trace_end_sysmem_clear_all(&cmd->trace, 2598 cs, mrt_count, rect_count); 2599} 2600 2601static void 2602pack_gmem_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4]) 2603{ 2604 switch (format) { 2605 case PIPE_FORMAT_Z24X8_UNORM: 2606 case PIPE_FORMAT_Z24_UNORM_S8_UINT: 2607 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) | 2608 val->depthStencil.stencil << 24; 2609 return; 2610 case PIPE_FORMAT_Z16_UNORM: 2611 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16); 2612 return; 2613 case PIPE_FORMAT_Z32_FLOAT: 2614 clear_value[0] = fui(val->depthStencil.depth); 2615 return; 2616 case PIPE_FORMAT_S8_UINT: 2617 clear_value[0] = val->depthStencil.stencil; 2618 return; 2619 default: 2620 break; 2621 } 2622 2623 float tmp[4]; 2624 memcpy(tmp, val->color.float32, 4 * sizeof(float)); 2625 if (util_format_is_srgb(format)) { 2626 for (int i = 0; i < 3; i++) 2627 tmp[i] = util_format_linear_to_srgb_float(tmp[i]); 2628 } 2629 2630#define PACK_F(type) util_format_##type##_pack_rgba_float \ 2631 ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1) 2632 switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) { 2633 case 4: 2634 PACK_F(r4g4b4a4_unorm); 2635 break; 2636 case 5: 2637 if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6) 2638 PACK_F(r5g6b5_unorm); 2639 else 2640 PACK_F(r5g5b5a1_unorm); 2641 break; 2642 case 8: 2643 if (util_format_is_snorm(format)) 2644 PACK_F(r8g8b8a8_snorm); 2645 else if (util_format_is_unorm(format)) 2646 PACK_F(r8g8b8a8_unorm); 2647 else 2648 pack_int8(clear_value, val->color.uint32); 2649 break; 2650 case 10: 2651 if (util_format_is_pure_integer(format)) 2652 pack_int10_2(clear_value, val->color.uint32); 2653 else 2654 PACK_F(r10g10b10a2_unorm); 2655 break; 2656 case 11: 2657 clear_value[0] = float3_to_r11g11b10f(val->color.float32); 2658 break; 2659 case 16: 2660 if (util_format_is_snorm(format)) 2661 PACK_F(r16g16b16a16_snorm); 2662 else if (util_format_is_unorm(format)) 2663 PACK_F(r16g16b16a16_unorm); 2664 else if (util_format_is_float(format)) 2665 PACK_F(r16g16b16a16_float); 2666 else 2667 pack_int16(clear_value, val->color.uint32); 2668 break; 2669 case 32: 2670 memcpy(clear_value, val->color.float32, 4 * sizeof(float)); 2671 break; 2672 default: 2673 unreachable("unexpected channel size"); 2674 } 2675#undef PACK_F 2676} 2677 2678static void 2679clear_gmem_attachment(struct tu_cmd_buffer *cmd, 2680 struct tu_cs *cs, 2681 enum pipe_format format, 2682 uint8_t clear_mask, 2683 uint32_t gmem_offset, 2684 const VkClearValue *value) 2685{ 2686 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1); 2687 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format))); 2688 2689 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask)); 2690 2691 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1); 2692 tu_cs_emit(cs, gmem_offset); 2693 2694 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1); 2695 tu_cs_emit(cs, 0); 2696 2697 uint32_t clear_vals[4] = {}; 2698 pack_gmem_clear_value(value, format, clear_vals); 2699 2700 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4); 2701 tu_cs_emit_array(cs, clear_vals, 4); 2702 2703 tu6_emit_event_write(cmd, cs, BLIT); 2704} 2705 2706static void 2707tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, 2708 struct tu_cs *cs, 2709 uint32_t attachment, 2710 VkImageAspectFlags mask, 2711 const VkClearValue *value) 2712{ 2713 const struct tu_render_pass_attachment *att = 2714 &cmd->state.pass->attachments[attachment]; 2715 2716 trace_start_gmem_clear(&cmd->trace, cs); 2717 2718 enum pipe_format format = tu_vk_format_to_pipe_format(att->format); 2719 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 2720 if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) 2721 clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, tu_attachment_gmem_offset(cmd, att), value); 2722 if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) 2723 clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, tu_attachment_gmem_offset_stencil(cmd, att), value); 2724 return; 2725 } 2726 2727 clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask), 2728 tu_attachment_gmem_offset(cmd, att), value); 2729 2730 trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples); 2731} 2732 2733static void 2734tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, 2735 uint32_t attachment_count, 2736 const VkClearAttachment *attachments, 2737 uint32_t rect_count, 2738 const VkClearRect *rects) 2739{ 2740 const struct tu_subpass *subpass = cmd->state.subpass; 2741 struct tu_cs *cs = &cmd->draw_cs; 2742 2743 if (rect_count > 1) 2744 perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream"); 2745 2746 for (unsigned i = 0; i < rect_count; i++) { 2747 unsigned x1 = rects[i].rect.offset.x; 2748 unsigned y1 = rects[i].rect.offset.y; 2749 unsigned x2 = x1 + rects[i].rect.extent.width - 1; 2750 unsigned y2 = y1 + rects[i].rect.extent.height - 1; 2751 2752 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2); 2753 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1)); 2754 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2)); 2755 2756 for (unsigned j = 0; j < attachment_count; j++) { 2757 uint32_t a; 2758 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) 2759 a = subpass->color_attachments[attachments[j].colorAttachment].attachment; 2760 else 2761 a = subpass->depth_stencil_attachment.attachment; 2762 2763 if (a == VK_ATTACHMENT_UNUSED) 2764 continue; 2765 2766 tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask, 2767 &attachments[j].clearValue); 2768 } 2769 } 2770} 2771 2772VKAPI_ATTR void VKAPI_CALL 2773tu_CmdClearAttachments(VkCommandBuffer commandBuffer, 2774 uint32_t attachmentCount, 2775 const VkClearAttachment *pAttachments, 2776 uint32_t rectCount, 2777 const VkClearRect *pRects) 2778{ 2779 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 2780 struct tu_cs *cs = &cmd->draw_cs; 2781 2782 /* sysmem path behaves like a draw, note we don't have a way of using different 2783 * flushes for sysmem/gmem, so this needs to be outside of the cond_exec 2784 */ 2785 tu_emit_cache_flush_renderpass(cmd, cs); 2786 2787 for (uint32_t j = 0; j < attachmentCount; j++) { 2788 if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0) 2789 continue; 2790 2791 tu_lrz_disable_during_renderpass(cmd); 2792 } 2793 2794 /* vkCmdClearAttachments is supposed to respect the predicate if active. The 2795 * easiest way to do this is to always use the 3d path, which always works 2796 * even with GMEM because it's just a simple draw using the existing 2797 * attachment state. 2798 * 2799 * Similarly, we also use the 3D path when in a secondary command buffer that 2800 * doesn't know the GMEM layout that will be chosen by the primary. 2801 */ 2802 if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) { 2803 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); 2804 return; 2805 } 2806 2807 /* If we could skip tile load/stores based on any draws intersecting them at 2808 * binning time, then emit the clear as a 3D draw so that it contributes to 2809 * that visibility. 2810 */ 2811 const struct tu_subpass *subpass = cmd->state.subpass; 2812 for (uint32_t i = 0; i < attachmentCount; i++) { 2813 uint32_t a; 2814 if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { 2815 uint32_t c = pAttachments[i].colorAttachment; 2816 a = subpass->color_attachments[c].attachment; 2817 } else { 2818 a = subpass->depth_stencil_attachment.attachment; 2819 } 2820 if (a != VK_ATTACHMENT_UNUSED) { 2821 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; 2822 if (att->cond_load_allowed || att->cond_store_allowed) { 2823 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); 2824 return; 2825 } 2826 } 2827 } 2828 2829 /* Otherwise, emit 2D blits for gmem rendering. */ 2830 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); 2831 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); 2832 tu_cond_exec_end(cs); 2833 2834 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); 2835 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); 2836 tu_cond_exec_end(cs); 2837} 2838 2839static void 2840clear_sysmem_attachment(struct tu_cmd_buffer *cmd, 2841 struct tu_cs *cs, 2842 VkFormat vk_format, 2843 VkImageAspectFlags clear_mask, 2844 const VkClearValue *value, 2845 uint32_t a, 2846 bool separate_ds) 2847{ 2848 enum pipe_format format = tu_vk_format_to_pipe_format(vk_format); 2849 const struct tu_framebuffer *fb = cmd->state.framebuffer; 2850 const struct tu_image_view *iview = cmd->state.attachments[a]; 2851 const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views; 2852 const struct blit_ops *ops = &r2d_ops; 2853 if (cmd->state.pass->attachments[a].samples > 1) 2854 ops = &r3d_ops; 2855 2856 trace_start_sysmem_clear(&cmd->trace, cs); 2857 2858 ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled, 2859 cmd->state.pass->attachments[a].samples); 2860 ops->coords(cs, &cmd->state.render_area.offset, NULL, 2861 &cmd->state.render_area.extent); 2862 ops->clear_value(cs, format, value); 2863 2864 for_each_layer(i, clear_views, fb->layers) { 2865 if (separate_ds) { 2866 if (vk_format == VK_FORMAT_D32_SFLOAT) { 2867 ops->dst_depth(cs, iview, i); 2868 } else { 2869 ops->dst_stencil(cs, iview, i); 2870 } 2871 } else { 2872 ops->dst(cs, &iview->view, i, format); 2873 } 2874 ops->run(cmd, cs); 2875 } 2876 2877 ops->teardown(cmd, cs); 2878 2879 trace_end_sysmem_clear(&cmd->trace, cs, 2880 vk_format, ops == &r3d_ops, 2881 cmd->state.pass->attachments[a].samples); 2882} 2883 2884void 2885tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, 2886 struct tu_cs *cs, 2887 uint32_t a, 2888 const VkClearValue *value) 2889{ 2890 const struct tu_render_pass_attachment *attachment = 2891 &cmd->state.pass->attachments[a]; 2892 2893 if (!attachment->clear_mask) 2894 return; 2895 2896 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 2897 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) { 2898 clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT, 2899 value, a, true); 2900 } 2901 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) { 2902 clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 2903 value, a, true); 2904 } 2905 } else { 2906 clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask, 2907 value, a, false); 2908 } 2909 2910 /* The spec doesn't explicitly say, but presumably the initial renderpass 2911 * clear is considered part of the renderpass, and therefore barriers 2912 * aren't required inside the subpass/renderpass. Therefore we need to 2913 * flush CCU color into CCU depth here, just like with 2914 * vkCmdClearAttachments(). Note that because this only happens at the 2915 * beginning of a renderpass, and renderpass writes are considered 2916 * "incoherent", we shouldn't have to worry about syncing depth into color 2917 * beforehand as depth should already be flushed. 2918 */ 2919 if (vk_format_is_depth_or_stencil(attachment->format)) { 2920 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); 2921 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); 2922 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH); 2923 } else { 2924 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); 2925 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR); 2926 } 2927 2928 if (cmd->device->physical_device->info->a6xx.has_ccu_flush_bug) 2929 tu_cs_emit_wfi(cs); 2930} 2931 2932void 2933tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, 2934 struct tu_cs *cs, 2935 uint32_t a, 2936 const VkClearValue *value) 2937{ 2938 const struct tu_render_pass_attachment *attachment = 2939 &cmd->state.pass->attachments[a]; 2940 2941 if (!attachment->clear_mask) 2942 return; 2943 2944 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples))); 2945 2946 tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask, value); 2947} 2948 2949static void 2950tu_emit_blit(struct tu_cmd_buffer *cmd, 2951 struct tu_cs *cs, 2952 const struct tu_image_view *iview, 2953 const struct tu_render_pass_attachment *attachment, 2954 bool resolve, 2955 bool separate_stencil) 2956{ 2957 tu_cs_emit_regs(cs, 2958 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples))); 2959 2960 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO( 2961 .unk0 = !resolve, 2962 .gmem = !resolve, 2963 .sample_0 = vk_format_is_int(attachment->format) || 2964 vk_format_is_depth_or_stencil(attachment->format))); 2965 2966 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4); 2967 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 2968 if (!separate_stencil) { 2969 tu_cs_emit(cs, tu_image_view_depth(iview, RB_BLIT_DST_INFO)); 2970 tu_cs_emit_qw(cs, iview->depth_base_addr); 2971 tu_cs_emit(cs, iview->depth_PITCH); 2972 2973 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3); 2974 tu_cs_image_flag_ref(cs, &iview->view, 0); 2975 } else { 2976 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS); 2977 tu_cs_emit_qw(cs, iview->stencil_base_addr); 2978 tu_cs_emit(cs, iview->stencil_PITCH); 2979 } 2980 } else { 2981 tu_cs_emit(cs, iview->view.RB_BLIT_DST_INFO); 2982 tu_cs_image_ref_2d(cs, &iview->view, 0, false); 2983 2984 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3); 2985 tu_cs_image_flag_ref(cs, &iview->view, 0); 2986 } 2987 2988 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) { 2989 tu_cs_emit_regs(cs, 2990 A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(cmd, attachment))); 2991 } else { 2992 tu_cs_emit_regs(cs, 2993 A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(cmd, attachment))); 2994 } 2995 2996 tu6_emit_event_write(cmd, cs, BLIT); 2997} 2998 2999static bool 3000blit_can_resolve(VkFormat format) 3001{ 3002 const struct util_format_description *desc = vk_format_description(format); 3003 3004 /* blit event can only do resolve for simple cases: 3005 * averaging samples as unsigned integers or choosing only one sample 3006 */ 3007 if (vk_format_is_snorm(format) || vk_format_is_srgb(format)) 3008 return false; 3009 3010 /* can't do formats with larger channel sizes 3011 * note: this includes all float formats 3012 * note2: single channel integer formats seem OK 3013 */ 3014 if (desc->channel[0].size > 10) 3015 return false; 3016 3017 switch (format) { 3018 /* for unknown reasons blit event can't msaa resolve these formats when tiled 3019 * likely related to these formats having different layout from other cpp=2 formats 3020 */ 3021 case VK_FORMAT_R8G8_UNORM: 3022 case VK_FORMAT_R8G8_UINT: 3023 case VK_FORMAT_R8G8_SINT: 3024 /* TODO: this one should be able to work? */ 3025 case VK_FORMAT_D24_UNORM_S8_UINT: 3026 return false; 3027 default: 3028 break; 3029 } 3030 3031 return true; 3032} 3033 3034static void 3035tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd, 3036 struct tu_cs *cs, bool load) 3037{ 3038 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); 3039 3040 if (!unlikely(cmd->device->physical_device->instance->debug_flags & 3041 TU_DEBUG_LOG_SKIP_GMEM_OPS)) 3042 return; 3043 3044 uint64_t result_iova; 3045 if (load) 3046 result_iova = global_iova(cmd, dbg_gmem_taken_loads); 3047 else 3048 result_iova = global_iova(cmd, dbg_gmem_taken_stores); 3049 3050 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7); 3051 tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B); 3052 tu_cs_emit_qw(cs, result_iova); 3053 tu_cs_emit_qw(cs, result_iova); 3054 tu_cs_emit_qw(cs, global_iova(cmd, dbg_one)); 3055} 3056 3057static void 3058tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd, 3059 struct tu_cs *cs, bool load) 3060{ 3061 tu_cond_exec_end(cs); 3062 3063 if (!unlikely(cmd->device->physical_device->instance->debug_flags & 3064 TU_DEBUG_LOG_SKIP_GMEM_OPS)) 3065 return; 3066 3067 uint64_t result_iova; 3068 if (load) 3069 result_iova = global_iova(cmd, dbg_gmem_total_loads); 3070 else 3071 result_iova = global_iova(cmd, dbg_gmem_total_stores); 3072 3073 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7); 3074 tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B); 3075 tu_cs_emit_qw(cs, result_iova); 3076 tu_cs_emit_qw(cs, result_iova); 3077 tu_cs_emit_qw(cs, global_iova(cmd, dbg_one)); 3078} 3079 3080void 3081tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, 3082 struct tu_cs *cs, 3083 uint32_t a, 3084 bool cond_exec_allowed, 3085 bool force_load) 3086{ 3087 const struct tu_image_view *iview = cmd->state.attachments[a]; 3088 const struct tu_render_pass_attachment *attachment = 3089 &cmd->state.pass->attachments[a]; 3090 3091 bool load_common = attachment->load || force_load; 3092 bool load_stencil = 3093 attachment->load_stencil || 3094 (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load); 3095 3096 if (!load_common && !load_stencil) 3097 return; 3098 3099 trace_start_gmem_load(&cmd->trace, cs); 3100 3101 /* If attachment will be cleared by vkCmdClearAttachments - it is likely 3102 * that it would be partially cleared, and since it is done by 2d blit 3103 * it doesn't produce geometry, so we have to unconditionally load. 3104 * 3105 * To simplify conditions treat partially cleared separate DS as fully 3106 * cleared and don't emit cond_exec. 3107 */ 3108 bool cond_exec = cond_exec_allowed && attachment->cond_load_allowed; 3109 if (cond_exec) 3110 tu_begin_load_store_cond_exec(cmd, cs, true); 3111 3112 if (load_common) 3113 tu_emit_blit(cmd, cs, iview, attachment, false, false); 3114 3115 if (load_stencil) 3116 tu_emit_blit(cmd, cs, iview, attachment, false, true); 3117 3118 if (cond_exec) 3119 tu_end_load_store_cond_exec(cmd, cs, true); 3120 3121 trace_end_gmem_load(&cmd->trace, cs, attachment->format, force_load); 3122} 3123 3124static void 3125store_cp_blit(struct tu_cmd_buffer *cmd, 3126 struct tu_cs *cs, 3127 const struct tu_image_view *iview, 3128 uint32_t samples, 3129 bool separate_stencil, 3130 enum pipe_format src_format, 3131 enum pipe_format dst_format, 3132 uint32_t gmem_offset, 3133 uint32_t cpp) 3134{ 3135 r2d_setup_common(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, 3136 iview->view.ubwc_enabled, true); 3137 3138 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 3139 if (!separate_stencil) { 3140 r2d_dst_depth(cs, iview, 0); 3141 } else { 3142 r2d_dst_stencil(cs, iview, 0); 3143 } 3144 } else { 3145 r2d_dst(cs, &iview->view, 0, src_format); 3146 } 3147 3148 enum a6xx_format fmt = tu6_format_texture(src_format, TILE6_2).fmt; 3149 fixup_src_format(&src_format, dst_format, &fmt); 3150 3151 tu_cs_emit_regs(cs, 3152 A6XX_SP_PS_2D_SRC_INFO( 3153 .color_format = fmt, 3154 .color_swap = WZYX, 3155 .tile_mode = TILE6_2, 3156 .srgb = util_format_is_srgb(src_format), 3157 .samples = tu_msaa_samples(samples), 3158 .samples_average = !util_format_is_pure_integer(dst_format) && 3159 !util_format_is_depth_or_stencil(dst_format), 3160 .unk20 = 1, 3161 .unk22 = 1), 3162 /* note: src size does not matter when not scaling */ 3163 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff), 3164 A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset), 3165 A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.tiling->tile0.width * cpp)); 3166 3167 /* sync GMEM writes with CACHE. */ 3168 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); 3169 3170 /* Wait for CACHE_INVALIDATE to land */ 3171 tu_cs_emit_wfi(cs); 3172 3173 tu_cs_emit_pkt7(cs, CP_BLIT, 1); 3174 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE)); 3175 3176 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to 3177 * sysmem, and we generally assume that GMEM renderpasses leave their 3178 * results in sysmem, so we need to flush manually here. 3179 */ 3180 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); 3181} 3182 3183static void 3184store_3d_blit(struct tu_cmd_buffer *cmd, 3185 struct tu_cs *cs, 3186 const struct tu_image_view *iview, 3187 uint32_t dst_samples, 3188 bool separate_stencil, 3189 enum pipe_format src_format, 3190 enum pipe_format dst_format, 3191 const VkRect2D *render_area, 3192 uint32_t gmem_offset, 3193 uint32_t cpp) 3194{ 3195 /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they 3196 * aren't set until we know whether we're HW binning or not, and we want to 3197 * avoid a dependence on that here to be able to store attachments before 3198 * the end of the renderpass in the future. Use the scratch space to 3199 * save/restore them dynamically. 3200 */ 3201 tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1); 3202 tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) | 3203 CP_REG_TO_SCRATCH_0_SCRATCH(0) | 3204 CP_REG_TO_SCRATCH_0_CNT(1 - 1)); 3205 3206 r3d_setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, 3207 iview->view.ubwc_enabled, dst_samples); 3208 3209 r3d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent); 3210 3211 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { 3212 if (!separate_stencil) { 3213 r3d_dst_depth(cs, iview, 0); 3214 } else { 3215 r3d_dst_stencil(cs, iview, 0); 3216 } 3217 } else { 3218 r3d_dst(cs, &iview->view, 0, src_format); 3219 } 3220 3221 r3d_src_gmem(cmd, cs, iview, src_format, dst_format, gmem_offset, cpp); 3222 3223 /* sync GMEM writes with CACHE. */ 3224 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); 3225 3226 /* Wait for CACHE_INVALIDATE to land */ 3227 tu_cs_emit_wfi(cs); 3228 3229 r3d_run(cmd, cs); 3230 3231 r3d_teardown(cmd, cs); 3232 3233 /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to 3234 * sysmem, and we generally assume that GMEM renderpasses leave their 3235 * results in sysmem, so we need to flush manually here. The 3d blit path 3236 * writes to depth images as a color RT, so there's no need to flush depth. 3237 */ 3238 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); 3239 3240 /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */ 3241 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1); 3242 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) | 3243 CP_SCRATCH_TO_REG_0_SCRATCH(0) | 3244 CP_SCRATCH_TO_REG_0_CNT(1 - 1)); 3245 3246 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1); 3247 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) | 3248 CP_SCRATCH_TO_REG_0_SCRATCH(0) | 3249 CP_SCRATCH_TO_REG_0_CNT(1 - 1)); 3250} 3251 3252static bool 3253tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a) 3254{ 3255 struct tu_physical_device *phys_dev = cmd->device->physical_device; 3256 const struct tu_image_view *iview = cmd->state.attachments[a]; 3257 const VkRect2D *render_area = &cmd->state.render_area; 3258 3259 /* Unaligned store is incredibly rare in CTS, we have to force it to test. */ 3260 if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_UNALIGNED_STORE)) 3261 return true; 3262 3263 uint32_t x1 = render_area->offset.x; 3264 uint32_t y1 = render_area->offset.y; 3265 uint32_t x2 = x1 + render_area->extent.width; 3266 uint32_t y2 = y1 + render_area->extent.height; 3267 /* x2/y2 can be unaligned if equal to the size of the image, since it will 3268 * write into padding space. The one exception is linear levels which don't 3269 * have the required y padding in the layout (except for the last level) 3270 */ 3271 bool need_y2_align = 3272 y2 != iview->view.height || iview->view.need_y2_align; 3273 3274 return (x1 % phys_dev->info->gmem_align_w || 3275 (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) || 3276 y1 % phys_dev->info->gmem_align_h || 3277 (y2 % phys_dev->info->gmem_align_h && need_y2_align)); 3278} 3279 3280/* Choose the GMEM layout (use the CCU space or not) based on whether the 3281 * current attachments will need. This has to happen at vkBeginRenderPass() 3282 * time because tu_attachment_store_unaligned() looks at the image views, which 3283 * are only available at that point. This should match the logic for the 3284 * !unaligned case in tu_store_gmem_attachment(). 3285 */ 3286void 3287tu_choose_gmem_layout(struct tu_cmd_buffer *cmd) 3288{ 3289 cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL; 3290 3291 for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) { 3292 if (!cmd->state.attachments[i]) 3293 continue; 3294 3295 struct tu_render_pass_attachment *att = 3296 &cmd->state.pass->attachments[i]; 3297 if ((att->store || att->store_stencil) && 3298 tu_attachment_store_unaligned(cmd, i)) 3299 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU; 3300 if (att->will_be_resolved && !blit_can_resolve(att->format)) 3301 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU; 3302 } 3303 3304 cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout]; 3305} 3306 3307void 3308tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, 3309 struct tu_cs *cs, 3310 uint32_t a, 3311 uint32_t gmem_a, 3312 bool cond_exec_allowed) 3313{ 3314 const VkRect2D *render_area = &cmd->state.render_area; 3315 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a]; 3316 const struct tu_image_view *iview = cmd->state.attachments[a]; 3317 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a]; 3318 3319 if (!dst->store && !dst->store_stencil) 3320 return; 3321 3322 /* Unconditional store should happen only if attachment was cleared, 3323 * which could have happened either by load_op or via vkCmdClearAttachments. 3324 */ 3325 bool cond_exec = cond_exec_allowed && src->cond_store_allowed; 3326 if (cond_exec) { 3327 tu_begin_load_store_cond_exec(cmd, cs, false); 3328 } 3329 3330 bool unaligned = tu_attachment_store_unaligned(cmd, a); 3331 3332 /* D32_SFLOAT_S8_UINT is quite special format: it has two planes, 3333 * one for depth and other for stencil. When resolving a MSAA 3334 * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account. 3335 */ 3336 bool resolve_d32s8_s8 = 3337 src->format == VK_FORMAT_D32_SFLOAT_S8_UINT && 3338 dst->format == VK_FORMAT_S8_UINT; 3339 3340 /* The fast path doesn't support picking out the last component of a D24S8 3341 * texture reinterpreted as RGBA8_UNORM. 3342 */ 3343 bool resolve_d24s8_s8 = 3344 src->format == VK_FORMAT_D24_UNORM_S8_UINT && 3345 dst->format == VK_FORMAT_S8_UINT; 3346 3347 bool store_common = dst->store && !resolve_d32s8_s8; 3348 bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8; 3349 3350 trace_start_gmem_store(&cmd->trace, cs); 3351 3352 /* use fast path when render area is aligned, except for unsupported resolve cases */ 3353 if (!unaligned && !resolve_d24s8_s8 && 3354 (a == gmem_a || blit_can_resolve(dst->format))) { 3355 if (store_common) 3356 tu_emit_blit(cmd, cs, iview, src, true, false); 3357 if (store_separate_stencil) 3358 tu_emit_blit(cmd, cs, iview, src, true, true); 3359 3360 if (cond_exec) { 3361 tu_end_load_store_cond_exec(cmd, cs, false); 3362 } 3363 3364 trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false); 3365 return; 3366 } 3367 3368 assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU); 3369 3370 enum pipe_format src_format = tu_vk_format_to_pipe_format(src->format); 3371 if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) 3372 src_format = PIPE_FORMAT_Z32_FLOAT; 3373 3374 enum pipe_format dst_format = tu_vk_format_to_pipe_format(dst->format); 3375 if (dst_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) 3376 dst_format = PIPE_FORMAT_Z32_FLOAT; 3377 3378 if (dst->samples > 1) { 3379 /* If we hit this path, we have to disable draw states after every tile 3380 * instead of once at the end of the renderpass, so that they aren't 3381 * executed when calling CP_DRAW. 3382 * 3383 * TODO: store a flag somewhere so we don't do this more than once and 3384 * don't do it after the renderpass when this happens. 3385 */ 3386 if (store_common || store_separate_stencil) 3387 tu_disable_draw_states(cmd, cs); 3388 3389 if (store_common) { 3390 store_3d_blit(cmd, cs, iview, dst->samples, false, src_format, 3391 dst_format, render_area, tu_attachment_gmem_offset(cmd, src), src->cpp); 3392 } 3393 if (store_separate_stencil) { 3394 store_3d_blit(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT, 3395 PIPE_FORMAT_S8_UINT, render_area, 3396 tu_attachment_gmem_offset_stencil(cmd, src), src->samples); 3397 } 3398 } else { 3399 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent); 3400 3401 if (store_common) { 3402 store_cp_blit(cmd, cs, iview, src->samples, false, src_format, 3403 dst_format, tu_attachment_gmem_offset(cmd, src), src->cpp); 3404 } 3405 if (store_separate_stencil) { 3406 store_cp_blit(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT, 3407 PIPE_FORMAT_S8_UINT, tu_attachment_gmem_offset_stencil(cmd, src), src->samples); 3408 } 3409 } 3410 3411 if (cond_exec) { 3412 tu_end_load_store_cond_exec(cmd, cs, false); 3413 } 3414 3415 trace_end_gmem_store(&cmd->trace, cs, dst->format, false, unaligned); 3416} 3417