1/* 2 * Copyright © 2016 Red Hat. 3 * Copyright © 2016 Bas Nieuwenhuizen 4 * 5 * based in part on anv driver which is: 6 * Copyright © 2015 Intel Corporation 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the next 16 * paragraph) shall be included in all copies or substantial portions of the 17 * Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 25 * IN THE SOFTWARE. 26 */ 27 28#include "radv_cs.h" 29#include "radv_debug.h" 30#include "radv_meta.h" 31#include "radv_private.h" 32#include "radv_radeon_winsys.h" 33#include "radv_shader.h" 34#include "sid.h" 35#include "vk_format.h" 36#include "vk_util.h" 37#include "vk_enum_defines.h" 38#include "vk_common_entrypoints.h" 39 40#include "ac_debug.h" 41#include "ac_shader_args.h" 42 43#include "util/fast_idiv_by_const.h" 44 45enum { 46 RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0), 47 RADV_PREFETCH_VS = (1 << 1), 48 RADV_PREFETCH_TCS = (1 << 2), 49 RADV_PREFETCH_TES = (1 << 3), 50 RADV_PREFETCH_GS = (1 << 4), 51 RADV_PREFETCH_PS = (1 << 5), 52 RADV_PREFETCH_MS = (1 << 6), 53 RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES | 54 RADV_PREFETCH_GS | RADV_PREFETCH_PS | RADV_PREFETCH_MS) 55}; 56 57static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, 58 struct radv_image *image, VkImageLayout src_layout, 59 bool src_render_loop, VkImageLayout dst_layout, 60 bool dst_render_loop, uint32_t src_family_index, 61 uint32_t dst_family_index, const VkImageSubresourceRange *range, 62 struct radv_sample_locations_state *sample_locs); 63 64static void radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size); 65 66const struct radv_dynamic_state default_dynamic_state = { 67 .viewport = 68 { 69 .count = 0, 70 }, 71 .scissor = 72 { 73 .count = 0, 74 }, 75 .line_width = 1.0f, 76 .depth_bias = 77 { 78 .bias = 0.0f, 79 .clamp = 0.0f, 80 .slope = 0.0f, 81 }, 82 .blend_constants = {0.0f, 0.0f, 0.0f, 0.0f}, 83 .depth_bounds = 84 { 85 .min = 0.0f, 86 .max = 1.0f, 87 }, 88 .stencil_compare_mask = 89 { 90 .front = ~0u, 91 .back = ~0u, 92 }, 93 .stencil_write_mask = 94 { 95 .front = ~0u, 96 .back = ~0u, 97 }, 98 .stencil_reference = 99 { 100 .front = 0u, 101 .back = 0u, 102 }, 103 .line_stipple = 104 { 105 .factor = 0u, 106 .pattern = 0u, 107 }, 108 .cull_mode = 0u, 109 .front_face = 0u, 110 .primitive_topology = 0u, 111 .fragment_shading_rate = 112 { 113 .size = {1u, 1u}, 114 .combiner_ops = {VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR, 115 VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR}, 116 }, 117 .depth_bias_enable = 0u, 118 .primitive_restart_enable = 0u, 119 .rasterizer_discard_enable = 0u, 120 .logic_op = 0u, 121 .color_write_enable = 0xffffffffu, 122}; 123 124static void 125radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src) 126{ 127 struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic; 128 uint64_t copy_mask = src->mask; 129 uint64_t dest_mask = 0; 130 131 dest->discard_rectangle.count = src->discard_rectangle.count; 132 dest->sample_location.count = src->sample_location.count; 133 134 if (copy_mask & RADV_DYNAMIC_VIEWPORT) { 135 if (dest->viewport.count != src->viewport.count) { 136 dest->viewport.count = src->viewport.count; 137 dest_mask |= RADV_DYNAMIC_VIEWPORT; 138 } 139 140 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports, 141 src->viewport.count * sizeof(VkViewport))) { 142 typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count); 143 typed_memcpy(dest->viewport.xform, src->viewport.xform, src->viewport.count); 144 dest_mask |= RADV_DYNAMIC_VIEWPORT; 145 } 146 } 147 148 if (copy_mask & RADV_DYNAMIC_SCISSOR) { 149 if (dest->scissor.count != src->scissor.count) { 150 dest->scissor.count = src->scissor.count; 151 dest_mask |= RADV_DYNAMIC_SCISSOR; 152 } 153 154 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors, 155 src->scissor.count * sizeof(VkRect2D))) { 156 typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count); 157 dest_mask |= RADV_DYNAMIC_SCISSOR; 158 } 159 } 160 161 if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) { 162 if (dest->line_width != src->line_width) { 163 dest->line_width = src->line_width; 164 dest_mask |= RADV_DYNAMIC_LINE_WIDTH; 165 } 166 } 167 168 if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) { 169 if (memcmp(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias))) { 170 dest->depth_bias = src->depth_bias; 171 dest_mask |= RADV_DYNAMIC_DEPTH_BIAS; 172 } 173 } 174 175 if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) { 176 if (memcmp(&dest->blend_constants, &src->blend_constants, sizeof(src->blend_constants))) { 177 typed_memcpy(dest->blend_constants, src->blend_constants, 4); 178 dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS; 179 } 180 } 181 182 if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) { 183 if (memcmp(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds))) { 184 dest->depth_bounds = src->depth_bounds; 185 dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS; 186 } 187 } 188 189 if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) { 190 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask, 191 sizeof(src->stencil_compare_mask))) { 192 dest->stencil_compare_mask = src->stencil_compare_mask; 193 dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK; 194 } 195 } 196 197 if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) { 198 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask, 199 sizeof(src->stencil_write_mask))) { 200 dest->stencil_write_mask = src->stencil_write_mask; 201 dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK; 202 } 203 } 204 205 if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) { 206 if (memcmp(&dest->stencil_reference, &src->stencil_reference, 207 sizeof(src->stencil_reference))) { 208 dest->stencil_reference = src->stencil_reference; 209 dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE; 210 } 211 } 212 213 if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) { 214 if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles, 215 src->discard_rectangle.count * sizeof(VkRect2D))) { 216 typed_memcpy(dest->discard_rectangle.rectangles, src->discard_rectangle.rectangles, 217 src->discard_rectangle.count); 218 dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE; 219 } 220 } 221 222 if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) { 223 if (dest->sample_location.per_pixel != src->sample_location.per_pixel || 224 dest->sample_location.grid_size.width != src->sample_location.grid_size.width || 225 dest->sample_location.grid_size.height != src->sample_location.grid_size.height || 226 memcmp(&dest->sample_location.locations, &src->sample_location.locations, 227 src->sample_location.count * sizeof(VkSampleLocationEXT))) { 228 dest->sample_location.per_pixel = src->sample_location.per_pixel; 229 dest->sample_location.grid_size = src->sample_location.grid_size; 230 typed_memcpy(dest->sample_location.locations, src->sample_location.locations, 231 src->sample_location.count); 232 dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS; 233 } 234 } 235 236 if (copy_mask & RADV_DYNAMIC_LINE_STIPPLE) { 237 if (memcmp(&dest->line_stipple, &src->line_stipple, sizeof(src->line_stipple))) { 238 dest->line_stipple = src->line_stipple; 239 dest_mask |= RADV_DYNAMIC_LINE_STIPPLE; 240 } 241 } 242 243 if (copy_mask & RADV_DYNAMIC_CULL_MODE) { 244 if (dest->cull_mode != src->cull_mode) { 245 dest->cull_mode = src->cull_mode; 246 dest_mask |= RADV_DYNAMIC_CULL_MODE; 247 } 248 } 249 250 if (copy_mask & RADV_DYNAMIC_FRONT_FACE) { 251 if (dest->front_face != src->front_face) { 252 dest->front_face = src->front_face; 253 dest_mask |= RADV_DYNAMIC_FRONT_FACE; 254 } 255 } 256 257 if (copy_mask & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) { 258 if (dest->primitive_topology != src->primitive_topology) { 259 dest->primitive_topology = src->primitive_topology; 260 dest_mask |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY; 261 } 262 } 263 264 if (copy_mask & RADV_DYNAMIC_DEPTH_TEST_ENABLE) { 265 if (dest->depth_test_enable != src->depth_test_enable) { 266 dest->depth_test_enable = src->depth_test_enable; 267 dest_mask |= RADV_DYNAMIC_DEPTH_TEST_ENABLE; 268 } 269 } 270 271 if (copy_mask & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) { 272 if (dest->depth_write_enable != src->depth_write_enable) { 273 dest->depth_write_enable = src->depth_write_enable; 274 dest_mask |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE; 275 } 276 } 277 278 if (copy_mask & RADV_DYNAMIC_DEPTH_COMPARE_OP) { 279 if (dest->depth_compare_op != src->depth_compare_op) { 280 dest->depth_compare_op = src->depth_compare_op; 281 dest_mask |= RADV_DYNAMIC_DEPTH_COMPARE_OP; 282 } 283 } 284 285 if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) { 286 if (dest->depth_bounds_test_enable != src->depth_bounds_test_enable) { 287 dest->depth_bounds_test_enable = src->depth_bounds_test_enable; 288 dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE; 289 } 290 } 291 292 if (copy_mask & RADV_DYNAMIC_STENCIL_TEST_ENABLE) { 293 if (dest->stencil_test_enable != src->stencil_test_enable) { 294 dest->stencil_test_enable = src->stencil_test_enable; 295 dest_mask |= RADV_DYNAMIC_STENCIL_TEST_ENABLE; 296 } 297 } 298 299 if (copy_mask & RADV_DYNAMIC_STENCIL_OP) { 300 if (memcmp(&dest->stencil_op, &src->stencil_op, sizeof(src->stencil_op))) { 301 dest->stencil_op = src->stencil_op; 302 dest_mask |= RADV_DYNAMIC_STENCIL_OP; 303 } 304 } 305 306 if (copy_mask & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) { 307 if (memcmp(&dest->fragment_shading_rate, &src->fragment_shading_rate, 308 sizeof(src->fragment_shading_rate))) { 309 dest->fragment_shading_rate = src->fragment_shading_rate; 310 dest_mask |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE; 311 } 312 } 313 314 if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) { 315 if (dest->depth_bias_enable != src->depth_bias_enable) { 316 dest->depth_bias_enable = src->depth_bias_enable; 317 dest_mask |= RADV_DYNAMIC_DEPTH_BIAS_ENABLE; 318 } 319 } 320 321 if (copy_mask & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) { 322 if (dest->primitive_restart_enable != src->primitive_restart_enable) { 323 dest->primitive_restart_enable = src->primitive_restart_enable; 324 dest_mask |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE; 325 } 326 } 327 328 if (copy_mask & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) { 329 if (dest->rasterizer_discard_enable != src->rasterizer_discard_enable) { 330 dest->rasterizer_discard_enable = src->rasterizer_discard_enable; 331 dest_mask |= RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE; 332 } 333 } 334 335 if (copy_mask & RADV_DYNAMIC_LOGIC_OP) { 336 if (dest->logic_op != src->logic_op) { 337 dest->logic_op = src->logic_op; 338 dest_mask |= RADV_DYNAMIC_LOGIC_OP; 339 } 340 } 341 342 if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_ENABLE) { 343 if (dest->color_write_enable != src->color_write_enable) { 344 dest->color_write_enable = src->color_write_enable; 345 dest_mask |= RADV_DYNAMIC_COLOR_WRITE_ENABLE; 346 } 347 } 348 349 cmd_buffer->state.dirty |= dest_mask; 350} 351 352bool 353radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer) 354{ 355 return cmd_buffer->qf == RADV_QUEUE_COMPUTE && 356 cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7; 357} 358 359enum amd_ip_type 360radv_queue_family_to_ring(struct radv_physical_device *physical_device, 361 enum radv_queue_family f) 362{ 363 switch (f) { 364 case RADV_QUEUE_GENERAL: 365 return AMD_IP_GFX; 366 case RADV_QUEUE_COMPUTE: 367 return AMD_IP_COMPUTE; 368 case RADV_QUEUE_TRANSFER: 369 return AMD_IP_SDMA; 370 default: 371 unreachable("Unknown queue family"); 372 } 373} 374 375static void 376radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va, 377 unsigned count, const uint32_t *data) 378{ 379 struct radeon_cmdbuf *cs = cmd_buffer->cs; 380 381 radeon_check_space(cmd_buffer->device->ws, cs, 4 + count); 382 383 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0)); 384 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine_sel)); 385 radeon_emit(cs, va); 386 radeon_emit(cs, va >> 32); 387 radeon_emit_array(cs, data, count); 388} 389 390static void 391radv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va, 392 unsigned size) 393{ 394 uint32_t *zeroes = alloca(size); 395 memset(zeroes, 0, size); 396 radv_emit_write_data_packet(cmd_buffer, engine_sel, va, size / 4, zeroes); 397} 398 399static void 400radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) 401{ 402 list_del(&cmd_buffer->pool_link); 403 404 util_dynarray_fini(&cmd_buffer->cached_vertex_formats); 405 406 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) 407 { 408 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo); 409 list_del(&up->list); 410 free(up); 411 } 412 413 if (cmd_buffer->upload.upload_bo) 414 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo); 415 416 if (cmd_buffer->state.own_render_pass) { 417 radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device), 418 radv_render_pass_to_handle(cmd_buffer->state.pass), NULL); 419 cmd_buffer->state.own_render_pass = false; 420 } 421 422 if (cmd_buffer->cs) 423 cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs); 424 if (cmd_buffer->ace_internal.cs) 425 cmd_buffer->device->ws->cs_destroy(cmd_buffer->ace_internal.cs); 426 427 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { 428 struct radv_descriptor_set_header *set = &cmd_buffer->descriptors[i].push_set.set; 429 free(set->mapped_ptr); 430 if (set->layout) 431 vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->layout->vk); 432 vk_object_base_finish(&set->base); 433 } 434 435 vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base); 436 437 vk_command_buffer_finish(&cmd_buffer->vk); 438 vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer); 439} 440 441static VkResult 442radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool, 443 VkCommandBufferLevel level, VkCommandBuffer *pCommandBuffer) 444{ 445 struct radv_cmd_buffer *cmd_buffer; 446 unsigned ring; 447 cmd_buffer = vk_zalloc(&pool->vk.alloc, sizeof(*cmd_buffer), 8, 448 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 449 if (cmd_buffer == NULL) 450 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 451 452 VkResult result = 453 vk_command_buffer_init(&cmd_buffer->vk, &pool->vk, level); 454 if (result != VK_SUCCESS) { 455 vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer); 456 return result; 457 } 458 459 cmd_buffer->device = device; 460 cmd_buffer->pool = pool; 461 462 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); 463 cmd_buffer->qf = vk_queue_to_radv(device->physical_device, pool->vk.queue_family_index); 464 465 ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf); 466 467 cmd_buffer->cs = device->ws->cs_create(device->ws, ring); 468 if (!cmd_buffer->cs) { 469 radv_destroy_cmd_buffer(cmd_buffer); 470 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 471 } 472 473 vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base, 474 VK_OBJECT_TYPE_DESCRIPTOR_SET); 475 476 util_dynarray_init(&cmd_buffer->cached_vertex_formats, NULL); 477 478 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) 479 vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base, 480 VK_OBJECT_TYPE_DESCRIPTOR_SET); 481 482 *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer); 483 484 list_inithead(&cmd_buffer->upload.list); 485 486 return VK_SUCCESS; 487} 488 489static VkResult 490radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) 491{ 492 vk_command_buffer_reset(&cmd_buffer->vk); 493 494 cmd_buffer->device->ws->cs_reset(cmd_buffer->cs); 495 if (cmd_buffer->ace_internal.cs) 496 cmd_buffer->device->ws->cs_reset(cmd_buffer->ace_internal.cs); 497 498 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) 499 { 500 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo); 501 list_del(&up->list); 502 free(up); 503 } 504 505 if (cmd_buffer->state.own_render_pass) { 506 radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device), 507 radv_render_pass_to_handle(cmd_buffer->state.pass), NULL); 508 cmd_buffer->state.own_render_pass = false; 509 } 510 511 cmd_buffer->push_constant_stages = 0; 512 cmd_buffer->scratch_size_per_wave_needed = 0; 513 cmd_buffer->scratch_waves_wanted = 0; 514 cmd_buffer->compute_scratch_size_per_wave_needed = 0; 515 cmd_buffer->compute_scratch_waves_wanted = 0; 516 cmd_buffer->esgs_ring_size_needed = 0; 517 cmd_buffer->gsvs_ring_size_needed = 0; 518 cmd_buffer->tess_rings_needed = false; 519 cmd_buffer->task_rings_needed = false; 520 cmd_buffer->mesh_scratch_ring_needed = false; 521 cmd_buffer->gds_needed = false; 522 cmd_buffer->gds_oa_needed = false; 523 cmd_buffer->sample_positions_needed = false; 524 cmd_buffer->ace_internal.sem.gfx2ace_value = 0; 525 cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = 0; 526 cmd_buffer->ace_internal.sem.va = 0; 527 528 if (cmd_buffer->upload.upload_bo) 529 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo); 530 cmd_buffer->upload.offset = 0; 531 532 cmd_buffer->record_result = VK_SUCCESS; 533 534 memset(cmd_buffer->vertex_binding_buffers, 0, sizeof(struct radv_buffer *) * cmd_buffer->used_vertex_bindings); 535 cmd_buffer->used_vertex_bindings = 0; 536 537 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { 538 cmd_buffer->descriptors[i].dirty = 0; 539 cmd_buffer->descriptors[i].valid = 0; 540 cmd_buffer->descriptors[i].push_dirty = false; 541 } 542 543 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) { 544 uint32_t pred_value = 0; 545 uint32_t pred_offset; 546 if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset)) 547 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 548 549 cmd_buffer->mec_inv_pred_emitted = false; 550 cmd_buffer->mec_inv_pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset; 551 } 552 553 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 && 554 cmd_buffer->qf == RADV_QUEUE_GENERAL) { 555 unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends; 556 unsigned fence_offset, eop_bug_offset; 557 void *fence_ptr; 558 559 radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr); 560 memset(fence_ptr, 0, 8); 561 562 cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 563 cmd_buffer->gfx9_fence_va += fence_offset; 564 565 radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8); 566 567 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) { 568 /* Allocate a buffer for the EOP bug on GFX9. */ 569 radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr); 570 memset(fence_ptr, 0, 16 * num_db); 571 cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 572 cmd_buffer->gfx9_eop_bug_va += eop_bug_offset; 573 574 radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db); 575 } 576 } 577 578 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL; 579 580 return cmd_buffer->record_result; 581} 582 583static bool 584radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed) 585{ 586 uint64_t new_size; 587 struct radeon_winsys_bo *bo = NULL; 588 struct radv_cmd_buffer_upload *upload; 589 struct radv_device *device = cmd_buffer->device; 590 591 new_size = MAX2(min_needed, 16 * 1024); 592 new_size = MAX2(new_size, 2 * cmd_buffer->upload.size); 593 594 VkResult result = 595 device->ws->buffer_create(device->ws, new_size, 4096, device->ws->cs_domain(device->ws), 596 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | 597 RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC, 598 RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo); 599 600 if (result != VK_SUCCESS) { 601 cmd_buffer->record_result = result; 602 return false; 603 } 604 605 radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo); 606 if (cmd_buffer->upload.upload_bo) { 607 upload = malloc(sizeof(*upload)); 608 609 if (!upload) { 610 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 611 device->ws->buffer_destroy(device->ws, bo); 612 return false; 613 } 614 615 memcpy(upload, &cmd_buffer->upload, sizeof(*upload)); 616 list_add(&upload->list, &cmd_buffer->upload.list); 617 } 618 619 cmd_buffer->upload.upload_bo = bo; 620 cmd_buffer->upload.size = new_size; 621 cmd_buffer->upload.offset = 0; 622 cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo); 623 624 if (!cmd_buffer->upload.map) { 625 cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; 626 return false; 627 } 628 629 return true; 630} 631 632bool 633radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size, 634 unsigned *out_offset, void **ptr) 635{ 636 assert(size % 4 == 0); 637 638 struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info; 639 640 /* Align to the scalar cache line size if it results in this allocation 641 * being placed in less of them. 642 */ 643 unsigned offset = cmd_buffer->upload.offset; 644 unsigned line_size = rad_info->gfx_level >= GFX10 ? 64 : 32; 645 unsigned gap = align(offset, line_size) - offset; 646 if ((size & (line_size - 1)) > gap) 647 offset = align(offset, line_size); 648 649 if (offset + size > cmd_buffer->upload.size) { 650 if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size)) 651 return false; 652 offset = 0; 653 } 654 655 *out_offset = offset; 656 *ptr = cmd_buffer->upload.map + offset; 657 658 cmd_buffer->upload.offset = offset + size; 659 return true; 660} 661 662bool 663radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data, 664 unsigned *out_offset) 665{ 666 uint8_t *ptr; 667 668 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr)) 669 return false; 670 assert(ptr); 671 672 memcpy(ptr, data, size); 673 return true; 674} 675 676void 677radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer) 678{ 679 struct radv_device *device = cmd_buffer->device; 680 struct radeon_cmdbuf *cs = cmd_buffer->cs; 681 uint64_t va; 682 683 va = radv_buffer_get_va(device->trace_bo); 684 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) 685 va += 4; 686 687 ++cmd_buffer->state.trace_id; 688 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id); 689 690 radeon_check_space(cmd_buffer->device->ws, cs, 2); 691 692 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); 693 radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id)); 694} 695 696static void 697radv_ace_internal_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask, 698 VkPipelineStageFlags2 dst_stage_mask) 699{ 700 /* Update flush bits from the main cmdbuf, except the stage flush. */ 701 cmd_buffer->ace_internal.flush_bits |= 702 cmd_buffer->state.flush_bits & RADV_CMD_FLUSH_ALL_COMPUTE & ~RADV_CMD_FLAG_CS_PARTIAL_FLUSH; 703 704 /* Add stage flush only when necessary. */ 705 if (src_stage_mask & 706 (VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV | VK_PIPELINE_STAGE_2_TRANSFER_BIT | 707 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) 708 cmd_buffer->ace_internal.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH; 709 710 /* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */ 711 if (src_stage_mask & 712 (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT | 713 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | 714 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) 715 dst_stage_mask |= cmd_buffer->state.dma_is_busy ? VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV : 0; 716 717 /* Increment the GFX/ACE semaphore when task shaders are blocked. */ 718 if (dst_stage_mask & 719 (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | 720 VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV)) 721 cmd_buffer->ace_internal.sem.gfx2ace_value++; 722} 723 724static void 725radv_ace_internal_cache_flush(struct radv_cmd_buffer *cmd_buffer) 726{ 727 struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs; 728 const uint32_t flush_bits = cmd_buffer->ace_internal.flush_bits; 729 enum rgp_flush_bits sqtt_flush_bits = 0; 730 731 si_cs_emit_cache_flush(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0, 732 true, flush_bits, &sqtt_flush_bits, 0); 733 734 cmd_buffer->ace_internal.flush_bits = 0; 735} 736 737static uint64_t 738radv_ace_internal_sem_create(struct radv_cmd_buffer *cmd_buffer) 739{ 740 /* DWORD 0: GFX->ACE semaphore (GFX blocks ACE, ie. ACE waits for GFX) 741 * DWORD 1: ACE->GFX semaphore 742 */ 743 uint64_t sem_init = 0; 744 uint32_t va_off = 0; 745 if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint64_t), &sem_init, &va_off)) { 746 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 747 return 0; 748 } 749 750 return radv_buffer_get_va(cmd_buffer->upload.upload_bo) + va_off; 751} 752 753static bool 754radv_ace_internal_sem_dirty(const struct radv_cmd_buffer *cmd_buffer) 755{ 756 return cmd_buffer->ace_internal.sem.gfx2ace_value != 757 cmd_buffer->ace_internal.sem.emitted_gfx2ace_value; 758} 759 760ALWAYS_INLINE static bool 761radv_flush_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer) 762{ 763 if (!radv_ace_internal_sem_dirty(cmd_buffer)) 764 return false; 765 766 if (!cmd_buffer->ace_internal.sem.va) { 767 cmd_buffer->ace_internal.sem.va = radv_ace_internal_sem_create(cmd_buffer); 768 if (!cmd_buffer->ace_internal.sem.va) 769 return false; 770 } 771 772 /* GFX writes a value to the semaphore which ACE can wait for.*/ 773 si_cs_emit_write_event_eop( 774 cmd_buffer->cs, cmd_buffer->device->physical_device->rad_info.gfx_level, 775 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, 776 EOP_DATA_SEL_VALUE_32BIT, cmd_buffer->ace_internal.sem.va, 777 cmd_buffer->ace_internal.sem.gfx2ace_value, cmd_buffer->gfx9_eop_bug_va); 778 779 cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = cmd_buffer->ace_internal.sem.gfx2ace_value; 780 return true; 781} 782 783ALWAYS_INLINE static void 784radv_wait_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer) 785{ 786 assert(cmd_buffer->ace_internal.sem.va); 787 struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs; 788 radeon_check_space(cmd_buffer->device->ws, ace_cs, 7); 789 790 /* ACE waits for the semaphore which GFX wrote. */ 791 radv_cp_wait_mem(ace_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, cmd_buffer->ace_internal.sem.va, 792 cmd_buffer->ace_internal.sem.gfx2ace_value, 0xffffffff); 793} 794 795static struct radeon_cmdbuf * 796radv_ace_internal_create(struct radv_cmd_buffer *cmd_buffer) 797{ 798 assert(!cmd_buffer->ace_internal.cs); 799 struct radv_device *device = cmd_buffer->device; 800 struct radeon_cmdbuf *ace_cs = device->ws->cs_create(device->ws, AMD_IP_COMPUTE); 801 802 if (!ace_cs) { 803 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 804 } 805 806 return ace_cs; 807} 808 809static VkResult 810radv_ace_internal_finalize(struct radv_cmd_buffer *cmd_buffer) 811{ 812 assert(cmd_buffer->ace_internal.cs); 813 struct radv_device *device = cmd_buffer->device; 814 struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs; 815 816 /* Emit pending cache flush. */ 817 radv_ace_internal_cache_flush(cmd_buffer); 818 819 /* Clear the ACE semaphore if it exists. 820 * This is necessary in case the same cmd buffer is submitted again in the future. 821 */ 822 if (cmd_buffer->ace_internal.sem.va) { 823 struct radeon_cmdbuf *main_cs = cmd_buffer->cs; 824 uint64_t gfx2ace_va = cmd_buffer->ace_internal.sem.va; 825 uint64_t ace2gfx_va = cmd_buffer->ace_internal.sem.va + 4; 826 827 /* ACE: write 1 to the ACE->GFX semaphore. */ 828 si_cs_emit_write_event_eop(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level, 829 true, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, 830 EOP_DATA_SEL_VALUE_32BIT, ace2gfx_va, 1, 831 cmd_buffer->gfx9_eop_bug_va); 832 833 /* Wait for ACE to finish, otherwise we may risk writing 0 to the semaphore 834 * when ACE is still waiting for it. This may not happen in practice, but 835 * better safe than sorry. 836 */ 837 radv_cp_wait_mem(main_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, ace2gfx_va, 1, 0xffffffff); 838 839 /* GFX: clear GFX->ACE and ACE->GFX semaphores. */ 840 radv_emit_clear_data(cmd_buffer, V_370_ME, gfx2ace_va, 8); 841 } 842 843 device->ws->cs_add_buffers(ace_cs, cmd_buffer->cs); 844 return device->ws->cs_finalize(ace_cs); 845} 846 847static void 848radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags) 849{ 850 if (unlikely(cmd_buffer->device->thread_trace.bo)) { 851 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 852 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0)); 853 } 854 855 if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) { 856 enum rgp_flush_bits sqtt_flush_bits = 0; 857 assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)); 858 859 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4); 860 861 /* Force wait for graphics or compute engines to be idle. */ 862 si_cs_emit_cache_flush(cmd_buffer->cs, 863 cmd_buffer->device->physical_device->rad_info.gfx_level, 864 &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va, 865 radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits, 866 cmd_buffer->gfx9_eop_bug_va); 867 868 if (cmd_buffer->state.graphics_pipeline && (flags & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) && 869 radv_pipeline_has_stage(cmd_buffer->state.graphics_pipeline, MESA_SHADER_TASK)) { 870 /* Force wait for compute engines to be idle on the internal cmdbuf. */ 871 si_cs_emit_cache_flush(cmd_buffer->ace_internal.cs, 872 cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0, 873 true, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, &sqtt_flush_bits, 0); 874 } 875 } 876 877 if (unlikely(cmd_buffer->device->trace_bo)) 878 radv_cmd_buffer_trace_emit(cmd_buffer); 879} 880 881static void 882radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) 883{ 884 struct radv_device *device = cmd_buffer->device; 885 enum amd_ip_type ring; 886 uint32_t data[2]; 887 uint64_t va; 888 889 va = radv_buffer_get_va(device->trace_bo); 890 891 ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf); 892 893 switch (ring) { 894 case AMD_IP_GFX: 895 va += 8; 896 break; 897 case AMD_IP_COMPUTE: 898 va += 16; 899 break; 900 default: 901 assert(!"invalid IP type"); 902 } 903 904 uint64_t pipeline_address = (uintptr_t)pipeline; 905 data[0] = pipeline_address; 906 data[1] = pipeline_address >> 32; 907 908 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data); 909} 910 911static void 912radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr) 913{ 914 struct radv_device *device = cmd_buffer->device; 915 uint32_t data[2]; 916 uint64_t va; 917 918 va = radv_buffer_get_va(device->trace_bo); 919 va += 24; 920 921 data[0] = vb_ptr; 922 data[1] = vb_ptr >> 32; 923 924 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data); 925} 926 927static void 928radv_save_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader_part *prolog) 929{ 930 struct radv_device *device = cmd_buffer->device; 931 uint32_t data[2]; 932 uint64_t va; 933 934 va = radv_buffer_get_va(device->trace_bo); 935 va += 32; 936 937 uint64_t prolog_address = (uintptr_t)prolog; 938 data[0] = prolog_address; 939 data[1] = prolog_address >> 32; 940 941 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data); 942} 943 944void 945radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point, 946 struct radv_descriptor_set *set, unsigned idx) 947{ 948 struct radv_descriptor_state *descriptors_state = 949 radv_get_descriptors_state(cmd_buffer, bind_point); 950 951 descriptors_state->sets[idx] = set; 952 953 descriptors_state->valid |= (1u << idx); /* active descriptors */ 954 descriptors_state->dirty |= (1u << idx); 955} 956 957static void 958radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point) 959{ 960 struct radv_descriptor_state *descriptors_state = 961 radv_get_descriptors_state(cmd_buffer, bind_point); 962 struct radv_device *device = cmd_buffer->device; 963 uint32_t data[MAX_SETS * 2] = {0}; 964 uint64_t va; 965 va = radv_buffer_get_va(device->trace_bo) + 40; 966 967 u_foreach_bit(i, descriptors_state->valid) 968 { 969 struct radv_descriptor_set *set = descriptors_state->sets[i]; 970 data[i * 2] = (uint64_t)(uintptr_t)set; 971 data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32; 972 } 973 974 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data); 975} 976 977struct radv_userdata_info * 978radv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx) 979{ 980 struct radv_shader *shader = radv_get_shader(pipeline, stage); 981 return &shader->info.user_sgprs_locs.shader_data[idx]; 982} 983 984static void 985radv_emit_userdata_address(struct radv_device *device, struct radeon_cmdbuf *cs, 986 struct radv_pipeline *pipeline, gl_shader_stage stage, int idx, 987 uint64_t va) 988{ 989 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx); 990 uint32_t base_reg = pipeline->user_data_0[stage]; 991 if (loc->sgpr_idx == -1) 992 return; 993 994 assert(loc->num_sgprs == 1); 995 996 radv_emit_shader_pointer(device, cs, base_reg + loc->sgpr_idx * 4, va, false); 997} 998 999static void 1000radv_emit_descriptor_pointers(struct radv_device *device, struct radeon_cmdbuf *cs, 1001 struct radv_pipeline *pipeline, 1002 struct radv_descriptor_state *descriptors_state, 1003 gl_shader_stage stage) 1004{ 1005 uint32_t sh_base = pipeline->user_data_0[stage]; 1006 struct radv_userdata_locations *locs = &pipeline->shaders[stage]->info.user_sgprs_locs; 1007 unsigned mask = locs->descriptor_sets_enabled; 1008 1009 mask &= descriptors_state->dirty & descriptors_state->valid; 1010 1011 while (mask) { 1012 int start, count; 1013 1014 u_bit_scan_consecutive_range(&mask, &start, &count); 1015 1016 struct radv_userdata_info *loc = &locs->descriptor_sets[start]; 1017 unsigned sh_offset = sh_base + loc->sgpr_idx * 4; 1018 1019 radv_emit_shader_pointer_head(cs, sh_offset, count, true); 1020 for (int i = 0; i < count; i++) { 1021 struct radv_descriptor_set *set = descriptors_state->sets[start + i]; 1022 1023 radv_emit_shader_pointer_body(device, cs, set->header.va, true); 1024 } 1025 } 1026} 1027 1028/** 1029 * Convert the user sample locations to hardware sample locations (the values 1030 * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*). 1031 */ 1032static void 1033radv_convert_user_sample_locs(struct radv_sample_locations_state *state, uint32_t x, uint32_t y, 1034 VkOffset2D *sample_locs) 1035{ 1036 uint32_t x_offset = x % state->grid_size.width; 1037 uint32_t y_offset = y % state->grid_size.height; 1038 uint32_t num_samples = (uint32_t)state->per_pixel; 1039 VkSampleLocationEXT *user_locs; 1040 uint32_t pixel_offset; 1041 1042 pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples; 1043 1044 assert(pixel_offset <= MAX_SAMPLE_LOCATIONS); 1045 user_locs = &state->locations[pixel_offset]; 1046 1047 for (uint32_t i = 0; i < num_samples; i++) { 1048 float shifted_pos_x = user_locs[i].x - 0.5; 1049 float shifted_pos_y = user_locs[i].y - 0.5; 1050 1051 int32_t scaled_pos_x = floorf(shifted_pos_x * 16); 1052 int32_t scaled_pos_y = floorf(shifted_pos_y * 16); 1053 1054 sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7); 1055 sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7); 1056 } 1057} 1058 1059/** 1060 * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample 1061 * locations. 1062 */ 1063static void 1064radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs, 1065 uint32_t *sample_locs_pixel) 1066{ 1067 for (uint32_t i = 0; i < num_samples; i++) { 1068 uint32_t sample_reg_idx = i / 4; 1069 uint32_t sample_loc_idx = i % 4; 1070 int32_t pos_x = sample_locs[i].x; 1071 int32_t pos_y = sample_locs[i].y; 1072 1073 uint32_t shift_x = 8 * sample_loc_idx; 1074 uint32_t shift_y = shift_x + 4; 1075 1076 sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x; 1077 sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y; 1078 } 1079} 1080 1081/** 1082 * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware 1083 * sample locations. 1084 */ 1085static uint64_t 1086radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs, 1087 uint32_t num_samples) 1088{ 1089 uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities)); 1090 uint32_t sample_mask = num_samples - 1; 1091 uint32_t *distances = alloca(num_samples * sizeof(*distances)); 1092 uint64_t centroid_priority = 0; 1093 1094 /* Compute the distances from center for each sample. */ 1095 for (int i = 0; i < num_samples; i++) { 1096 distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y); 1097 } 1098 1099 /* Compute the centroid priorities by looking at the distances array. */ 1100 for (int i = 0; i < num_samples; i++) { 1101 uint32_t min_idx = 0; 1102 1103 for (int j = 1; j < num_samples; j++) { 1104 if (distances[j] < distances[min_idx]) 1105 min_idx = j; 1106 } 1107 1108 centroid_priorities[i] = min_idx; 1109 distances[min_idx] = 0xffffffff; 1110 } 1111 1112 /* Compute the final centroid priority. */ 1113 for (int i = 0; i < 8; i++) { 1114 centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4); 1115 } 1116 1117 return centroid_priority << 32 | centroid_priority; 1118} 1119 1120/** 1121 * Emit the sample locations that are specified with VK_EXT_sample_locations. 1122 */ 1123static void 1124radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer) 1125{ 1126 struct radv_sample_locations_state *sample_location = &cmd_buffer->state.dynamic.sample_location; 1127 uint32_t num_samples = (uint32_t)sample_location->per_pixel; 1128 struct radeon_cmdbuf *cs = cmd_buffer->cs; 1129 uint32_t sample_locs_pixel[4][2] = {0}; 1130 VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */ 1131 uint32_t max_sample_dist = 0; 1132 uint64_t centroid_priority; 1133 1134 if (!cmd_buffer->state.dynamic.sample_location.count) 1135 return; 1136 1137 /* Convert the user sample locations to hardware sample locations. */ 1138 radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]); 1139 radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]); 1140 radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]); 1141 radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]); 1142 1143 /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */ 1144 for (uint32_t i = 0; i < 4; i++) { 1145 radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]); 1146 } 1147 1148 /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */ 1149 centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples); 1150 1151 /* Compute the maximum sample distance from the specified locations. */ 1152 for (unsigned i = 0; i < 4; ++i) { 1153 for (uint32_t j = 0; j < num_samples; j++) { 1154 VkOffset2D offset = sample_locs[i][j]; 1155 max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y))); 1156 } 1157 } 1158 1159 /* Emit the specified user sample locations. */ 1160 switch (num_samples) { 1161 case 2: 1162 case 4: 1163 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 1164 sample_locs_pixel[0][0]); 1165 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 1166 sample_locs_pixel[1][0]); 1167 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 1168 sample_locs_pixel[2][0]); 1169 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 1170 sample_locs_pixel[3][0]); 1171 break; 1172 case 8: 1173 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 1174 sample_locs_pixel[0][0]); 1175 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 1176 sample_locs_pixel[1][0]); 1177 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 1178 sample_locs_pixel[2][0]); 1179 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 1180 sample_locs_pixel[3][0]); 1181 radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, 1182 sample_locs_pixel[0][1]); 1183 radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, 1184 sample_locs_pixel[1][1]); 1185 radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, 1186 sample_locs_pixel[2][1]); 1187 radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, 1188 sample_locs_pixel[3][1]); 1189 break; 1190 default: 1191 unreachable("invalid number of samples"); 1192 } 1193 1194 /* Emit the maximum sample distance and the centroid priority. */ 1195 radeon_set_context_reg_rmw(cs, R_028BE0_PA_SC_AA_CONFIG, 1196 S_028BE0_MAX_SAMPLE_DIST(max_sample_dist), ~C_028BE0_MAX_SAMPLE_DIST); 1197 1198 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); 1199 radeon_emit(cs, centroid_priority); 1200 radeon_emit(cs, centroid_priority >> 32); 1201 1202 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1203} 1204 1205static void 1206radv_emit_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs, 1207 struct radv_pipeline *pipeline, gl_shader_stage stage, int idx, 1208 uint32_t *values) 1209{ 1210 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx); 1211 uint32_t base_reg = pipeline->user_data_0[stage]; 1212 if (loc->sgpr_idx == -1) 1213 return; 1214 1215 radeon_check_space(device->ws, cs, 2 + loc->num_sgprs); 1216 1217 radeon_set_sh_reg_seq(cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs); 1218 radeon_emit_array(cs, values, loc->num_sgprs); 1219} 1220 1221static void 1222radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, 1223 struct radv_graphics_pipeline *pipeline) 1224{ 1225 int num_samples = pipeline->ms.num_samples; 1226 struct radv_graphics_pipeline *old_pipeline = cmd_buffer->state.emitted_graphics_pipeline; 1227 1228 if (pipeline->base.shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions) 1229 cmd_buffer->sample_positions_needed = true; 1230 1231 if (old_pipeline && num_samples == old_pipeline->ms.num_samples) 1232 return; 1233 1234 radv_emit_default_sample_locations(cmd_buffer->cs, num_samples); 1235 1236 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1237} 1238 1239static void 1240radv_update_binning_state(struct radv_cmd_buffer *cmd_buffer, 1241 struct radv_graphics_pipeline *pipeline) 1242{ 1243 const struct radv_graphics_pipeline *old_pipeline = cmd_buffer->state.emitted_graphics_pipeline; 1244 1245 if (pipeline->base.device->physical_device->rad_info.gfx_level < GFX9) 1246 return; 1247 1248 if (old_pipeline && 1249 old_pipeline->binning.pa_sc_binner_cntl_0 == 1250 pipeline->binning.pa_sc_binner_cntl_0) 1251 return; 1252 1253 bool binning_flush = false; 1254 if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 || 1255 cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 || 1256 cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 || 1257 cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) { 1258 binning_flush = !old_pipeline || 1259 G_028C44_BINNING_MODE(old_pipeline->binning.pa_sc_binner_cntl_0) != 1260 G_028C44_BINNING_MODE(pipeline->binning.pa_sc_binner_cntl_0); 1261 } 1262 1263 radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0, 1264 pipeline->binning.pa_sc_binner_cntl_0 | 1265 S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush)); 1266 1267 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1268} 1269 1270static void 1271radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader) 1272{ 1273 uint64_t va; 1274 1275 if (!shader) 1276 return; 1277 1278 va = radv_shader_get_va(shader); 1279 1280 si_cp_dma_prefetch(cmd_buffer, va, shader->code_size); 1281} 1282 1283static void 1284radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer, 1285 struct radv_graphics_pipeline *pipeline, bool first_stage_only) 1286{ 1287 struct radv_cmd_state *state = &cmd_buffer->state; 1288 uint32_t mask = state->prefetch_L2_mask; 1289 1290 /* Fast prefetch path for starting draws as soon as possible. */ 1291 if (first_stage_only) 1292 mask &= RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS | RADV_PREFETCH_MS; 1293 1294 if (mask & RADV_PREFETCH_VS) 1295 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_VERTEX]); 1296 1297 if (mask & RADV_PREFETCH_MS) 1298 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_MESH]); 1299 1300 if (mask & RADV_PREFETCH_VBO_DESCRIPTORS) 1301 si_cp_dma_prefetch(cmd_buffer, state->vb_va, pipeline->vb_desc_alloc_size); 1302 1303 if (mask & RADV_PREFETCH_TCS) 1304 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_TESS_CTRL]); 1305 1306 if (mask & RADV_PREFETCH_TES) 1307 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_TESS_EVAL]); 1308 1309 if (mask & RADV_PREFETCH_GS) { 1310 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_GEOMETRY]); 1311 if (radv_pipeline_has_gs_copy_shader(&pipeline->base)) 1312 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.gs_copy_shader); 1313 } 1314 1315 if (mask & RADV_PREFETCH_PS) 1316 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_FRAGMENT]); 1317 1318 state->prefetch_L2_mask &= ~mask; 1319} 1320 1321static void 1322radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer) 1323{ 1324 if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed) 1325 return; 1326 1327 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 1328 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 1329 1330 unsigned sx_ps_downconvert = 0; 1331 unsigned sx_blend_opt_epsilon = 0; 1332 unsigned sx_blend_opt_control = 0; 1333 1334 for (unsigned i = 0; i < subpass->color_count; ++i) { 1335 unsigned format, swap; 1336 bool has_alpha, has_rgb; 1337 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { 1338 /* We don't set the DISABLE bits, because the HW can't have holes, 1339 * so the SPI color format is set to 32-bit 1-component. */ 1340 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); 1341 continue; 1342 } 1343 1344 int idx = subpass->color_attachments[i].attachment; 1345 if (cmd_buffer->state.attachments) { 1346 struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb; 1347 1348 format = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11 1349 ? G_028C70_FORMAT_GFX11(cb->cb_color_info) 1350 : G_028C70_FORMAT_GFX6(cb->cb_color_info); 1351 swap = G_028C70_COMP_SWAP(cb->cb_color_info); 1352 has_alpha = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11 1353 ? !G_028C74_FORCE_DST_ALPHA_1_GFX11(cb->cb_color_attrib) 1354 : !G_028C74_FORCE_DST_ALPHA_1_GFX6(cb->cb_color_attrib); 1355 } else { 1356 VkFormat fmt = cmd_buffer->state.pass->attachments[idx].format; 1357 format = radv_translate_colorformat(fmt); 1358 swap = radv_translate_colorswap(fmt, false); 1359 has_alpha = vk_format_description(fmt)->swizzle[3] != PIPE_SWIZZLE_1; 1360 } 1361 1362 uint32_t spi_format = (pipeline->col_format >> (i * 4)) & 0xf; 1363 uint32_t colormask = (pipeline->cb_target_mask >> (i * 4)) & 0xf; 1364 1365 if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32) 1366 has_rgb = !has_alpha; 1367 else 1368 has_rgb = true; 1369 1370 /* Check the colormask and export format. */ 1371 if (!(colormask & 0x7)) 1372 has_rgb = false; 1373 if (!(colormask & 0x8)) 1374 has_alpha = false; 1375 1376 if (spi_format == V_028714_SPI_SHADER_ZERO) { 1377 has_rgb = false; 1378 has_alpha = false; 1379 } 1380 1381 /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha 1382 * optimization, even though it has no alpha. */ 1383 if (has_rgb && format == V_028C70_COLOR_5_9_9_9) 1384 has_alpha = true; 1385 1386 /* Disable value checking for disabled channels. */ 1387 if (!has_rgb) 1388 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); 1389 if (!has_alpha) 1390 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); 1391 1392 /* Enable down-conversion for 32bpp and smaller formats. */ 1393 switch (format) { 1394 case V_028C70_COLOR_8: 1395 case V_028C70_COLOR_8_8: 1396 case V_028C70_COLOR_8_8_8_8: 1397 /* For 1 and 2-channel formats, use the superset thereof. */ 1398 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR || 1399 spi_format == V_028714_SPI_SHADER_UINT16_ABGR || 1400 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { 1401 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4); 1402 sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4); 1403 } 1404 break; 1405 1406 case V_028C70_COLOR_5_6_5: 1407 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 1408 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4); 1409 sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4); 1410 } 1411 break; 1412 1413 case V_028C70_COLOR_1_5_5_5: 1414 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 1415 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4); 1416 sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4); 1417 } 1418 break; 1419 1420 case V_028C70_COLOR_4_4_4_4: 1421 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 1422 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4); 1423 sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4); 1424 } 1425 break; 1426 1427 case V_028C70_COLOR_32: 1428 if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R) 1429 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); 1430 else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR) 1431 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4); 1432 break; 1433 1434 case V_028C70_COLOR_16: 1435 case V_028C70_COLOR_16_16: 1436 /* For 1-channel formats, use the superset thereof. */ 1437 if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR || 1438 spi_format == V_028714_SPI_SHADER_SNORM16_ABGR || 1439 spi_format == V_028714_SPI_SHADER_UINT16_ABGR || 1440 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { 1441 if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV) 1442 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4); 1443 else 1444 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4); 1445 } 1446 break; 1447 1448 case V_028C70_COLOR_10_11_11: 1449 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) 1450 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4); 1451 break; 1452 1453 case V_028C70_COLOR_2_10_10_10: 1454 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { 1455 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4); 1456 sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4); 1457 } 1458 break; 1459 case V_028C70_COLOR_5_9_9_9: 1460 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) 1461 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4); 1462 break; 1463 } 1464 } 1465 1466 /* Do not set the DISABLE bits for the unused attachments, as that 1467 * breaks dual source blending in SkQP and does not seem to improve 1468 * performance. */ 1469 1470 if (sx_ps_downconvert == cmd_buffer->state.last_sx_ps_downconvert && 1471 sx_blend_opt_epsilon == cmd_buffer->state.last_sx_blend_opt_epsilon && 1472 sx_blend_opt_control == cmd_buffer->state.last_sx_blend_opt_control) 1473 return; 1474 1475 radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3); 1476 radeon_emit(cmd_buffer->cs, sx_ps_downconvert); 1477 radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon); 1478 radeon_emit(cmd_buffer->cs, sx_blend_opt_control); 1479 1480 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1481 1482 cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert; 1483 cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon; 1484 cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control; 1485} 1486 1487static void 1488radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer) 1489{ 1490 if (!cmd_buffer->device->pbb_allowed) 1491 return; 1492 1493 struct radv_binning_settings settings = 1494 radv_get_binning_settings(cmd_buffer->device->physical_device); 1495 bool break_for_new_ps = 1496 (!cmd_buffer->state.emitted_graphics_pipeline || 1497 cmd_buffer->state.emitted_graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT] != 1498 cmd_buffer->state.graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT]) && 1499 (settings.context_states_per_bin > 1 || settings.persistent_states_per_bin > 1); 1500 bool break_for_new_cb_target_mask = 1501 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) && 1502 settings.context_states_per_bin > 1; 1503 1504 if (!break_for_new_ps && !break_for_new_cb_target_mask) 1505 return; 1506 1507 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1508 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); 1509} 1510 1511static void 1512radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) 1513{ 1514 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 1515 1516 if (cmd_buffer->state.emitted_graphics_pipeline == pipeline) 1517 return; 1518 1519 radv_update_multisample_state(cmd_buffer, pipeline); 1520 radv_update_binning_state(cmd_buffer, pipeline); 1521 1522 cmd_buffer->scratch_size_per_wave_needed = 1523 MAX2(cmd_buffer->scratch_size_per_wave_needed, pipeline->base.scratch_bytes_per_wave); 1524 cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, pipeline->base.max_waves); 1525 1526 if (!cmd_buffer->state.emitted_graphics_pipeline || 1527 cmd_buffer->state.emitted_graphics_pipeline->negative_one_to_one != pipeline->negative_one_to_one || 1528 cmd_buffer->state.emitted_graphics_pipeline->depth_clamp_mode != pipeline->depth_clamp_mode) 1529 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT; 1530 1531 if (!cmd_buffer->state.emitted_graphics_pipeline || 1532 radv_rast_prim_is_points_or_lines(cmd_buffer->state.emitted_graphics_pipeline->rast_prim) != radv_rast_prim_is_points_or_lines(pipeline->rast_prim) || 1533 cmd_buffer->state.emitted_graphics_pipeline->line_width != pipeline->line_width) 1534 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; 1535 1536 if (!cmd_buffer->state.emitted_graphics_pipeline || 1537 cmd_buffer->state.emitted_graphics_pipeline->pa_su_sc_mode_cntl != pipeline->pa_su_sc_mode_cntl) 1538 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | 1539 RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE | 1540 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; 1541 1542 if (!cmd_buffer->state.emitted_graphics_pipeline || 1543 cmd_buffer->state.emitted_graphics_pipeline->pa_cl_clip_cntl != pipeline->pa_cl_clip_cntl) 1544 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE; 1545 1546 if (!cmd_buffer->state.emitted_graphics_pipeline || 1547 cmd_buffer->state.emitted_graphics_pipeline->cb_color_control != pipeline->cb_color_control) 1548 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP; 1549 1550 if (!cmd_buffer->state.emitted_graphics_pipeline) 1551 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY | 1552 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS | 1553 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS | 1554 RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE; 1555 1556 if (!cmd_buffer->state.emitted_graphics_pipeline || 1557 cmd_buffer->state.emitted_graphics_pipeline->db_depth_control != pipeline->db_depth_control) 1558 cmd_buffer->state.dirty |= 1559 RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE | 1560 RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE | 1561 RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP; 1562 1563 if (!cmd_buffer->state.emitted_graphics_pipeline) 1564 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP; 1565 1566 if (!cmd_buffer->state.emitted_graphics_pipeline || 1567 cmd_buffer->state.emitted_graphics_pipeline->cb_target_mask != pipeline->cb_target_mask) { 1568 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE; 1569 } 1570 1571 radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw); 1572 1573 if (pipeline->has_ngg_culling && 1574 pipeline->last_vgt_api_stage != MESA_SHADER_GEOMETRY && 1575 !cmd_buffer->state.last_nggc_settings) { 1576 /* The already emitted RSRC2 contains the LDS required for NGG culling. 1577 * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage. 1578 * API GS always needs LDS, so this isn't useful there. 1579 */ 1580 struct radv_shader *v = pipeline->base.shaders[pipeline->last_vgt_api_stage]; 1581 radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, 1582 (v->config.rsrc2 & C_00B22C_LDS_SIZE) | 1583 S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling)); 1584 } 1585 1586 if (!cmd_buffer->state.emitted_graphics_pipeline || 1587 cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.cdw != pipeline->base.ctx_cs.cdw || 1588 cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs_hash != pipeline->base.ctx_cs_hash || 1589 memcmp(cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.buf, 1590 pipeline->base.ctx_cs.cdw * 4)) { 1591 radeon_emit_array(cmd_buffer->cs, pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.cdw); 1592 cmd_buffer->state.context_roll_without_scissor_emitted = true; 1593 } 1594 1595 radv_emit_batch_break_on_new_ps(cmd_buffer); 1596 1597 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.slab_bo); 1598 1599 if (unlikely(cmd_buffer->device->trace_bo)) 1600 radv_save_pipeline(cmd_buffer, &pipeline->base); 1601 1602 cmd_buffer->state.emitted_graphics_pipeline = pipeline; 1603 1604 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE; 1605} 1606 1607static void 1608radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer) 1609{ 1610 const struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 1611 const struct radv_viewport_state *viewport = &cmd_buffer->state.dynamic.viewport; 1612 int i; 1613 const unsigned count = viewport->count; 1614 1615 assert(count); 1616 radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, count * 6); 1617 1618 for (i = 0; i < count; i++) { 1619 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[0])); 1620 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[0])); 1621 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[1])); 1622 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[1])); 1623 1624 double scale_z, translate_z; 1625 if (pipeline->negative_one_to_one) { 1626 scale_z = viewport->xform[i].scale[2] * 0.5f; 1627 translate_z = (viewport->xform[i].translate[2] + viewport->viewports[i].maxDepth) * 0.5f; 1628 } else { 1629 scale_z = viewport->xform[i].scale[2]; 1630 translate_z = viewport->xform[i].translate[2]; 1631 1632 } 1633 radeon_emit(cmd_buffer->cs, fui(scale_z)); 1634 radeon_emit(cmd_buffer->cs, fui(translate_z)); 1635 } 1636 1637 radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, count * 2); 1638 for (i = 0; i < count; i++) { 1639 float zmin, zmax; 1640 1641 if (pipeline->depth_clamp_mode == RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE) { 1642 zmin = 0.0f; 1643 zmax = 1.0f; 1644 } else { 1645 zmin = MIN2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth); 1646 zmax = MAX2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth); 1647 } 1648 1649 radeon_emit(cmd_buffer->cs, fui(zmin)); 1650 radeon_emit(cmd_buffer->cs, fui(zmax)); 1651 } 1652} 1653 1654void 1655radv_write_scissors(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs) 1656{ 1657 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 1658 uint32_t count = cmd_buffer->state.dynamic.scissor.count; 1659 unsigned rast_prim; 1660 1661 if (!(pipeline->dynamic_states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) || 1662 (pipeline->active_stages & (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | 1663 VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT | 1664 VK_SHADER_STAGE_GEOMETRY_BIT | 1665 VK_SHADER_STAGE_MESH_BIT_NV))) { 1666 /* Ignore dynamic primitive topology for TES/GS/MS stages. */ 1667 rast_prim = pipeline->rast_prim; 1668 } else { 1669 rast_prim = si_conv_prim_to_gs_out(cmd_buffer->state.dynamic.primitive_topology); 1670 } 1671 1672 si_write_scissors(cs, 0, count, cmd_buffer->state.dynamic.scissor.scissors, 1673 cmd_buffer->state.dynamic.viewport.viewports, rast_prim, 1674 cmd_buffer->state.dynamic.line_width); 1675} 1676 1677static void 1678radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer) 1679{ 1680 radv_write_scissors(cmd_buffer, cmd_buffer->cs); 1681 1682 cmd_buffer->state.context_roll_without_scissor_emitted = false; 1683} 1684 1685static void 1686radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer) 1687{ 1688 if (!cmd_buffer->state.dynamic.discard_rectangle.count) 1689 return; 1690 1691 radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL, 1692 cmd_buffer->state.dynamic.discard_rectangle.count * 2); 1693 for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) { 1694 VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i]; 1695 radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y)); 1696 radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) | 1697 S_028214_BR_Y(rect.offset.y + rect.extent.height)); 1698 } 1699} 1700 1701static void 1702radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer) 1703{ 1704 unsigned width = cmd_buffer->state.dynamic.line_width * 8; 1705 1706 radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL, 1707 S_028A08_WIDTH(CLAMP(width, 0, 0xFFFF))); 1708} 1709 1710static void 1711radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer) 1712{ 1713 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1714 1715 radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4); 1716 radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4); 1717} 1718 1719static void 1720radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer) 1721{ 1722 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1723 1724 radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2); 1725 radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) | 1726 S_028430_STENCILMASK(d->stencil_compare_mask.front) | 1727 S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) | 1728 S_028430_STENCILOPVAL(1)); 1729 radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) | 1730 S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) | 1731 S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) | 1732 S_028434_STENCILOPVAL_BF(1)); 1733} 1734 1735static void 1736radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer) 1737{ 1738 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1739 1740 radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2); 1741 radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.min)); 1742 radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.max)); 1743} 1744 1745static void 1746radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer) 1747{ 1748 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1749 unsigned slope = fui(d->depth_bias.slope * 16.0f); 1750 1751 radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5); 1752 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */ 1753 radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */ 1754 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias)); /* FRONT OFFSET */ 1755 radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */ 1756 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias)); /* BACK OFFSET */ 1757} 1758 1759static void 1760radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer) 1761{ 1762 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1763 uint32_t auto_reset_cntl = 1; 1764 1765 if (d->primitive_topology == V_008958_DI_PT_LINESTRIP) 1766 auto_reset_cntl = 2; 1767 1768 radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE, 1769 S_028A0C_LINE_PATTERN(d->line_stipple.pattern) | 1770 S_028A0C_REPEAT_COUNT(d->line_stipple.factor - 1) | 1771 S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl)); 1772} 1773 1774uint32_t 1775radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer *cmd_buffer) 1776{ 1777 unsigned pa_su_sc_mode_cntl = cmd_buffer->state.graphics_pipeline->pa_su_sc_mode_cntl; 1778 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1779 1780 pa_su_sc_mode_cntl &= C_028814_CULL_FRONT & 1781 C_028814_CULL_BACK & 1782 C_028814_FACE & 1783 C_028814_POLY_OFFSET_FRONT_ENABLE & 1784 C_028814_POLY_OFFSET_BACK_ENABLE & 1785 C_028814_POLY_OFFSET_PARA_ENABLE; 1786 1787 pa_su_sc_mode_cntl |= S_028814_CULL_FRONT(!!(d->cull_mode & VK_CULL_MODE_FRONT_BIT)) | 1788 S_028814_CULL_BACK(!!(d->cull_mode & VK_CULL_MODE_BACK_BIT)) | 1789 S_028814_FACE(d->front_face) | 1790 S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) | 1791 S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) | 1792 S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable); 1793 return pa_su_sc_mode_cntl; 1794} 1795 1796static void 1797radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states) 1798{ 1799 unsigned pa_su_sc_mode_cntl = radv_get_pa_su_sc_mode_cntl(cmd_buffer); 1800 1801 radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl); 1802} 1803 1804static void 1805radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer) 1806{ 1807 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1808 1809 assert(!cmd_buffer->state.mesh_shading); 1810 1811 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) { 1812 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs, 1813 R_030908_VGT_PRIMITIVE_TYPE, 1, d->primitive_topology); 1814 } else { 1815 radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->primitive_topology); 1816 } 1817} 1818 1819static void 1820radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer, uint64_t states) 1821{ 1822 unsigned db_depth_control = cmd_buffer->state.graphics_pipeline->db_depth_control; 1823 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1824 1825 db_depth_control &= C_028800_Z_ENABLE & 1826 C_028800_Z_WRITE_ENABLE & 1827 C_028800_ZFUNC & 1828 C_028800_DEPTH_BOUNDS_ENABLE & 1829 C_028800_STENCIL_ENABLE & 1830 C_028800_BACKFACE_ENABLE & 1831 C_028800_STENCILFUNC & 1832 C_028800_STENCILFUNC_BF; 1833 1834 db_depth_control |= S_028800_Z_ENABLE(d->depth_test_enable ? 1 : 0) | 1835 S_028800_Z_WRITE_ENABLE(d->depth_write_enable ? 1 : 0) | 1836 S_028800_ZFUNC(d->depth_compare_op) | 1837 S_028800_DEPTH_BOUNDS_ENABLE(d->depth_bounds_test_enable ? 1 : 0) | 1838 S_028800_STENCIL_ENABLE(d->stencil_test_enable ? 1 : 0) | 1839 S_028800_BACKFACE_ENABLE(d->stencil_test_enable ? 1 : 0) | 1840 S_028800_STENCILFUNC(d->stencil_op.front.compare_op) | 1841 S_028800_STENCILFUNC_BF(d->stencil_op.back.compare_op); 1842 1843 radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, db_depth_control); 1844} 1845 1846static void 1847radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer) 1848{ 1849 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1850 1851 radeon_set_context_reg( 1852 cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL, 1853 S_02842C_STENCILFAIL(si_translate_stencil_op(d->stencil_op.front.fail_op)) | 1854 S_02842C_STENCILZPASS(si_translate_stencil_op(d->stencil_op.front.pass_op)) | 1855 S_02842C_STENCILZFAIL(si_translate_stencil_op(d->stencil_op.front.depth_fail_op)) | 1856 S_02842C_STENCILFAIL_BF(si_translate_stencil_op(d->stencil_op.back.fail_op)) | 1857 S_02842C_STENCILZPASS_BF(si_translate_stencil_op(d->stencil_op.back.pass_op)) | 1858 S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(d->stencil_op.back.depth_fail_op))); 1859} 1860 1861static void 1862radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer) 1863{ 1864 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 1865 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 1866 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1867 uint32_t rate_x = MIN2(2, d->fragment_shading_rate.size.width) - 1; 1868 uint32_t rate_y = MIN2(2, d->fragment_shading_rate.size.height) - 1; 1869 uint32_t pa_cl_vrs_cntl = pipeline->vrs.pa_cl_vrs_cntl; 1870 uint32_t pipeline_comb_mode = d->fragment_shading_rate.combiner_ops[0]; 1871 uint32_t htile_comb_mode = d->fragment_shading_rate.combiner_ops[1]; 1872 1873 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3); 1874 1875 if (subpass && !subpass->vrs_attachment) { 1876 /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we 1877 * can cheat by tweaking the different combiner modes. 1878 */ 1879 switch (htile_comb_mode) { 1880 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR: 1881 /* The result of min(A, 1x1) is always 1x1. */ 1882 FALLTHROUGH; 1883 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR: 1884 /* Force the per-draw VRS rate to 1x1. */ 1885 rate_x = rate_y = 0; 1886 1887 /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate 1888 * combiner mode as passthrough. 1889 */ 1890 pipeline_comb_mode = V_028848_VRS_COMB_MODE_PASSTHRU; 1891 break; 1892 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR: 1893 /* The result of max(A, 1x1) is always A. */ 1894 FALLTHROUGH; 1895 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR: 1896 /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */ 1897 break; 1898 default: 1899 break; 1900 } 1901 } 1902 1903 /* Emit per-draw VRS rate which is the first combiner. */ 1904 radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE, 1905 S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y)); 1906 1907 /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the 1908 * draw rate and the vertex rate. 1909 */ 1910 if (cmd_buffer->state.mesh_shading) { 1911 pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU) | 1912 S_028848_PRIMITIVE_RATE_COMBINER_MODE(pipeline_comb_mode); 1913 } else { 1914 pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(pipeline_comb_mode) | 1915 S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU); 1916 } 1917 1918 /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE 1919 * rate. 1920 */ 1921 pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode); 1922 1923 radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl); 1924} 1925 1926static void 1927radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer) 1928{ 1929 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1930 1931 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { 1932 radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_GE_MULTI_PRIM_IB_RESET_EN, 1933 d->primitive_restart_enable); 1934 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) { 1935 radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, 1936 d->primitive_restart_enable); 1937 } else { 1938 radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, 1939 d->primitive_restart_enable); 1940 } 1941} 1942 1943static void 1944radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer *cmd_buffer) 1945{ 1946 unsigned pa_cl_clip_cntl = cmd_buffer->state.graphics_pipeline->pa_cl_clip_cntl; 1947 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1948 1949 pa_cl_clip_cntl &= C_028810_DX_RASTERIZATION_KILL; 1950 pa_cl_clip_cntl |= S_028810_DX_RASTERIZATION_KILL(d->rasterizer_discard_enable); 1951 1952 radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL, pa_cl_clip_cntl); 1953} 1954 1955static void 1956radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer) 1957{ 1958 unsigned cb_color_control = cmd_buffer->state.graphics_pipeline->cb_color_control; 1959 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1960 1961 cb_color_control &= C_028808_ROP3; 1962 cb_color_control |= S_028808_ROP3(d->logic_op); 1963 1964 radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control); 1965} 1966 1967static void 1968radv_emit_color_write_enable(struct radv_cmd_buffer *cmd_buffer) 1969{ 1970 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 1971 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 1972 1973 radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK, 1974 pipeline->cb_target_mask & d->color_write_enable); 1975} 1976 1977static void 1978radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index, 1979 struct radv_color_buffer_info *cb, struct radv_image_view *iview, 1980 VkImageLayout layout, bool in_render_loop) 1981{ 1982 bool is_vi = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8; 1983 uint32_t cb_fdcc_control = cb->cb_dcc_control; 1984 uint32_t cb_color_info = cb->cb_color_info; 1985 struct radv_image *image = iview->image; 1986 1987 if (!radv_layout_dcc_compressed( 1988 cmd_buffer->device, image, iview->vk.base_mip_level, layout, in_render_loop, 1989 radv_image_queue_family_mask(image, cmd_buffer->qf, 1990 cmd_buffer->qf))) { 1991 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { 1992 cb_fdcc_control &= C_028C78_FDCC_ENABLE; 1993 } else { 1994 cb_color_info &= C_028C70_DCC_ENABLE; 1995 } 1996 } 1997 1998 if (!radv_layout_fmask_compressed( 1999 cmd_buffer->device, image, layout, 2000 radv_image_queue_family_mask(image, cmd_buffer->qf, 2001 cmd_buffer->qf))) { 2002 cb_color_info &= C_028C70_COMPRESSION; 2003 } 2004 2005 if (radv_image_is_tc_compat_cmask(image) && (radv_is_fmask_decompress_pipeline(cmd_buffer) || 2006 radv_is_dcc_decompress_pipeline(cmd_buffer))) { 2007 /* If this bit is set, the FMASK decompression operation 2008 * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS). 2009 */ 2010 cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY; 2011 } 2012 2013 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { 2014 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C6C_CB_COLOR0_VIEW + index * 0x3c, 4); 2015 radeon_emit(cmd_buffer->cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ 2016 radeon_emit(cmd_buffer->cs, cb->cb_color_info); /* CB_COLOR0_INFO */ 2017 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); /* CB_COLOR0_ATTRIB */ 2018 radeon_emit(cmd_buffer->cs, cb_fdcc_control); /* CB_COLOR0_FDCC_CONTROL */ 2019 2020 radeon_set_context_reg(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, cb->cb_color_base); 2021 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4, cb->cb_color_base >> 32); 2022 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base); 2023 radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4, cb->cb_dcc_base >> 32); 2024 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->cb_color_attrib2); 2025 radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->cb_color_attrib3); 2026 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) { 2027 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); 2028 radeon_emit(cmd_buffer->cs, cb->cb_color_base); 2029 radeon_emit(cmd_buffer->cs, 0); 2030 radeon_emit(cmd_buffer->cs, 0); 2031 radeon_emit(cmd_buffer->cs, cb->cb_color_view); 2032 radeon_emit(cmd_buffer->cs, cb_color_info); 2033 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); 2034 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); 2035 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); 2036 radeon_emit(cmd_buffer->cs, 0); 2037 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); 2038 radeon_emit(cmd_buffer->cs, 0); 2039 2040 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base); 2041 2042 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4, 2043 cb->cb_color_base >> 32); 2044 radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4, 2045 cb->cb_color_cmask >> 32); 2046 radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4, 2047 cb->cb_color_fmask >> 32); 2048 radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4, 2049 cb->cb_dcc_base >> 32); 2050 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, 2051 cb->cb_color_attrib2); 2052 radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, 2053 cb->cb_color_attrib3); 2054 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) { 2055 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); 2056 radeon_emit(cmd_buffer->cs, cb->cb_color_base); 2057 radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32)); 2058 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2); 2059 radeon_emit(cmd_buffer->cs, cb->cb_color_view); 2060 radeon_emit(cmd_buffer->cs, cb_color_info); 2061 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); 2062 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); 2063 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); 2064 radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32)); 2065 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); 2066 radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32)); 2067 2068 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2); 2069 radeon_emit(cmd_buffer->cs, cb->cb_dcc_base); 2070 radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32)); 2071 2072 radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4, 2073 cb->cb_mrt_epitch); 2074 } else { 2075 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11); 2076 radeon_emit(cmd_buffer->cs, cb->cb_color_base); 2077 radeon_emit(cmd_buffer->cs, cb->cb_color_pitch); 2078 radeon_emit(cmd_buffer->cs, cb->cb_color_slice); 2079 radeon_emit(cmd_buffer->cs, cb->cb_color_view); 2080 radeon_emit(cmd_buffer->cs, cb_color_info); 2081 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); 2082 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control); 2083 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask); 2084 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice); 2085 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask); 2086 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice); 2087 2088 if (is_vi) { /* DCC BASE */ 2089 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2090 cb->cb_dcc_base); 2091 } 2092 } 2093 2094 if (G_028C70_DCC_ENABLE(cb_color_info)) { 2095 /* Drawing with DCC enabled also compresses colorbuffers. */ 2096 VkImageSubresourceRange range = { 2097 .aspectMask = iview->vk.aspects, 2098 .baseMipLevel = iview->vk.base_mip_level, 2099 .levelCount = iview->vk.level_count, 2100 .baseArrayLayer = iview->vk.base_array_layer, 2101 .layerCount = iview->vk.layer_count, 2102 }; 2103 2104 radv_update_dcc_metadata(cmd_buffer, image, &range, true); 2105 } 2106} 2107 2108static void 2109radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds, 2110 const struct radv_image_view *iview, VkImageLayout layout, 2111 bool in_render_loop, bool requires_cond_exec) 2112{ 2113 const struct radv_image *image = iview->image; 2114 uint32_t db_z_info = ds->db_z_info; 2115 uint32_t db_z_info_reg; 2116 2117 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug || 2118 !radv_image_is_tc_compat_htile(image)) 2119 return; 2120 2121 if (!radv_layout_is_htile_compressed( 2122 cmd_buffer->device, image, layout, in_render_loop, 2123 radv_image_queue_family_mask(image, cmd_buffer->qf, 2124 cmd_buffer->qf))) { 2125 db_z_info &= C_028040_TILE_SURFACE_ENABLE; 2126 } 2127 2128 db_z_info &= C_028040_ZRANGE_PRECISION; 2129 2130 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) { 2131 db_z_info_reg = R_028038_DB_Z_INFO; 2132 } else { 2133 db_z_info_reg = R_028040_DB_Z_INFO; 2134 } 2135 2136 /* When we don't know the last fast clear value we need to emit a 2137 * conditional packet that will eventually skip the following 2138 * SET_CONTEXT_REG packet. 2139 */ 2140 if (requires_cond_exec) { 2141 uint64_t va = radv_get_tc_compat_zrange_va(image, iview->vk.base_mip_level); 2142 2143 radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0)); 2144 radeon_emit(cmd_buffer->cs, va); 2145 radeon_emit(cmd_buffer->cs, va >> 32); 2146 radeon_emit(cmd_buffer->cs, 0); 2147 radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */ 2148 } 2149 2150 radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info); 2151} 2152 2153static void 2154radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds, 2155 struct radv_image_view *iview, VkImageLayout layout, bool in_render_loop) 2156{ 2157 const struct radv_image *image = iview->image; 2158 uint32_t db_z_info = ds->db_z_info; 2159 uint32_t db_stencil_info = ds->db_stencil_info; 2160 uint32_t db_htile_surface = ds->db_htile_surface; 2161 2162 if (!radv_layout_is_htile_compressed( 2163 cmd_buffer->device, image, layout, in_render_loop, 2164 radv_image_queue_family_mask(image, cmd_buffer->qf, 2165 cmd_buffer->qf))) { 2166 db_z_info &= C_028040_TILE_SURFACE_ENABLE; 2167 db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1); 2168 } 2169 2170 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3 && 2171 !cmd_buffer->state.subpass->vrs_attachment) { 2172 db_htile_surface &= C_028ABC_VRS_HTILE_ENCODING; 2173 } 2174 2175 radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view); 2176 radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface); 2177 2178 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) { 2179 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base); 2180 radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size); 2181 2182 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { 2183 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 6); 2184 } else { 2185 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7); 2186 radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1)); 2187 } 2188 radeon_emit(cmd_buffer->cs, db_z_info); 2189 radeon_emit(cmd_buffer->cs, db_stencil_info); 2190 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); 2191 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); 2192 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); 2193 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); 2194 2195 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5); 2196 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32); 2197 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32); 2198 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32); 2199 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32); 2200 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32); 2201 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) { 2202 radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3); 2203 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base); 2204 radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32)); 2205 radeon_emit(cmd_buffer->cs, ds->db_depth_size); 2206 2207 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10); 2208 radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */ 2209 radeon_emit(cmd_buffer->cs, db_stencil_info); /* DB_STENCIL_INFO */ 2210 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */ 2211 radeon_emit(cmd_buffer->cs, 2212 S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */ 2213 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */ 2214 radeon_emit(cmd_buffer->cs, 2215 S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */ 2216 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */ 2217 radeon_emit(cmd_buffer->cs, 2218 S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */ 2219 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */ 2220 radeon_emit(cmd_buffer->cs, 2221 S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ 2222 2223 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2); 2224 radeon_emit(cmd_buffer->cs, ds->db_z_info2); 2225 radeon_emit(cmd_buffer->cs, ds->db_stencil_info2); 2226 } else { 2227 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base); 2228 2229 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9); 2230 radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */ 2231 radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */ 2232 radeon_emit(cmd_buffer->cs, db_stencil_info); /* R_028044_DB_STENCIL_INFO */ 2233 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */ 2234 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */ 2235 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */ 2236 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */ 2237 radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */ 2238 radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */ 2239 } 2240 2241 /* Update the ZRANGE_PRECISION value for the TC-compat bug. */ 2242 radv_update_zrange_precision(cmd_buffer, ds, iview, layout, in_render_loop, true); 2243 2244 radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, 2245 ds->pa_su_poly_offset_db_fmt_cntl); 2246} 2247 2248/** 2249 * Update the fast clear depth/stencil values if the image is bound as a 2250 * depth/stencil buffer. 2251 */ 2252static void 2253radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, 2254 const struct radv_image_view *iview, 2255 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) 2256{ 2257 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 2258 const struct radv_image *image = iview->image; 2259 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2260 uint32_t att_idx; 2261 2262 if (!cmd_buffer->state.attachments || !subpass) 2263 return; 2264 2265 if (!subpass->depth_stencil_attachment) 2266 return; 2267 2268 att_idx = subpass->depth_stencil_attachment->attachment; 2269 if (cmd_buffer->state.attachments[att_idx].iview->image != image) 2270 return; 2271 2272 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { 2273 radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2); 2274 radeon_emit(cs, ds_clear_value.stencil); 2275 radeon_emit(cs, fui(ds_clear_value.depth)); 2276 } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) { 2277 radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth)); 2278 } else { 2279 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT); 2280 radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil); 2281 } 2282 2283 /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is 2284 * only needed when clearing Z to 0.0. 2285 */ 2286 if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) { 2287 VkImageLayout layout = subpass->depth_stencil_attachment->layout; 2288 bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop; 2289 2290 radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, iview, 2291 layout, in_render_loop, false); 2292 } 2293 2294 cmd_buffer->state.context_roll_without_scissor_emitted = true; 2295} 2296 2297/** 2298 * Set the clear depth/stencil values to the image's metadata. 2299 */ 2300static void 2301radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 2302 const VkImageSubresourceRange *range, 2303 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) 2304{ 2305 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2306 uint32_t level_count = radv_get_levelCount(image, range); 2307 2308 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { 2309 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel); 2310 2311 /* Use the fastest way when both aspects are used. */ 2312 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating)); 2313 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 2314 radeon_emit(cs, va); 2315 radeon_emit(cs, va >> 32); 2316 2317 for (uint32_t l = 0; l < level_count; l++) { 2318 radeon_emit(cs, ds_clear_value.stencil); 2319 radeon_emit(cs, fui(ds_clear_value.depth)); 2320 } 2321 } else { 2322 /* Otherwise we need one WRITE_DATA packet per level. */ 2323 for (uint32_t l = 0; l < level_count; l++) { 2324 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l); 2325 unsigned value; 2326 2327 if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) { 2328 value = fui(ds_clear_value.depth); 2329 va += 4; 2330 } else { 2331 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT); 2332 value = ds_clear_value.stencil; 2333 } 2334 2335 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating)); 2336 radeon_emit(cs, 2337 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 2338 radeon_emit(cs, va); 2339 radeon_emit(cs, va >> 32); 2340 radeon_emit(cs, value); 2341 } 2342 } 2343} 2344 2345/** 2346 * Update the TC-compat metadata value for this image. 2347 */ 2348static void 2349radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 2350 const VkImageSubresourceRange *range, uint32_t value) 2351{ 2352 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2353 2354 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug) 2355 return; 2356 2357 uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel); 2358 uint32_t level_count = radv_get_levelCount(image, range); 2359 2360 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating)); 2361 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 2362 radeon_emit(cs, va); 2363 radeon_emit(cs, va >> 32); 2364 2365 for (uint32_t l = 0; l < level_count; l++) 2366 radeon_emit(cs, value); 2367} 2368 2369static void 2370radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, 2371 const struct radv_image_view *iview, 2372 VkClearDepthStencilValue ds_clear_value) 2373{ 2374 VkImageSubresourceRange range = { 2375 .aspectMask = iview->vk.aspects, 2376 .baseMipLevel = iview->vk.base_mip_level, 2377 .levelCount = iview->vk.level_count, 2378 .baseArrayLayer = iview->vk.base_array_layer, 2379 .layerCount = iview->vk.layer_count, 2380 }; 2381 uint32_t cond_val; 2382 2383 /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last 2384 * depth clear value is 0.0f. 2385 */ 2386 cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0; 2387 2388 radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val); 2389} 2390 2391/** 2392 * Update the clear depth/stencil values for this image. 2393 */ 2394void 2395radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, 2396 const struct radv_image_view *iview, 2397 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects) 2398{ 2399 VkImageSubresourceRange range = { 2400 .aspectMask = iview->vk.aspects, 2401 .baseMipLevel = iview->vk.base_mip_level, 2402 .levelCount = iview->vk.level_count, 2403 .baseArrayLayer = iview->vk.base_array_layer, 2404 .layerCount = iview->vk.layer_count, 2405 }; 2406 struct radv_image *image = iview->image; 2407 2408 assert(radv_htile_enabled(image, range.baseMipLevel)); 2409 2410 radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects); 2411 2412 if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { 2413 radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value); 2414 } 2415 2416 radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects); 2417} 2418 2419/** 2420 * Load the clear depth/stencil values from the image's metadata. 2421 */ 2422static void 2423radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview) 2424{ 2425 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2426 const struct radv_image *image = iview->image; 2427 VkImageAspectFlags aspects = vk_format_aspects(image->vk.format); 2428 uint64_t va = radv_get_ds_clear_value_va(image, iview->vk.base_mip_level); 2429 unsigned reg_offset = 0, reg_count = 0; 2430 2431 assert(radv_image_has_htile(image)); 2432 2433 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { 2434 ++reg_count; 2435 } else { 2436 ++reg_offset; 2437 va += 4; 2438 } 2439 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) 2440 ++reg_count; 2441 2442 uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset; 2443 2444 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) { 2445 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0)); 2446 radeon_emit(cs, va); 2447 radeon_emit(cs, va >> 32); 2448 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); 2449 radeon_emit(cs, reg_count); 2450 } else { 2451 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 2452 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | 2453 (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0)); 2454 radeon_emit(cs, va); 2455 radeon_emit(cs, va >> 32); 2456 radeon_emit(cs, reg >> 2); 2457 radeon_emit(cs, 0); 2458 2459 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 2460 radeon_emit(cs, 0); 2461 } 2462} 2463 2464/* 2465 * With DCC some colors don't require CMASK elimination before being 2466 * used as a texture. This sets a predicate value to determine if the 2467 * cmask eliminate is required. 2468 */ 2469void 2470radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 2471 const VkImageSubresourceRange *range, bool value) 2472{ 2473 if (!image->fce_pred_offset) 2474 return; 2475 2476 uint64_t pred_val = value; 2477 uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel); 2478 uint32_t level_count = radv_get_levelCount(image, range); 2479 uint32_t count = 2 * level_count; 2480 2481 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0)); 2482 radeon_emit(cmd_buffer->cs, 2483 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 2484 radeon_emit(cmd_buffer->cs, va); 2485 radeon_emit(cmd_buffer->cs, va >> 32); 2486 2487 for (uint32_t l = 0; l < level_count; l++) { 2488 radeon_emit(cmd_buffer->cs, pred_val); 2489 radeon_emit(cmd_buffer->cs, pred_val >> 32); 2490 } 2491} 2492 2493/** 2494 * Update the DCC predicate to reflect the compression state. 2495 */ 2496void 2497radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 2498 const VkImageSubresourceRange *range, bool value) 2499{ 2500 if (image->dcc_pred_offset == 0) 2501 return; 2502 2503 uint64_t pred_val = value; 2504 uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel); 2505 uint32_t level_count = radv_get_levelCount(image, range); 2506 uint32_t count = 2 * level_count; 2507 2508 assert(radv_dcc_enabled(image, range->baseMipLevel)); 2509 2510 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0)); 2511 radeon_emit(cmd_buffer->cs, 2512 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 2513 radeon_emit(cmd_buffer->cs, va); 2514 radeon_emit(cmd_buffer->cs, va >> 32); 2515 2516 for (uint32_t l = 0; l < level_count; l++) { 2517 radeon_emit(cmd_buffer->cs, pred_val); 2518 radeon_emit(cmd_buffer->cs, pred_val >> 32); 2519 } 2520} 2521 2522/** 2523 * Update the fast clear color values if the image is bound as a color buffer. 2524 */ 2525static void 2526radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 2527 int cb_idx, uint32_t color_values[2]) 2528{ 2529 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 2530 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2531 uint32_t att_idx; 2532 2533 if (!cmd_buffer->state.attachments || !subpass) 2534 return; 2535 2536 att_idx = subpass->color_attachments[cb_idx].attachment; 2537 if (att_idx == VK_ATTACHMENT_UNUSED) 2538 return; 2539 2540 if (cmd_buffer->state.attachments[att_idx].iview->image != image) 2541 return; 2542 2543 radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2); 2544 radeon_emit(cs, color_values[0]); 2545 radeon_emit(cs, color_values[1]); 2546 2547 cmd_buffer->state.context_roll_without_scissor_emitted = true; 2548} 2549 2550/** 2551 * Set the clear color values to the image's metadata. 2552 */ 2553static void 2554radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 2555 const VkImageSubresourceRange *range, uint32_t color_values[2]) 2556{ 2557 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2558 uint32_t level_count = radv_get_levelCount(image, range); 2559 uint32_t count = 2 * level_count; 2560 2561 assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)); 2562 2563 if (radv_image_has_clear_value(image)) { 2564 uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel); 2565 2566 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating)); 2567 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 2568 radeon_emit(cs, va); 2569 radeon_emit(cs, va >> 32); 2570 2571 for (uint32_t l = 0; l < level_count; l++) { 2572 radeon_emit(cs, color_values[0]); 2573 radeon_emit(cs, color_values[1]); 2574 } 2575 } else { 2576 /* Some default value we can set in the update. */ 2577 assert(color_values[0] == 0 && color_values[1] == 0); 2578 } 2579} 2580 2581/** 2582 * Update the clear color values for this image. 2583 */ 2584void 2585radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, 2586 const struct radv_image_view *iview, int cb_idx, 2587 uint32_t color_values[2]) 2588{ 2589 struct radv_image *image = iview->image; 2590 VkImageSubresourceRange range = { 2591 .aspectMask = iview->vk.aspects, 2592 .baseMipLevel = iview->vk.base_mip_level, 2593 .levelCount = iview->vk.level_count, 2594 .baseArrayLayer = iview->vk.base_array_layer, 2595 .layerCount = iview->vk.layer_count, 2596 }; 2597 2598 assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->vk.base_mip_level)); 2599 2600 /* Do not need to update the clear value for images that are fast cleared with the comp-to-single 2601 * mode because the hardware gets the value from the image directly. 2602 */ 2603 if (iview->image->support_comp_to_single) 2604 return; 2605 2606 radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values); 2607 2608 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values); 2609} 2610 2611/** 2612 * Load the clear color values from the image's metadata. 2613 */ 2614static void 2615radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview, 2616 int cb_idx) 2617{ 2618 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2619 struct radv_image *image = iview->image; 2620 2621 if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->vk.base_mip_level)) 2622 return; 2623 2624 if (iview->image->support_comp_to_single) 2625 return; 2626 2627 if (!radv_image_has_clear_value(image)) { 2628 uint32_t color_values[2] = {0, 0}; 2629 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values); 2630 return; 2631 } 2632 2633 uint64_t va = radv_image_get_fast_clear_va(image, iview->vk.base_mip_level); 2634 uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c; 2635 2636 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) { 2637 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating)); 2638 radeon_emit(cs, va); 2639 radeon_emit(cs, va >> 32); 2640 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); 2641 radeon_emit(cs, 2); 2642 } else { 2643 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating)); 2644 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | 2645 COPY_DATA_COUNT_SEL); 2646 radeon_emit(cs, va); 2647 radeon_emit(cs, va >> 32); 2648 radeon_emit(cs, reg >> 2); 2649 radeon_emit(cs, 0); 2650 2651 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); 2652 radeon_emit(cs, 0); 2653 } 2654} 2655 2656/* GFX9+ metadata cache flushing workaround. metadata cache coherency is 2657 * broken if the CB caches data of multiple mips of the same image at the 2658 * same time. 2659 * 2660 * Insert some flushes to avoid this. 2661 */ 2662static void 2663radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer) 2664{ 2665 struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 2666 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 2667 bool color_mip_changed = false; 2668 2669 /* Entire workaround is not applicable before GFX9 */ 2670 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9) 2671 return; 2672 2673 if (!framebuffer) 2674 return; 2675 2676 for (int i = 0; i < subpass->color_count; ++i) { 2677 int idx = subpass->color_attachments[i].attachment; 2678 if (idx == VK_ATTACHMENT_UNUSED) 2679 continue; 2680 2681 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview; 2682 2683 if ((radv_image_has_CB_metadata(iview->image) || 2684 radv_dcc_enabled(iview->image, iview->vk.base_mip_level) || 2685 radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) && 2686 cmd_buffer->state.cb_mip[i] != iview->vk.base_mip_level) 2687 color_mip_changed = true; 2688 2689 cmd_buffer->state.cb_mip[i] = iview->vk.base_mip_level; 2690 } 2691 2692 if (color_mip_changed) { 2693 cmd_buffer->state.flush_bits |= 2694 RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 2695 } 2696} 2697 2698/* This function does the flushes for mip changes if the levels are not zero for 2699 * all render targets. This way we can assume at the start of the next cmd_buffer 2700 * that rendering to mip 0 doesn't need any flushes. As that is the most common 2701 * case that saves some flushes. */ 2702static void 2703radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer) 2704{ 2705 /* Entire workaround is not applicable before GFX9 */ 2706 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9) 2707 return; 2708 2709 bool need_color_mip_flush = false; 2710 for (unsigned i = 0; i < 8; ++i) { 2711 if (cmd_buffer->state.cb_mip[i]) { 2712 need_color_mip_flush = true; 2713 break; 2714 } 2715 } 2716 2717 if (need_color_mip_flush) { 2718 cmd_buffer->state.flush_bits |= 2719 RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 2720 } 2721 2722 memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip)); 2723} 2724 2725static struct radv_image * 2726radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer) 2727{ 2728 struct radv_device *device = cmd_buffer->device; 2729 2730 if (!device->vrs.image) { 2731 VkResult result; 2732 2733 /* The global VRS state is initialized on-demand to avoid wasting VRAM. */ 2734 result = radv_device_init_vrs_state(device); 2735 if (result != VK_SUCCESS) { 2736 cmd_buffer->record_result = result; 2737 return NULL; 2738 } 2739 } 2740 2741 return device->vrs.image; 2742} 2743 2744static void 2745radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer) 2746{ 2747 int i; 2748 struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 2749 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 2750 bool disable_constant_encode_ac01 = false; 2751 unsigned color_invalid = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11 2752 ? G_028C70_FORMAT_GFX11(V_028C70_COLOR_INVALID) 2753 : G_028C70_FORMAT_GFX6(V_028C70_COLOR_INVALID); 2754 2755 for (i = 0; i < subpass->color_count; ++i) { 2756 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { 2757 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid); 2758 continue; 2759 } 2760 2761 int idx = subpass->color_attachments[i].attachment; 2762 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview; 2763 VkImageLayout layout = subpass->color_attachments[i].layout; 2764 bool in_render_loop = subpass->color_attachments[i].in_render_loop; 2765 2766 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bindings[0].bo); 2767 2768 assert(iview->vk.aspects & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT | 2769 VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)); 2770 2771 if (iview->image->disjoint && iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT) { 2772 for (uint32_t plane_id = 0; plane_id < iview->image->plane_count; plane_id++) { 2773 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 2774 iview->image->bindings[plane_id].bo); 2775 } 2776 } else { 2777 uint32_t plane_id = iview->image->disjoint ? iview->plane_id : 0; 2778 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 2779 iview->image->bindings[plane_id].bo); 2780 } 2781 2782 radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout, 2783 in_render_loop); 2784 2785 radv_load_color_clear_metadata(cmd_buffer, iview, i); 2786 2787 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 && 2788 iview->image->dcc_sign_reinterpret) { 2789 /* Disable constant encoding with the clear value of "1" with different DCC signedness 2790 * because the hardware will fill "1" instead of the clear value. 2791 */ 2792 disable_constant_encode_ac01 = true; 2793 } 2794 } 2795 for (; i < cmd_buffer->state.last_subpass_color_count; i++) { 2796 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid); 2797 } 2798 cmd_buffer->state.last_subpass_color_count = subpass->color_count; 2799 2800 if (subpass->depth_stencil_attachment) { 2801 int idx = subpass->depth_stencil_attachment->attachment; 2802 VkImageLayout layout = subpass->depth_stencil_attachment->layout; 2803 bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop; 2804 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview; 2805 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, 2806 cmd_buffer->state.attachments[idx].iview->image->bindings[0].bo); 2807 2808 radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout, 2809 in_render_loop); 2810 2811 if (radv_layout_is_htile_compressed( 2812 cmd_buffer->device, iview->image, layout, in_render_loop, 2813 radv_image_queue_family_mask(iview->image, cmd_buffer->qf, 2814 cmd_buffer->qf))) { 2815 /* Only load the depth/stencil fast clear values when 2816 * compressed rendering is enabled. 2817 */ 2818 radv_load_ds_clear_metadata(cmd_buffer, iview); 2819 } 2820 } else if (subpass->vrs_attachment && radv_cmd_buffer_get_vrs_image(cmd_buffer)) { 2821 /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to 2822 * bind our internal depth buffer that contains the VRS data as part of HTILE. 2823 */ 2824 VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; 2825 struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer; 2826 struct radv_image *image = cmd_buffer->device->vrs.image; 2827 struct radv_ds_buffer_info ds; 2828 struct radv_image_view iview; 2829 2830 radv_image_view_init(&iview, cmd_buffer->device, 2831 &(VkImageViewCreateInfo){ 2832 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, 2833 .image = radv_image_to_handle(image), 2834 .viewType = radv_meta_get_view_type(image), 2835 .format = image->vk.format, 2836 .subresourceRange = 2837 { 2838 .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT, 2839 .baseMipLevel = 0, 2840 .levelCount = 1, 2841 .baseArrayLayer = 0, 2842 .layerCount = 1, 2843 }, 2844 }, 2845 0, NULL); 2846 2847 radv_initialise_vrs_surface(image, htile_buffer, &ds); 2848 2849 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo); 2850 2851 radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, layout, false); 2852 2853 radv_image_view_finish(&iview); 2854 } else { 2855 unsigned num_samples = 0; 2856 2857 /* On GFX11, DB_Z_INFO.NUM_SAMPLES should always match the framebuffer samples. It affects 2858 * VRS and occlusion queries if depth and stencil are not bound. 2859 */ 2860 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX11) 2861 num_samples = util_logbase2(subpass->max_sample_count); 2862 2863 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) 2864 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2); 2865 else 2866 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2); 2867 2868 radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID) | /* DB_Z_INFO */ 2869 S_028040_NUM_SAMPLES(num_samples)); 2870 radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ 2871 } 2872 radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, 2873 S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height)); 2874 2875 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8) { 2876 bool disable_constant_encode = 2877 cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode; 2878 enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level; 2879 uint8_t watermark = gfx_level >= GFX10 ? 6 : 4; 2880 2881 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { 2882 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_FDCC_CONTROL, 2883 S_028424_SAMPLE_MASK_TRACKER_WATERMARK(watermark)); 2884 } else { 2885 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL, 2886 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(gfx_level <= GFX9) | 2887 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) | 2888 S_028424_DISABLE_CONSTANT_ENCODE_AC01(disable_constant_encode_ac01) | 2889 S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode)); 2890 } 2891 } 2892 2893 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER; 2894} 2895 2896static void 2897radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect) 2898{ 2899 struct radeon_cmdbuf *cs = cmd_buffer->cs; 2900 struct radv_cmd_state *state = &cmd_buffer->state; 2901 2902 /* With indirect generated commands the index buffer bind may be part of the 2903 * indirect command buffer, in which case the app may not have bound any yet. */ 2904 if (state->index_type < 0) 2905 return; 2906 2907 /* For the direct indexed draws we use DRAW_INDEX_2, which includes 2908 * the index_va and max_index_count already. */ 2909 if (!indirect) 2910 return; 2911 2912 if (state->max_index_count || 2913 !cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) { 2914 radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0)); 2915 radeon_emit(cs, state->index_va); 2916 radeon_emit(cs, state->index_va >> 32); 2917 2918 radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); 2919 radeon_emit(cs, state->max_index_count); 2920 } 2921 2922 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER; 2923} 2924 2925void 2926radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer, bool enable_occlusion_queries) 2927{ 2928 bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled; 2929 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 2930 uint32_t pa_sc_mode_cntl_1 = pipeline ? pipeline->ms.pa_sc_mode_cntl_1 : 0; 2931 uint32_t db_count_control; 2932 2933 if (!enable_occlusion_queries) { 2934 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) { 2935 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) && 2936 pipeline->disable_out_of_order_rast_for_occlusion && has_perfect_queries) { 2937 /* Re-enable out-of-order rasterization if the 2938 * bound pipeline supports it and if it's has 2939 * been disabled before starting any perfect 2940 * occlusion queries. 2941 */ 2942 radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1); 2943 } 2944 } 2945 db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1); 2946 } else { 2947 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 2948 uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0; 2949 bool gfx10_perfect = 2950 cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10 && has_perfect_queries; 2951 2952 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) { 2953 /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially 2954 * covered tiles, discards, and early depth testing. For more details, 2955 * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */ 2956 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | 2957 S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | 2958 S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) | 2959 S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1); 2960 2961 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) && 2962 pipeline->disable_out_of_order_rast_for_occlusion && has_perfect_queries) { 2963 /* If the bound pipeline has enabled 2964 * out-of-order rasterization, we should 2965 * disable it before starting any perfect 2966 * occlusion queries. 2967 */ 2968 pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE; 2969 2970 radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1); 2971 } 2972 } else { 2973 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate); 2974 } 2975 } 2976 2977 radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control); 2978 2979 cmd_buffer->state.context_roll_without_scissor_emitted = true; 2980} 2981 2982unsigned 2983radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs) 2984{ 2985 /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a 2986 * single array sorted in ascending order using: 2987 * - total number of attributes 2988 * - number of instanced attributes 2989 * - index of first instanced attribute 2990 */ 2991 2992 /* From total number of attributes to offset. */ 2993 static const uint16_t total_to_offset[16] = {0, 1, 4, 10, 20, 35, 56, 84, 2994 120, 165, 220, 286, 364, 455, 560, 680}; 2995 unsigned start_index = total_to_offset[num_attributes - 1]; 2996 2997 /* From number of instanced attributes to offset. This would require a different LUT depending on 2998 * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total 2999 * attributes. 3000 */ 3001 static const uint8_t count_to_offset_total16[16] = {0, 16, 31, 45, 58, 70, 81, 91, 3002 100, 108, 115, 121, 126, 130, 133, 135}; 3003 unsigned count = util_bitcount(instance_rate_inputs); 3004 unsigned offset_from_start_index = 3005 count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1)); 3006 3007 unsigned first = ffs(instance_rate_inputs) - 1; 3008 return start_index + offset_from_start_index + first; 3009} 3010 3011union vs_prolog_key_header { 3012 struct { 3013 uint32_t key_size : 8; 3014 uint32_t num_attributes : 6; 3015 uint32_t as_ls : 1; 3016 uint32_t is_ngg : 1; 3017 uint32_t wave32 : 1; 3018 uint32_t next_stage : 3; 3019 uint32_t instance_rate_inputs : 1; 3020 uint32_t alpha_adjust_lo : 1; 3021 uint32_t alpha_adjust_hi : 1; 3022 uint32_t misaligned_mask : 1; 3023 uint32_t post_shuffle : 1; 3024 uint32_t nontrivial_divisors : 1; 3025 uint32_t zero_divisors : 1; 3026 /* We need this to ensure the padding is zero. It's useful even if it's unused. */ 3027 uint32_t padding0 : 5; 3028 }; 3029 uint32_t v; 3030}; 3031 3032uint32_t 3033radv_hash_vs_prolog(const void *key_) 3034{ 3035 const uint32_t *key = key_; 3036 union vs_prolog_key_header header; 3037 header.v = key[0]; 3038 return _mesa_hash_data(key, header.key_size); 3039} 3040 3041bool 3042radv_cmp_vs_prolog(const void *a_, const void *b_) 3043{ 3044 const uint32_t *a = a_; 3045 const uint32_t *b = b_; 3046 if (a[0] != b[0]) 3047 return false; 3048 3049 union vs_prolog_key_header header; 3050 header.v = a[0]; 3051 return memcmp(a, b, header.key_size) == 0; 3052} 3053 3054static struct radv_shader_part * 3055lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader, 3056 uint32_t *nontrivial_divisors) 3057{ 3058 STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4); 3059 assert(vs_shader->info.vs.dynamic_inputs); 3060 3061 const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; 3062 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 3063 struct radv_device *device = cmd_buffer->device; 3064 3065 unsigned num_attributes = pipeline->last_vertex_attrib_bit; 3066 uint32_t attribute_mask = BITFIELD_MASK(num_attributes); 3067 3068 uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask; 3069 uint32_t zero_divisors = state->zero_divisors & attribute_mask; 3070 *nontrivial_divisors = state->nontrivial_divisors & attribute_mask; 3071 uint32_t misaligned_mask = cmd_buffer->state.vbo_misaligned_mask; 3072 if (cmd_buffer->state.vbo_misaligned_mask_invalid) { 3073 assert(device->physical_device->rad_info.gfx_level == GFX6 || 3074 device->physical_device->rad_info.gfx_level >= GFX10); 3075 3076 u_foreach_bit (index, cmd_buffer->state.vbo_misaligned_mask_invalid & attribute_mask) { 3077 uint8_t binding = state->bindings[index]; 3078 if (!(cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(binding))) 3079 continue; 3080 uint8_t req = state->format_align_req_minus_1[index]; 3081 struct radv_vertex_binding *vb = &cmd_buffer->vertex_bindings[binding]; 3082 VkDeviceSize offset = vb->offset + state->offsets[index]; 3083 if ((offset & req) || (vb->stride & req)) 3084 misaligned_mask |= BITFIELD_BIT(index); 3085 } 3086 cmd_buffer->state.vbo_misaligned_mask = misaligned_mask; 3087 cmd_buffer->state.vbo_misaligned_mask_invalid &= ~attribute_mask; 3088 } 3089 3090 /* try to use a pre-compiled prolog first */ 3091 struct radv_shader_part *prolog = NULL; 3092 if (pipeline->can_use_simple_input && 3093 (!vs_shader->info.vs.as_ls || !instance_rate_inputs) && 3094 !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) { 3095 if (!instance_rate_inputs) { 3096 prolog = device->simple_vs_prologs[num_attributes - 1]; 3097 } else if (num_attributes <= 16 && !*nontrivial_divisors && !zero_divisors && 3098 util_bitcount(instance_rate_inputs) == 3099 (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) { 3100 unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs); 3101 prolog = device->instance_rate_vs_prologs[index]; 3102 } 3103 } 3104 if (prolog) 3105 return prolog; 3106 3107 /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */ 3108 uint32_t key_words[17]; 3109 unsigned key_size = 1; 3110 3111 struct radv_vs_prolog_key key; 3112 key.state = state; 3113 key.num_attributes = num_attributes; 3114 key.misaligned_mask = misaligned_mask; 3115 /* The instance ID input VGPR is placed differently when as_ls=true. */ 3116 key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs; 3117 key.is_ngg = vs_shader->info.is_ngg; 3118 key.wave32 = vs_shader->info.wave_size == 32; 3119 key.next_stage = pipeline->next_vertex_stage; 3120 3121 union vs_prolog_key_header header; 3122 header.v = 0; 3123 header.num_attributes = num_attributes; 3124 header.as_ls = key.as_ls; 3125 header.is_ngg = key.is_ngg; 3126 header.wave32 = key.wave32; 3127 header.next_stage = key.next_stage; 3128 3129 if (instance_rate_inputs & ~*nontrivial_divisors) { 3130 header.instance_rate_inputs = true; 3131 key_words[key_size++] = instance_rate_inputs; 3132 } 3133 if (*nontrivial_divisors) { 3134 header.nontrivial_divisors = true; 3135 key_words[key_size++] = *nontrivial_divisors; 3136 } 3137 if (zero_divisors) { 3138 header.zero_divisors = true; 3139 key_words[key_size++] = zero_divisors; 3140 } 3141 if (misaligned_mask) { 3142 header.misaligned_mask = true; 3143 key_words[key_size++] = misaligned_mask; 3144 3145 uint8_t *formats = (uint8_t *)&key_words[key_size]; 3146 unsigned num_formats = 0; 3147 u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index]; 3148 while (num_formats & 0x3) 3149 formats[num_formats++] = 0; 3150 key_size += num_formats / 4u; 3151 3152 if (state->post_shuffle & attribute_mask) { 3153 header.post_shuffle = true; 3154 key_words[key_size++] = state->post_shuffle & attribute_mask; 3155 } 3156 } 3157 if (state->alpha_adjust_lo & attribute_mask) { 3158 header.alpha_adjust_lo = true; 3159 key_words[key_size++] = state->alpha_adjust_lo & attribute_mask; 3160 } 3161 if (state->alpha_adjust_hi & attribute_mask) { 3162 header.alpha_adjust_hi = true; 3163 key_words[key_size++] = state->alpha_adjust_hi & attribute_mask; 3164 } 3165 3166 header.key_size = key_size * sizeof(key_words[0]); 3167 key_words[0] = header.v; 3168 3169 uint32_t hash = radv_hash_vs_prolog(key_words); 3170 3171 if (cmd_buffer->state.emitted_vs_prolog && 3172 cmd_buffer->state.emitted_vs_prolog_key_hash == hash && 3173 radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key)) 3174 return cmd_buffer->state.emitted_vs_prolog; 3175 3176 u_rwlock_rdlock(&device->vs_prologs_lock); 3177 struct hash_entry *prolog_entry = 3178 _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words); 3179 u_rwlock_rdunlock(&device->vs_prologs_lock); 3180 3181 if (!prolog_entry) { 3182 u_rwlock_wrlock(&device->vs_prologs_lock); 3183 prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words); 3184 if (prolog_entry) { 3185 u_rwlock_wrunlock(&device->vs_prologs_lock); 3186 return prolog_entry->data; 3187 } 3188 3189 prolog = radv_create_vs_prolog(device, &key); 3190 uint32_t *key2 = malloc(key_size * 4); 3191 if (!prolog || !key2) { 3192 radv_shader_part_destroy(device, prolog); 3193 free(key2); 3194 u_rwlock_wrunlock(&device->vs_prologs_lock); 3195 return NULL; 3196 } 3197 memcpy(key2, key_words, key_size * 4); 3198 _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog); 3199 3200 u_rwlock_wrunlock(&device->vs_prologs_lock); 3201 return prolog; 3202 } 3203 3204 return prolog_entry->data; 3205} 3206 3207static void 3208emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader, 3209 struct radv_shader_part *prolog, bool pipeline_is_dirty) 3210{ 3211 /* no need to re-emit anything in this case */ 3212 if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty) 3213 return; 3214 3215 enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level; 3216 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 3217 uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset; 3218 3219 assert(cmd_buffer->state.emitted_graphics_pipeline == cmd_buffer->state.graphics_pipeline); 3220 3221 uint32_t rsrc1 = vs_shader->config.rsrc1; 3222 if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1)) 3223 rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS); 3224 3225 /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not 3226 * work. 3227 */ 3228 assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1)); 3229 3230 unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS; 3231 unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; 3232 if (vs_shader->info.is_ngg || pipeline->base.shaders[MESA_SHADER_GEOMETRY] == vs_shader) { 3233 pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES; 3234 rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; 3235 } else if (pipeline->base.shaders[MESA_SHADER_TESS_CTRL] == vs_shader) { 3236 pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS; 3237 rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS; 3238 } else if (vs_shader->info.vs.as_ls) { 3239 pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS; 3240 rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS; 3241 } else if (vs_shader->info.vs.as_es) { 3242 pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES; 3243 rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES; 3244 } 3245 3246 radeon_set_sh_reg(cmd_buffer->cs, pgm_lo_reg, prolog_va >> 8); 3247 3248 if (chip < GFX10) 3249 radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1); 3250 else 3251 assert(rsrc1 == vs_shader->config.rsrc1); 3252 3253 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo); 3254} 3255 3256static void 3257emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader, 3258 uint32_t nontrivial_divisors, bool pipeline_is_dirty) 3259{ 3260 /* no need to re-emit anything in this case */ 3261 if (!nontrivial_divisors && !pipeline_is_dirty && cmd_buffer->state.emitted_vs_prolog && 3262 !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors) 3263 return; 3264 3265 const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; 3266 uint64_t input_va = radv_shader_get_va(vs_shader); 3267 3268 if (nontrivial_divisors) { 3269 unsigned inputs_offset; 3270 uint32_t *inputs; 3271 unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8; 3272 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs)) 3273 return; 3274 3275 *(inputs++) = input_va; 3276 *(inputs++) = input_va >> 32; 3277 3278 u_foreach_bit(index, nontrivial_divisors) 3279 { 3280 uint32_t div = state->divisors[index]; 3281 if (div == 0) { 3282 *(inputs++) = 0; 3283 *(inputs++) = 1; 3284 } else if (util_is_power_of_two_or_zero(div)) { 3285 *(inputs++) = util_logbase2(div) | (1 << 8); 3286 *(inputs++) = 0xffffffffu; 3287 } else { 3288 struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32); 3289 *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16); 3290 *(inputs++) = info.multiplier; 3291 } 3292 } 3293 3294 input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset; 3295 } 3296 3297 struct radv_userdata_info *loc = 3298 &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS]; 3299 uint32_t base_reg = cmd_buffer->state.graphics_pipeline->base.user_data_0[MESA_SHADER_VERTEX]; 3300 assert(loc->sgpr_idx != -1); 3301 assert(loc->num_sgprs == 2); 3302 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 3303 input_va, true); 3304} 3305 3306static void 3307radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 3308{ 3309 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 3310 struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX); 3311 3312 assert(!cmd_buffer->state.mesh_shading); 3313 3314 if (!vs_shader->info.vs.has_prolog) 3315 return; 3316 3317 uint32_t nontrivial_divisors; 3318 struct radv_shader_part *prolog = 3319 lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors); 3320 if (!prolog) { 3321 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 3322 return; 3323 } 3324 emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty); 3325 emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty); 3326 3327 cmd_buffer->state.emitted_vs_prolog = prolog; 3328 3329 if (unlikely(cmd_buffer->device->trace_bo)) 3330 radv_save_vs_prolog(cmd_buffer, prolog); 3331} 3332 3333static void 3334radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 3335{ 3336 uint64_t states = 3337 cmd_buffer->state.dirty & cmd_buffer->state.emitted_graphics_pipeline->needed_dynamic_state; 3338 3339 if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT)) 3340 radv_emit_viewport(cmd_buffer); 3341 3342 if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) && 3343 !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug) 3344 radv_emit_scissor(cmd_buffer); 3345 3346 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH) 3347 radv_emit_line_width(cmd_buffer); 3348 3349 if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) 3350 radv_emit_blend_constants(cmd_buffer); 3351 3352 if (states & 3353 (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK | 3354 RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK)) 3355 radv_emit_stencil(cmd_buffer); 3356 3357 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS) 3358 radv_emit_depth_bounds(cmd_buffer); 3359 3360 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS) 3361 radv_emit_depth_bias(cmd_buffer); 3362 3363 if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE) 3364 radv_emit_discard_rectangle(cmd_buffer); 3365 3366 if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) 3367 radv_emit_sample_locations(cmd_buffer); 3368 3369 if (states & (RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE)) 3370 radv_emit_line_stipple(cmd_buffer); 3371 3372 if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE | 3373 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE)) 3374 radv_emit_culling(cmd_buffer, states); 3375 3376 if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) 3377 radv_emit_primitive_topology(cmd_buffer); 3378 3379 if (states & 3380 (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE | 3381 RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE | 3382 RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)) 3383 radv_emit_depth_control(cmd_buffer, states); 3384 3385 if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP) 3386 radv_emit_stencil_control(cmd_buffer); 3387 3388 if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE) 3389 radv_emit_fragment_shading_rate(cmd_buffer); 3390 3391 if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE) 3392 radv_emit_primitive_restart_enable(cmd_buffer); 3393 3394 if (states & RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) 3395 radv_emit_rasterizer_discard_enable(cmd_buffer); 3396 3397 if (states & RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP) 3398 radv_emit_logic_op(cmd_buffer); 3399 3400 if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) 3401 radv_emit_color_write_enable(cmd_buffer); 3402 3403 if (states & RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT) 3404 radv_emit_vertex_input(cmd_buffer, pipeline_is_dirty); 3405 3406 cmd_buffer->state.dirty &= ~states; 3407} 3408 3409static void 3410radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point) 3411{ 3412 struct radv_descriptor_state *descriptors_state = 3413 radv_get_descriptors_state(cmd_buffer, bind_point); 3414 struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set; 3415 unsigned bo_offset; 3416 3417 if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr, 3418 &bo_offset)) 3419 return; 3420 3421 set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 3422 set->header.va += bo_offset; 3423} 3424 3425static void 3426radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer, 3427 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point) 3428{ 3429 struct radv_descriptor_state *descriptors_state = 3430 radv_get_descriptors_state(cmd_buffer, bind_point); 3431 uint32_t size = MAX_SETS * 4; 3432 uint32_t offset; 3433 void *ptr; 3434 3435 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr)) 3436 return; 3437 3438 for (unsigned i = 0; i < MAX_SETS; i++) { 3439 uint32_t *uptr = ((uint32_t *)ptr) + i; 3440 uint64_t set_va = 0; 3441 struct radv_descriptor_set *set = descriptors_state->sets[i]; 3442 if (descriptors_state->valid & (1u << i)) 3443 set_va = set->header.va; 3444 uptr[0] = set_va & 0xffffffff; 3445 } 3446 3447 struct radeon_cmdbuf *cs = cmd_buffer->cs; 3448 struct radv_device *device = cmd_buffer->device; 3449 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 3450 va += offset; 3451 3452 if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { 3453 struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline); 3454 3455 if (pipeline->shaders[MESA_SHADER_VERTEX]) 3456 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_VERTEX, 3457 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3458 3459 if (pipeline->shaders[MESA_SHADER_FRAGMENT]) 3460 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_FRAGMENT, 3461 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3462 3463 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH)) 3464 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_MESH, 3465 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3466 3467 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK)) 3468 radv_emit_userdata_address(device, cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK, 3469 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3470 3471 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_GEOMETRY)) 3472 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_GEOMETRY, 3473 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3474 3475 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL)) 3476 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_TESS_CTRL, 3477 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3478 3479 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL)) 3480 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_TESS_EVAL, 3481 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3482 } else { 3483 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_COMPUTE, 3484 AC_UD_INDIRECT_DESCRIPTOR_SETS, va); 3485 } 3486} 3487 3488static void 3489radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, 3490 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point) 3491{ 3492 struct radv_descriptor_state *descriptors_state = 3493 radv_get_descriptors_state(cmd_buffer, bind_point); 3494 struct radv_device *device = cmd_buffer->device; 3495 struct radeon_cmdbuf *cs = cmd_buffer->cs; 3496 bool flush_indirect_descriptors; 3497 3498 if (!descriptors_state->dirty) 3499 return; 3500 3501 if (descriptors_state->push_dirty) 3502 radv_flush_push_descriptors(cmd_buffer, bind_point); 3503 3504 flush_indirect_descriptors = pipeline->need_indirect_descriptor_sets; 3505 3506 if (flush_indirect_descriptors) 3507 radv_flush_indirect_descriptor_sets(cmd_buffer, pipeline, bind_point); 3508 3509 ASSERTED unsigned cdw_max = 3510 radeon_check_space(device->ws, cs, MAX_SETS * MESA_VULKAN_SHADER_STAGES * 4); 3511 3512 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) { 3513 radv_emit_descriptor_pointers(device, cs, pipeline, descriptors_state, MESA_SHADER_COMPUTE); 3514 } else { 3515 radv_foreach_stage(stage, stages & ~VK_SHADER_STAGE_TASK_BIT_NV) 3516 { 3517 if (!cmd_buffer->state.graphics_pipeline->base.shaders[stage]) 3518 continue; 3519 3520 radv_emit_descriptor_pointers(device, cs, pipeline, descriptors_state, stage); 3521 } 3522 3523 if (stages & VK_SHADER_STAGE_TASK_BIT_NV) { 3524 radv_emit_descriptor_pointers(device, cmd_buffer->ace_internal.cs, pipeline, 3525 descriptors_state, MESA_SHADER_TASK); 3526 } 3527 } 3528 3529 descriptors_state->dirty = 0; 3530 descriptors_state->push_dirty = false; 3531 3532 assert(cmd_buffer->cs->cdw <= cdw_max); 3533 3534 if (unlikely(cmd_buffer->device->trace_bo)) 3535 radv_save_descriptors(cmd_buffer, bind_point); 3536} 3537 3538static bool 3539radv_shader_loads_push_constants(struct radv_pipeline *pipeline, gl_shader_stage stage) 3540{ 3541 struct radv_userdata_info *loc = 3542 radv_lookup_user_sgpr(pipeline, stage, AC_UD_PUSH_CONSTANTS); 3543 return loc->sgpr_idx != -1; 3544} 3545 3546static void 3547radv_emit_all_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs, 3548 struct radv_pipeline *pipeline, gl_shader_stage stage, 3549 uint32_t *values, bool *need_push_constants) 3550{ 3551 const struct radv_shader *shader = radv_get_shader(pipeline, stage); 3552 if (!shader) 3553 return; 3554 3555 *need_push_constants |= radv_shader_loads_push_constants(pipeline, stage); 3556 3557 const uint64_t mask = shader->info.inline_push_constant_mask; 3558 if (!mask) 3559 return; 3560 3561 const uint8_t base = ffs(mask) - 1; 3562 if (mask == u_bit_consecutive64(base, util_last_bit64(mask) - base)) { 3563 /* consecutive inline push constants */ 3564 radv_emit_inline_push_consts(device, cs, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS, 3565 values + base); 3566 } else { 3567 /* sparse inline push constants */ 3568 uint32_t consts[AC_MAX_INLINE_PUSH_CONSTS]; 3569 unsigned num_consts = 0; 3570 u_foreach_bit64 (idx, mask) 3571 consts[num_consts++] = values[idx]; 3572 radv_emit_inline_push_consts(device, cs, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS, 3573 consts); 3574 } 3575} 3576 3577static void 3578radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, 3579 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point) 3580{ 3581 struct radv_device *device = cmd_buffer->device; 3582 struct radeon_cmdbuf *cs = cmd_buffer->cs; 3583 struct radv_descriptor_state *descriptors_state = 3584 radv_get_descriptors_state(cmd_buffer, bind_point); 3585 struct radv_shader *shader, *prev_shader; 3586 bool need_push_constants = false; 3587 unsigned offset; 3588 void *ptr; 3589 uint64_t va; 3590 uint32_t internal_stages; 3591 uint32_t dirty_stages = 0; 3592 3593 stages &= cmd_buffer->push_constant_stages; 3594 if (!stages || (!pipeline->push_constant_size && !pipeline->dynamic_offset_count)) 3595 return; 3596 3597 internal_stages = stages; 3598 switch (bind_point) { 3599 case VK_PIPELINE_BIND_POINT_GRAPHICS: 3600 break; 3601 case VK_PIPELINE_BIND_POINT_COMPUTE: 3602 dirty_stages = RADV_RT_STAGE_BITS; 3603 break; 3604 case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: 3605 internal_stages = VK_SHADER_STAGE_COMPUTE_BIT; 3606 dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT; 3607 break; 3608 default: 3609 unreachable("Unhandled bind point"); 3610 } 3611 3612 radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_NV) 3613 { 3614 radv_emit_all_inline_push_consts( 3615 device, cs, pipeline, stage, (uint32_t *)cmd_buffer->push_constants, &need_push_constants); 3616 } 3617 3618 if (internal_stages & VK_SHADER_STAGE_TASK_BIT_NV) { 3619 radv_emit_all_inline_push_consts(device, cmd_buffer->ace_internal.cs, pipeline, 3620 MESA_SHADER_TASK, (uint32_t *)cmd_buffer->push_constants, 3621 &need_push_constants); 3622 } 3623 3624 if (need_push_constants) { 3625 if (!radv_cmd_buffer_upload_alloc( 3626 cmd_buffer, pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count, &offset, 3627 &ptr)) 3628 return; 3629 3630 memcpy(ptr, cmd_buffer->push_constants, pipeline->push_constant_size); 3631 memcpy((char *)ptr + pipeline->push_constant_size, descriptors_state->dynamic_buffers, 3632 16 * pipeline->dynamic_offset_count); 3633 3634 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 3635 va += offset; 3636 3637 ASSERTED unsigned cdw_max = 3638 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_VULKAN_SHADER_STAGES * 4); 3639 3640 prev_shader = NULL; 3641 radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_NV) 3642 { 3643 shader = radv_get_shader(pipeline, stage); 3644 3645 /* Avoid redundantly emitting the address for merged stages. */ 3646 if (shader && shader != prev_shader) { 3647 radv_emit_userdata_address(device, cs, pipeline, stage, AC_UD_PUSH_CONSTANTS, va); 3648 3649 prev_shader = shader; 3650 } 3651 } 3652 3653 if (internal_stages & VK_SHADER_STAGE_TASK_BIT_NV) { 3654 radv_emit_userdata_address(device, cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK, 3655 AC_UD_PUSH_CONSTANTS, va); 3656 } 3657 3658 assert(cmd_buffer->cs->cdw <= cdw_max); 3659 } 3660 3661 cmd_buffer->push_constant_stages &= ~stages; 3662 cmd_buffer->push_constant_stages |= dirty_stages; 3663} 3664 3665enum radv_dst_sel { 3666 DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) | 3667 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), 3668 DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) | 3669 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), 3670 DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 3671 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), 3672 DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 3673 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), 3674 DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 3675 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W), 3676 DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 3677 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W), 3678}; 3679 3680static const uint32_t data_format_dst_sel[] = { 3681 [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001, 3682 [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001, 3683 [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001, 3684 [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01, 3685 [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001, 3686 [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01, 3687 [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1, 3688 [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1, 3689 [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW, 3690 [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW, 3691 [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW, 3692 [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01, 3693 [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW, 3694 [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1, 3695 [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW, 3696}; 3697 3698void 3699radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer, 3700 const struct radv_graphics_pipeline *pipeline, 3701 bool full_null_descriptors, void *vb_ptr) 3702{ 3703 struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX); 3704 enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level; 3705 unsigned desc_index = 0; 3706 uint32_t mask = pipeline->vb_desc_usage_mask; 3707 uint64_t va; 3708 const struct radv_vs_input_state *vs_state = 3709 vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL; 3710 assert(!vs_state || pipeline->use_per_attribute_vb_descs); 3711 3712 while (mask) { 3713 unsigned i = u_bit_scan(&mask); 3714 uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4]; 3715 uint32_t offset, rsrc_word3; 3716 unsigned binding = 3717 vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i] 3718 : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i); 3719 struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding]; 3720 unsigned num_records; 3721 unsigned stride; 3722 3723 if (vs_state) { 3724 unsigned format = vs_state->formats[i]; 3725 unsigned dfmt = format & 0xf; 3726 unsigned nfmt = (format >> 4) & 0x7; 3727 3728 rsrc_word3 = vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt]; 3729 3730 if (chip >= GFX10) 3731 rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt)); 3732 else 3733 rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt); 3734 } else { 3735 if (chip >= GFX10) 3736 rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT); 3737 else 3738 rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | 3739 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 3740 } 3741 3742 if (pipeline->uses_dynamic_stride) { 3743 stride = cmd_buffer->vertex_bindings[binding].stride; 3744 } else { 3745 stride = pipeline->binding_stride[binding]; 3746 } 3747 3748 if (!buffer) { 3749 if (full_null_descriptors) { 3750 /* Put all the info in for the DGC generation shader in case the VBO gets overridden. */ 3751 desc[0] = 0; 3752 desc[1] = S_008F04_STRIDE(stride); 3753 desc[2] = 0; 3754 desc[3] = rsrc_word3; 3755 } else if (vs_state) { 3756 /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need 3757 * to include the format/word3 so that the alpha channel is 1 for formats without an 3758 * alpha channel. 3759 */ 3760 desc[0] = 0; 3761 desc[1] = S_008F04_STRIDE(16); 3762 desc[2] = 0; 3763 desc[3] = rsrc_word3; 3764 } else { 3765 memset(desc, 0, 4 * 4); 3766 } 3767 3768 continue; 3769 } 3770 3771 va = radv_buffer_get_va(buffer->bo); 3772 3773 offset = cmd_buffer->vertex_bindings[binding].offset; 3774 va += offset + buffer->offset; 3775 if (vs_state) 3776 va += vs_state->offsets[i]; 3777 3778 if (cmd_buffer->vertex_bindings[binding].size) { 3779 num_records = cmd_buffer->vertex_bindings[binding].size; 3780 } else { 3781 num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE); 3782 } 3783 3784 if (pipeline->use_per_attribute_vb_descs) { 3785 uint32_t attrib_end = 3786 vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i] : pipeline->attrib_ends[i]; 3787 3788 if (num_records < attrib_end) { 3789 num_records = 0; /* not enough space for one vertex */ 3790 } else if (stride == 0) { 3791 num_records = 1; /* only one vertex */ 3792 } else { 3793 num_records = (num_records - attrib_end) / stride + 1; 3794 /* If attrib_offset>stride, then the compiler will increase the vertex index by 3795 * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is 3796 * only allowed with static strides. 3797 */ 3798 num_records += pipeline->attrib_index_offset[i]; 3799 } 3800 3801 /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into 3802 * into bytes in that case. GFX8 always uses bytes. 3803 */ 3804 if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) { 3805 num_records = (num_records - 1) * stride + attrib_end; 3806 } else if (!num_records) { 3807 /* On GFX9, it seems bounds checking is disabled if both 3808 * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and 3809 * GFX10.3 but it doesn't hurt. 3810 */ 3811 if (full_null_descriptors) { 3812 /* Put all the info in for the DGC generation shader in case the VBO gets overridden. 3813 */ 3814 desc[0] = 0; 3815 desc[1] = S_008F04_STRIDE(stride); 3816 desc[2] = 0; 3817 desc[3] = rsrc_word3; 3818 } else if (vs_state) { 3819 desc[0] = 0; 3820 desc[1] = S_008F04_STRIDE(16); 3821 desc[2] = 0; 3822 desc[3] = rsrc_word3; 3823 } else { 3824 memset(desc, 0, 16); 3825 } 3826 3827 continue; 3828 } 3829 } else { 3830 if (chip != GFX8 && stride) 3831 num_records = DIV_ROUND_UP(num_records, stride); 3832 } 3833 3834 if (chip >= GFX10) { 3835 /* OOB_SELECT chooses the out-of-bounds check: 3836 * - 1: index >= NUM_RECORDS (Structured) 3837 * - 3: offset >= NUM_RECORDS (Raw) 3838 */ 3839 int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW; 3840 rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11); 3841 } 3842 3843 desc[0] = va; 3844 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride); 3845 desc[2] = num_records; 3846 desc[3] = rsrc_word3; 3847 } 3848} 3849 3850static void 3851radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 3852{ 3853 if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) && 3854 cmd_buffer->state.graphics_pipeline->vb_desc_usage_mask) { 3855 /* Mesh shaders don't have vertex descriptors. */ 3856 assert(!cmd_buffer->state.mesh_shading); 3857 3858 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 3859 unsigned vb_offset; 3860 void *vb_ptr; 3861 uint64_t va; 3862 3863 /* allocate some descriptor state for vertex buffers */ 3864 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, 3865 &vb_ptr)) 3866 return; 3867 3868 radv_write_vertex_descriptors(cmd_buffer, pipeline, false, vb_ptr); 3869 3870 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 3871 va += vb_offset; 3872 3873 radv_emit_userdata_address(cmd_buffer->device, cmd_buffer->cs, &pipeline->base, 3874 MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS, va); 3875 3876 cmd_buffer->state.vb_va = va; 3877 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS; 3878 3879 if (unlikely(cmd_buffer->device->trace_bo)) 3880 radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr); 3881 } 3882 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER; 3883} 3884 3885static void 3886radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va) 3887{ 3888 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 3889 struct radv_userdata_info *loc; 3890 uint32_t base_reg; 3891 3892 for (unsigned stage = 0; stage < MESA_VULKAN_SHADER_STAGES; ++stage) { 3893 if (!radv_get_shader(&pipeline->base, stage)) 3894 continue; 3895 3896 loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_STREAMOUT_BUFFERS); 3897 if (loc->sgpr_idx == -1) 3898 continue; 3899 3900 base_reg = pipeline->base.user_data_0[stage]; 3901 3902 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va, 3903 false); 3904 } 3905 3906 if (radv_pipeline_has_gs_copy_shader(&pipeline->base)) { 3907 loc = &pipeline->base.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS]; 3908 if (loc->sgpr_idx != -1) { 3909 base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0; 3910 3911 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 3912 va, false); 3913 } 3914 } 3915} 3916 3917static void 3918radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer) 3919{ 3920 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) { 3921 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; 3922 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 3923 unsigned so_offset; 3924 void *so_ptr; 3925 uint64_t va; 3926 3927 /* Allocate some descriptor state for streamout buffers. */ 3928 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr)) 3929 return; 3930 3931 for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) { 3932 struct radv_buffer *buffer = sb[i].buffer; 3933 uint32_t *desc = &((uint32_t *)so_ptr)[i * 4]; 3934 3935 if (!(so->enabled_mask & (1 << i))) 3936 continue; 3937 3938 va = radv_buffer_get_va(buffer->bo) + buffer->offset; 3939 3940 va += sb[i].offset; 3941 3942 /* Set the descriptor. 3943 * 3944 * On GFX8, the format must be non-INVALID, otherwise 3945 * the buffer will be considered not bound and store 3946 * instructions will be no-ops. 3947 */ 3948 uint32_t size = 0xffffffff; 3949 3950 /* Compute the correct buffer size for NGG streamout 3951 * because it's used to determine the max emit per 3952 * buffer. 3953 */ 3954 if (cmd_buffer->device->physical_device->use_ngg_streamout) 3955 size = buffer->vk.size - sb[i].offset; 3956 3957 uint32_t rsrc_word3 = 3958 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 3959 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); 3960 3961 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { 3962 rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) | 3963 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW); 3964 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) { 3965 rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 3966 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); 3967 } else { 3968 rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 3969 } 3970 3971 desc[0] = va; 3972 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); 3973 desc[2] = size; 3974 desc[3] = rsrc_word3; 3975 } 3976 3977 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 3978 va += so_offset; 3979 3980 radv_emit_streamout_buffers(cmd_buffer, va); 3981 } 3982 3983 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER; 3984} 3985 3986static void 3987radv_flush_ngg_query_state(struct radv_cmd_buffer *cmd_buffer) 3988{ 3989 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 3990 const unsigned stage = pipeline->last_vgt_api_stage; 3991 struct radv_userdata_info *loc; 3992 uint32_t ngg_query_state = 0; 3993 uint32_t base_reg; 3994 3995 loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_NGG_QUERY_STATE); 3996 if (loc->sgpr_idx == -1) 3997 return; 3998 3999 assert(pipeline->is_ngg); 4000 4001 /* By default NGG queries are disabled but they are enabled if the command buffer has active GDS 4002 * queries or if it's a secondary command buffer that inherits the number of generated 4003 * primitives. 4004 */ 4005 if (cmd_buffer->state.active_pipeline_gds_queries || 4006 (cmd_buffer->state.inherited_pipeline_statistics & 4007 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)) 4008 ngg_query_state = 1; 4009 4010 base_reg = pipeline->base.user_data_0[stage]; 4011 assert(loc->sgpr_idx != -1); 4012 4013 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ngg_query_state); 4014} 4015 4016static void 4017radv_flush_force_vrs_state(struct radv_cmd_buffer *cmd_buffer) 4018{ 4019 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 4020 enum amd_gfx_level gfx_level = pipeline->base.device->physical_device->rad_info.gfx_level; 4021 const unsigned stage = pipeline->last_vgt_api_stage; 4022 struct radv_userdata_info *loc; 4023 uint32_t vrs_rates = 0; 4024 uint32_t base_reg; 4025 4026 if (!pipeline->force_vrs_per_vertex) { 4027 /* Un-set the SGPR index so we know to re-emit it later. */ 4028 cmd_buffer->state.last_vrs_rates_sgpr_idx = -1; 4029 return; 4030 } 4031 4032 loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_FORCE_VRS_RATES); 4033 assert(loc->sgpr_idx != -1); 4034 4035 base_reg = pipeline->base.user_data_0[stage]; 4036 4037 switch (cmd_buffer->device->force_vrs) { 4038 case RADV_FORCE_VRS_2x2: 4039 vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X2 : (1u << 2) | (1u << 4); 4040 break; 4041 case RADV_FORCE_VRS_2x1: 4042 vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X1 : (1u << 2) | (0u << 4); 4043 break; 4044 case RADV_FORCE_VRS_1x2: 4045 vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_1X2 : (0u << 2) | (1u << 4); 4046 break; 4047 default: 4048 break; 4049 } 4050 4051 if (cmd_buffer->state.last_vrs_rates != vrs_rates || 4052 cmd_buffer->state.last_vrs_rates_sgpr_idx != loc->sgpr_idx) { 4053 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, vrs_rates); 4054 } 4055 4056 cmd_buffer->state.last_vrs_rates = vrs_rates; 4057 cmd_buffer->state.last_vrs_rates_sgpr_idx = loc->sgpr_idx; 4058} 4059 4060static void 4061radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) 4062{ 4063 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 4064 4065 radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty); 4066 radv_flush_streamout_descriptors(cmd_buffer); 4067 4068 VkShaderStageFlags stages = VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_NV; 4069 radv_flush_descriptors(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS); 4070 radv_flush_constants(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS); 4071 radv_flush_ngg_query_state(cmd_buffer); 4072 radv_flush_force_vrs_state(cmd_buffer); 4073} 4074 4075struct radv_draw_info { 4076 /** 4077 * Number of vertices. 4078 */ 4079 uint32_t count; 4080 4081 /** 4082 * First instance id. 4083 */ 4084 uint32_t first_instance; 4085 4086 /** 4087 * Number of instances. 4088 */ 4089 uint32_t instance_count; 4090 4091 /** 4092 * Whether it's an indexed draw. 4093 */ 4094 bool indexed; 4095 4096 /** 4097 * Indirect draw parameters resource. 4098 */ 4099 struct radv_buffer *indirect; 4100 uint64_t indirect_offset; 4101 uint32_t stride; 4102 4103 /** 4104 * Draw count parameters resource. 4105 */ 4106 struct radv_buffer *count_buffer; 4107 uint64_t count_buffer_offset; 4108 4109 /** 4110 * Stream output parameters resource. 4111 */ 4112 struct radv_buffer *strmout_buffer; 4113 uint64_t strmout_buffer_offset; 4114}; 4115 4116static uint32_t 4117radv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer) 4118{ 4119 uint32_t index_type = G_028A7C_INDEX_TYPE(cmd_buffer->state.index_type); 4120 switch (index_type) { 4121 case V_028A7C_VGT_INDEX_8: 4122 return 0xffu; 4123 case V_028A7C_VGT_INDEX_16: 4124 return 0xffffu; 4125 case V_028A7C_VGT_INDEX_32: 4126 return 0xffffffffu; 4127 default: 4128 unreachable("invalid index type"); 4129 } 4130} 4131 4132static void 4133si_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw, 4134 bool indirect_draw, bool count_from_stream_output, 4135 uint32_t draw_vertex_count) 4136{ 4137 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; 4138 struct radv_cmd_state *state = &cmd_buffer->state; 4139 unsigned topology = state->dynamic.primitive_topology; 4140 bool prim_restart_enable = state->dynamic.primitive_restart_enable; 4141 struct radeon_cmdbuf *cs = cmd_buffer->cs; 4142 unsigned ia_multi_vgt_param; 4143 4144 ia_multi_vgt_param = 4145 si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output, 4146 draw_vertex_count, topology, prim_restart_enable); 4147 4148 if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) { 4149 if (info->gfx_level == GFX9) { 4150 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs, 4151 R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param); 4152 } else if (info->gfx_level >= GFX7) { 4153 radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param); 4154 } else { 4155 radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); 4156 } 4157 state->last_ia_multi_vgt_param = ia_multi_vgt_param; 4158 } 4159} 4160 4161static void 4162radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info) 4163{ 4164 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; 4165 struct radv_cmd_state *state = &cmd_buffer->state; 4166 struct radeon_cmdbuf *cs = cmd_buffer->cs; 4167 uint32_t topology = state->dynamic.primitive_topology; 4168 bool disable_instance_packing = false; 4169 4170 /* Draw state. */ 4171 if (info->gfx_level < GFX10) { 4172 si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect, 4173 !!draw_info->strmout_buffer, 4174 draw_info->indirect ? 0 : draw_info->count); 4175 } 4176 4177 if (state->dynamic.primitive_restart_enable) { 4178 uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer); 4179 4180 if (primitive_reset_index != state->last_primitive_reset_index) { 4181 radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index); 4182 state->last_primitive_reset_index = primitive_reset_index; 4183 } 4184 } 4185 4186 if (draw_info->strmout_buffer) { 4187 uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo); 4188 4189 va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset; 4190 4191 radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride); 4192 4193 if (info->gfx_level >= GFX10) { 4194 /* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption 4195 * (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+. 4196 */ 4197 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 4198 radeon_emit(cs, 0); 4199 4200 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0)); 4201 radeon_emit(cs, va); 4202 radeon_emit(cs, va >> 32); 4203 radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2); 4204 radeon_emit(cs, 1); /* 1 DWORD */ 4205 } else { 4206 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 4207 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | 4208 COPY_DATA_WR_CONFIRM); 4209 radeon_emit(cs, va); 4210 radeon_emit(cs, va >> 32); 4211 radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); 4212 radeon_emit(cs, 0); /* unused */ 4213 } 4214 4215 radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo); 4216 } 4217 4218 /* RDNA2 is affected by a hardware bug when instance packing is enabled for adjacent primitive 4219 * topologies and instance_count > 1, pipeline stats generated by GE are incorrect. It needs to 4220 * be applied for indexed and non-indexed draws. 4221 */ 4222 if (info->gfx_level == GFX10_3 && state->active_pipeline_queries > 0 && 4223 (draw_info->instance_count > 1 || draw_info->indirect) && 4224 (topology == V_008958_DI_PT_LINELIST_ADJ || topology == V_008958_DI_PT_LINESTRIP_ADJ || 4225 topology == V_008958_DI_PT_TRILIST_ADJ || topology == V_008958_DI_PT_TRISTRIP_ADJ)) { 4226 disable_instance_packing = true; 4227 } 4228 4229 if ((draw_info->indexed && state->index_type != state->last_index_type) || 4230 (info->gfx_level == GFX10_3 && 4231 (state->last_index_type == -1 || 4232 disable_instance_packing != G_028A7C_DISABLE_INSTANCE_PACKING(state->last_index_type)))) { 4233 uint32_t index_type = state->index_type | S_028A7C_DISABLE_INSTANCE_PACKING(disable_instance_packing); 4234 4235 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) { 4236 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs, 4237 R_03090C_VGT_INDEX_TYPE, 2, index_type); 4238 } else { 4239 radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); 4240 radeon_emit(cs, index_type); 4241 } 4242 4243 state->last_index_type = index_type; 4244 } 4245} 4246 4247static void 4248radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask) 4249{ 4250 /* For simplicity, if the barrier wants to wait for the task shader, 4251 * just make it wait for the mesh shader too. 4252 */ 4253 if (src_stage_mask & VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV) 4254 src_stage_mask |= VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV; 4255 4256 if (src_stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT | 4257 VK_PIPELINE_STAGE_2_RESOLVE_BIT | 4258 VK_PIPELINE_STAGE_2_BLIT_BIT | 4259 VK_PIPELINE_STAGE_2_CLEAR_BIT)) { 4260 /* Be conservative for now. */ 4261 src_stage_mask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT; 4262 } 4263 4264 if (src_stage_mask & 4265 (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | 4266 VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR | 4267 VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR | 4268 VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | 4269 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) { 4270 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH; 4271 } 4272 4273 if (src_stage_mask & 4274 (VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | 4275 VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT | 4276 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | 4277 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) { 4278 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH; 4279 } else if (src_stage_mask & 4280 (VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT | 4281 VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT | 4282 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT | 4283 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | 4284 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT | 4285 VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV | 4286 VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT | 4287 VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT)) { 4288 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH; 4289 } 4290} 4291 4292static bool 4293can_skip_buffer_l2_flushes(struct radv_device *device) 4294{ 4295 return device->physical_device->rad_info.gfx_level == GFX9 || 4296 (device->physical_device->rad_info.gfx_level >= GFX10 && 4297 !device->physical_device->rad_info.tcc_rb_non_coherent); 4298} 4299 4300/* 4301 * In vulkan barriers have two kinds of operations: 4302 * 4303 * - visibility (implemented with radv_src_access_flush) 4304 * - availability (implemented with radv_dst_access_flush) 4305 * 4306 * for a memory operation to observe the result of a previous memory operation 4307 * one needs to do a visibility operation from the source memory and then an 4308 * availability operation to the target memory. 4309 * 4310 * The complication is the availability and visibility operations do not need to 4311 * be in the same barrier. 4312 * 4313 * The cleanest way to implement this is to define the visibility operation to 4314 * bring the caches to a "state of rest", which none of the caches below that 4315 * level dirty. 4316 * 4317 * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty. 4318 * 4319 * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all 4320 * buffers and for images marked as coherent, and VRAM/GTT for non-coherent 4321 * images. However, given the existence of memory barriers which do not specify 4322 * the image/buffer it often devolves to just VRAM/GTT anyway. 4323 * 4324 * To help reducing the invalidations for GPUs that have L2 coherency between the 4325 * RB and the shader caches, we always invalidate L2 on the src side, as we can 4326 * use our knowledge of past usage to optimize flushes away. 4327 */ 4328 4329enum radv_cmd_flush_bits 4330radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 src_flags, 4331 const struct radv_image *image) 4332{ 4333 bool has_CB_meta = true, has_DB_meta = true; 4334 bool image_is_coherent = image ? image->l2_coherent : false; 4335 enum radv_cmd_flush_bits flush_bits = 0; 4336 4337 if (image) { 4338 if (!radv_image_has_CB_metadata(image)) 4339 has_CB_meta = false; 4340 if (!radv_image_has_htile(image)) 4341 has_DB_meta = false; 4342 } 4343 4344 u_foreach_bit64(b, src_flags) 4345 { 4346 switch ((VkAccessFlags2)(1 << b)) { 4347 case VK_ACCESS_2_SHADER_WRITE_BIT: 4348 case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT: 4349 /* since the STORAGE bit isn't set we know that this is a meta operation. 4350 * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so 4351 * set it here. */ 4352 if (image && !(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) { 4353 if (vk_format_is_depth_or_stencil(image->vk.format)) { 4354 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 4355 } else { 4356 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 4357 } 4358 } 4359 4360 if (!image_is_coherent) 4361 flush_bits |= RADV_CMD_FLAG_INV_L2; 4362 break; 4363 case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR: 4364 case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT: 4365 case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: 4366 if (!image_is_coherent) 4367 flush_bits |= RADV_CMD_FLAG_WB_L2; 4368 break; 4369 case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT: 4370 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 4371 if (has_CB_meta) 4372 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 4373 break; 4374 case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: 4375 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 4376 if (has_DB_meta) 4377 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 4378 break; 4379 case VK_ACCESS_2_TRANSFER_WRITE_BIT: 4380 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB; 4381 4382 if (!image_is_coherent) 4383 flush_bits |= RADV_CMD_FLAG_INV_L2; 4384 if (has_CB_meta) 4385 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 4386 if (has_DB_meta) 4387 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 4388 break; 4389 case VK_ACCESS_2_MEMORY_WRITE_BIT: 4390 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB; 4391 4392 if (!image_is_coherent) 4393 flush_bits |= RADV_CMD_FLAG_INV_L2; 4394 if (has_CB_meta) 4395 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 4396 if (has_DB_meta) 4397 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 4398 break; 4399 default: 4400 break; 4401 } 4402 } 4403 return flush_bits; 4404} 4405 4406enum radv_cmd_flush_bits 4407radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 dst_flags, 4408 const struct radv_image *image) 4409{ 4410 bool has_CB_meta = true, has_DB_meta = true; 4411 enum radv_cmd_flush_bits flush_bits = 0; 4412 bool flush_CB = true, flush_DB = true; 4413 bool image_is_coherent = image ? image->l2_coherent : false; 4414 4415 if (image) { 4416 if (!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) { 4417 flush_CB = false; 4418 flush_DB = false; 4419 } 4420 4421 if (!radv_image_has_CB_metadata(image)) 4422 has_CB_meta = false; 4423 if (!radv_image_has_htile(image)) 4424 has_DB_meta = false; 4425 } 4426 4427 /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images 4428 * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */ 4429 image_is_coherent |= 4430 can_skip_buffer_l2_flushes(cmd_buffer->device) && !cmd_buffer->state.rb_noncoherent_dirty; 4431 4432 u_foreach_bit64(b, dst_flags) 4433 { 4434 switch ((VkAccessFlags2)(1 << b)) { 4435 case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT: 4436 /* SMEM loads are used to read compute dispatch size in shaders */ 4437 if (!cmd_buffer->device->load_grid_size_from_user_sgpr) 4438 flush_bits |= RADV_CMD_FLAG_INV_SCACHE; 4439 4440 /* Ensure the DGC meta shader can read the commands. */ 4441 if (cmd_buffer->device->uses_device_generated_commands) { 4442 flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE; 4443 4444 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9) 4445 flush_bits |= RADV_CMD_FLAG_INV_L2; 4446 } 4447 4448 break; 4449 case VK_ACCESS_2_INDEX_READ_BIT: 4450 case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: 4451 break; 4452 case VK_ACCESS_2_UNIFORM_READ_BIT: 4453 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE; 4454 break; 4455 case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT: 4456 case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT: 4457 case VK_ACCESS_2_TRANSFER_READ_BIT: 4458 case VK_ACCESS_2_TRANSFER_WRITE_BIT: 4459 flush_bits |= RADV_CMD_FLAG_INV_VCACHE; 4460 4461 if (has_CB_meta || has_DB_meta) 4462 flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA; 4463 if (!image_is_coherent) 4464 flush_bits |= RADV_CMD_FLAG_INV_L2; 4465 break; 4466 case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR: 4467 case VK_ACCESS_2_SHADER_READ_BIT: 4468 case VK_ACCESS_2_SHADER_STORAGE_READ_BIT: 4469 flush_bits |= RADV_CMD_FLAG_INV_VCACHE; 4470 /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to 4471 * invalidate the scalar cache. */ 4472 if (!cmd_buffer->device->physical_device->use_llvm && !image) 4473 flush_bits |= RADV_CMD_FLAG_INV_SCACHE; 4474 4475 if (has_CB_meta || has_DB_meta) 4476 flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA; 4477 if (!image_is_coherent) 4478 flush_bits |= RADV_CMD_FLAG_INV_L2; 4479 break; 4480 case VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR: 4481 flush_bits |= RADV_CMD_FLAG_INV_VCACHE; 4482 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9) 4483 flush_bits |= RADV_CMD_FLAG_INV_L2; 4484 break; 4485 case VK_ACCESS_2_SHADER_WRITE_BIT: 4486 case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT: 4487 case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR: 4488 break; 4489 case VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT: 4490 case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT: 4491 if (flush_CB) 4492 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 4493 if (has_CB_meta) 4494 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 4495 break; 4496 case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT: 4497 case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: 4498 if (flush_DB) 4499 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 4500 if (has_DB_meta) 4501 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 4502 break; 4503 case VK_ACCESS_2_MEMORY_READ_BIT: 4504 case VK_ACCESS_2_MEMORY_WRITE_BIT: 4505 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE; 4506 if (!image_is_coherent) 4507 flush_bits |= RADV_CMD_FLAG_INV_L2; 4508 if (flush_CB) 4509 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB; 4510 if (has_CB_meta) 4511 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META; 4512 if (flush_DB) 4513 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB; 4514 if (has_DB_meta) 4515 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 4516 break; 4517 default: 4518 break; 4519 } 4520 } 4521 return flush_bits; 4522} 4523 4524void 4525radv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, 4526 const struct radv_subpass_barrier *barrier) 4527{ 4528 struct radv_render_pass *pass = cmd_buffer->state.pass; 4529 4530 for (uint32_t i = 0; i < pass->attachment_count; i++) { 4531 struct radv_image_view *iview = cmd_buffer->state.attachments[i].iview; 4532 4533 cmd_buffer->state.flush_bits |= 4534 radv_src_access_flush(cmd_buffer, barrier->src_access_mask, iview->image); 4535 } 4536 4537 radv_stage_flush(cmd_buffer, barrier->src_stage_mask); 4538 4539 for (uint32_t i = 0; i < pass->attachment_count; i++) { 4540 struct radv_image_view *iview = cmd_buffer->state.attachments[i].iview; 4541 4542 cmd_buffer->state.flush_bits |= 4543 radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, iview->image); 4544 } 4545 4546 radv_ace_internal_barrier(cmd_buffer, barrier->src_stage_mask, barrier->dst_stage_mask); 4547} 4548 4549uint32_t 4550radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer) 4551{ 4552 struct radv_cmd_state *state = &cmd_buffer->state; 4553 uint32_t subpass_id = state->subpass - state->pass->subpasses; 4554 4555 /* The id of this subpass shouldn't exceed the number of subpasses in 4556 * this render pass minus 1. 4557 */ 4558 assert(subpass_id < state->pass->subpass_count); 4559 return subpass_id; 4560} 4561 4562static struct radv_sample_locations_state * 4563radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer, uint32_t att_idx, 4564 bool begin_subpass) 4565{ 4566 struct radv_cmd_state *state = &cmd_buffer->state; 4567 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer); 4568 struct radv_image_view *view = state->attachments[att_idx].iview; 4569 4570 if (view->image->info.samples == 1) 4571 return NULL; 4572 4573 if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) { 4574 /* Return the initial sample locations if this is the initial 4575 * layout transition of the given subpass attachemnt. 4576 */ 4577 if (state->attachments[att_idx].sample_location.count > 0) 4578 return &state->attachments[att_idx].sample_location; 4579 } else { 4580 /* Otherwise return the subpass sample locations if defined. */ 4581 if (state->subpass_sample_locs) { 4582 /* Because the driver sets the current subpass before 4583 * initial layout transitions, we should use the sample 4584 * locations from the previous subpass to avoid an 4585 * off-by-one problem. Otherwise, use the sample 4586 * locations for the current subpass for final layout 4587 * transitions. 4588 */ 4589 if (begin_subpass) 4590 subpass_id--; 4591 4592 for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) { 4593 if (state->subpass_sample_locs[i].subpass_idx == subpass_id) 4594 return &state->subpass_sample_locs[i].sample_location; 4595 } 4596 } 4597 } 4598 4599 return NULL; 4600} 4601 4602static void 4603radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer, 4604 struct radv_subpass_attachment att, bool begin_subpass) 4605{ 4606 unsigned idx = att.attachment; 4607 struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview; 4608 struct radv_sample_locations_state *sample_locs; 4609 VkImageSubresourceRange range; 4610 range.aspectMask = view->vk.aspects; 4611 range.baseMipLevel = view->vk.base_mip_level; 4612 range.levelCount = 1; 4613 range.baseArrayLayer = view->vk.base_array_layer; 4614 range.layerCount = cmd_buffer->state.framebuffer->layers; 4615 4616 if (cmd_buffer->state.subpass->view_mask) { 4617 /* If the current subpass uses multiview, the driver might have 4618 * performed a fast color/depth clear to the whole image 4619 * (including all layers). To make sure the driver will 4620 * decompress the image correctly (if needed), we have to 4621 * account for the "real" number of layers. If the view mask is 4622 * sparse, this will decompress more layers than needed. 4623 */ 4624 range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask); 4625 } 4626 4627 /* Get the subpass sample locations for the given attachment, if NULL 4628 * is returned the driver will use the default HW locations. 4629 */ 4630 sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx, begin_subpass); 4631 4632 /* Determine if the subpass uses separate depth/stencil layouts. */ 4633 bool uses_separate_depth_stencil_layouts = false; 4634 if ((cmd_buffer->state.attachments[idx].current_layout != 4635 cmd_buffer->state.attachments[idx].current_stencil_layout) || 4636 (att.layout != att.stencil_layout)) { 4637 uses_separate_depth_stencil_layouts = true; 4638 } 4639 4640 /* For separate layouts, perform depth and stencil transitions 4641 * separately. 4642 */ 4643 if (uses_separate_depth_stencil_layouts && 4644 (range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) { 4645 /* Depth-only transitions. */ 4646 range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; 4647 radv_handle_image_transition(cmd_buffer, view->image, 4648 cmd_buffer->state.attachments[idx].current_layout, 4649 cmd_buffer->state.attachments[idx].current_in_render_loop, 4650 att.layout, att.in_render_loop, 0, 0, &range, sample_locs); 4651 4652 /* Stencil-only transitions. */ 4653 range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT; 4654 radv_handle_image_transition( 4655 cmd_buffer, view->image, cmd_buffer->state.attachments[idx].current_stencil_layout, 4656 cmd_buffer->state.attachments[idx].current_in_render_loop, att.stencil_layout, 4657 att.in_render_loop, 0, 0, &range, sample_locs); 4658 } else { 4659 radv_handle_image_transition(cmd_buffer, view->image, 4660 cmd_buffer->state.attachments[idx].current_layout, 4661 cmd_buffer->state.attachments[idx].current_in_render_loop, 4662 att.layout, att.in_render_loop, 0, 0, &range, sample_locs); 4663 } 4664 4665 cmd_buffer->state.attachments[idx].current_layout = att.layout; 4666 cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout; 4667 cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop; 4668} 4669 4670void 4671radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass *subpass) 4672{ 4673 cmd_buffer->state.subpass = subpass; 4674 4675 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER; 4676} 4677 4678static VkResult 4679radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer, 4680 struct radv_render_pass *pass, 4681 const VkRenderPassBeginInfo *info) 4682{ 4683 const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs = 4684 vk_find_struct_const(info->pNext, RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT); 4685 struct radv_cmd_state *state = &cmd_buffer->state; 4686 4687 if (!sample_locs) { 4688 state->subpass_sample_locs = NULL; 4689 return VK_SUCCESS; 4690 } 4691 4692 for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) { 4693 const VkAttachmentSampleLocationsEXT *att_sample_locs = 4694 &sample_locs->pAttachmentInitialSampleLocations[i]; 4695 uint32_t att_idx = att_sample_locs->attachmentIndex; 4696 struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image; 4697 4698 assert(vk_format_is_depth_or_stencil(image->vk.format)); 4699 4700 /* From the Vulkan spec 1.1.108: 4701 * 4702 * "If the image referenced by the framebuffer attachment at 4703 * index attachmentIndex was not created with 4704 * VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT 4705 * then the values specified in sampleLocationsInfo are 4706 * ignored." 4707 */ 4708 if (!(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT)) 4709 continue; 4710 4711 const VkSampleLocationsInfoEXT *sample_locs_info = &att_sample_locs->sampleLocationsInfo; 4712 4713 state->attachments[att_idx].sample_location.per_pixel = 4714 sample_locs_info->sampleLocationsPerPixel; 4715 state->attachments[att_idx].sample_location.grid_size = 4716 sample_locs_info->sampleLocationGridSize; 4717 state->attachments[att_idx].sample_location.count = sample_locs_info->sampleLocationsCount; 4718 typed_memcpy(&state->attachments[att_idx].sample_location.locations[0], 4719 sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount); 4720 } 4721 4722 state->subpass_sample_locs = 4723 vk_alloc(&cmd_buffer->pool->vk.alloc, 4724 sample_locs->postSubpassSampleLocationsCount * sizeof(state->subpass_sample_locs[0]), 4725 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 4726 if (state->subpass_sample_locs == NULL) { 4727 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 4728 return cmd_buffer->record_result; 4729 } 4730 4731 state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount; 4732 4733 for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) { 4734 const VkSubpassSampleLocationsEXT *subpass_sample_locs_info = 4735 &sample_locs->pPostSubpassSampleLocations[i]; 4736 const VkSampleLocationsInfoEXT *sample_locs_info = 4737 &subpass_sample_locs_info->sampleLocationsInfo; 4738 4739 state->subpass_sample_locs[i].subpass_idx = subpass_sample_locs_info->subpassIndex; 4740 state->subpass_sample_locs[i].sample_location.per_pixel = 4741 sample_locs_info->sampleLocationsPerPixel; 4742 state->subpass_sample_locs[i].sample_location.grid_size = 4743 sample_locs_info->sampleLocationGridSize; 4744 state->subpass_sample_locs[i].sample_location.count = sample_locs_info->sampleLocationsCount; 4745 typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0], 4746 sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount); 4747 } 4748 4749 return VK_SUCCESS; 4750} 4751 4752static VkResult 4753radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass, 4754 const VkRenderPassBeginInfo *info) 4755{ 4756 struct radv_cmd_state *state = &cmd_buffer->state; 4757 const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL; 4758 4759 if (info) { 4760 attachment_info = vk_find_struct_const(info->pNext, RENDER_PASS_ATTACHMENT_BEGIN_INFO); 4761 } 4762 4763 if (pass->attachment_count == 0) { 4764 state->attachments = NULL; 4765 return VK_SUCCESS; 4766 } 4767 4768 state->attachments = 4769 vk_alloc(&cmd_buffer->pool->vk.alloc, pass->attachment_count * sizeof(state->attachments[0]), 4770 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 4771 if (state->attachments == NULL) { 4772 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 4773 return cmd_buffer->record_result; 4774 } 4775 4776 for (uint32_t i = 0; i < pass->attachment_count; ++i) { 4777 struct radv_render_pass_attachment *att = &pass->attachments[i]; 4778 VkImageAspectFlags att_aspects = vk_format_aspects(att->format); 4779 VkImageAspectFlags clear_aspects = 0; 4780 4781 if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { 4782 /* color attachment */ 4783 if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 4784 clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; 4785 } 4786 } else { 4787 /* depthstencil attachment */ 4788 if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && 4789 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 4790 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; 4791 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 4792 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE) 4793 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 4794 } 4795 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 4796 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 4797 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 4798 } 4799 } 4800 4801 state->attachments[i].pending_clear_aspects = clear_aspects; 4802 state->attachments[i].cleared_views = 0; 4803 if (clear_aspects && info) { 4804 assert(info->clearValueCount > i); 4805 state->attachments[i].clear_value = info->pClearValues[i]; 4806 } 4807 4808 state->attachments[i].current_layout = att->initial_layout; 4809 state->attachments[i].current_in_render_loop = false; 4810 state->attachments[i].current_stencil_layout = att->stencil_initial_layout; 4811 state->attachments[i].sample_location.count = 0; 4812 4813 struct radv_image_view *iview; 4814 if (attachment_info && attachment_info->attachmentCount > i) { 4815 iview = radv_image_view_from_handle(attachment_info->pAttachments[i]); 4816 } else { 4817 iview = radv_image_view_from_handle(state->framebuffer->attachments[i]); 4818 } 4819 4820 state->attachments[i].iview = iview; 4821 if (iview->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { 4822 radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview); 4823 } else { 4824 radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview); 4825 } 4826 } 4827 4828 return VK_SUCCESS; 4829} 4830 4831VKAPI_ATTR VkResult VKAPI_CALL 4832radv_AllocateCommandBuffers(VkDevice _device, const VkCommandBufferAllocateInfo *pAllocateInfo, 4833 VkCommandBuffer *pCommandBuffers) 4834{ 4835 RADV_FROM_HANDLE(radv_device, device, _device); 4836 RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool); 4837 4838 VkResult result = VK_SUCCESS; 4839 uint32_t i; 4840 4841 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { 4842 4843 if (!list_is_empty(&pool->free_cmd_buffers)) { 4844 struct radv_cmd_buffer *cmd_buffer = 4845 list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link); 4846 4847 list_del(&cmd_buffer->pool_link); 4848 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); 4849 4850 result = radv_reset_cmd_buffer(cmd_buffer); 4851 vk_command_buffer_finish(&cmd_buffer->vk); 4852 VkResult init_result = 4853 vk_command_buffer_init(&cmd_buffer->vk, &pool->vk, pAllocateInfo->level); 4854 if (init_result != VK_SUCCESS) 4855 result = init_result; 4856 4857 pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer); 4858 } else { 4859 result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, &pCommandBuffers[i]); 4860 } 4861 if (result != VK_SUCCESS) 4862 break; 4863 } 4864 4865 if (result != VK_SUCCESS) { 4866 radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, pCommandBuffers); 4867 4868 /* From the Vulkan 1.0.66 spec: 4869 * 4870 * "vkAllocateCommandBuffers can be used to create multiple 4871 * command buffers. If the creation of any of those command 4872 * buffers fails, the implementation must destroy all 4873 * successfully created command buffer objects from this 4874 * command, set all entries of the pCommandBuffers array to 4875 * NULL and return the error." 4876 */ 4877 memset(pCommandBuffers, 0, sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount); 4878 } 4879 4880 return result; 4881} 4882 4883VKAPI_ATTR void VKAPI_CALL 4884radv_FreeCommandBuffers(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount, 4885 const VkCommandBuffer *pCommandBuffers) 4886{ 4887 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 4888 4889 for (uint32_t i = 0; i < commandBufferCount; i++) { 4890 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]); 4891 4892 if (!cmd_buffer) 4893 continue; 4894 assert(cmd_buffer->pool == pool); 4895 4896 list_del(&cmd_buffer->pool_link); 4897 list_addtail(&cmd_buffer->pool_link, &pool->free_cmd_buffers); 4898 } 4899} 4900 4901VKAPI_ATTR VkResult VKAPI_CALL 4902radv_ResetCommandBuffer(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags) 4903{ 4904 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 4905 return radv_reset_cmd_buffer(cmd_buffer); 4906} 4907 4908static void 4909radv_inherit_dynamic_rendering(struct radv_cmd_buffer *cmd_buffer, 4910 const VkCommandBufferInheritanceInfo *inherit_info, 4911 const VkCommandBufferInheritanceRenderingInfo *dyn_info) 4912{ 4913 const VkAttachmentSampleCountInfoAMD *sample_info = 4914 vk_find_struct_const(inherit_info->pNext, ATTACHMENT_SAMPLE_COUNT_INFO_AMD); 4915 VkResult result; 4916 /* (normal + resolve) for color attachments and ds and a VRS attachment */ 4917 VkAttachmentDescription2 att_desc[MAX_RTS * 2 + 3]; 4918 VkAttachmentReference2 color_refs[MAX_RTS], ds_ref; 4919 unsigned att_count = 0; 4920 4921 VkSubpassDescription2 subpass = { 4922 .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2, 4923 .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, 4924 .viewMask = dyn_info->viewMask, 4925 .colorAttachmentCount = dyn_info->colorAttachmentCount, 4926 .pColorAttachments = color_refs, 4927 }; 4928 4929 for (unsigned i = 0; i < dyn_info->colorAttachmentCount; ++i) { 4930 if (dyn_info->pColorAttachmentFormats[i] == VK_FORMAT_UNDEFINED) { 4931 color_refs[i] = (VkAttachmentReference2){ 4932 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, 4933 .attachment = VK_ATTACHMENT_UNUSED, 4934 }; 4935 continue; 4936 } 4937 4938 color_refs[i] = (VkAttachmentReference2){ 4939 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, 4940 .attachment = att_count, 4941 .layout = VK_IMAGE_LAYOUT_GENERAL, /* Shouldn't be used */ 4942 .aspectMask = 0, /* Shouldn't be used */ 4943 }; 4944 4945 VkAttachmentDescription2 *att = att_desc + att_count++; 4946 memset(att, 0, sizeof(*att)); 4947 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2; 4948 att->format = dyn_info->pColorAttachmentFormats[i]; 4949 att->samples = 4950 sample_info ? sample_info->pColorAttachmentSamples[i] : dyn_info->rasterizationSamples; 4951 att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; 4952 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE; 4953 att->initialLayout = VK_IMAGE_LAYOUT_GENERAL; 4954 att->finalLayout = VK_IMAGE_LAYOUT_GENERAL; 4955 } 4956 4957 if (dyn_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED || 4958 dyn_info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED) { 4959 VkFormat fmt = dyn_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED 4960 ? dyn_info->depthAttachmentFormat 4961 : dyn_info->stencilAttachmentFormat; 4962 4963 ds_ref = (VkAttachmentReference2){ 4964 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, 4965 .attachment = att_count, 4966 .layout = VK_IMAGE_LAYOUT_GENERAL, /* Shouldn't be used */ 4967 .aspectMask = 0, /* Shouldn't be used */ 4968 }; 4969 subpass.pDepthStencilAttachment = &ds_ref; 4970 4971 VkAttachmentDescription2 *att = att_desc + att_count++; 4972 4973 memset(att, 0, sizeof(*att)); 4974 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2; 4975 att->format = fmt; 4976 att->samples = 4977 sample_info ? sample_info->depthStencilAttachmentSamples : dyn_info->rasterizationSamples; 4978 att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; 4979 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE; 4980 att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD; 4981 att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE; 4982 } 4983 4984 VkRenderPassCreateInfo2 rp_create_info = { 4985 .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2, 4986 .attachmentCount = att_count, 4987 .pAttachments = att_desc, 4988 .subpassCount = 1, 4989 .pSubpasses = &subpass, 4990 }; 4991 4992 VkRenderPass rp; 4993 result = 4994 radv_CreateRenderPass2(radv_device_to_handle(cmd_buffer->device), &rp_create_info, NULL, &rp); 4995 if (result != VK_SUCCESS) { 4996 cmd_buffer->record_result = result; 4997 return; 4998 } 4999 5000 cmd_buffer->state.pass = radv_render_pass_from_handle(rp); 5001 cmd_buffer->state.own_render_pass = true; 5002} 5003 5004VKAPI_ATTR VkResult VKAPI_CALL 5005radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo) 5006{ 5007 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5008 VkResult result = VK_SUCCESS; 5009 5010 if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) { 5011 /* If the command buffer has already been resetted with 5012 * vkResetCommandBuffer, no need to do it again. 5013 */ 5014 result = radv_reset_cmd_buffer(cmd_buffer); 5015 if (result != VK_SUCCESS) 5016 return result; 5017 } 5018 5019 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); 5020 cmd_buffer->state.last_primitive_reset_en = -1; 5021 cmd_buffer->state.last_index_type = -1; 5022 cmd_buffer->state.last_num_instances = -1; 5023 cmd_buffer->state.last_vertex_offset = -1; 5024 cmd_buffer->state.last_first_instance = -1; 5025 cmd_buffer->state.last_drawid = -1; 5026 cmd_buffer->state.last_subpass_color_count = MAX_RTS; 5027 cmd_buffer->state.predication_type = -1; 5028 cmd_buffer->state.last_sx_ps_downconvert = -1; 5029 cmd_buffer->state.last_sx_blend_opt_epsilon = -1; 5030 cmd_buffer->state.last_sx_blend_opt_control = -1; 5031 cmd_buffer->state.last_nggc_settings = -1; 5032 cmd_buffer->state.last_nggc_settings_sgpr_idx = -1; 5033 cmd_buffer->state.mesh_shading = false; 5034 cmd_buffer->state.last_vrs_rates = -1; 5035 cmd_buffer->state.last_vrs_rates_sgpr_idx = -1; 5036 cmd_buffer->usage_flags = pBeginInfo->flags; 5037 5038 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && 5039 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) { 5040 struct radv_subpass *subpass = NULL; 5041 5042 assert(pBeginInfo->pInheritanceInfo); 5043 5044 cmd_buffer->state.framebuffer = 5045 vk_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer); 5046 5047 if (pBeginInfo->pInheritanceInfo->renderPass) { 5048 cmd_buffer->state.pass = 5049 radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); 5050 assert(pBeginInfo->pInheritanceInfo->subpass < cmd_buffer->state.pass->subpass_count); 5051 subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; 5052 } else { 5053 const VkCommandBufferInheritanceRenderingInfo *dyn_info = 5054 vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, 5055 COMMAND_BUFFER_INHERITANCE_RENDERING_INFO); 5056 if (dyn_info) { 5057 radv_inherit_dynamic_rendering(cmd_buffer, pBeginInfo->pInheritanceInfo, dyn_info); 5058 subpass = &cmd_buffer->state.pass->subpasses[0]; 5059 } 5060 } 5061 5062 if (cmd_buffer->state.framebuffer) { 5063 result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL); 5064 if (result != VK_SUCCESS) 5065 return result; 5066 } 5067 5068 cmd_buffer->state.inherited_pipeline_statistics = 5069 pBeginInfo->pInheritanceInfo->pipelineStatistics; 5070 5071 if (cmd_buffer->state.pass) { 5072 cmd_buffer->state.subpass = subpass; 5073 if (cmd_buffer->state.framebuffer) 5074 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER; 5075 } 5076 } 5077 5078 if (unlikely(cmd_buffer->device->trace_bo)) 5079 radv_cmd_buffer_trace_emit(cmd_buffer); 5080 5081 radv_describe_begin_cmd_buffer(cmd_buffer); 5082 5083 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING; 5084 5085 return result; 5086} 5087 5088VKAPI_ATTR void VKAPI_CALL 5089radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding, 5090 uint32_t bindingCount, const VkBuffer *pBuffers, 5091 const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes, 5092 const VkDeviceSize *pStrides) 5093{ 5094 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5095 struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings; 5096 const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; 5097 5098 /* We have to defer setting up vertex buffer since we need the buffer 5099 * stride from the pipeline. */ 5100 5101 assert(firstBinding + bindingCount <= MAX_VBS); 5102 enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level; 5103 5104 if (firstBinding + bindingCount > cmd_buffer->used_vertex_bindings) 5105 cmd_buffer->used_vertex_bindings = firstBinding + bindingCount; 5106 5107 uint32_t misaligned_mask_invalid = 0; 5108 5109 for (uint32_t i = 0; i < bindingCount; i++) { 5110 RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]); 5111 uint32_t idx = firstBinding + i; 5112 VkDeviceSize size = pSizes ? pSizes[i] : 0; 5113 /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */ 5114 VkDeviceSize stride = pStrides ? pStrides[i] : vb[idx].stride; 5115 5116 if (!!cmd_buffer->vertex_binding_buffers[idx] != !!buffer || 5117 (buffer && ((vb[idx].offset & 0x3) != (pOffsets[i] & 0x3) || 5118 (vb[idx].stride & 0x3) != (stride & 0x3)))) { 5119 misaligned_mask_invalid |= state->bindings_match_attrib ? BITFIELD_BIT(idx) : 0xffffffff; 5120 } 5121 5122 cmd_buffer->vertex_binding_buffers[idx] = buffer; 5123 vb[idx].offset = pOffsets[i]; 5124 vb[idx].size = size; 5125 vb[idx].stride = stride; 5126 5127 uint32_t bit = BITFIELD_BIT(idx); 5128 if (buffer) { 5129 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->vertex_binding_buffers[idx]->bo); 5130 cmd_buffer->state.vbo_bound_mask |= bit; 5131 } else { 5132 cmd_buffer->state.vbo_bound_mask &= ~bit; 5133 } 5134 } 5135 5136 if ((chip == GFX6 || chip >= GFX10) && misaligned_mask_invalid) { 5137 cmd_buffer->state.vbo_misaligned_mask_invalid = misaligned_mask_invalid; 5138 cmd_buffer->state.vbo_misaligned_mask &= ~misaligned_mask_invalid; 5139 } 5140 5141 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER | 5142 RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT; 5143} 5144 5145static uint32_t 5146vk_to_index_type(VkIndexType type) 5147{ 5148 switch (type) { 5149 case VK_INDEX_TYPE_UINT8_EXT: 5150 return V_028A7C_VGT_INDEX_8; 5151 case VK_INDEX_TYPE_UINT16: 5152 return V_028A7C_VGT_INDEX_16; 5153 case VK_INDEX_TYPE_UINT32: 5154 return V_028A7C_VGT_INDEX_32; 5155 default: 5156 unreachable("invalid index type"); 5157 } 5158} 5159 5160uint32_t 5161radv_get_vgt_index_size(uint32_t type) 5162{ 5163 uint32_t index_type = G_028A7C_INDEX_TYPE(type); 5164 switch (index_type) { 5165 case V_028A7C_VGT_INDEX_8: 5166 return 1; 5167 case V_028A7C_VGT_INDEX_16: 5168 return 2; 5169 case V_028A7C_VGT_INDEX_32: 5170 return 4; 5171 default: 5172 unreachable("invalid index type"); 5173 } 5174} 5175 5176VKAPI_ATTR void VKAPI_CALL 5177radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, 5178 VkIndexType indexType) 5179{ 5180 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5181 RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer); 5182 5183 cmd_buffer->state.index_buffer = index_buffer; 5184 cmd_buffer->state.index_offset = offset; 5185 cmd_buffer->state.index_type = vk_to_index_type(indexType); 5186 cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo); 5187 cmd_buffer->state.index_va += index_buffer->offset + offset; 5188 5189 int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType)); 5190 cmd_buffer->state.max_index_count = 5191 (vk_buffer_range(&index_buffer->vk, offset, VK_WHOLE_SIZE)) / index_size; 5192 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; 5193 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo); 5194} 5195 5196static void 5197radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point, 5198 struct radv_descriptor_set *set, unsigned idx) 5199{ 5200 struct radeon_winsys *ws = cmd_buffer->device->ws; 5201 5202 radv_set_descriptor_set(cmd_buffer, bind_point, set, idx); 5203 5204 assert(set); 5205 assert(!(set->header.layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); 5206 5207 if (!cmd_buffer->device->use_global_bo_list) { 5208 for (unsigned j = 0; j < set->header.buffer_count; ++j) 5209 if (set->descriptors[j]) 5210 radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]); 5211 } 5212 5213 if (set->header.bo) 5214 radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo); 5215} 5216 5217VKAPI_ATTR void VKAPI_CALL 5218radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, 5219 VkPipelineLayout _layout, uint32_t firstSet, uint32_t descriptorSetCount, 5220 const VkDescriptorSet *pDescriptorSets, uint32_t dynamicOffsetCount, 5221 const uint32_t *pDynamicOffsets) 5222{ 5223 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5224 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 5225 unsigned dyn_idx = 0; 5226 5227 const bool no_dynamic_bounds = 5228 cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS; 5229 struct radv_descriptor_state *descriptors_state = 5230 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint); 5231 5232 for (unsigned i = 0; i < descriptorSetCount; ++i) { 5233 unsigned set_idx = i + firstSet; 5234 RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]); 5235 5236 if (!set) { 5237 /* From the Vulkan spec 1.3.211: 5238 * 5239 * "VUID-vkCmdBindDescriptorSets-layout-06564 5240 * If layout was not created with VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT, each 5241 * element of pDescriptorSets must be a valid VkDescriptorSet" 5242 */ 5243 assert(layout->independent_sets); 5244 continue; 5245 } 5246 5247 /* If the set is already bound we only need to update the 5248 * (potentially changed) dynamic offsets. */ 5249 if (descriptors_state->sets[set_idx] != set || 5250 !(descriptors_state->valid & (1u << set_idx))) { 5251 radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, set_idx); 5252 } 5253 5254 for (unsigned j = 0; j < set->header.layout->dynamic_offset_count; ++j, ++dyn_idx) { 5255 unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start; 5256 uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4; 5257 assert(dyn_idx < dynamicOffsetCount); 5258 5259 struct radv_descriptor_range *range = set->header.dynamic_descriptors + j; 5260 5261 if (!range->va) { 5262 memset(dst, 0, 4 * 4); 5263 } else { 5264 uint64_t va = range->va + pDynamicOffsets[dyn_idx]; 5265 dst[0] = va; 5266 dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); 5267 dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size; 5268 dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 5269 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); 5270 5271 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { 5272 dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) | 5273 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW); 5274 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) { 5275 dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 5276 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); 5277 } else { 5278 dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 5279 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 5280 } 5281 } 5282 5283 cmd_buffer->push_constant_stages |= set->header.layout->dynamic_shader_stages; 5284 } 5285 } 5286} 5287 5288static bool 5289radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set, 5290 struct radv_descriptor_set_layout *layout, 5291 VkPipelineBindPoint bind_point) 5292{ 5293 struct radv_descriptor_state *descriptors_state = 5294 radv_get_descriptors_state(cmd_buffer, bind_point); 5295 set->header.size = layout->size; 5296 5297 if (set->header.layout != layout) { 5298 if (set->header.layout) 5299 vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->header.layout->vk); 5300 vk_descriptor_set_layout_ref(&layout->vk); 5301 set->header.layout = layout; 5302 } 5303 5304 if (descriptors_state->push_set.capacity < set->header.size) { 5305 size_t new_size = MAX2(set->header.size, 1024); 5306 new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity); 5307 new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS); 5308 5309 free(set->header.mapped_ptr); 5310 set->header.mapped_ptr = malloc(new_size); 5311 5312 if (!set->header.mapped_ptr) { 5313 descriptors_state->push_set.capacity = 0; 5314 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 5315 return false; 5316 } 5317 5318 descriptors_state->push_set.capacity = new_size; 5319 } 5320 5321 return true; 5322} 5323 5324void 5325radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, 5326 VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout _layout, 5327 uint32_t set, uint32_t descriptorWriteCount, 5328 const VkWriteDescriptorSet *pDescriptorWrites) 5329{ 5330 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 5331 struct radv_descriptor_set *push_set = 5332 (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors; 5333 unsigned bo_offset; 5334 5335 assert(set == 0); 5336 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); 5337 5338 push_set->header.size = layout->set[set].layout->size; 5339 push_set->header.layout = layout->set[set].layout; 5340 5341 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset, 5342 (void **)&push_set->header.mapped_ptr)) 5343 return; 5344 5345 push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); 5346 push_set->header.va += bo_offset; 5347 5348 radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer, 5349 radv_descriptor_set_to_handle(push_set), descriptorWriteCount, 5350 pDescriptorWrites, 0, NULL); 5351 5352 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set); 5353} 5354 5355VKAPI_ATTR void VKAPI_CALL 5356radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, 5357 VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount, 5358 const VkWriteDescriptorSet *pDescriptorWrites) 5359{ 5360 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5361 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 5362 struct radv_descriptor_state *descriptors_state = 5363 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint); 5364 struct radv_descriptor_set *push_set = 5365 (struct radv_descriptor_set *)&descriptors_state->push_set.set; 5366 5367 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); 5368 5369 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout, 5370 pipelineBindPoint)) 5371 return; 5372 5373 /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR() 5374 * because it is invalid, according to Vulkan spec. 5375 */ 5376 for (int i = 0; i < descriptorWriteCount; i++) { 5377 ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i]; 5378 assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK); 5379 } 5380 5381 radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer, 5382 radv_descriptor_set_to_handle(push_set), descriptorWriteCount, 5383 pDescriptorWrites, 0, NULL); 5384 5385 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set); 5386 descriptors_state->push_dirty = true; 5387} 5388 5389VKAPI_ATTR void VKAPI_CALL 5390radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer, 5391 VkDescriptorUpdateTemplate descriptorUpdateTemplate, 5392 VkPipelineLayout _layout, uint32_t set, const void *pData) 5393{ 5394 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5395 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout); 5396 RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate); 5397 struct radv_descriptor_state *descriptors_state = 5398 radv_get_descriptors_state(cmd_buffer, templ->bind_point); 5399 struct radv_descriptor_set *push_set = 5400 (struct radv_descriptor_set *)&descriptors_state->push_set.set; 5401 5402 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR); 5403 5404 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout, 5405 templ->bind_point)) 5406 return; 5407 5408 radv_cmd_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set, 5409 descriptorUpdateTemplate, pData); 5410 5411 radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set); 5412 descriptors_state->push_dirty = true; 5413} 5414 5415VKAPI_ATTR void VKAPI_CALL 5416radv_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout, 5417 VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size, 5418 const void *pValues) 5419{ 5420 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5421 memcpy(cmd_buffer->push_constants + offset, pValues, size); 5422 cmd_buffer->push_constant_stages |= stageFlags; 5423} 5424 5425VKAPI_ATTR VkResult VKAPI_CALL 5426radv_EndCommandBuffer(VkCommandBuffer commandBuffer) 5427{ 5428 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5429 5430 radv_emit_mip_change_flush_default(cmd_buffer); 5431 5432 if (cmd_buffer->qf != RADV_QUEUE_TRANSFER) { 5433 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX6) 5434 cmd_buffer->state.flush_bits |= 5435 RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2; 5436 5437 /* Make sure to sync all pending active queries at the end of 5438 * command buffer. 5439 */ 5440 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits; 5441 5442 /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a 5443 * command buffer. 5444 */ 5445 if (cmd_buffer->state.rb_noncoherent_dirty && can_skip_buffer_l2_flushes(cmd_buffer->device)) 5446 cmd_buffer->state.flush_bits |= radv_src_access_flush( 5447 cmd_buffer, 5448 VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT | 5449 VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, 5450 NULL); 5451 5452 /* Since NGG streamout uses GDS, we need to make GDS idle when 5453 * we leave the IB, otherwise another process might overwrite 5454 * it while our shaders are busy. 5455 */ 5456 if (cmd_buffer->gds_needed) 5457 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH; 5458 5459 /* Finalize the internal compute command stream, if it exists. */ 5460 if (cmd_buffer->ace_internal.cs) { 5461 VkResult result = radv_ace_internal_finalize(cmd_buffer); 5462 if (result != VK_SUCCESS) 5463 return vk_error(cmd_buffer, result); 5464 } 5465 5466 si_emit_cache_flush(cmd_buffer); 5467 } 5468 5469 /* Make sure CP DMA is idle at the end of IBs because the kernel 5470 * doesn't wait for it. 5471 */ 5472 si_cp_dma_wait_for_idle(cmd_buffer); 5473 5474 radv_describe_end_cmd_buffer(cmd_buffer); 5475 5476 vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments); 5477 vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.subpass_sample_locs); 5478 5479 VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs); 5480 if (result != VK_SUCCESS) 5481 return vk_error(cmd_buffer, result); 5482 5483 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE; 5484 5485 return cmd_buffer->record_result; 5486} 5487 5488static void 5489radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, 5490 struct radv_compute_pipeline *pipeline) 5491{ 5492 if (pipeline == cmd_buffer->state.emitted_compute_pipeline) 5493 return; 5494 5495 assert(!pipeline->base.ctx_cs.cdw); 5496 5497 cmd_buffer->state.emitted_compute_pipeline = pipeline; 5498 5499 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.cs.cdw); 5500 radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw); 5501 5502 cmd_buffer->compute_scratch_size_per_wave_needed = 5503 MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, pipeline->base.scratch_bytes_per_wave); 5504 cmd_buffer->compute_scratch_waves_wanted = 5505 MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->base.max_waves); 5506 5507 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.slab_bo); 5508 5509 if (unlikely(cmd_buffer->device->trace_bo)) 5510 radv_save_pipeline(cmd_buffer, &pipeline->base); 5511} 5512 5513static void 5514radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point) 5515{ 5516 struct radv_descriptor_state *descriptors_state = 5517 radv_get_descriptors_state(cmd_buffer, bind_point); 5518 5519 descriptors_state->dirty |= descriptors_state->valid; 5520} 5521 5522VKAPI_ATTR void VKAPI_CALL 5523radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, 5524 VkPipeline _pipeline) 5525{ 5526 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5527 RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline); 5528 5529 switch (pipelineBindPoint) { 5530 case VK_PIPELINE_BIND_POINT_COMPUTE: { 5531 struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline); 5532 5533 if (cmd_buffer->state.compute_pipeline == compute_pipeline) 5534 return; 5535 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint); 5536 5537 cmd_buffer->state.compute_pipeline = compute_pipeline; 5538 cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT; 5539 cmd_buffer->task_rings_needed |= 5540 pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.uses_task_rings; 5541 break; 5542 } 5543 case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: { 5544 struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline); 5545 5546 if (cmd_buffer->state.rt_pipeline == compute_pipeline) 5547 return; 5548 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint); 5549 5550 cmd_buffer->state.rt_pipeline = compute_pipeline; 5551 cmd_buffer->push_constant_stages |= RADV_RT_STAGE_BITS; 5552 if (compute_pipeline->dynamic_stack_size) 5553 radv_set_rt_stack_size(cmd_buffer, cmd_buffer->state.rt_stack_size); 5554 break; 5555 } 5556 case VK_PIPELINE_BIND_POINT_GRAPHICS: { 5557 struct radv_graphics_pipeline *graphics_pipeline = 5558 pipeline ? radv_pipeline_to_graphics(pipeline) : NULL; 5559 5560 if (cmd_buffer->state.graphics_pipeline == graphics_pipeline) 5561 return; 5562 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint); 5563 5564 bool vtx_emit_count_changed = 5565 !pipeline || !cmd_buffer->state.graphics_pipeline || 5566 cmd_buffer->state.graphics_pipeline->vtx_emit_num != graphics_pipeline->vtx_emit_num || 5567 cmd_buffer->state.graphics_pipeline->vtx_base_sgpr != graphics_pipeline->vtx_base_sgpr; 5568 cmd_buffer->state.graphics_pipeline = graphics_pipeline; 5569 if (!pipeline) 5570 break; 5571 5572 bool mesh_shading = radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH); 5573 if (mesh_shading != cmd_buffer->state.mesh_shading) { 5574 /* Re-emit VRS state because the combiner is different (vertex vs primitive). 5575 * Re-emit primitive topology because the mesh shading pipeline clobbered it. 5576 */ 5577 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE | 5578 RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY; 5579 } 5580 5581 cmd_buffer->state.mesh_shading = mesh_shading; 5582 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT; 5583 cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages; 5584 5585 /* the new vertex shader might not have the same user regs */ 5586 if (vtx_emit_count_changed) { 5587 cmd_buffer->state.last_first_instance = -1; 5588 cmd_buffer->state.last_vertex_offset = -1; 5589 cmd_buffer->state.last_drawid = -1; 5590 } 5591 5592 /* Prefetch all pipeline shaders at first draw time. */ 5593 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS; 5594 5595 if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug && 5596 cmd_buffer->state.emitted_graphics_pipeline && 5597 cmd_buffer->state.emitted_graphics_pipeline->is_ngg && 5598 !cmd_buffer->state.graphics_pipeline->is_ngg) { 5599 /* Transitioning from NGG to legacy GS requires 5600 * VGT_FLUSH on GFX10 and Navi21. VGT_FLUSH 5601 * is also emitted at the beginning of IBs when legacy 5602 * GS ring pointers are set. 5603 */ 5604 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH; 5605 } 5606 5607 radv_bind_dynamic_state(cmd_buffer, &graphics_pipeline->dynamic_state); 5608 5609 if (graphics_pipeline->esgs_ring_size > cmd_buffer->esgs_ring_size_needed) 5610 cmd_buffer->esgs_ring_size_needed = graphics_pipeline->esgs_ring_size; 5611 if (graphics_pipeline->gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed) 5612 cmd_buffer->gsvs_ring_size_needed = graphics_pipeline->gsvs_ring_size; 5613 5614 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL)) 5615 cmd_buffer->tess_rings_needed = true; 5616 if (mesh_shading) 5617 cmd_buffer->mesh_scratch_ring_needed |= 5618 pipeline->shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring; 5619 5620 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK)) { 5621 if (!cmd_buffer->ace_internal.cs) { 5622 cmd_buffer->ace_internal.cs = radv_ace_internal_create(cmd_buffer); 5623 if (!cmd_buffer->ace_internal.cs) 5624 return; 5625 } 5626 5627 cmd_buffer->task_rings_needed = true; 5628 } 5629 break; 5630 } 5631 default: 5632 assert(!"invalid bind point"); 5633 break; 5634 } 5635} 5636 5637VKAPI_ATTR void VKAPI_CALL 5638radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount, 5639 const VkViewport *pViewports) 5640{ 5641 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5642 struct radv_cmd_state *state = &cmd_buffer->state; 5643 ASSERTED const uint32_t total_count = firstViewport + viewportCount; 5644 5645 assert(firstViewport < MAX_VIEWPORTS); 5646 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); 5647 5648 if (state->dynamic.viewport.count < total_count) 5649 state->dynamic.viewport.count = total_count; 5650 5651 memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports, 5652 viewportCount * sizeof(*pViewports)); 5653 for (unsigned i = 0; i < viewportCount; i++) { 5654 radv_get_viewport_xform(&pViewports[i], 5655 state->dynamic.viewport.xform[i + firstViewport].scale, 5656 state->dynamic.viewport.xform[i + firstViewport].translate); 5657 } 5658 5659 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT; 5660} 5661 5662VKAPI_ATTR void VKAPI_CALL 5663radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount, 5664 const VkRect2D *pScissors) 5665{ 5666 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5667 struct radv_cmd_state *state = &cmd_buffer->state; 5668 ASSERTED const uint32_t total_count = firstScissor + scissorCount; 5669 5670 assert(firstScissor < MAX_SCISSORS); 5671 assert(total_count >= 1 && total_count <= MAX_SCISSORS); 5672 5673 if (state->dynamic.scissor.count < total_count) 5674 state->dynamic.scissor.count = total_count; 5675 5676 memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors, 5677 scissorCount * sizeof(*pScissors)); 5678 5679 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; 5680} 5681 5682VKAPI_ATTR void VKAPI_CALL 5683radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth) 5684{ 5685 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5686 5687 if (cmd_buffer->state.dynamic.line_width != lineWidth) 5688 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; 5689 5690 cmd_buffer->state.dynamic.line_width = lineWidth; 5691 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH; 5692} 5693 5694VKAPI_ATTR void VKAPI_CALL 5695radv_CmdSetDepthBias(VkCommandBuffer commandBuffer, float depthBiasConstantFactor, 5696 float depthBiasClamp, float depthBiasSlopeFactor) 5697{ 5698 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5699 struct radv_cmd_state *state = &cmd_buffer->state; 5700 5701 state->dynamic.depth_bias.bias = depthBiasConstantFactor; 5702 state->dynamic.depth_bias.clamp = depthBiasClamp; 5703 state->dynamic.depth_bias.slope = depthBiasSlopeFactor; 5704 5705 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS; 5706} 5707 5708VKAPI_ATTR void VKAPI_CALL 5709radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4]) 5710{ 5711 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5712 struct radv_cmd_state *state = &cmd_buffer->state; 5713 5714 memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4); 5715 5716 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS; 5717} 5718 5719VKAPI_ATTR void VKAPI_CALL 5720radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds) 5721{ 5722 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5723 struct radv_cmd_state *state = &cmd_buffer->state; 5724 5725 state->dynamic.depth_bounds.min = minDepthBounds; 5726 state->dynamic.depth_bounds.max = maxDepthBounds; 5727 5728 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS; 5729} 5730 5731VKAPI_ATTR void VKAPI_CALL 5732radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, 5733 uint32_t compareMask) 5734{ 5735 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5736 struct radv_cmd_state *state = &cmd_buffer->state; 5737 5738 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 5739 state->dynamic.stencil_compare_mask.front = compareMask; 5740 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 5741 state->dynamic.stencil_compare_mask.back = compareMask; 5742 5743 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK; 5744} 5745 5746VKAPI_ATTR void VKAPI_CALL 5747radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, 5748 uint32_t writeMask) 5749{ 5750 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5751 struct radv_cmd_state *state = &cmd_buffer->state; 5752 5753 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 5754 state->dynamic.stencil_write_mask.front = writeMask; 5755 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 5756 state->dynamic.stencil_write_mask.back = writeMask; 5757 5758 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK; 5759} 5760 5761VKAPI_ATTR void VKAPI_CALL 5762radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, 5763 uint32_t reference) 5764{ 5765 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5766 5767 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 5768 cmd_buffer->state.dynamic.stencil_reference.front = reference; 5769 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 5770 cmd_buffer->state.dynamic.stencil_reference.back = reference; 5771 5772 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE; 5773} 5774 5775VKAPI_ATTR void VKAPI_CALL 5776radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle, 5777 uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles) 5778{ 5779 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5780 struct radv_cmd_state *state = &cmd_buffer->state; 5781 ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount; 5782 5783 assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES); 5784 assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES); 5785 5786 typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle], 5787 pDiscardRectangles, discardRectangleCount); 5788 5789 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE; 5790} 5791 5792VKAPI_ATTR void VKAPI_CALL 5793radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer, 5794 const VkSampleLocationsInfoEXT *pSampleLocationsInfo) 5795{ 5796 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5797 struct radv_cmd_state *state = &cmd_buffer->state; 5798 5799 assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS); 5800 5801 state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel; 5802 state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize; 5803 state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount; 5804 typed_memcpy(&state->dynamic.sample_location.locations[0], 5805 pSampleLocationsInfo->pSampleLocations, pSampleLocationsInfo->sampleLocationsCount); 5806 5807 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS; 5808} 5809 5810VKAPI_ATTR void VKAPI_CALL 5811radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor, 5812 uint16_t lineStipplePattern) 5813{ 5814 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5815 struct radv_cmd_state *state = &cmd_buffer->state; 5816 5817 state->dynamic.line_stipple.factor = lineStippleFactor; 5818 state->dynamic.line_stipple.pattern = lineStipplePattern; 5819 5820 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE; 5821} 5822 5823VKAPI_ATTR void VKAPI_CALL 5824radv_CmdSetCullMode(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode) 5825{ 5826 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5827 struct radv_cmd_state *state = &cmd_buffer->state; 5828 5829 state->dynamic.cull_mode = cullMode; 5830 5831 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE; 5832} 5833 5834VKAPI_ATTR void VKAPI_CALL 5835radv_CmdSetFrontFace(VkCommandBuffer commandBuffer, VkFrontFace frontFace) 5836{ 5837 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5838 struct radv_cmd_state *state = &cmd_buffer->state; 5839 5840 state->dynamic.front_face = frontFace; 5841 5842 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE; 5843} 5844 5845VKAPI_ATTR void VKAPI_CALL 5846radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer, VkPrimitiveTopology primitiveTopology) 5847{ 5848 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5849 struct radv_cmd_state *state = &cmd_buffer->state; 5850 unsigned primitive_topology = si_translate_prim(primitiveTopology); 5851 5852 if ((state->dynamic.primitive_topology == V_008958_DI_PT_LINESTRIP) != 5853 (primitive_topology == V_008958_DI_PT_LINESTRIP)) 5854 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE; 5855 5856 if (radv_prim_is_points_or_lines(state->dynamic.primitive_topology) != 5857 radv_prim_is_points_or_lines(primitive_topology)) 5858 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR; 5859 5860 state->dynamic.primitive_topology = primitive_topology; 5861 5862 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY; 5863} 5864 5865VKAPI_ATTR void VKAPI_CALL 5866radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer, uint32_t viewportCount, 5867 const VkViewport *pViewports) 5868{ 5869 radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports); 5870} 5871 5872VKAPI_ATTR void VKAPI_CALL 5873radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer, uint32_t scissorCount, 5874 const VkRect2D *pScissors) 5875{ 5876 radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors); 5877} 5878 5879VKAPI_ATTR void VKAPI_CALL 5880radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable) 5881 5882{ 5883 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5884 struct radv_cmd_state *state = &cmd_buffer->state; 5885 5886 state->dynamic.depth_test_enable = depthTestEnable; 5887 5888 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE; 5889} 5890 5891VKAPI_ATTR void VKAPI_CALL 5892radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable) 5893{ 5894 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5895 struct radv_cmd_state *state = &cmd_buffer->state; 5896 5897 state->dynamic.depth_write_enable = depthWriteEnable; 5898 5899 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE; 5900} 5901 5902VKAPI_ATTR void VKAPI_CALL 5903radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp) 5904{ 5905 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5906 struct radv_cmd_state *state = &cmd_buffer->state; 5907 5908 state->dynamic.depth_compare_op = depthCompareOp; 5909 5910 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP; 5911} 5912 5913VKAPI_ATTR void VKAPI_CALL 5914radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable) 5915{ 5916 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5917 struct radv_cmd_state *state = &cmd_buffer->state; 5918 5919 state->dynamic.depth_bounds_test_enable = depthBoundsTestEnable; 5920 5921 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE; 5922} 5923 5924VKAPI_ATTR void VKAPI_CALL 5925radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable) 5926{ 5927 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5928 struct radv_cmd_state *state = &cmd_buffer->state; 5929 5930 state->dynamic.stencil_test_enable = stencilTestEnable; 5931 5932 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE; 5933} 5934 5935VKAPI_ATTR void VKAPI_CALL 5936radv_CmdSetStencilOp(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, 5937 VkStencilOp failOp, VkStencilOp passOp, VkStencilOp depthFailOp, 5938 VkCompareOp compareOp) 5939{ 5940 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5941 struct radv_cmd_state *state = &cmd_buffer->state; 5942 5943 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) { 5944 state->dynamic.stencil_op.front.fail_op = failOp; 5945 state->dynamic.stencil_op.front.pass_op = passOp; 5946 state->dynamic.stencil_op.front.depth_fail_op = depthFailOp; 5947 state->dynamic.stencil_op.front.compare_op = compareOp; 5948 } 5949 5950 if (faceMask & VK_STENCIL_FACE_BACK_BIT) { 5951 state->dynamic.stencil_op.back.fail_op = failOp; 5952 state->dynamic.stencil_op.back.pass_op = passOp; 5953 state->dynamic.stencil_op.back.depth_fail_op = depthFailOp; 5954 state->dynamic.stencil_op.back.compare_op = compareOp; 5955 } 5956 5957 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP; 5958} 5959 5960VKAPI_ATTR void VKAPI_CALL 5961radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize, 5962 const VkFragmentShadingRateCombinerOpKHR combinerOps[2]) 5963{ 5964 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5965 struct radv_cmd_state *state = &cmd_buffer->state; 5966 5967 state->dynamic.fragment_shading_rate.size = *pFragmentSize; 5968 for (unsigned i = 0; i < 2; i++) 5969 state->dynamic.fragment_shading_rate.combiner_ops[i] = combinerOps[i]; 5970 5971 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE; 5972} 5973 5974VKAPI_ATTR void VKAPI_CALL 5975radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable) 5976{ 5977 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5978 struct radv_cmd_state *state = &cmd_buffer->state; 5979 5980 state->dynamic.depth_bias_enable = depthBiasEnable; 5981 5982 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE; 5983} 5984 5985VKAPI_ATTR void VKAPI_CALL 5986radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable) 5987{ 5988 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 5989 struct radv_cmd_state *state = &cmd_buffer->state; 5990 5991 state->dynamic.primitive_restart_enable = primitiveRestartEnable; 5992 5993 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE; 5994} 5995 5996VKAPI_ATTR void VKAPI_CALL 5997radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer, VkBool32 rasterizerDiscardEnable) 5998{ 5999 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6000 struct radv_cmd_state *state = &cmd_buffer->state; 6001 6002 state->dynamic.rasterizer_discard_enable = rasterizerDiscardEnable; 6003 6004 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE; 6005} 6006 6007VKAPI_ATTR void VKAPI_CALL 6008radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints) 6009{ 6010 /* not implemented */ 6011} 6012 6013VKAPI_ATTR void VKAPI_CALL 6014radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp) 6015{ 6016 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6017 struct radv_cmd_state *state = &cmd_buffer->state; 6018 unsigned logic_op = si_translate_blend_logic_op(logicOp); 6019 6020 state->dynamic.logic_op = logic_op; 6021 6022 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP; 6023} 6024 6025VKAPI_ATTR void VKAPI_CALL 6026radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount, 6027 const VkBool32 *pColorWriteEnables) 6028{ 6029 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6030 struct radv_cmd_state *state = &cmd_buffer->state; 6031 uint32_t color_write_enable = 0; 6032 6033 assert(attachmentCount <= MAX_RTS); 6034 6035 for (uint32_t i = 0; i < attachmentCount; i++) { 6036 color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0; 6037 } 6038 6039 state->dynamic.color_write_enable = color_write_enable; 6040 6041 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE; 6042} 6043 6044VKAPI_ATTR void VKAPI_CALL 6045radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount, 6046 const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions, 6047 uint32_t vertexAttributeDescriptionCount, 6048 const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions) 6049{ 6050 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6051 struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; 6052 6053 const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS]; 6054 for (unsigned i = 0; i < vertexBindingDescriptionCount; i++) 6055 bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i]; 6056 6057 cmd_buffer->state.vbo_misaligned_mask = 0; 6058 cmd_buffer->state.vbo_misaligned_mask_invalid = 0; 6059 6060 memset(state, 0, sizeof(*state)); 6061 state->bindings_match_attrib = true; 6062 6063 enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level; 6064 for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) { 6065 const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i]; 6066 const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding]; 6067 unsigned loc = attrib->location; 6068 6069 state->attribute_mask |= 1u << loc; 6070 state->bindings[loc] = attrib->binding; 6071 if (attrib->binding != loc) 6072 state->bindings_match_attrib = false; 6073 if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) { 6074 state->instance_rate_inputs |= 1u << loc; 6075 state->divisors[loc] = binding->divisor; 6076 if (binding->divisor == 0) { 6077 state->zero_divisors |= 1u << loc; 6078 } else if (binding->divisor > 1) { 6079 state->nontrivial_divisors |= 1u << loc; 6080 } 6081 } 6082 cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride; 6083 state->offsets[loc] = attrib->offset; 6084 6085 struct dynamic_vertex_format_cache *found = NULL; 6086 util_dynarray_foreach(&cmd_buffer->cached_vertex_formats, 6087 struct dynamic_vertex_format_cache, 6088 vf) { 6089 if (vf->format == attrib->format) { 6090 found = vf; 6091 break; 6092 } 6093 } 6094 if (!found) { 6095 unsigned nfmt, dfmt; 6096 bool post_shuffle; 6097 enum radv_vs_input_alpha_adjust alpha_adjust; 6098 const struct util_format_description *format_desc = vk_format_description(attrib->format); 6099 6100 found = util_dynarray_grow(&cmd_buffer->cached_vertex_formats, 6101 struct dynamic_vertex_format_cache, 1); 6102 radv_translate_vertex_format(cmd_buffer->device->physical_device, attrib->format, format_desc, 6103 &dfmt, &nfmt, &post_shuffle, &alpha_adjust); 6104 found->format = attrib->format; 6105 found->hw_fmt = dfmt | (nfmt << 4); 6106 const uint8_t format_align_req_minus_1 = format_desc->channel[0].size >= 32 ? 3 : 6107 (format_desc->block.bits / 8u - 1); 6108 found->fmt_align_req_minus_1 = format_align_req_minus_1; 6109 found->fmt_size = format_desc->block.bits / 8u; 6110 found->post_shuffle = post_shuffle; 6111 found->alpha_adjust_lo = alpha_adjust & 0x1; 6112 found->alpha_adjust_hi = (alpha_adjust >> 1) & 0x1; 6113 } 6114 6115 state->formats[loc] = found->hw_fmt; 6116 state->format_align_req_minus_1[loc] = found->fmt_align_req_minus_1; 6117 state->format_sizes[loc] = found->fmt_size; 6118 state->alpha_adjust_lo |= found->alpha_adjust_lo << loc; 6119 state->alpha_adjust_hi |= found->alpha_adjust_hi << loc; 6120 if (found->post_shuffle) 6121 state->post_shuffle |= 1u << loc; 6122 6123 if ((chip == GFX6 || chip >= GFX10) && 6124 cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(attrib->binding)) { 6125 if (binding->stride & found->fmt_align_req_minus_1) { 6126 cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc); 6127 } else if ((cmd_buffer->vertex_bindings[attrib->binding].offset + state->offsets[loc]) & 6128 found->fmt_align_req_minus_1) { 6129 cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc); 6130 } 6131 } 6132 } 6133 6134 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER | 6135 RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT; 6136} 6137 6138VKAPI_ATTR void VKAPI_CALL 6139radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, 6140 const VkCommandBuffer *pCmdBuffers) 6141{ 6142 RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer); 6143 6144 assert(commandBufferCount > 0); 6145 6146 radv_emit_mip_change_flush_default(primary); 6147 6148 /* Emit pending flushes on primary prior to executing secondary */ 6149 si_emit_cache_flush(primary); 6150 6151 /* Make sure CP DMA is idle on primary prior to executing secondary. */ 6152 si_cp_dma_wait_for_idle(primary); 6153 6154 for (uint32_t i = 0; i < commandBufferCount; i++) { 6155 RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]); 6156 bool allow_ib2 = true; 6157 6158 if (secondary->device->physical_device->rad_info.gfx_level == GFX7 && 6159 secondary->state.uses_draw_indirect_multi) { 6160 /* Do not launch an IB2 for secondary command buffers that contain 6161 * DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU. 6162 */ 6163 allow_ib2 = false; 6164 } 6165 6166 if (secondary->qf == RADV_QUEUE_COMPUTE) { 6167 /* IB2 packets are not supported on compute queues according to PAL. */ 6168 allow_ib2 = false; 6169 } 6170 6171 primary->scratch_size_per_wave_needed = 6172 MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed); 6173 primary->scratch_waves_wanted = 6174 MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted); 6175 primary->compute_scratch_size_per_wave_needed = 6176 MAX2(primary->compute_scratch_size_per_wave_needed, 6177 secondary->compute_scratch_size_per_wave_needed); 6178 primary->compute_scratch_waves_wanted = 6179 MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted); 6180 6181 if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed) 6182 primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed; 6183 if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed) 6184 primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed; 6185 if (secondary->tess_rings_needed) 6186 primary->tess_rings_needed = true; 6187 if (secondary->task_rings_needed) 6188 primary->task_rings_needed = true; 6189 if (secondary->mesh_scratch_ring_needed) 6190 primary->mesh_scratch_ring_needed = true; 6191 if (secondary->sample_positions_needed) 6192 primary->sample_positions_needed = true; 6193 if (secondary->gds_needed) 6194 primary->gds_needed = true; 6195 6196 if (!secondary->state.framebuffer && primary->state.pass && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) { 6197 /* Emit the framebuffer state from primary if secondary 6198 * has been recorded without a framebuffer, otherwise 6199 * fast color/depth clears can't work. 6200 */ 6201 radv_emit_fb_mip_change_flush(primary); 6202 radv_emit_framebuffer_state(primary); 6203 } 6204 6205 if (secondary->ace_internal.cs) { 6206 if (!primary->ace_internal.cs) { 6207 primary->ace_internal.cs = radv_ace_internal_create(primary); 6208 if (!primary->ace_internal.cs) 6209 return; 6210 } 6211 6212 struct radeon_cmdbuf *ace_primary = primary->ace_internal.cs; 6213 struct radeon_cmdbuf *ace_secondary = secondary->ace_internal.cs; 6214 6215 /* Emit pending flushes on primary prior to executing secondary. */ 6216 radv_ace_internal_cache_flush(primary); 6217 6218 /* Wait for primary GFX->ACE semaphore, if necessary. */ 6219 if (radv_flush_gfx2ace_semaphore(primary)) 6220 radv_wait_gfx2ace_semaphore(primary); 6221 6222 /* Execute the secondary compute cmdbuf. 6223 * Don't use IB2 packets because they are not supported on compute queues. 6224 */ 6225 primary->device->ws->cs_execute_secondary(ace_primary, ace_secondary, false); 6226 } 6227 6228 /* Update pending ACE internal flush bits from the secondary cmdbuf */ 6229 primary->ace_internal.flush_bits |= secondary->ace_internal.flush_bits; 6230 6231 /* Increment primary semaphore if secondary was dirty. 6232 * This happens when the secondary cmdbuf has a barrier which 6233 * isn't consumed by a draw call. 6234 */ 6235 if (radv_ace_internal_sem_dirty(secondary)) 6236 primary->ace_internal.sem.gfx2ace_value++; 6237 6238 primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2); 6239 6240 /* When the secondary command buffer is compute only we don't 6241 * need to re-emit the current graphics pipeline. 6242 */ 6243 if (secondary->state.emitted_graphics_pipeline) { 6244 primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline; 6245 } 6246 6247 /* When the secondary command buffer is graphics only we don't 6248 * need to re-emit the current compute pipeline. 6249 */ 6250 if (secondary->state.emitted_compute_pipeline) { 6251 primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline; 6252 } 6253 6254 /* Only re-emit the draw packets when needed. */ 6255 if (secondary->state.last_primitive_reset_en != -1) { 6256 primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en; 6257 } 6258 6259 if (secondary->state.last_primitive_reset_index) { 6260 primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index; 6261 } 6262 6263 if (secondary->state.last_ia_multi_vgt_param) { 6264 primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param; 6265 } 6266 6267 primary->state.last_first_instance = secondary->state.last_first_instance; 6268 primary->state.last_num_instances = secondary->state.last_num_instances; 6269 primary->state.last_drawid = secondary->state.last_drawid; 6270 primary->state.last_subpass_color_count = secondary->state.last_subpass_color_count; 6271 primary->state.last_vertex_offset = secondary->state.last_vertex_offset; 6272 primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert; 6273 primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon; 6274 primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control; 6275 6276 if (secondary->state.last_index_type != -1) { 6277 primary->state.last_index_type = secondary->state.last_index_type; 6278 } 6279 6280 primary->state.last_nggc_settings = secondary->state.last_nggc_settings; 6281 primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx; 6282 primary->state.last_nggc_skip = secondary->state.last_nggc_skip; 6283 6284 primary->state.last_vrs_rates = secondary->state.last_vrs_rates; 6285 primary->state.last_vrs_rates_sgpr_idx = secondary->state.last_vrs_rates_sgpr_idx; 6286 } 6287 6288 /* After executing commands from secondary buffers we have to dirty 6289 * some states. 6290 */ 6291 primary->state.dirty |= 6292 RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_ALL; 6293 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS); 6294 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE); 6295} 6296 6297VKAPI_ATTR VkResult VKAPI_CALL 6298radv_CreateCommandPool(VkDevice _device, const VkCommandPoolCreateInfo *pCreateInfo, 6299 const VkAllocationCallbacks *pAllocator, VkCommandPool *pCmdPool) 6300{ 6301 RADV_FROM_HANDLE(radv_device, device, _device); 6302 struct radv_cmd_pool *pool; 6303 6304 pool = 6305 vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 6306 if (pool == NULL) 6307 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 6308 6309 VkResult result = vk_command_pool_init(&pool->vk, &device->vk, pCreateInfo, pAllocator); 6310 if (result != VK_SUCCESS) { 6311 vk_free2(&device->vk.alloc, pAllocator, pool); 6312 return result; 6313 } 6314 6315 list_inithead(&pool->cmd_buffers); 6316 list_inithead(&pool->free_cmd_buffers); 6317 6318 *pCmdPool = radv_cmd_pool_to_handle(pool); 6319 6320 return VK_SUCCESS; 6321} 6322 6323VKAPI_ATTR void VKAPI_CALL 6324radv_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool, 6325 const VkAllocationCallbacks *pAllocator) 6326{ 6327 RADV_FROM_HANDLE(radv_device, device, _device); 6328 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 6329 6330 if (!pool) 6331 return; 6332 6333 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link) 6334 { 6335 radv_destroy_cmd_buffer(cmd_buffer); 6336 } 6337 6338 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link) 6339 { 6340 radv_destroy_cmd_buffer(cmd_buffer); 6341 } 6342 6343 vk_command_pool_finish(&pool->vk); 6344 vk_free2(&device->vk.alloc, pAllocator, pool); 6345} 6346 6347VKAPI_ATTR VkResult VKAPI_CALL 6348radv_ResetCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags) 6349{ 6350 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 6351 VkResult result; 6352 6353 list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link) 6354 { 6355 result = radv_reset_cmd_buffer(cmd_buffer); 6356 if (result != VK_SUCCESS) 6357 return result; 6358 } 6359 6360 return VK_SUCCESS; 6361} 6362 6363VKAPI_ATTR void VKAPI_CALL 6364radv_TrimCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlags flags) 6365{ 6366 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool); 6367 6368 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link) 6369 { 6370 radv_destroy_cmd_buffer(cmd_buffer); 6371 } 6372} 6373 6374static void 6375radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpass_id) 6376{ 6377 struct radv_cmd_state *state = &cmd_buffer->state; 6378 struct radv_subpass *subpass = &state->pass->subpasses[subpass_id]; 6379 6380 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096); 6381 6382 radv_emit_subpass_barrier(cmd_buffer, &subpass->start_barrier); 6383 6384 radv_cmd_buffer_set_subpass(cmd_buffer, subpass); 6385 6386 radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC); 6387 6388 for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 6389 const uint32_t a = subpass->attachments[i].attachment; 6390 if (a == VK_ATTACHMENT_UNUSED) 6391 continue; 6392 6393 radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true); 6394 } 6395 6396 radv_ace_internal_barrier(cmd_buffer, 0, 0); 6397 radv_describe_barrier_end(cmd_buffer); 6398 6399 radv_cmd_buffer_clear_subpass(cmd_buffer); 6400 6401 if (subpass->vrs_attachment) { 6402 int idx = subpass->vrs_attachment->attachment; 6403 struct radv_image_view *vrs_iview = cmd_buffer->state.attachments[idx].iview; 6404 6405 if (subpass->depth_stencil_attachment) { 6406 /* When a subpass uses a VRS attachment and a depth/stencil attachment, we just need to 6407 * copy the VRS rates to the HTILE buffer of the attachment. 6408 */ 6409 int ds_idx = subpass->depth_stencil_attachment->attachment; 6410 struct radv_image_view *ds_iview = cmd_buffer->state.attachments[ds_idx].iview; 6411 struct radv_image *ds_image = ds_iview->image; 6412 uint32_t level = ds_iview->vk.base_mip_level; 6413 6414 VkExtent2D extent = { 6415 .width = radv_minify(ds_image->info.width, level), 6416 .height = radv_minify(ds_image->info.height, level), 6417 }; 6418 6419 /* HTILE buffer */ 6420 uint64_t htile_offset = ds_image->bindings[0].offset + ds_image->planes[0].surface.meta_offset + 6421 ds_image->planes[0].surface.u.gfx9.meta_levels[level].offset; 6422 uint64_t htile_size = ds_image->planes[0].surface.u.gfx9.meta_levels[level].size; 6423 struct radv_buffer htile_buffer; 6424 6425 radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bindings[0].bo, htile_size, htile_offset); 6426 6427 /* Copy the VRS rates to the HTILE buffer. */ 6428 radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, &htile_buffer, true); 6429 6430 radv_buffer_finish(&htile_buffer); 6431 } else { 6432 /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have 6433 * to copy the VRS rates to our internal HTILE buffer. 6434 */ 6435 struct vk_framebuffer *fb = cmd_buffer->state.framebuffer; 6436 struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer); 6437 6438 if (ds_image) { 6439 /* HTILE buffer */ 6440 struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer; 6441 6442 VkExtent2D extent = { 6443 .width = MIN2(fb->width, ds_image->info.width), 6444 .height = MIN2(fb->height, ds_image->info.height), 6445 }; 6446 6447 /* Copy the VRS rates to the HTILE buffer. */ 6448 radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, htile_buffer, false); 6449 } 6450 } 6451 } 6452 6453 assert(cmd_buffer->cs->cdw <= cdw_max); 6454} 6455 6456static void 6457radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer) 6458{ 6459 const struct radv_subpass *subpass = cmd_buffer->state.subpass; 6460 6461 /* Have to be conservative in cmdbuffers with inherited attachments. */ 6462 if (!cmd_buffer->state.attachments) { 6463 cmd_buffer->state.rb_noncoherent_dirty = true; 6464 return; 6465 } 6466 6467 for (uint32_t i = 0; i < subpass->color_count; ++i) { 6468 const uint32_t a = subpass->color_attachments[i].attachment; 6469 if (a == VK_ATTACHMENT_UNUSED) 6470 continue; 6471 if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) { 6472 cmd_buffer->state.rb_noncoherent_dirty = true; 6473 return; 6474 } 6475 } 6476 if (subpass->depth_stencil_attachment && 6477 !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment] 6478 .iview->image->l2_coherent) 6479 cmd_buffer->state.rb_noncoherent_dirty = true; 6480} 6481 6482void 6483radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer, 6484 const struct radv_subpass *subpass) 6485{ 6486 radv_mark_noncoherent_rb(cmd_buffer); 6487 radv_cmd_buffer_set_subpass(cmd_buffer, subpass); 6488} 6489 6490static void 6491radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer) 6492{ 6493 struct radv_cmd_state *state = &cmd_buffer->state; 6494 const struct radv_subpass *subpass = state->subpass; 6495 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer); 6496 6497 radv_cmd_buffer_resolve_subpass(cmd_buffer); 6498 6499 radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC); 6500 6501 for (uint32_t i = 0; i < subpass->attachment_count; ++i) { 6502 const uint32_t a = subpass->attachments[i].attachment; 6503 if (a == VK_ATTACHMENT_UNUSED) 6504 continue; 6505 6506 if (state->pass->attachments[a].last_subpass_idx != subpass_id) 6507 continue; 6508 6509 VkImageLayout layout = state->pass->attachments[a].final_layout; 6510 VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout; 6511 struct radv_subpass_attachment att = {a, layout, stencil_layout}; 6512 radv_handle_subpass_image_transition(cmd_buffer, att, false); 6513 } 6514 6515 radv_ace_internal_barrier(cmd_buffer, 0, 0); 6516 radv_describe_barrier_end(cmd_buffer); 6517} 6518 6519VKAPI_ATTR void VKAPI_CALL 6520radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, 6521 const VkRenderPassBeginInfo *pRenderPassBeginInfo, 6522 const VkSubpassBeginInfo *pSubpassBeginInfo) 6523{ 6524 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6525 RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBeginInfo->renderPass); 6526 RADV_FROM_HANDLE(vk_framebuffer, framebuffer, pRenderPassBeginInfo->framebuffer); 6527 VkResult result; 6528 6529 cmd_buffer->state.framebuffer = framebuffer; 6530 cmd_buffer->state.pass = pass; 6531 cmd_buffer->state.render_area = pRenderPassBeginInfo->renderArea; 6532 6533 result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBeginInfo); 6534 if (result != VK_SUCCESS) 6535 return; 6536 6537 result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBeginInfo); 6538 if (result != VK_SUCCESS) 6539 return; 6540 6541 radv_cmd_buffer_begin_subpass(cmd_buffer, 0); 6542} 6543 6544VKAPI_ATTR void VKAPI_CALL 6545radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pSubpassBeginInfo, 6546 const VkSubpassEndInfo *pSubpassEndInfo) 6547{ 6548 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 6549 6550 radv_mark_noncoherent_rb(cmd_buffer); 6551 6552 uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer); 6553 radv_cmd_buffer_end_subpass(cmd_buffer); 6554 radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1); 6555} 6556 6557static void 6558radv_emit_view_index_per_stage(struct radeon_cmdbuf *cs, struct radv_graphics_pipeline *pipeline, 6559 unsigned stage, unsigned index) 6560{ 6561 struct radv_userdata_info *loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_VIEW_INDEX); 6562 if (loc->sgpr_idx == -1) 6563 return; 6564 uint32_t base_reg = pipeline->base.user_data_0[stage]; 6565 radeon_set_sh_reg(cs, base_reg + loc->sgpr_idx * 4, index); 6566} 6567 6568static void 6569radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index) 6570{ 6571 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 6572 6573 radv_foreach_stage(stage, pipeline->active_stages & ~VK_SHADER_STAGE_TASK_BIT_NV) { 6574 radv_emit_view_index_per_stage(cmd_buffer->cs, pipeline, stage, index); 6575 } 6576 if (radv_pipeline_has_gs_copy_shader(&pipeline->base)) { 6577 struct radv_userdata_info *loc = 6578 &pipeline->base.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX]; 6579 if (loc->sgpr_idx != -1) { 6580 uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0; 6581 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index); 6582 } 6583 } 6584 if (pipeline->active_stages & VK_SHADER_STAGE_TASK_BIT_NV) { 6585 radv_emit_view_index_per_stage(cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK, 6586 index); 6587 } 6588} 6589 6590/** 6591 * Emulates predication for MEC using COND_EXEC. 6592 * When the current command buffer is predicating, emit a COND_EXEC packet 6593 * so that the MEC skips the next few dwords worth of packets. 6594 * 6595 * To make it work with inverted conditional rendering, we allocate 6596 * space in the upload BO and emit some packets to invert the condition. 6597 */ 6598static void 6599radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmdbuf *cs, 6600 uint64_t inv_va, bool *inv_emitted, unsigned dwords) 6601{ 6602 if (!state->predicating) 6603 return; 6604 6605 uint64_t va = state->predication_va; 6606 6607 if (!state->predication_type) { 6608 /* Invert the condition the first time it is needed. */ 6609 if (!*inv_emitted) { 6610 *inv_emitted = true; 6611 6612 /* Write 1 to the inverted predication VA. */ 6613 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 6614 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 6615 COPY_DATA_WR_CONFIRM); 6616 radeon_emit(cs, 1); 6617 radeon_emit(cs, 0); 6618 radeon_emit(cs, inv_va); 6619 radeon_emit(cs, inv_va >> 32); 6620 6621 /* If the API predication VA == 0, skip next command. */ 6622 radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0)); 6623 radeon_emit(cs, va); 6624 radeon_emit(cs, va >> 32); 6625 radeon_emit(cs, 0); 6626 radeon_emit(cs, 6); /* 1x COPY_DATA size */ 6627 6628 /* Write 0 to the new predication VA (when the API condition != 0) */ 6629 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 6630 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 6631 COPY_DATA_WR_CONFIRM); 6632 radeon_emit(cs, 0); 6633 radeon_emit(cs, 0); 6634 radeon_emit(cs, inv_va); 6635 radeon_emit(cs, inv_va >> 32); 6636 } 6637 6638 va = inv_va; 6639 } 6640 6641 radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0)); 6642 radeon_emit(cs, va); 6643 radeon_emit(cs, va >> 32); 6644 radeon_emit(cs, 0); /* Cache policy */ 6645 radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */ 6646} 6647 6648static void 6649radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count, 6650 uint32_t use_opaque) 6651{ 6652 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating)); 6653 radeon_emit(cmd_buffer->cs, vertex_count); 6654 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque); 6655} 6656 6657/** 6658 * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices. 6659 * 6660 * The starting address "index_va" may point anywhere within the index buffer. The number of 6661 * indexes allocated in the index buffer *past that point* is specified by "max_index_count". 6662 * Hardware uses this information to return 0 for out-of-bounds reads. 6663 */ 6664static void 6665radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va, 6666 uint32_t max_index_count, uint32_t index_count, bool not_eop) 6667{ 6668 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating)); 6669 radeon_emit(cmd_buffer->cs, max_index_count); 6670 radeon_emit(cmd_buffer->cs, index_va); 6671 radeon_emit(cmd_buffer->cs, index_va >> 32); 6672 radeon_emit(cmd_buffer->cs, index_count); 6673 /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs 6674 * can be changed between draws and GS fast launch must be disabled. 6675 * NOT_EOP doesn't work on gfx9 and older. 6676 */ 6677 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop)); 6678} 6679 6680/* MUST inline this function to avoid massive perf loss in drawoverhead */ 6681ALWAYS_INLINE static void 6682radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed, 6683 uint32_t draw_count, uint64_t count_va, uint32_t stride) 6684{ 6685 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6686 const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX; 6687 bool draw_id_enable = cmd_buffer->state.graphics_pipeline->uses_drawid; 6688 uint32_t base_reg = cmd_buffer->state.graphics_pipeline->vtx_base_sgpr; 6689 uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0; 6690 bool predicating = cmd_buffer->state.predicating; 6691 bool mesh = cmd_buffer->state.mesh_shading; 6692 assert(base_reg); 6693 6694 /* just reset draw state for vertex data */ 6695 cmd_buffer->state.last_first_instance = -1; 6696 cmd_buffer->state.last_num_instances = -1; 6697 cmd_buffer->state.last_drawid = -1; 6698 cmd_buffer->state.last_vertex_offset = -1; 6699 6700 vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2; 6701 if (cmd_buffer->state.graphics_pipeline->uses_baseinstance) 6702 start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2; 6703 if (draw_id_enable) 6704 draw_id_reg = ((base_reg + mesh * 12 + 4) - SI_SH_REG_OFFSET) >> 2; 6705 6706 if (draw_count == 1 && !count_va && !draw_id_enable) { 6707 radeon_emit(cs, 6708 PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating)); 6709 radeon_emit(cs, 0); 6710 radeon_emit(cs, vertex_offset_reg); 6711 radeon_emit(cs, start_instance_reg); 6712 radeon_emit(cs, di_src_sel); 6713 } else { 6714 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8, 6715 predicating)); 6716 radeon_emit(cs, 0); 6717 radeon_emit(cs, vertex_offset_reg); 6718 radeon_emit(cs, start_instance_reg); 6719 radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) | 6720 S_2C3_COUNT_INDIRECT_ENABLE(!!count_va)); 6721 radeon_emit(cs, draw_count); /* count */ 6722 radeon_emit(cs, count_va); /* count_addr */ 6723 radeon_emit(cs, count_va >> 32); 6724 radeon_emit(cs, stride); /* stride */ 6725 radeon_emit(cs, di_src_sel); 6726 6727 cmd_buffer->state.uses_draw_indirect_multi = true; 6728 } 6729} 6730 6731ALWAYS_INLINE static void 6732radv_cs_emit_dispatch_taskmesh_direct_ace_packet(struct radv_cmd_buffer *cmd_buffer, 6733 const uint32_t x, const uint32_t y, 6734 const uint32_t z) 6735{ 6736 struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base; 6737 struct radv_shader *compute_shader = radv_get_shader(pipeline, MESA_SHADER_TASK); 6738 struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs; 6739 const bool predicating = cmd_buffer->state.predicating; 6740 const uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator_task | 6741 S_00B800_CS_W32_EN(compute_shader->info.wave_size == 32); 6742 6743 struct radv_userdata_info *ring_entry_loc = 6744 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_TASK_RING_ENTRY); 6745 assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1); 6746 6747 uint32_t ring_entry_reg = 6748 (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2; 6749 6750 radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, predicating) | PKT3_SHADER_TYPE_S(1)); 6751 radeon_emit(cs, x); 6752 radeon_emit(cs, y); 6753 radeon_emit(cs, z); 6754 radeon_emit(cs, dispatch_initiator); 6755 radeon_emit(cs, ring_entry_reg & 0xFFFF); 6756} 6757 6758ALWAYS_INLINE static void 6759radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(struct radv_cmd_buffer *cmd_buffer, 6760 uint64_t data_va, uint32_t draw_count, 6761 uint64_t count_va, uint32_t stride) 6762{ 6763 assert((data_va & 0x03) == 0); 6764 assert((count_va & 0x03) == 0); 6765 6766 struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base; 6767 struct radv_shader *compute_shader = radv_get_shader(pipeline, MESA_SHADER_TASK); 6768 struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs; 6769 6770 const uint32_t count_indirect_enable = !!count_va; 6771 const uint32_t xyz_dim_enable = compute_shader->info.cs.uses_grid_size; 6772 const uint32_t draw_id_enable = compute_shader->info.vs.needs_draw_id; 6773 const uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator_task | 6774 S_00B800_CS_W32_EN(compute_shader->info.wave_size == 32); 6775 6776 const struct radv_userdata_info *ring_entry_loc = 6777 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_TASK_RING_ENTRY); 6778 const struct radv_userdata_info *xyz_dim_loc = 6779 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_GRID_SIZE); 6780 const struct radv_userdata_info *draw_id_loc = 6781 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_DRAW_ID); 6782 6783 assert(ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1); 6784 assert(!xyz_dim_enable || (xyz_dim_loc->sgpr_idx != -1 && xyz_dim_loc->num_sgprs == 3)); 6785 assert(!draw_id_enable || (draw_id_loc->sgpr_idx != -1 && draw_id_loc->num_sgprs == 1)); 6786 6787 const uint32_t ring_entry_reg = 6788 (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2; 6789 const uint32_t xyz_dim_reg = 6790 !xyz_dim_enable 6791 ? 0 6792 : (R_00B900_COMPUTE_USER_DATA_0 + xyz_dim_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2; 6793 const uint32_t draw_id_reg = 6794 !draw_id_enable 6795 ? 0 6796 : (R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2; 6797 6798 radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1)); 6799 radeon_emit(cs, data_va); 6800 radeon_emit(cs, data_va >> 32); 6801 radeon_emit(cs, ring_entry_reg & 0xFFFF); 6802 radeon_emit(cs, (count_indirect_enable << 1) | (draw_id_enable << 2) | (xyz_dim_enable << 3) | 6803 (draw_id_reg << 16)); 6804 radeon_emit(cs, xyz_dim_reg & 0xFFFF); 6805 radeon_emit(cs, draw_count); 6806 radeon_emit(cs, count_va); 6807 radeon_emit(cs, count_va >> 32); 6808 radeon_emit(cs, stride); 6809 radeon_emit(cs, dispatch_initiator); 6810} 6811 6812ALWAYS_INLINE static void 6813radv_cs_emit_dispatch_taskmesh_gfx_packet(struct radv_cmd_buffer *cmd_buffer) 6814{ 6815 struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base; 6816 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6817 bool predicating = cmd_buffer->state.predicating; 6818 6819 struct radv_userdata_info *ring_entry_loc = 6820 radv_lookup_user_sgpr(pipeline, MESA_SHADER_MESH, AC_UD_TASK_RING_ENTRY); 6821 6822 assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1); 6823 6824 uint32_t base_reg = cmd_buffer->state.graphics_pipeline->vtx_base_sgpr; 6825 uint32_t xyz_dim_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2; 6826 uint32_t ring_entry_reg = ((base_reg + ring_entry_loc->sgpr_idx * 4) - SI_SH_REG_OFFSET) >> 2; 6827 6828 radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, predicating)); 6829 radeon_emit(cs, (ring_entry_reg << 16) | (xyz_dim_reg & 0xFFFF)); 6830 radeon_emit(cs, 0); 6831 radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX); 6832} 6833 6834static inline void 6835radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer, 6836 const struct radv_draw_info *info, const uint32_t vertex_offset) 6837{ 6838 struct radv_cmd_state *state = &cmd_buffer->state; 6839 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6840 const bool uses_baseinstance = state->graphics_pipeline->uses_baseinstance; 6841 const bool uses_drawid = state->graphics_pipeline->uses_drawid; 6842 6843 radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, state->graphics_pipeline->vtx_emit_num); 6844 6845 radeon_emit(cs, vertex_offset); 6846 state->last_vertex_offset = vertex_offset; 6847 if (uses_drawid) { 6848 radeon_emit(cs, 0); 6849 state->last_drawid = 0; 6850 } 6851 if (uses_baseinstance) { 6852 radeon_emit(cs, info->first_instance); 6853 state->last_first_instance = info->first_instance; 6854 } 6855} 6856 6857ALWAYS_INLINE static void 6858radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, 6859 const uint32_t vertex_offset) 6860{ 6861 const struct radv_cmd_state *state = &cmd_buffer->state; 6862 const bool uses_baseinstance = state->graphics_pipeline->uses_baseinstance; 6863 const bool uses_drawid = state->graphics_pipeline->uses_drawid; 6864 6865 /* this looks very dumb, but it allows the compiler to optimize better and yields 6866 * ~3-4% perf increase in drawoverhead 6867 */ 6868 if (vertex_offset != state->last_vertex_offset) { 6869 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset); 6870 } else if (uses_drawid && 0 != state->last_drawid) { 6871 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset); 6872 } else if (uses_baseinstance && info->first_instance != state->last_first_instance) { 6873 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset); 6874 } 6875} 6876 6877ALWAYS_INLINE static void 6878radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid) 6879{ 6880 struct radv_cmd_state *state = &cmd_buffer->state; 6881 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6882 radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, 1 + !!drawid); 6883 radeon_emit(cs, vertex_offset); 6884 state->last_vertex_offset = vertex_offset; 6885 if (drawid) 6886 radeon_emit(cs, drawid); 6887 6888} 6889 6890ALWAYS_INLINE static void 6891radv_emit_userdata_mesh(struct radv_cmd_buffer *cmd_buffer, 6892 const uint32_t x, const uint32_t y, const uint32_t z, 6893 const uint32_t first_task) 6894{ 6895 struct radv_cmd_state *state = &cmd_buffer->state; 6896 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6897 const bool uses_drawid = state->graphics_pipeline->uses_drawid; 6898 6899 radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, state->graphics_pipeline->vtx_emit_num); 6900 radeon_emit(cs, first_task); 6901 radeon_emit(cs, x); 6902 radeon_emit(cs, y); 6903 radeon_emit(cs, z); 6904 6905 if (uses_drawid) { 6906 radeon_emit(cs, 0); 6907 state->last_drawid = 0; 6908 } 6909} 6910 6911ALWAYS_INLINE static void 6912radv_emit_userdata_mesh_first_task_0_draw_id_0(struct radv_cmd_buffer *cmd_buffer) 6913{ 6914 struct radv_cmd_state *state = &cmd_buffer->state; 6915 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6916 struct radv_graphics_pipeline *pipeline = state->graphics_pipeline; 6917 const bool uses_drawid = pipeline->uses_drawid; 6918 6919 radeon_set_sh_reg_seq(cs, pipeline->vtx_base_sgpr, 1); 6920 radeon_emit(cs, 0); 6921 6922 if (uses_drawid) { 6923 radeon_set_sh_reg_seq(cs, pipeline->vtx_base_sgpr + (pipeline->vtx_emit_num - 1) * 4, 1); 6924 radeon_emit(cs, 0); 6925 } 6926} 6927 6928ALWAYS_INLINE static void 6929radv_emit_userdata_task_ib_only(struct radv_cmd_buffer *cmd_buffer, uint64_t ib_va, 6930 uint32_t ib_stride) 6931{ 6932 struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base; 6933 struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs; 6934 6935 struct radv_userdata_info *task_ib_loc = 6936 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_IB); 6937 6938 if (task_ib_loc->sgpr_idx != -1) { 6939 assert(task_ib_loc->num_sgprs == 3); 6940 unsigned task_ib_reg = R_00B900_COMPUTE_USER_DATA_0 + task_ib_loc->sgpr_idx * 4; 6941 6942 radeon_set_sh_reg_seq(cs, task_ib_reg, 3); 6943 radeon_emit(cs, ib_va); 6944 radeon_emit(cs, ib_va >> 32); 6945 radeon_emit(cs, ib_stride); 6946 } 6947} 6948 6949ALWAYS_INLINE static void 6950radv_emit_userdata_task(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z, 6951 uint32_t draw_id, uint32_t first_task, uint64_t ib_va) 6952{ 6953 struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base; 6954 struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs; 6955 6956 struct radv_userdata_info *xyz_loc = 6957 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_GRID_SIZE); 6958 struct radv_userdata_info *draw_id_loc = 6959 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_DRAW_ID); 6960 6961 if (xyz_loc->sgpr_idx != -1) { 6962 assert(xyz_loc->num_sgprs == 3); 6963 unsigned xyz_reg = R_00B900_COMPUTE_USER_DATA_0 + xyz_loc->sgpr_idx * 4; 6964 6965 radeon_set_sh_reg_seq(cs, xyz_reg, 3); 6966 radeon_emit(cs, x); 6967 radeon_emit(cs, y); 6968 radeon_emit(cs, z); 6969 } 6970 6971 if (draw_id_loc->sgpr_idx != -1) { 6972 assert(draw_id_loc->num_sgprs == 1); 6973 unsigned draw_id_reg = R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4; 6974 6975 radeon_set_sh_reg_seq(cs, draw_id_reg, 1); 6976 radeon_emit(cs, draw_id); 6977 } 6978 6979 radv_emit_userdata_task_ib_only(cmd_buffer, ib_va, first_task ? 8 : 0); 6980} 6981 6982ALWAYS_INLINE static void 6983radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer, 6984 const struct radv_draw_info *info, 6985 uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo, 6986 uint32_t stride, 6987 const int32_t *vertexOffset) 6988 6989{ 6990 struct radv_cmd_state *state = &cmd_buffer->state; 6991 struct radeon_cmdbuf *cs = cmd_buffer->cs; 6992 const int index_size = radv_get_vgt_index_size(state->index_type); 6993 unsigned i = 0; 6994 const bool uses_drawid = state->graphics_pipeline->uses_drawid; 6995 const bool can_eop = 6996 !uses_drawid && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10; 6997 6998 if (uses_drawid) { 6999 if (vertexOffset) { 7000 radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset); 7001 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) { 7002 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex; 7003 7004 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */ 7005 if (!remaining_indexes && 7006 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) 7007 continue; 7008 7009 if (i > 0) 7010 radeon_set_sh_reg(cs, state->graphics_pipeline->vtx_base_sgpr + sizeof(uint32_t), i); 7011 7012 const uint64_t index_va = state->index_va + draw->firstIndex * index_size; 7013 7014 if (!state->subpass->view_mask) { 7015 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 7016 } else { 7017 u_foreach_bit(view, state->subpass->view_mask) { 7018 radv_emit_view_index(cmd_buffer, view); 7019 7020 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 7021 } 7022 } 7023 } 7024 } else { 7025 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) { 7026 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex; 7027 7028 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */ 7029 if (!remaining_indexes && 7030 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) 7031 continue; 7032 7033 if (i > 0) { 7034 if (state->last_vertex_offset != draw->vertexOffset) 7035 radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i); 7036 else 7037 radeon_set_sh_reg(cs, state->graphics_pipeline->vtx_base_sgpr + sizeof(uint32_t), i); 7038 } else 7039 radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset); 7040 7041 const uint64_t index_va = state->index_va + draw->firstIndex * index_size; 7042 7043 if (!state->subpass->view_mask) { 7044 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 7045 } else { 7046 u_foreach_bit(view, state->subpass->view_mask) { 7047 radv_emit_view_index(cmd_buffer, view); 7048 7049 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 7050 } 7051 } 7052 } 7053 } 7054 if (drawCount > 1) { 7055 state->last_drawid = drawCount - 1; 7056 } 7057 } else { 7058 if (vertexOffset) { 7059 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10) { 7060 /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have 7061 * count == 0 for the last draw that doesn't have NOT_EOP. 7062 */ 7063 while (drawCount > 1) { 7064 const VkMultiDrawIndexedInfoEXT *last = (const VkMultiDrawIndexedInfoEXT*)(((const uint8_t*)minfo) + (drawCount - 1) * stride); 7065 if (last->indexCount) 7066 break; 7067 drawCount--; 7068 } 7069 } 7070 7071 radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset); 7072 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) { 7073 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex; 7074 7075 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */ 7076 if (!remaining_indexes && 7077 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) 7078 continue; 7079 7080 const uint64_t index_va = state->index_va + draw->firstIndex * index_size; 7081 7082 if (!state->subpass->view_mask) { 7083 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && i < drawCount - 1); 7084 } else { 7085 u_foreach_bit(view, state->subpass->view_mask) { 7086 radv_emit_view_index(cmd_buffer, view); 7087 7088 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 7089 } 7090 } 7091 } 7092 } else { 7093 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) { 7094 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex; 7095 7096 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */ 7097 if (!remaining_indexes && 7098 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) 7099 continue; 7100 7101 const VkMultiDrawIndexedInfoEXT *next = (const VkMultiDrawIndexedInfoEXT*)(i < drawCount - 1 ? ((uint8_t*)draw + stride) : NULL); 7102 const bool offset_changes = next && next->vertexOffset != draw->vertexOffset; 7103 radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset); 7104 7105 const uint64_t index_va = state->index_va + draw->firstIndex * index_size; 7106 7107 if (!state->subpass->view_mask) { 7108 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && !offset_changes && i < drawCount - 1); 7109 } else { 7110 u_foreach_bit(view, state->subpass->view_mask) { 7111 radv_emit_view_index(cmd_buffer, view); 7112 7113 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false); 7114 } 7115 } 7116 } 7117 } 7118 if (drawCount > 1) { 7119 state->last_drawid = drawCount - 1; 7120 } 7121 } 7122} 7123 7124ALWAYS_INLINE static void 7125radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, 7126 uint32_t drawCount, const VkMultiDrawInfoEXT *minfo, 7127 uint32_t use_opaque, uint32_t stride) 7128{ 7129 unsigned i = 0; 7130 const uint32_t view_mask = cmd_buffer->state.subpass->view_mask; 7131 const bool uses_drawid = cmd_buffer->state.graphics_pipeline->uses_drawid; 7132 uint32_t last_start = 0; 7133 7134 vk_foreach_multi_draw(draw, i, minfo, drawCount, stride) { 7135 if (!i) 7136 radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex); 7137 else 7138 radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0); 7139 7140 if (!view_mask) { 7141 radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque); 7142 } else { 7143 u_foreach_bit(view, view_mask) { 7144 radv_emit_view_index(cmd_buffer, view); 7145 radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque); 7146 } 7147 } 7148 last_start = draw->firstVertex; 7149 } 7150 if (drawCount > 1) { 7151 struct radv_cmd_state *state = &cmd_buffer->state; 7152 state->last_vertex_offset = last_start; 7153 if (uses_drawid) 7154 state->last_drawid = drawCount - 1; 7155 } 7156} 7157 7158ALWAYS_INLINE static void 7159radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer, 7160 uint32_t x, uint32_t y, uint32_t z, 7161 uint32_t first_task) 7162{ 7163 const uint32_t view_mask = cmd_buffer->state.subpass->view_mask; 7164 const uint32_t count = x * y * z; 7165 7166 radv_emit_userdata_mesh(cmd_buffer, x, y, z, first_task); 7167 7168 if (!view_mask) { 7169 radv_cs_emit_draw_packet(cmd_buffer, count, 0); 7170 } else { 7171 u_foreach_bit(view, view_mask) { 7172 radv_emit_view_index(cmd_buffer, view); 7173 radv_cs_emit_draw_packet(cmd_buffer, count, 0); 7174 } 7175 } 7176} 7177 7178ALWAYS_INLINE static void 7179radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, 7180 uint32_t z, uint32_t first_task) 7181{ 7182 uint64_t fake_ib_va = 0; 7183 const uint32_t view_mask = cmd_buffer->state.subpass->view_mask; 7184 const unsigned num_views = MAX2(1, util_bitcount(view_mask)); 7185 unsigned ace_predication_size = num_views * 6; /* DISPATCH_TASKMESH_DIRECT_ACE size */ 7186 7187 if (first_task) { 7188 /* Pass this as the IB to the shader for emulating firstTask in task shaders. */ 7189 uint32_t fake_ib_dwords[2] = {x, first_task}; 7190 unsigned fake_ib_offset; 7191 radv_cmd_buffer_upload_data(cmd_buffer, 8, fake_ib_dwords, &fake_ib_offset); 7192 fake_ib_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + fake_ib_offset; 7193 } 7194 7195 radv_emit_userdata_task(cmd_buffer, x, y, z, 0, first_task, fake_ib_va); 7196 radv_emit_userdata_mesh_first_task_0_draw_id_0(cmd_buffer); 7197 radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->ace_internal.cs, 7198 cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted, 7199 ace_predication_size); 7200 7201 if (!view_mask) { 7202 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z); 7203 radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer); 7204 } else { 7205 u_foreach_bit (view, view_mask) { 7206 radv_emit_view_index(cmd_buffer, view); 7207 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z); 7208 radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer); 7209 } 7210 } 7211} 7212 7213static void 7214radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, 7215 const struct radv_draw_info *info, uint64_t nv_ib_va, 7216 uint32_t nv_ib_stride) 7217{ 7218 const uint32_t view_mask = cmd_buffer->state.subpass->view_mask; 7219 struct radeon_winsys *ws = cmd_buffer->device->ws; 7220 const unsigned num_views = MAX2(1, util_bitcount(view_mask)); 7221 unsigned ace_predication_size = num_views * 11; /* DISPATCH_TASKMESH_INDIRECT_MULTI_ACE size */ 7222 struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs; 7223 7224 const uint64_t va = 7225 radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset; 7226 const uint64_t count_va = !info->count_buffer 7227 ? 0 7228 : radv_buffer_get_va(info->count_buffer->bo) + 7229 info->count_buffer->offset + info->count_buffer_offset; 7230 uint64_t workaround_cond_va = 0; 7231 7232 if (count_va) { 7233 radv_cs_add_buffer(ws, cmd_buffer->ace_internal.cs, info->count_buffer->bo); 7234 7235 /* MEC firmware bug workaround. 7236 * When the count buffer contains zero, DISPATCH_TASKMESH_INDIRECT_MULTI_ACE hangs. 7237 * - We must ensure that DISPATCH_TASKMESH_INDIRECT_MULTI_ACE 7238 * is only executed when the count buffer contains non-zero. 7239 * - Furthermore, we must also ensure that each DISPATCH_TASKMESH_GFX packet 7240 * has a matching ACE packet. 7241 * 7242 * As a workaround: 7243 * - Reserve a dword in the upload buffer and initialize it to 1 for the workaround 7244 * - When count != 0, write 0 to the workaround BO and execute the indirect dispatch 7245 * - When workaround BO != 0 (count was 0), execute an empty direct dispatch 7246 */ 7247 7248 uint32_t workaround_cond_init = 0; 7249 uint32_t workaround_cond_off; 7250 if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &workaround_cond_init, &workaround_cond_off)) 7251 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; 7252 7253 workaround_cond_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + workaround_cond_off; 7254 7255 radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0)); 7256 radeon_emit(ace_cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 7257 COPY_DATA_WR_CONFIRM); 7258 radeon_emit(ace_cs, 1); 7259 radeon_emit(ace_cs, 0); 7260 radeon_emit(ace_cs, workaround_cond_va); 7261 radeon_emit(ace_cs, workaround_cond_va >> 32); 7262 7263 /* 2x COND_EXEC + 1x COPY_DATA + Nx DISPATCH_TASKMESH_DIRECT_ACE */ 7264 ace_predication_size += 2 * 5 + 6 + 6 * num_views; 7265 } 7266 7267 radv_cs_add_buffer(ws, cmd_buffer->ace_internal.cs, info->indirect->bo); 7268 radv_emit_userdata_task_ib_only(cmd_buffer, nv_ib_va, nv_ib_stride); 7269 radv_emit_userdata_mesh_first_task_0_draw_id_0(cmd_buffer); 7270 radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->ace_internal.cs, 7271 cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted, 7272 ace_predication_size); 7273 7274 if (workaround_cond_va) { 7275 radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0)); 7276 radeon_emit(ace_cs, count_va); 7277 radeon_emit(ace_cs, count_va >> 32); 7278 radeon_emit(ace_cs, 0); 7279 radeon_emit(ace_cs, 7280 6 + 11 * num_views); /* 1x COPY_DATA + Nx DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */ 7281 7282 radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0)); 7283 radeon_emit(ace_cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 7284 COPY_DATA_WR_CONFIRM); 7285 radeon_emit(ace_cs, 0); 7286 radeon_emit(ace_cs, 0); 7287 radeon_emit(ace_cs, workaround_cond_va); 7288 radeon_emit(ace_cs, workaround_cond_va >> 32); 7289 } 7290 7291 if (!view_mask) { 7292 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count, 7293 count_va, info->stride); 7294 radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer); 7295 } else { 7296 u_foreach_bit (view, view_mask) { 7297 radv_emit_view_index(cmd_buffer, view); 7298 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count, 7299 count_va, info->stride); 7300 radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer); 7301 } 7302 } 7303 7304 if (workaround_cond_va) { 7305 radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0)); 7306 radeon_emit(ace_cs, workaround_cond_va); 7307 radeon_emit(ace_cs, workaround_cond_va >> 32); 7308 radeon_emit(ace_cs, 0); 7309 radeon_emit(ace_cs, 6 * num_views); /* Nx DISPATCH_TASKMESH_DIRECT_ACE */ 7310 7311 for (unsigned v = 0; v < num_views; ++v) { 7312 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, 0, 0, 0); 7313 } 7314 } 7315} 7316 7317static void 7318radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer, 7319 const struct radv_draw_info *info) 7320{ 7321 const struct radv_cmd_state *state = &cmd_buffer->state; 7322 struct radeon_winsys *ws = cmd_buffer->device->ws; 7323 struct radeon_cmdbuf *cs = cmd_buffer->cs; 7324 const uint64_t va = 7325 radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset; 7326 const uint64_t count_va = info->count_buffer 7327 ? radv_buffer_get_va(info->count_buffer->bo) + 7328 info->count_buffer->offset + info->count_buffer_offset 7329 : 0; 7330 7331 radv_cs_add_buffer(ws, cs, info->indirect->bo); 7332 7333 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0)); 7334 radeon_emit(cs, 1); 7335 radeon_emit(cs, va); 7336 radeon_emit(cs, va >> 32); 7337 7338 if (info->count_buffer) { 7339 radv_cs_add_buffer(ws, cs, info->count_buffer->bo); 7340 } 7341 7342 if (!state->subpass->view_mask) { 7343 radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, 7344 info->stride); 7345 } else { 7346 u_foreach_bit(i, state->subpass->view_mask) 7347 { 7348 radv_emit_view_index(cmd_buffer, i); 7349 7350 radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, 7351 info->stride); 7352 } 7353 } 7354} 7355 7356/* 7357 * Vega and raven have a bug which triggers if there are multiple context 7358 * register contexts active at the same time with different scissor values. 7359 * 7360 * There are two possible workarounds: 7361 * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way 7362 * there is only ever 1 active set of scissor values at the same time. 7363 * 7364 * 2) Whenever the hardware switches contexts we have to set the scissor 7365 * registers again even if it is a noop. That way the new context gets 7366 * the correct scissor values. 7367 * 7368 * This implements option 2. radv_need_late_scissor_emission needs to 7369 * return true on affected HW if radv_emit_all_graphics_states sets 7370 * any context registers. 7371 */ 7372static bool 7373radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, 7374 const struct radv_draw_info *info) 7375{ 7376 struct radv_cmd_state *state = &cmd_buffer->state; 7377 7378 if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug) 7379 return false; 7380 7381 if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer) 7382 return true; 7383 7384 uint64_t used_states = 7385 cmd_buffer->state.graphics_pipeline->needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL; 7386 7387 /* Index, vertex and streamout buffers don't change context regs, and 7388 * pipeline is already handled. 7389 */ 7390 used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | 7391 RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT | RADV_CMD_DIRTY_STREAMOUT_BUFFER | 7392 RADV_CMD_DIRTY_PIPELINE); 7393 7394 if (cmd_buffer->state.dirty & used_states) 7395 return true; 7396 7397 uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer); 7398 7399 if (info->indexed && state->dynamic.primitive_restart_enable && 7400 primitive_reset_index != state->last_primitive_reset_index) 7401 return true; 7402 7403 return false; 7404} 7405 7406ALWAYS_INLINE static bool 7407radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt, 7408 bool indirect) 7409{ 7410 /* If we have to draw only a few vertices, we get better latency if 7411 * we disable NGG culling. 7412 * 7413 * When tessellation is used, what matters is the number of tessellated 7414 * vertices, so let's always assume it's not a small draw. 7415 */ 7416 return !has_tess && !indirect && vtx_cnt < 128; 7417} 7418 7419ALWAYS_INLINE static uint32_t 7420radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted) 7421{ 7422 const struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 7423 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 7424 7425 /* Cull every triangle when rasterizer discard is enabled. */ 7426 if (d->rasterizer_discard_enable || 7427 G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.graphics_pipeline->pa_cl_clip_cntl)) 7428 return radv_nggc_front_face | radv_nggc_back_face; 7429 7430 uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.graphics_pipeline->pa_su_sc_mode_cntl; 7431 uint32_t nggc_settings = radv_nggc_none; 7432 7433 /* The culling code needs to know whether face is CW or CCW. */ 7434 bool ccw = (pipeline->needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE) 7435 ? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE 7436 : G_028814_FACE(pa_su_sc_mode_cntl) == 0; 7437 7438 /* Take inverted viewport into account. */ 7439 ccw ^= vp_y_inverted; 7440 7441 if (ccw) 7442 nggc_settings |= radv_nggc_face_is_ccw; 7443 7444 /* Face culling settings. */ 7445 if ((pipeline->needed_dynamic_state & RADV_DYNAMIC_CULL_MODE) 7446 ? (d->cull_mode & VK_CULL_MODE_FRONT_BIT) 7447 : G_028814_CULL_FRONT(pa_su_sc_mode_cntl)) 7448 nggc_settings |= radv_nggc_front_face; 7449 if ((pipeline->needed_dynamic_state & RADV_DYNAMIC_CULL_MODE) 7450 ? (d->cull_mode & VK_CULL_MODE_BACK_BIT) 7451 : G_028814_CULL_BACK(pa_su_sc_mode_cntl)) 7452 nggc_settings |= radv_nggc_back_face; 7453 7454 /* Small primitive culling is only valid when conservative overestimation is not used. It's also 7455 * disabled for user sample locations because small primitive culling assumes a sample 7456 * position at (0.5, 0.5). */ 7457 if (!pipeline->uses_conservative_overestimate && !pipeline->uses_user_sample_locations) { 7458 nggc_settings |= radv_nggc_small_primitives; 7459 7460 /* small_prim_precision = num_samples / 2^subpixel_bits 7461 * num_samples is also always a power of two, so the small prim precision can only be 7462 * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent. 7463 */ 7464 unsigned subpixel_bits = 256; 7465 int32_t small_prim_precision_log2 = util_logbase2(pipeline->ms.num_samples) - util_logbase2(subpixel_bits); 7466 nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u); 7467 } 7468 7469 return nggc_settings; 7470} 7471 7472static void 7473radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info) 7474{ 7475 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 7476 const unsigned stage = pipeline->last_vgt_api_stage; 7477 const bool nggc_supported = pipeline->has_ngg_culling; 7478 7479 if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) { 7480 /* Current shader doesn't support culling and culling was already disabled: 7481 * No further steps needed, just remember the SGPR's location is not set. 7482 */ 7483 cmd_buffer->state.last_nggc_settings_sgpr_idx = -1; 7484 return; 7485 } 7486 7487 /* Check dirty flags: 7488 * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed). 7489 * - Dirty dynamic flags: culling settings may have changed. 7490 */ 7491 const bool dirty = 7492 cmd_buffer->state.dirty & 7493 (RADV_CMD_DIRTY_PIPELINE | 7494 RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE | 7495 RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT); 7496 7497 /* Check small draw status: 7498 * For small draw calls, we disable culling by setting the SGPR to 0. 7499 */ 7500 const bool skip = 7501 radv_skip_ngg_culling(stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect); 7502 7503 /* See if anything changed. */ 7504 if (!dirty && skip == cmd_buffer->state.last_nggc_skip) 7505 return; 7506 7507 /* Remember small draw state. */ 7508 cmd_buffer->state.last_nggc_skip = skip; 7509 const struct radv_shader *v = pipeline->base.shaders[stage]; 7510 assert(v->info.has_ngg_culling == nggc_supported); 7511 7512 /* Find the user SGPR. */ 7513 const uint32_t base_reg = pipeline->base.user_data_0[stage]; 7514 const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx; 7515 assert(!nggc_supported || nggc_sgpr_idx != -1); 7516 7517 /* Get viewport transform. */ 7518 float vp_scale[2], vp_translate[2]; 7519 memcpy(vp_scale, cmd_buffer->state.dynamic.viewport.xform[0].scale, 2 * sizeof(float)); 7520 memcpy(vp_translate, cmd_buffer->state.dynamic.viewport.xform[0].translate, 2 * sizeof(float)); 7521 bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]); 7522 7523 /* Get current culling settings. */ 7524 uint32_t nggc_settings = nggc_supported && !skip 7525 ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted) 7526 : radv_nggc_none; 7527 7528 bool emit_viewport = nggc_settings && 7529 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT || 7530 cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx || 7531 !cmd_buffer->state.last_nggc_settings); 7532 7533 if (emit_viewport) { 7534 /* Correction for inverted Y */ 7535 if (vp_y_inverted) { 7536 vp_scale[1] = -vp_scale[1]; 7537 vp_translate[1] = -vp_translate[1]; 7538 } 7539 7540 /* Correction for number of samples per pixel. */ 7541 for (unsigned i = 0; i < 2; ++i) { 7542 vp_scale[i] *= (float) pipeline->ms.num_samples; 7543 vp_translate[i] *= (float) pipeline->ms.num_samples; 7544 } 7545 7546 uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])}; 7547 const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx; 7548 assert(vp_sgpr_idx != -1); 7549 radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4); 7550 radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4); 7551 } 7552 7553 bool emit_settings = nggc_supported && 7554 (cmd_buffer->state.last_nggc_settings != nggc_settings || 7555 cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx); 7556 7557 /* This needs to be emitted when culling is turned on 7558 * and when it's already on but some settings change. 7559 */ 7560 if (emit_settings) { 7561 assert(nggc_sgpr_idx >= 0); 7562 radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings); 7563 } 7564 7565 /* These only need to be emitted when culling is turned on or off, 7566 * but not when it stays on and just some settings change. 7567 */ 7568 if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) { 7569 uint32_t rsrc2 = v->config.rsrc2; 7570 7571 if (!nggc_settings) { 7572 /* Allocate less LDS when culling is disabled. (But GS always needs it.) */ 7573 if (stage != MESA_SHADER_GEOMETRY) 7574 rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling); 7575 } 7576 7577 /* When the pipeline is dirty and not yet emitted, don't write it here 7578 * because radv_emit_graphics_pipeline will overwrite this register. 7579 */ 7580 if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) || 7581 cmd_buffer->state.emitted_graphics_pipeline == pipeline) { 7582 radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); 7583 } 7584 } 7585 7586 cmd_buffer->state.last_nggc_settings = nggc_settings; 7587 cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx; 7588} 7589 7590static void 7591radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, 7592 bool pipeline_is_dirty) 7593{ 7594 bool late_scissor_emission; 7595 7596 if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) || 7597 cmd_buffer->state.emitted_graphics_pipeline != cmd_buffer->state.graphics_pipeline) 7598 radv_emit_rbplus_state(cmd_buffer); 7599 7600 if (cmd_buffer->device->physical_device->use_ngg_culling && 7601 cmd_buffer->state.graphics_pipeline->is_ngg) 7602 radv_emit_ngg_culling_state(cmd_buffer, info); 7603 7604 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) 7605 radv_emit_graphics_pipeline(cmd_buffer); 7606 7607 /* This should be before the cmd_buffer->state.dirty is cleared 7608 * (excluding RADV_CMD_DIRTY_PIPELINE) and after 7609 * cmd_buffer->state.context_roll_without_scissor_emitted is set. */ 7610 late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info); 7611 7612 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) 7613 radv_emit_framebuffer_state(cmd_buffer); 7614 7615 if (info->indexed) { 7616 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER) 7617 radv_emit_index_buffer(cmd_buffer, info->indirect); 7618 } else { 7619 /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE, 7620 * so the state must be re-emitted before the next indexed 7621 * draw. 7622 */ 7623 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) { 7624 cmd_buffer->state.last_index_type = -1; 7625 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; 7626 } 7627 } 7628 7629 if (cmd_buffer->device->force_vrs != RADV_FORCE_VRS_1x1) { 7630 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; 7631 uint64_t dynamic_states = 7632 cmd_buffer->state.dirty & cmd_buffer->state.emitted_graphics_pipeline->needed_dynamic_state; 7633 7634 if ((dynamic_states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE) && 7635 d->fragment_shading_rate.size.width == 1 && 7636 d->fragment_shading_rate.size.height == 1 && 7637 d->fragment_shading_rate.combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR && 7638 d->fragment_shading_rate.combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) { 7639 /* When per-vertex VRS is forced and the dynamic fragment shading rate is a no-op, ignore 7640 * it. This is needed for vkd3d-proton because it always declares per-draw VRS as dynamic. 7641 */ 7642 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE; 7643 } 7644 } 7645 7646 radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty); 7647 7648 radv_emit_draw_registers(cmd_buffer, info); 7649 7650 if (late_scissor_emission) 7651 radv_emit_scissor(cmd_buffer); 7652} 7653 7654/* MUST inline this function to avoid massive perf loss in drawoverhead */ 7655ALWAYS_INLINE static bool 7656radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount) 7657{ 7658 const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7; 7659 const bool pipeline_is_dirty = (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) && 7660 cmd_buffer->state.graphics_pipeline != cmd_buffer->state.emitted_graphics_pipeline; 7661 7662 ASSERTED const unsigned cdw_max = 7663 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1)); 7664 7665 if (likely(!info->indirect)) { 7666 /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is 7667 * no workaround for indirect draws, but we can at least skip 7668 * direct draws. 7669 */ 7670 if (unlikely(!info->instance_count)) 7671 return false; 7672 7673 /* Handle count == 0. */ 7674 if (unlikely(!info->count && !info->strmout_buffer)) 7675 return false; 7676 } 7677 7678 /* Need to apply this workaround early as it can set flush flags. */ 7679 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) 7680 radv_emit_fb_mip_change_flush(cmd_buffer); 7681 7682 /* Use optimal packet order based on whether we need to sync the 7683 * pipeline. 7684 */ 7685 if (cmd_buffer->state.flush_bits & 7686 (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB | 7687 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) { 7688 /* If we have to wait for idle, set all states first, so that 7689 * all SET packets are processed in parallel with previous draw 7690 * calls. Then upload descriptors, set shader pointers, and 7691 * draw, and prefetch at the end. This ensures that the time 7692 * the CUs are idle is very short. (there are only SET_SH 7693 * packets between the wait and the draw) 7694 */ 7695 radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty); 7696 si_emit_cache_flush(cmd_buffer); 7697 /* <-- CUs are idle here --> */ 7698 7699 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty); 7700 } else { 7701 /* If we don't wait for idle, start prefetches first, then set 7702 * states, and draw at the end. 7703 */ 7704 si_emit_cache_flush(cmd_buffer); 7705 7706 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) { 7707 /* Only prefetch the vertex shader and VBO descriptors 7708 * in order to start the draw as soon as possible. 7709 */ 7710 radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.graphics_pipeline, true); 7711 } 7712 7713 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty); 7714 7715 radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty); 7716 } 7717 7718 radv_describe_draw(cmd_buffer); 7719 if (likely(!info->indirect)) { 7720 struct radv_cmd_state *state = &cmd_buffer->state; 7721 struct radeon_cmdbuf *cs = cmd_buffer->cs; 7722 assert(state->graphics_pipeline->vtx_base_sgpr); 7723 if (state->last_num_instances != info->instance_count) { 7724 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false)); 7725 radeon_emit(cs, info->instance_count); 7726 state->last_num_instances = info->instance_count; 7727 } 7728 } 7729 assert(cmd_buffer->cs->cdw <= cdw_max); 7730 7731 return true; 7732} 7733 7734ALWAYS_INLINE static bool 7735radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, 7736 uint32_t drawCount) 7737{ 7738 struct radv_descriptor_state *descriptors_state = 7739 radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS); 7740 const bool pipeline_is_dirty = 7741 cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE && 7742 cmd_buffer->state.graphics_pipeline != cmd_buffer->state.emitted_graphics_pipeline; 7743 const bool push_dirty = descriptors_state->push_dirty; 7744 const uint32_t desc_dirty = descriptors_state->dirty; 7745 7746 const bool gfx_result = radv_before_draw(cmd_buffer, info, drawCount); 7747 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 7748 struct radv_shader *task_shader = radv_get_shader(&pipeline->base, MESA_SHADER_TASK); 7749 7750 /* If there is no task shader, no need to do anything special. */ 7751 if (!task_shader) 7752 return gfx_result; 7753 7754 /* Need to check the count even for indirect draws to work around 7755 * an issue with DISPATCH_TASKMESH_INDIRECT_MULTI_ACE. 7756 */ 7757 if (!info->count || !gfx_result) 7758 return false; 7759 7760 const bool need_task_semaphore = radv_flush_gfx2ace_semaphore(cmd_buffer); 7761 struct radv_physical_device *pdevice = cmd_buffer->device->physical_device; 7762 struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs; 7763 struct radeon_winsys *ws = cmd_buffer->device->ws; 7764 7765 assert(ace_cs); 7766 ASSERTED const unsigned ace_cdw_max = 7767 radeon_check_space(ws, ace_cs, 4096 + 128 * (drawCount - 1)); 7768 7769 if (need_task_semaphore) 7770 radv_wait_gfx2ace_semaphore(cmd_buffer); 7771 7772 if (pipeline_is_dirty) { 7773 radv_pipeline_emit_hw_cs(pdevice, ace_cs, task_shader); 7774 radv_pipeline_emit_compute_state(pdevice, ace_cs, task_shader); 7775 } 7776 7777 radv_ace_internal_cache_flush(cmd_buffer); 7778 7779 /* Restore dirty state of descriptors 7780 * They were marked non-dirty in radv_before_draw, 7781 * but they need to be re-emitted now to the ACE cmdbuf. 7782 */ 7783 descriptors_state->push_dirty = push_dirty; 7784 descriptors_state->dirty = desc_dirty; 7785 7786 /* Flush descriptors and push constants for task shaders. */ 7787 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_TASK_BIT_NV, &pipeline->base, 7788 VK_PIPELINE_BIND_POINT_GRAPHICS); 7789 radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_TASK_BIT_NV, &pipeline->base, 7790 VK_PIPELINE_BIND_POINT_GRAPHICS); 7791 7792 assert(ace_cs->cdw <= ace_cdw_max); 7793 return true; 7794} 7795 7796static void 7797radv_after_draw(struct radv_cmd_buffer *cmd_buffer) 7798{ 7799 const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info; 7800 bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7; 7801 /* Start prefetches after the draw has been started. Both will 7802 * run in parallel, but starting the draw first is more 7803 * important. 7804 */ 7805 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) { 7806 radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.graphics_pipeline, false); 7807 } 7808 7809 /* Workaround for a VGT hang when streamout is enabled. 7810 * It must be done after drawing. 7811 */ 7812 if (radv_is_streamout_enabled(cmd_buffer) && 7813 (rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA || 7814 rad_info->family == CHIP_FIJI)) { 7815 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC; 7816 } 7817 7818 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH); 7819} 7820 7821static struct radv_buffer 7822radv_nv_mesh_indirect_bo(struct radv_cmd_buffer *cmd_buffer, 7823 struct radv_buffer *buffer, VkDeviceSize offset, 7824 uint32_t draw_count, uint32_t stride) 7825{ 7826 /* Translates the indirect BO format used by NV_mesh_shader API 7827 * to the BO format used by DRAW_INDIRECT / DRAW_INDIRECT_MULTI. 7828 */ 7829 7830 struct radeon_cmdbuf *cs = cmd_buffer->cs; 7831 struct radeon_winsys *ws = cmd_buffer->device->ws; 7832 7833 const size_t src_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV)); 7834 const size_t dst_stride = sizeof(VkDrawIndirectCommand); 7835 const size_t src_off_task_count = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount); 7836 const size_t src_off_first_task = offsetof(VkDrawMeshTasksIndirectCommandNV, firstTask); 7837 const size_t dst_off_vertex_count = offsetof(VkDrawIndirectCommand, vertexCount); 7838 const size_t dst_off_first_vertex = offsetof(VkDrawIndirectCommand, firstVertex); 7839 7840 /* Fill the buffer with all zeroes except instanceCount = 1. 7841 * This helps emit fewer copy packets below. 7842 */ 7843 VkDrawIndirectCommand *fill_data = (VkDrawIndirectCommand *) alloca(dst_stride * draw_count); 7844 const VkDrawIndirectCommand filler = { .instanceCount = 1 }; 7845 for (unsigned i = 0; i < draw_count; ++i) 7846 fill_data[i] = filler; 7847 7848 /* We'll have to copy data from the API BO. */ 7849 uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset; 7850 radv_cs_add_buffer(ws, cs, buffer->bo); 7851 7852 /* Allocate some space in the upload BO. */ 7853 unsigned out_offset; 7854 radv_cmd_buffer_upload_data(cmd_buffer, dst_stride * draw_count, fill_data, &out_offset); 7855 const uint64_t new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + out_offset; 7856 7857 ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 12 * draw_count + 2); 7858 7859 /* Copy data from the API BO so that the format is suitable for the 7860 * indirect draw packet: 7861 * - vertexCount = taskCount (copied here) 7862 * - instanceCount = 1 (filled by CPU above) 7863 * - firstVertex = firstTask (copied here) 7864 * - firstInstance = 0 (filled by CPU above) 7865 */ 7866 for (unsigned i = 0; i < draw_count; ++i) { 7867 const uint64_t src_task_count = va + i * src_stride + src_off_task_count; 7868 const uint64_t src_first_task = va + i * src_stride + src_off_first_task; 7869 const uint64_t dst_vertex_count = new_va + i * dst_stride + dst_off_vertex_count; 7870 const uint64_t dst_first_vertex = new_va + i * dst_stride + dst_off_first_vertex; 7871 7872 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating)); 7873 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 7874 COPY_DATA_WR_CONFIRM); 7875 radeon_emit(cs, src_task_count); 7876 radeon_emit(cs, src_task_count >> 32); 7877 radeon_emit(cs, dst_vertex_count); 7878 radeon_emit(cs, dst_vertex_count >> 32); 7879 7880 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating)); 7881 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 7882 COPY_DATA_WR_CONFIRM); 7883 radeon_emit(cs, src_first_task); 7884 radeon_emit(cs, src_first_task >> 32); 7885 radeon_emit(cs, dst_first_vertex); 7886 radeon_emit(cs, dst_first_vertex >> 32); 7887 } 7888 7889 /* Wait for the copies to finish */ 7890 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 7891 radeon_emit(cs, 0); 7892 7893 /* The draw packet can now use this buffer: */ 7894 struct radv_buffer buf = *buffer; 7895 buf.bo = cmd_buffer->upload.upload_bo; 7896 buf.offset = out_offset; 7897 7898 assert(cmd_buffer->cs->cdw <= cdw_max); 7899 7900 return buf; 7901} 7902 7903static struct radv_buffer 7904radv_nv_task_indirect_bo(struct radv_cmd_buffer *cmd_buffer, struct radv_buffer *buffer, 7905 VkDeviceSize offset, uint32_t draw_count, uint32_t stride) 7906{ 7907 /* Translates the indirect BO format used by NV_mesh_shader API 7908 * to the BO format used by DISPATCH_TASKMESH_INDIRECT_MULTI_ACE. 7909 */ 7910 7911 assert(draw_count); 7912 static_assert(sizeof(VkDispatchIndirectCommand) == 12, "Incorrect size of taskmesh command."); 7913 7914 struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs; 7915 struct radeon_winsys *ws = cmd_buffer->device->ws; 7916 7917 const size_t src_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV)); 7918 const size_t dst_stride = sizeof(VkDispatchIndirectCommand); 7919 const size_t src_off_task_count = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount); 7920 const size_t dst_off_x = offsetof(VkDispatchIndirectCommand, x); 7921 7922 const unsigned new_disp_size = dst_stride * draw_count; 7923 7924 const uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset; 7925 radv_cs_add_buffer(ws, cs, buffer->bo); 7926 7927 /* Fill the buffer with X=0, Y=1, Z=1. */ 7928 VkDispatchIndirectCommand *fill_data = (VkDispatchIndirectCommand *)alloca(new_disp_size); 7929 for (unsigned i = 0; i < draw_count; ++i) { 7930 fill_data[i].x = 0; 7931 fill_data[i].y = 1; 7932 fill_data[i].z = 1; 7933 } 7934 7935 /* Allocate space in the upload BO. */ 7936 unsigned out_offset; 7937 ASSERTED bool uploaded = 7938 radv_cmd_buffer_upload_data(cmd_buffer, new_disp_size, fill_data, &out_offset); 7939 const uint64_t new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + out_offset; 7940 assert(uploaded); 7941 7942 /* Clamp draw count to fit the actual size of the buffer. 7943 * This is to avoid potential out of bounds copies (eg. for draws with an indirect count buffer). 7944 * The remaining indirect draws will stay filled with X=0, Y=1, Z=1 which is harmless. 7945 */ 7946 draw_count = MIN2(draw_count, (buffer->vk.size - buffer->offset - offset) / src_stride); 7947 7948 ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 6 * draw_count + 2); 7949 7950 /* Copy taskCount from the NV API BO to the X dispatch size of the compatible BO. */ 7951 for (unsigned i = 0; i < draw_count; ++i) { 7952 const uint64_t src_task_count = va + i * src_stride + src_off_task_count; 7953 const uint64_t dst_x = new_va + i * dst_stride + dst_off_x; 7954 7955 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating)); 7956 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 7957 COPY_DATA_WR_CONFIRM); 7958 radeon_emit(cs, src_task_count); 7959 radeon_emit(cs, src_task_count >> 32); 7960 radeon_emit(cs, dst_x); 7961 radeon_emit(cs, dst_x >> 32); 7962 } 7963 7964 assert(cs->cdw <= cdw_max); 7965 7966 /* The draw packet can now use this buffer: */ 7967 struct radv_buffer buf = *buffer; 7968 buf.bo = cmd_buffer->upload.upload_bo; 7969 buf.offset = out_offset; 7970 7971 return buf; 7972} 7973 7974VKAPI_ATTR void VKAPI_CALL 7975radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, 7976 uint32_t firstVertex, uint32_t firstInstance) 7977{ 7978 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 7979 struct radv_draw_info info; 7980 7981 info.count = vertexCount; 7982 info.instance_count = instanceCount; 7983 info.first_instance = firstInstance; 7984 info.strmout_buffer = NULL; 7985 info.indirect = NULL; 7986 info.indexed = false; 7987 7988 if (!radv_before_draw(cmd_buffer, &info, 1)) 7989 return; 7990 const VkMultiDrawInfoEXT minfo = { firstVertex, vertexCount }; 7991 radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0); 7992 radv_after_draw(cmd_buffer); 7993} 7994 7995VKAPI_ATTR void VKAPI_CALL 7996radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo, 7997 uint32_t instanceCount, uint32_t firstInstance, uint32_t stride) 7998{ 7999 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8000 struct radv_draw_info info; 8001 8002 if (!drawCount) 8003 return; 8004 8005 info.count = pVertexInfo->vertexCount; 8006 info.instance_count = instanceCount; 8007 info.first_instance = firstInstance; 8008 info.strmout_buffer = NULL; 8009 info.indirect = NULL; 8010 info.indexed = false; 8011 8012 if (!radv_before_draw(cmd_buffer, &info, drawCount)) 8013 return; 8014 radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride); 8015 radv_after_draw(cmd_buffer); 8016} 8017 8018VKAPI_ATTR void VKAPI_CALL 8019radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, 8020 uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance) 8021{ 8022 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8023 struct radv_draw_info info; 8024 8025 info.indexed = true; 8026 info.count = indexCount; 8027 info.instance_count = instanceCount; 8028 info.first_instance = firstInstance; 8029 info.strmout_buffer = NULL; 8030 info.indirect = NULL; 8031 8032 if (!radv_before_draw(cmd_buffer, &info, 1)) 8033 return; 8034 const VkMultiDrawIndexedInfoEXT minfo = { firstIndex, indexCount, vertexOffset }; 8035 radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL); 8036 radv_after_draw(cmd_buffer); 8037} 8038 8039VKAPI_ATTR void VKAPI_CALL 8040radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo, 8041 uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset) 8042{ 8043 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8044 struct radv_draw_info info; 8045 8046 if (!drawCount) 8047 return; 8048 8049 const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo; 8050 info.indexed = true; 8051 info.count = minfo->indexCount; 8052 info.instance_count = instanceCount; 8053 info.first_instance = firstInstance; 8054 info.strmout_buffer = NULL; 8055 info.indirect = NULL; 8056 8057 if (!radv_before_draw(cmd_buffer, &info, drawCount)) 8058 return; 8059 radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset); 8060 radv_after_draw(cmd_buffer); 8061} 8062 8063VKAPI_ATTR void VKAPI_CALL 8064radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, 8065 uint32_t drawCount, uint32_t stride) 8066{ 8067 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8068 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 8069 struct radv_draw_info info; 8070 8071 info.count = drawCount; 8072 info.indirect = buffer; 8073 info.indirect_offset = offset; 8074 info.stride = stride; 8075 info.strmout_buffer = NULL; 8076 info.count_buffer = NULL; 8077 info.indexed = false; 8078 info.instance_count = 0; 8079 8080 if (!radv_before_draw(cmd_buffer, &info, 1)) 8081 return; 8082 radv_emit_indirect_draw_packets(cmd_buffer, &info); 8083 radv_after_draw(cmd_buffer); 8084} 8085 8086VKAPI_ATTR void VKAPI_CALL 8087radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, 8088 uint32_t drawCount, uint32_t stride) 8089{ 8090 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8091 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 8092 struct radv_draw_info info; 8093 8094 info.indexed = true; 8095 info.count = drawCount; 8096 info.indirect = buffer; 8097 info.indirect_offset = offset; 8098 info.stride = stride; 8099 info.count_buffer = NULL; 8100 info.strmout_buffer = NULL; 8101 info.instance_count = 0; 8102 8103 if (!radv_before_draw(cmd_buffer, &info, 1)) 8104 return; 8105 radv_emit_indirect_draw_packets(cmd_buffer, &info); 8106 radv_after_draw(cmd_buffer); 8107} 8108 8109VKAPI_ATTR void VKAPI_CALL 8110radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, 8111 VkBuffer _countBuffer, VkDeviceSize countBufferOffset, 8112 uint32_t maxDrawCount, uint32_t stride) 8113{ 8114 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8115 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 8116 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer); 8117 struct radv_draw_info info; 8118 8119 info.count = maxDrawCount; 8120 info.indirect = buffer; 8121 info.indirect_offset = offset; 8122 info.count_buffer = count_buffer; 8123 info.count_buffer_offset = countBufferOffset; 8124 info.stride = stride; 8125 info.strmout_buffer = NULL; 8126 info.indexed = false; 8127 info.instance_count = 0; 8128 8129 if (!radv_before_draw(cmd_buffer, &info, 1)) 8130 return; 8131 radv_emit_indirect_draw_packets(cmd_buffer, &info); 8132 radv_after_draw(cmd_buffer); 8133} 8134 8135VKAPI_ATTR void VKAPI_CALL 8136radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, 8137 VkDeviceSize offset, VkBuffer _countBuffer, 8138 VkDeviceSize countBufferOffset, uint32_t maxDrawCount, 8139 uint32_t stride) 8140{ 8141 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8142 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 8143 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer); 8144 struct radv_draw_info info; 8145 8146 info.indexed = true; 8147 info.count = maxDrawCount; 8148 info.indirect = buffer; 8149 info.indirect_offset = offset; 8150 info.count_buffer = count_buffer; 8151 info.count_buffer_offset = countBufferOffset; 8152 info.stride = stride; 8153 info.strmout_buffer = NULL; 8154 info.instance_count = 0; 8155 8156 if (!radv_before_draw(cmd_buffer, &info, 1)) 8157 return; 8158 radv_emit_indirect_draw_packets(cmd_buffer, &info); 8159 radv_after_draw(cmd_buffer); 8160} 8161 8162VKAPI_ATTR void VKAPI_CALL 8163radv_CmdDrawMeshTasksNV(VkCommandBuffer commandBuffer, uint32_t taskCount, uint32_t firstTask) 8164{ 8165 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8166 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 8167 struct radv_draw_info info; 8168 8169 info.count = taskCount; 8170 info.instance_count = 1; 8171 info.first_instance = 0; 8172 info.stride = 0; 8173 info.indexed = false; 8174 info.strmout_buffer = NULL; 8175 info.count_buffer = NULL; 8176 info.indirect = NULL; 8177 8178 if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1)) 8179 return; 8180 8181 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) { 8182 radv_emit_direct_taskmesh_draw_packets(cmd_buffer, taskCount, 1, 1, firstTask); 8183 } else { 8184 radv_emit_direct_mesh_draw_packet(cmd_buffer, taskCount, 1, 1, firstTask); 8185 } 8186 8187 radv_after_draw(cmd_buffer); 8188} 8189 8190VKAPI_ATTR void VKAPI_CALL 8191radv_CmdDrawMeshTasksIndirectNV(VkCommandBuffer commandBuffer, VkBuffer _buffer, 8192 VkDeviceSize offset, uint32_t drawCount, uint32_t stride) 8193{ 8194 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8195 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 8196 8197 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 8198 struct radv_draw_info info; 8199 8200 info.indirect = buffer; 8201 info.indirect_offset = offset; 8202 info.stride = stride; 8203 info.count = drawCount; 8204 info.strmout_buffer = NULL; 8205 info.count_buffer = NULL; 8206 info.indexed = false; 8207 info.instance_count = 0; 8208 8209 if (!radv_before_taskmesh_draw(cmd_buffer, &info, drawCount)) 8210 return; 8211 8212 /* Indirect draw with mesh shader only: 8213 * Use DRAW_INDIRECT / DRAW_INDIRECT_MULTI like normal indirect draws. 8214 * Needed because DISPATCH_MESH_INDIRECT_MULTI doesn't support firstTask. 8215 * 8216 * Indirect draw with task + mesh shaders: 8217 * Use DISPATCH_TASKMESH_INDIRECT_MULTI_ACE + DISPATCH_TASKMESH_GFX. 8218 * These packets don't support firstTask so we implement that by 8219 * reading the NV command's indirect buffer in the shader. 8220 * 8221 * The indirect BO layout from the NV_mesh_shader API is incompatible 8222 * with AMD HW. To make it work, we allocate some space 8223 * in the upload buffer and copy the data to it. 8224 */ 8225 8226 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) { 8227 uint64_t nv_ib_va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset; 8228 uint32_t nv_ib_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV)); 8229 struct radv_buffer buf = 8230 radv_nv_task_indirect_bo(cmd_buffer, buffer, offset, drawCount, stride); 8231 info.indirect = &buf; 8232 info.indirect_offset = 0; 8233 info.stride = sizeof(VkDispatchIndirectCommand); 8234 8235 radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info, nv_ib_va, nv_ib_stride); 8236 } else { 8237 struct radv_buffer buf = 8238 radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset, drawCount, stride); 8239 info.indirect = &buf; 8240 info.indirect_offset = 0; 8241 info.stride = sizeof(VkDrawIndirectCommand); 8242 8243 radv_emit_indirect_draw_packets(cmd_buffer, &info); 8244 } 8245 8246 radv_after_draw(cmd_buffer); 8247} 8248 8249VKAPI_ATTR void VKAPI_CALL 8250radv_CmdDrawMeshTasksIndirectCountNV(VkCommandBuffer commandBuffer, VkBuffer _buffer, 8251 VkDeviceSize offset, VkBuffer _countBuffer, 8252 VkDeviceSize countBufferOffset, uint32_t maxDrawCount, 8253 uint32_t stride) 8254{ 8255 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8256 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 8257 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer); 8258 8259 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 8260 struct radv_draw_info info; 8261 8262 info.indirect = buffer; 8263 info.indirect_offset = offset; 8264 info.stride = stride; 8265 info.count = maxDrawCount; 8266 info.strmout_buffer = NULL; 8267 info.count_buffer = count_buffer; 8268 info.count_buffer_offset = countBufferOffset; 8269 info.indexed = false; 8270 info.instance_count = 0; 8271 8272 if (!radv_before_taskmesh_draw(cmd_buffer, &info, maxDrawCount)) 8273 return; 8274 8275 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) { 8276 uint64_t nv_ib_va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset; 8277 uint32_t nv_ib_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV)); 8278 struct radv_buffer buf = 8279 radv_nv_task_indirect_bo(cmd_buffer, buffer, offset, maxDrawCount, stride); 8280 info.indirect = &buf; 8281 info.indirect_offset = 0; 8282 info.stride = sizeof(VkDispatchIndirectCommand); 8283 8284 radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info, nv_ib_va, nv_ib_stride); 8285 } else { 8286 struct radv_buffer buf = 8287 radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset, maxDrawCount, stride); 8288 info.indirect = &buf; 8289 info.indirect_offset = 0; 8290 info.stride = sizeof(VkDrawIndirectCommand); 8291 8292 radv_emit_indirect_draw_packets(cmd_buffer, &info); 8293 } 8294 8295 radv_after_draw(cmd_buffer); 8296} 8297 8298void 8299radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed, 8300 const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) 8301{ 8302 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8303 VK_FROM_HANDLE(radv_indirect_command_layout, layout, 8304 pGeneratedCommandsInfo->indirectCommandsLayout); 8305 VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer); 8306 8307 /* The only actions that can be done are draws, so skip on other queues. */ 8308 if (cmd_buffer->qf != RADV_QUEUE_GENERAL) 8309 return; 8310 8311 /* Secondary command buffers are needed for the full extension but can't use 8312 * PKT3_INDIRECT_BUFFER_CIK. 8313 */ 8314 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); 8315 8316 radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo); 8317 8318 struct radv_draw_info info; 8319 8320 info.count = pGeneratedCommandsInfo->sequencesCount; 8321 info.indirect = prep_buffer; /* We're not really going use it this way, but a good signal 8322 that this is not direct. */ 8323 info.indirect_offset = 0; 8324 info.stride = 0; 8325 info.strmout_buffer = NULL; 8326 info.count_buffer = NULL; 8327 info.indexed = layout->indexed; 8328 info.instance_count = 0; 8329 8330 if (!radv_before_draw(cmd_buffer, &info, 1)) 8331 return; 8332 8333 uint32_t cmdbuf_size = radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo); 8334 uint64_t va = radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset + 8335 pGeneratedCommandsInfo->preprocessOffset; 8336 const uint32_t view_mask = cmd_buffer->state.subpass->view_mask; 8337 8338 radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating)); 8339 radeon_emit(cmd_buffer->cs, 0); 8340 8341 if (!view_mask) { 8342 radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); 8343 radeon_emit(cmd_buffer->cs, va); 8344 radeon_emit(cmd_buffer->cs, va >> 32); 8345 radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2); 8346 } else { 8347 u_foreach_bit (view, view_mask) { 8348 radv_emit_view_index(cmd_buffer, view); 8349 8350 radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); 8351 radeon_emit(cmd_buffer->cs, va); 8352 radeon_emit(cmd_buffer->cs, va >> 32); 8353 radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2); 8354 } 8355 } 8356 8357 if (layout->binds_index_buffer) { 8358 cmd_buffer->state.last_index_type = -1; 8359 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER; 8360 } 8361 8362 if (layout->bind_vbo_mask) 8363 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER; 8364 8365 if (layout->binds_state) 8366 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE; 8367 8368 cmd_buffer->push_constant_stages |= ~0; 8369 8370 cmd_buffer->state.last_index_type = -1; 8371 cmd_buffer->state.last_num_instances = -1; 8372 cmd_buffer->state.last_vertex_offset = -1; 8373 cmd_buffer->state.last_first_instance = -1; 8374 cmd_buffer->state.last_drawid = -1; 8375 8376 radv_after_draw(cmd_buffer); 8377} 8378 8379struct radv_dispatch_info { 8380 /** 8381 * Determine the layout of the grid (in block units) to be used. 8382 */ 8383 uint32_t blocks[3]; 8384 8385 /** 8386 * A starting offset for the grid. If unaligned is set, the offset 8387 * must still be aligned. 8388 */ 8389 uint32_t offsets[3]; 8390 /** 8391 * Whether it's an unaligned compute dispatch. 8392 */ 8393 bool unaligned; 8394 8395 /** 8396 * Indirect compute parameters resource. 8397 */ 8398 struct radeon_winsys_bo *indirect; 8399 uint64_t va; 8400}; 8401 8402static void 8403radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, 8404 struct radv_compute_pipeline *pipeline, 8405 const struct radv_dispatch_info *info) 8406{ 8407 struct radv_shader *compute_shader = pipeline->base.shaders[MESA_SHADER_COMPUTE]; 8408 unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator; 8409 struct radeon_winsys *ws = cmd_buffer->device->ws; 8410 bool predicating = cmd_buffer->state.predicating; 8411 struct radeon_cmdbuf *cs = cmd_buffer->cs; 8412 struct radv_userdata_info *loc; 8413 8414 radv_describe_dispatch(cmd_buffer, info->blocks[0], info->blocks[1], info->blocks[2]); 8415 8416 loc = radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE); 8417 8418 ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 30); 8419 8420 if (compute_shader->info.wave_size == 32) { 8421 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10); 8422 dispatch_initiator |= S_00B800_CS_W32_EN(1); 8423 } 8424 8425 if (info->va) { 8426 if (info->indirect) 8427 radv_cs_add_buffer(ws, cs, info->indirect); 8428 8429 if (info->unaligned) { 8430 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); 8431 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0])); 8432 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1])); 8433 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2])); 8434 8435 dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1); 8436 } 8437 8438 if (loc->sgpr_idx != -1) { 8439 unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4; 8440 8441 if (cmd_buffer->device->load_grid_size_from_user_sgpr) { 8442 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3); 8443 radeon_emit(cs, PKT3(PKT3_LOAD_SH_REG_INDEX, 3, 0)); 8444 radeon_emit(cs, info->va); 8445 radeon_emit(cs, info->va >> 32); 8446 radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2); 8447 radeon_emit(cs, 3); 8448 } else { 8449 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, reg, info->va, true); 8450 } 8451 } 8452 8453 if (radv_cmd_buffer_uses_mec(cmd_buffer)) { 8454 radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va, 8455 &cmd_buffer->mec_inv_pred_emitted, 8456 4 /* DISPATCH_INDIRECT size */); 8457 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | PKT3_SHADER_TYPE_S(1)); 8458 radeon_emit(cs, info->va); 8459 radeon_emit(cs, info->va >> 32); 8460 radeon_emit(cs, dispatch_initiator); 8461 } else { 8462 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1)); 8463 radeon_emit(cs, 1); 8464 radeon_emit(cs, info->va); 8465 radeon_emit(cs, info->va >> 32); 8466 8467 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1)); 8468 radeon_emit(cs, 0); 8469 radeon_emit(cs, dispatch_initiator); 8470 } 8471 } else { 8472 unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]}; 8473 unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]}; 8474 8475 if (info->unaligned) { 8476 unsigned *cs_block_size = compute_shader->info.cs.block_size; 8477 unsigned remainder[3]; 8478 8479 /* If aligned, these should be an entire block size, 8480 * not 0. 8481 */ 8482 remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]); 8483 remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]); 8484 remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]); 8485 8486 blocks[0] = round_up_u32(blocks[0], cs_block_size[0]); 8487 blocks[1] = round_up_u32(blocks[1], cs_block_size[1]); 8488 blocks[2] = round_up_u32(blocks[2], cs_block_size[2]); 8489 8490 for (unsigned i = 0; i < 3; ++i) { 8491 assert(offsets[i] % cs_block_size[i] == 0); 8492 offsets[i] /= cs_block_size[i]; 8493 } 8494 8495 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); 8496 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) | 8497 S_00B81C_NUM_THREAD_PARTIAL(remainder[0])); 8498 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) | 8499 S_00B81C_NUM_THREAD_PARTIAL(remainder[1])); 8500 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) | 8501 S_00B81C_NUM_THREAD_PARTIAL(remainder[2])); 8502 8503 dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); 8504 } 8505 8506 if (loc->sgpr_idx != -1) { 8507 if (cmd_buffer->device->load_grid_size_from_user_sgpr) { 8508 assert(loc->num_sgprs == 3); 8509 8510 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3); 8511 radeon_emit(cs, blocks[0]); 8512 radeon_emit(cs, blocks[1]); 8513 radeon_emit(cs, blocks[2]); 8514 } else { 8515 uint32_t offset; 8516 if (!radv_cmd_buffer_upload_data(cmd_buffer, 12, blocks, &offset)) 8517 return; 8518 8519 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset; 8520 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, 8521 R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, va, true); 8522 } 8523 } 8524 8525 if (offsets[0] || offsets[1] || offsets[2]) { 8526 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3); 8527 radeon_emit(cs, offsets[0]); 8528 radeon_emit(cs, offsets[1]); 8529 radeon_emit(cs, offsets[2]); 8530 8531 /* The blocks in the packet are not counts but end values. */ 8532 for (unsigned i = 0; i < 3; ++i) 8533 blocks[i] += offsets[i]; 8534 } else { 8535 dispatch_initiator |= S_00B800_FORCE_START_AT_000(1); 8536 } 8537 8538 if (radv_cmd_buffer_uses_mec(cmd_buffer)) { 8539 radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va, 8540 &cmd_buffer->mec_inv_pred_emitted, 8541 5 /* DISPATCH_DIRECT size */); 8542 predicating = false; 8543 } 8544 8545 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1)); 8546 radeon_emit(cs, blocks[0]); 8547 radeon_emit(cs, blocks[1]); 8548 radeon_emit(cs, blocks[2]); 8549 radeon_emit(cs, dispatch_initiator); 8550 } 8551 8552 assert(cmd_buffer->cs->cdw <= cdw_max); 8553} 8554 8555static void 8556radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, 8557 struct radv_compute_pipeline *pipeline, 8558 VkPipelineBindPoint bind_point) 8559{ 8560 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, &pipeline->base, bind_point); 8561 radv_flush_constants(cmd_buffer, 8562 bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR 8563 ? RADV_RT_STAGE_BITS 8564 : VK_SHADER_STAGE_COMPUTE_BIT, 8565 &pipeline->base, bind_point); 8566} 8567 8568static void 8569radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info, 8570 struct radv_compute_pipeline *pipeline, VkPipelineBindPoint bind_point) 8571{ 8572 bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7; 8573 bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline; 8574 8575 if (pipeline->cs_regalloc_hang_bug) 8576 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | 8577 RADV_CMD_FLAG_CS_PARTIAL_FLUSH; 8578 8579 if (cmd_buffer->state.flush_bits & 8580 (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB | 8581 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) { 8582 /* If we have to wait for idle, set all states first, so that 8583 * all SET packets are processed in parallel with previous draw 8584 * calls. Then upload descriptors, set shader pointers, and 8585 * dispatch, and prefetch at the end. This ensures that the 8586 * time the CUs are idle is very short. (there are only SET_SH 8587 * packets between the wait and the draw) 8588 */ 8589 radv_emit_compute_pipeline(cmd_buffer, pipeline); 8590 si_emit_cache_flush(cmd_buffer); 8591 /* <-- CUs are idle here --> */ 8592 8593 radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point); 8594 8595 radv_emit_dispatch_packets(cmd_buffer, pipeline, info); 8596 /* <-- CUs are busy here --> */ 8597 8598 /* Start prefetches after the dispatch has been started. Both 8599 * will run in parallel, but starting the dispatch first is 8600 * more important. 8601 */ 8602 if (has_prefetch && pipeline_is_dirty) { 8603 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_COMPUTE]); 8604 } 8605 } else { 8606 /* If we don't wait for idle, start prefetches first, then set 8607 * states, and dispatch at the end. 8608 */ 8609 si_emit_cache_flush(cmd_buffer); 8610 8611 if (has_prefetch && pipeline_is_dirty) { 8612 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_COMPUTE]); 8613 } 8614 8615 radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point); 8616 8617 radv_emit_compute_pipeline(cmd_buffer, pipeline); 8618 radv_emit_dispatch_packets(cmd_buffer, pipeline, info); 8619 } 8620 8621 if (pipeline_is_dirty) { 8622 /* Raytracing uses compute shaders but has separate bind points and pipelines. 8623 * So if we set compute userdata & shader registers we should dirty the raytracing 8624 * ones and the other way around. 8625 * 8626 * We only need to do this when the pipeline is dirty because when we switch between 8627 * the two we always need to switch pipelines. 8628 */ 8629 radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE 8630 ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR 8631 : VK_PIPELINE_BIND_POINT_COMPUTE); 8632 } 8633 8634 if (pipeline->cs_regalloc_hang_bug) 8635 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH; 8636 8637 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH); 8638} 8639 8640static void 8641radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info) 8642{ 8643 radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline, 8644 VK_PIPELINE_BIND_POINT_COMPUTE); 8645} 8646 8647VKAPI_ATTR void VKAPI_CALL 8648radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y, 8649 uint32_t base_z, uint32_t x, uint32_t y, uint32_t z) 8650{ 8651 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8652 struct radv_dispatch_info info = {0}; 8653 8654 info.blocks[0] = x; 8655 info.blocks[1] = y; 8656 info.blocks[2] = z; 8657 8658 info.offsets[0] = base_x; 8659 info.offsets[1] = base_y; 8660 info.offsets[2] = base_z; 8661 radv_compute_dispatch(cmd_buffer, &info); 8662} 8663 8664VKAPI_ATTR void VKAPI_CALL 8665radv_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z) 8666{ 8667 radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z); 8668} 8669 8670VKAPI_ATTR void VKAPI_CALL 8671radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset) 8672{ 8673 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8674 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer); 8675 struct radv_dispatch_info info = {0}; 8676 8677 info.indirect = buffer->bo; 8678 info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset; 8679 8680 radv_compute_dispatch(cmd_buffer, &info); 8681} 8682 8683void 8684radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z) 8685{ 8686 struct radv_dispatch_info info = {0}; 8687 8688 info.blocks[0] = x; 8689 info.blocks[1] = y; 8690 info.blocks[2] = z; 8691 info.unaligned = 1; 8692 8693 radv_compute_dispatch(cmd_buffer, &info); 8694} 8695 8696void 8697radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va) 8698{ 8699 struct radv_dispatch_info info = {0}; 8700 8701 info.indirect = bo; 8702 info.va = va; 8703 8704 radv_compute_dispatch(cmd_buffer, &info); 8705} 8706 8707enum radv_rt_mode { 8708 radv_rt_mode_direct, 8709 radv_rt_mode_indirect, 8710 radv_rt_mode_indirect2, 8711}; 8712 8713static void 8714radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *tables, 8715 uint64_t indirect_va, enum radv_rt_mode mode) 8716{ 8717 struct radv_compute_pipeline *pipeline = cmd_buffer->state.rt_pipeline; 8718 uint32_t base_reg = pipeline->base.user_data_0[MESA_SHADER_COMPUTE]; 8719 8720 struct radv_dispatch_info info = {0}; 8721 info.unaligned = true; 8722 8723 uint64_t launch_size_va; 8724 uint64_t sbt_va; 8725 8726 if (mode != radv_rt_mode_indirect2) { 8727 uint32_t upload_size = mode == radv_rt_mode_direct 8728 ? sizeof(VkTraceRaysIndirectCommand2KHR) 8729 : offsetof(VkTraceRaysIndirectCommand2KHR, width); 8730 8731 uint32_t offset; 8732 if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset)) 8733 return; 8734 8735 uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset; 8736 8737 launch_size_va = (mode == radv_rt_mode_direct) 8738 ? upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width) 8739 : indirect_va; 8740 sbt_va = upload_va; 8741 } else { 8742 launch_size_va = indirect_va + offsetof(VkTraceRaysIndirectCommand2KHR, width); 8743 sbt_va = indirect_va; 8744 } 8745 8746 if (mode == radv_rt_mode_direct) { 8747 info.blocks[0] = tables->width; 8748 info.blocks[1] = tables->height; 8749 info.blocks[2] = tables->depth; 8750 } else 8751 info.va = launch_size_va; 8752 8753 struct radv_userdata_info *desc_loc = 8754 radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_SBT_DESCRIPTORS); 8755 if (desc_loc->sgpr_idx != -1) { 8756 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, 8757 base_reg + desc_loc->sgpr_idx * 4, sbt_va, true); 8758 } 8759 8760 struct radv_userdata_info *size_loc = 8761 radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR); 8762 if (size_loc->sgpr_idx != -1) { 8763 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, 8764 base_reg + size_loc->sgpr_idx * 4, launch_size_va, true); 8765 } 8766 8767 radv_dispatch(cmd_buffer, &info, pipeline, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR); 8768} 8769 8770VKAPI_ATTR void VKAPI_CALL 8771radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer, 8772 const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable, 8773 const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable, 8774 const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable, 8775 const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, 8776 uint32_t width, uint32_t height, uint32_t depth) 8777{ 8778 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8779 8780 VkTraceRaysIndirectCommand2KHR tables = { 8781 .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress, 8782 .raygenShaderRecordSize = pRaygenShaderBindingTable->size, 8783 .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress, 8784 .missShaderBindingTableSize = pMissShaderBindingTable->size, 8785 .missShaderBindingTableStride = pMissShaderBindingTable->stride, 8786 .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress, 8787 .hitShaderBindingTableSize = pHitShaderBindingTable->size, 8788 .hitShaderBindingTableStride = pHitShaderBindingTable->stride, 8789 .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress, 8790 .callableShaderBindingTableSize = pCallableShaderBindingTable->size, 8791 .callableShaderBindingTableStride = pCallableShaderBindingTable->stride, 8792 .width = width, 8793 .height = height, 8794 .depth = depth, 8795 }; 8796 8797 radv_trace_rays(cmd_buffer, &tables, 0, radv_rt_mode_direct); 8798} 8799 8800VKAPI_ATTR void VKAPI_CALL 8801radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer, 8802 const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable, 8803 const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable, 8804 const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable, 8805 const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, 8806 VkDeviceAddress indirectDeviceAddress) 8807{ 8808 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8809 8810 assert(cmd_buffer->device->use_global_bo_list); 8811 8812 VkTraceRaysIndirectCommand2KHR tables = { 8813 .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress, 8814 .raygenShaderRecordSize = pRaygenShaderBindingTable->size, 8815 .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress, 8816 .missShaderBindingTableSize = pMissShaderBindingTable->size, 8817 .missShaderBindingTableStride = pMissShaderBindingTable->stride, 8818 .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress, 8819 .hitShaderBindingTableSize = pHitShaderBindingTable->size, 8820 .hitShaderBindingTableStride = pHitShaderBindingTable->stride, 8821 .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress, 8822 .callableShaderBindingTableSize = pCallableShaderBindingTable->size, 8823 .callableShaderBindingTableStride = pCallableShaderBindingTable->stride, 8824 }; 8825 8826 radv_trace_rays(cmd_buffer, &tables, indirectDeviceAddress, radv_rt_mode_indirect); 8827} 8828 8829VKAPI_ATTR void VKAPI_CALL 8830radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer, VkDeviceAddress indirectDeviceAddress) 8831{ 8832 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8833 8834 assert(cmd_buffer->device->use_global_bo_list); 8835 8836 radv_trace_rays(cmd_buffer, NULL, indirectDeviceAddress, radv_rt_mode_indirect2); 8837} 8838 8839static void 8840radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size) 8841{ 8842 unsigned wave_size = 0; 8843 unsigned scratch_bytes_per_wave = 0; 8844 8845 if (cmd_buffer->state.rt_pipeline) { 8846 scratch_bytes_per_wave = cmd_buffer->state.rt_pipeline->base.scratch_bytes_per_wave; 8847 wave_size = cmd_buffer->state.rt_pipeline->base.shaders[MESA_SHADER_COMPUTE]->info.wave_size; 8848 } 8849 8850 /* The hardware register is specified as a multiple of 256 DWORDS. */ 8851 scratch_bytes_per_wave += align(size * wave_size, 1024); 8852 8853 cmd_buffer->compute_scratch_size_per_wave_needed = 8854 MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave); 8855} 8856 8857VKAPI_ATTR void VKAPI_CALL 8858radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size) 8859{ 8860 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8861 8862 radv_set_rt_stack_size(cmd_buffer, size); 8863 cmd_buffer->state.rt_stack_size = size; 8864} 8865 8866VKAPI_ATTR void VKAPI_CALL 8867radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo) 8868{ 8869 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8870 8871 radv_mark_noncoherent_rb(cmd_buffer); 8872 8873 radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier); 8874 8875 radv_cmd_buffer_end_subpass(cmd_buffer); 8876 8877 vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments); 8878 vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.subpass_sample_locs); 8879 8880 cmd_buffer->state.pass = NULL; 8881 cmd_buffer->state.subpass = NULL; 8882 cmd_buffer->state.attachments = NULL; 8883 cmd_buffer->state.framebuffer = NULL; 8884 cmd_buffer->state.subpass_sample_locs = NULL; 8885} 8886 8887VKAPI_ATTR void VKAPI_CALL 8888radv_CmdBeginRendering(VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo) 8889{ 8890 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 8891 const VkRenderingFragmentShadingRateAttachmentInfoKHR *vrs_info = vk_find_struct_const( 8892 pRenderingInfo->pNext, RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR); 8893 VkResult result; 8894 /* (normal + resolve) for color attachments and ds and a VRS attachment */ 8895 VkAttachmentDescription2 att_desc[MAX_RTS * 2 + 3]; 8896 VkAttachmentDescriptionStencilLayout ds_stencil_att, ds_stencil_resolve_att; 8897 VkImageView iviews[MAX_RTS * 2 + 3]; 8898 VkAttachmentReference2 color_refs[MAX_RTS], color_resolve_refs[MAX_RTS]; 8899 VkAttachmentReference2 ds_ref, ds_resolve_ref, vrs_ref; 8900 VkAttachmentReferenceStencilLayout ds_stencil_ref, ds_stencil_resolve_ref; 8901 VkSubpassDescriptionDepthStencilResolve ds_resolve_info; 8902 VkFragmentShadingRateAttachmentInfoKHR vrs_subpass_info; 8903 VkClearValue clear_values[MAX_RTS * 2 + 3]; 8904 unsigned att_count = 0; 8905 8906 VkSubpassDescription2 subpass = { 8907 .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2, 8908 .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, 8909 .viewMask = pRenderingInfo->viewMask, 8910 .colorAttachmentCount = pRenderingInfo->colorAttachmentCount, 8911 .pColorAttachments = color_refs, 8912 .pResolveAttachments = color_resolve_refs, 8913 }; 8914 8915 for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; ++i) { 8916 color_refs[i] = (VkAttachmentReference2){ 8917 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, 8918 .attachment = VK_ATTACHMENT_UNUSED, 8919 }; 8920 color_resolve_refs[i] = (VkAttachmentReference2){ 8921 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, 8922 .attachment = VK_ATTACHMENT_UNUSED, 8923 }; 8924 8925 if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE) 8926 continue; 8927 8928 const VkRenderingAttachmentInfo *info = &pRenderingInfo->pColorAttachments[i]; 8929 RADV_FROM_HANDLE(radv_image_view, iview, info->imageView); 8930 color_refs[i] = (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, 8931 .attachment = att_count, 8932 .layout = info->imageLayout, 8933 .aspectMask = iview->vk.aspects}; 8934 8935 iviews[att_count] = info->imageView; 8936 clear_values[att_count] = info->clearValue; 8937 VkAttachmentDescription2 *att = att_desc + att_count++; 8938 8939 memset(att, 0, sizeof(*att)); 8940 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2; 8941 att->format = iview->vk.format; 8942 att->samples = iview->image->info.samples; 8943 att->loadOp = info->loadOp; 8944 att->storeOp = info->storeOp; 8945 att->initialLayout = info->imageLayout; 8946 att->finalLayout = info->imageLayout; 8947 8948 if (pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT) 8949 att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; 8950 8951 if (pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT) 8952 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE; 8953 8954 if (info->resolveMode != VK_RESOLVE_MODE_NONE && 8955 !(pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)) { 8956 RADV_FROM_HANDLE(radv_image_view, resolve_iview, info->resolveImageView); 8957 color_resolve_refs[i] = 8958 (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, 8959 .attachment = att_count, 8960 .layout = info->resolveImageLayout, 8961 .aspectMask = resolve_iview->vk.aspects}; 8962 8963 iviews[att_count] = info->resolveImageView; 8964 att = att_desc + att_count++; 8965 8966 memset(att, 0, sizeof(*att)); 8967 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2; 8968 att->format = resolve_iview->vk.format; 8969 att->samples = resolve_iview->image->info.samples; 8970 att->loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; 8971 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE; 8972 att->initialLayout = info->resolveImageLayout; 8973 att->finalLayout = info->resolveImageLayout; 8974 } 8975 } 8976 8977 if (pRenderingInfo->pDepthAttachment || pRenderingInfo->pStencilAttachment) { 8978 const VkRenderingAttachmentInfo *common_info = pRenderingInfo->pDepthAttachment 8979 ? pRenderingInfo->pDepthAttachment 8980 : pRenderingInfo->pStencilAttachment; 8981 RADV_FROM_HANDLE(radv_image_view, iview, common_info->imageView); 8982 8983 if (common_info->imageView != VK_NULL_HANDLE) { 8984 ds_ref = (VkAttachmentReference2){ 8985 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, 8986 .attachment = att_count, 8987 .layout = common_info->imageLayout, 8988 .aspectMask = (pRenderingInfo->pDepthAttachment ? VK_IMAGE_ASPECT_DEPTH_BIT : 0) | 8989 (pRenderingInfo->pStencilAttachment ? VK_IMAGE_ASPECT_STENCIL_BIT : 0)}; 8990 subpass.pDepthStencilAttachment = &ds_ref; 8991 8992 iviews[att_count] = common_info->imageView; 8993 if (pRenderingInfo->pDepthAttachment) 8994 clear_values[att_count].depthStencil.depth = 8995 pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth; 8996 if (pRenderingInfo->pStencilAttachment) 8997 clear_values[att_count].depthStencil.stencil = 8998 pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil; 8999 VkAttachmentDescription2 *att = att_desc + att_count++; 9000 9001 memset(att, 0, sizeof(*att)); 9002 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2; 9003 att->format = iview->vk.format; 9004 att->samples = iview->image->info.samples; 9005 9006 if (pRenderingInfo->pDepthAttachment) { 9007 att->loadOp = pRenderingInfo->pDepthAttachment->loadOp; 9008 att->storeOp = pRenderingInfo->pDepthAttachment->storeOp; 9009 } else { 9010 att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; 9011 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE; 9012 } 9013 9014 if (pRenderingInfo->pStencilAttachment) { 9015 att->stencilLoadOp = pRenderingInfo->pStencilAttachment->loadOp; 9016 att->stencilStoreOp = pRenderingInfo->pStencilAttachment->storeOp; 9017 } else { 9018 att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD; 9019 att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE; 9020 } 9021 9022 if (pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT) { 9023 att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; 9024 att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD; 9025 } 9026 9027 if (pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT) { 9028 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE; 9029 att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE; 9030 } 9031 9032 att->initialLayout = common_info->imageLayout; 9033 att->finalLayout = common_info->imageLayout; 9034 9035 if (pRenderingInfo->pDepthAttachment && pRenderingInfo->pStencilAttachment) { 9036 ds_ref.pNext = &ds_stencil_ref; 9037 ds_stencil_ref = (VkAttachmentReferenceStencilLayout){ 9038 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT, 9039 .stencilLayout = pRenderingInfo->pStencilAttachment->imageLayout}; 9040 9041 att->pNext = &ds_stencil_att; 9042 ds_stencil_att = (VkAttachmentDescriptionStencilLayout){ 9043 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT, 9044 .stencilInitialLayout = pRenderingInfo->pStencilAttachment->imageLayout, 9045 .stencilFinalLayout = pRenderingInfo->pStencilAttachment->imageLayout, 9046 }; 9047 } 9048 9049 if (((pRenderingInfo->pDepthAttachment && 9050 pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE) || 9051 (pRenderingInfo->pStencilAttachment && 9052 pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE)) && 9053 !(pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)) { 9054 RADV_FROM_HANDLE(radv_image_view, resolve_iview, common_info->resolveImageView); 9055 ds_resolve_ref = 9056 (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, 9057 .attachment = att_count, 9058 .layout = common_info->resolveImageLayout, 9059 .aspectMask = resolve_iview->vk.aspects}; 9060 9061 iviews[att_count] = common_info->resolveImageView; 9062 att = att_desc + att_count++; 9063 9064 memset(att, 0, sizeof(*att)); 9065 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2; 9066 att->format = resolve_iview->vk.format; 9067 att->samples = resolve_iview->image->info.samples; 9068 att->loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; 9069 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE; 9070 att->initialLayout = common_info->resolveImageLayout; 9071 att->finalLayout = common_info->resolveImageLayout; 9072 9073 ds_resolve_info = (VkSubpassDescriptionDepthStencilResolve){ 9074 .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE, 9075 .pNext = subpass.pNext, 9076 .depthResolveMode = 9077 (pRenderingInfo->pDepthAttachment && 9078 pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE) 9079 ? pRenderingInfo->pDepthAttachment->resolveMode 9080 : VK_RESOLVE_MODE_NONE, 9081 .stencilResolveMode = 9082 (pRenderingInfo->pStencilAttachment && 9083 pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE) 9084 ? pRenderingInfo->pStencilAttachment->resolveMode 9085 : VK_RESOLVE_MODE_NONE, 9086 .pDepthStencilResolveAttachment = &ds_resolve_ref}; 9087 subpass.pNext = &ds_resolve_info; 9088 9089 if (pRenderingInfo->pDepthAttachment && pRenderingInfo->pStencilAttachment && 9090 pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE && 9091 pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE) { 9092 ds_resolve_ref.pNext = &ds_stencil_resolve_ref; 9093 ds_stencil_resolve_ref = (VkAttachmentReferenceStencilLayout){ 9094 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT, 9095 .stencilLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout}; 9096 9097 att->pNext = &ds_stencil_resolve_att; 9098 ds_stencil_resolve_att = (VkAttachmentDescriptionStencilLayout){ 9099 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT, 9100 .stencilInitialLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout, 9101 .stencilFinalLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout, 9102 }; 9103 } 9104 } 9105 } 9106 } 9107 9108 if (vrs_info && vrs_info->imageView) { 9109 RADV_FROM_HANDLE(radv_image_view, iview, vrs_info->imageView); 9110 vrs_ref = (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2, 9111 .attachment = att_count, 9112 .layout = vrs_info->imageLayout, 9113 .aspectMask = iview->vk.aspects}; 9114 9115 iviews[att_count] = vrs_info->imageView; 9116 VkAttachmentDescription2 *att = att_desc + att_count++; 9117 9118 memset(att, 0, sizeof(*att)); 9119 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2; 9120 att->format = iview->vk.format; 9121 att->samples = iview->image->info.samples; 9122 att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; 9123 att->storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; 9124 att->initialLayout = vrs_info->imageLayout; 9125 att->finalLayout = vrs_info->imageLayout; 9126 9127 vrs_subpass_info = (VkFragmentShadingRateAttachmentInfoKHR){ 9128 .sType = VK_STRUCTURE_TYPE_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR, 9129 .pNext = subpass.pNext, 9130 .pFragmentShadingRateAttachment = &vrs_ref, 9131 .shadingRateAttachmentTexelSize = vrs_info->shadingRateAttachmentTexelSize, 9132 }; 9133 subpass.pNext = &vrs_subpass_info; 9134 } 9135 9136 VkRenderPassCreateInfo2 rp_create_info = { 9137 .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2, 9138 .attachmentCount = att_count, 9139 .pAttachments = att_desc, 9140 .subpassCount = 1, 9141 .pSubpasses = &subpass, 9142 }; 9143 9144 VkRenderPass rp; 9145 result = 9146 radv_CreateRenderPass2(radv_device_to_handle(cmd_buffer->device), &rp_create_info, NULL, &rp); 9147 if (result != VK_SUCCESS) { 9148 cmd_buffer->record_result = result; 9149 return; 9150 } 9151 9152 unsigned w = pRenderingInfo->renderArea.offset.x + pRenderingInfo->renderArea.extent.width; 9153 unsigned h = pRenderingInfo->renderArea.offset.y + pRenderingInfo->renderArea.extent.height; 9154 for (unsigned i = 0; i < att_count; ++i) { 9155 RADV_FROM_HANDLE(radv_image_view, iview, iviews[i]); 9156 9157 if (vrs_info && vrs_info->imageView == iviews[i]) 9158 continue; 9159 9160 w = MIN2(w, iview->extent.width); 9161 h = MIN2(h, iview->extent.height); 9162 } 9163 VkFramebufferCreateInfo fb_create_info = { 9164 .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, 9165 .renderPass = rp, 9166 .attachmentCount = att_count, 9167 .pAttachments = iviews, 9168 .width = w, 9169 .height = h, 9170 .layers = pRenderingInfo->layerCount, 9171 }; 9172 9173 VkFramebuffer fb; 9174 result = 9175 vk_common_CreateFramebuffer(radv_device_to_handle(cmd_buffer->device), &fb_create_info, NULL, &fb); 9176 if (result != VK_SUCCESS) { 9177 radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device), rp, NULL); 9178 cmd_buffer->record_result = result; 9179 return; 9180 } 9181 9182 VkRenderPassBeginInfo begin_info = {.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, 9183 .renderPass = rp, 9184 .framebuffer = fb, 9185 .renderArea = pRenderingInfo->renderArea, 9186 .clearValueCount = att_count, 9187 .pClearValues = clear_values}; 9188 9189 const VkSubpassBeginInfo pass_begin_info = { 9190 .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO, 9191 .contents = (pRenderingInfo->flags & VK_RENDERING_CONTENTS_SECONDARY_COMMAND_BUFFERS_BIT) 9192 ? VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS 9193 : VK_SUBPASS_CONTENTS_INLINE, 9194 }; 9195 9196 radv_CmdBeginRenderPass2(commandBuffer, &begin_info, &pass_begin_info); 9197} 9198 9199VKAPI_ATTR void VKAPI_CALL 9200radv_CmdEndRendering(VkCommandBuffer commandBuffer) 9201{ 9202 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 9203 struct radv_render_pass *pass = cmd_buffer->state.pass; 9204 struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 9205 9206 radv_CmdEndRenderPass2(commandBuffer, NULL); 9207 9208 vk_common_DestroyFramebuffer(radv_device_to_handle(cmd_buffer->device), 9209 vk_framebuffer_to_handle(framebuffer), NULL); 9210 radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device), 9211 radv_render_pass_to_handle(pass), NULL); 9212} 9213 9214/* 9215 * For HTILE we have the following interesting clear words: 9216 * 0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE 9217 * 0xfffc000f: Uncompressed, full depth range, for depth only HTILE. 9218 * 0xfffffff0: Clear depth to 1.0 9219 * 0x00000000: Clear depth to 0.0 9220 */ 9221static void 9222radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 9223 const VkImageSubresourceRange *range) 9224{ 9225 struct radv_cmd_state *state = &cmd_buffer->state; 9226 uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image); 9227 VkClearDepthStencilValue value = {0}; 9228 struct radv_barrier_data barrier = {0}; 9229 9230 barrier.layout_transitions.init_mask_ram = 1; 9231 radv_describe_layout_transition(cmd_buffer, &barrier); 9232 9233 /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent 9234 * in considering previous rendering work for WAW hazards. */ 9235 state->flush_bits |= 9236 radv_src_access_flush(cmd_buffer, VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image); 9237 9238 if (image->planes[0].surface.has_stencil && 9239 !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) { 9240 /* Flush caches before performing a separate aspect initialization because it's a 9241 * read-modify-write operation. 9242 */ 9243 state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT, image); 9244 } 9245 9246 state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value); 9247 9248 radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask); 9249 9250 if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) { 9251 /* Initialize the TC-compat metada value to 0 because by 9252 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only 9253 * need have to conditionally update its value when performing 9254 * a fast depth clear. 9255 */ 9256 radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0); 9257 } 9258} 9259 9260static void 9261radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 9262 VkImageLayout src_layout, bool src_render_loop, 9263 VkImageLayout dst_layout, bool dst_render_loop, 9264 unsigned src_queue_mask, unsigned dst_queue_mask, 9265 const VkImageSubresourceRange *range, 9266 struct radv_sample_locations_state *sample_locs) 9267{ 9268 struct radv_device *device = cmd_buffer->device; 9269 9270 if (!radv_htile_enabled(image, range->baseMipLevel)) 9271 return; 9272 9273 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { 9274 radv_initialize_htile(cmd_buffer, image, range); 9275 } else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop, 9276 src_queue_mask) && 9277 radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop, 9278 dst_queue_mask)) { 9279 radv_initialize_htile(cmd_buffer, image, range); 9280 } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop, 9281 src_queue_mask) && 9282 !radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop, 9283 dst_queue_mask)) { 9284 cmd_buffer->state.flush_bits |= 9285 RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 9286 9287 radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs); 9288 9289 cmd_buffer->state.flush_bits |= 9290 RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META; 9291 } 9292} 9293 9294static uint32_t 9295radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 9296 const VkImageSubresourceRange *range, uint32_t value) 9297{ 9298 struct radv_barrier_data barrier = {0}; 9299 9300 barrier.layout_transitions.init_mask_ram = 1; 9301 radv_describe_layout_transition(cmd_buffer, &barrier); 9302 9303 return radv_clear_cmask(cmd_buffer, image, range, value); 9304} 9305 9306uint32_t 9307radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 9308 const VkImageSubresourceRange *range) 9309{ 9310 static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210}; 9311 uint32_t log2_samples = util_logbase2(image->info.samples); 9312 uint32_t value = fmask_clear_values[log2_samples]; 9313 struct radv_barrier_data barrier = {0}; 9314 9315 barrier.layout_transitions.init_mask_ram = 1; 9316 radv_describe_layout_transition(cmd_buffer, &barrier); 9317 9318 return radv_clear_fmask(cmd_buffer, image, range, value); 9319} 9320 9321uint32_t 9322radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 9323 const VkImageSubresourceRange *range, uint32_t value) 9324{ 9325 struct radv_barrier_data barrier = {0}; 9326 uint32_t flush_bits = 0; 9327 unsigned size = 0; 9328 9329 barrier.layout_transitions.init_mask_ram = 1; 9330 radv_describe_layout_transition(cmd_buffer, &barrier); 9331 9332 flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value); 9333 9334 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX8) { 9335 /* When DCC is enabled with mipmaps, some levels might not 9336 * support fast clears and we have to initialize them as "fully 9337 * expanded". 9338 */ 9339 /* Compute the size of all fast clearable DCC levels. */ 9340 for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) { 9341 struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i]; 9342 unsigned dcc_fast_clear_size = 9343 dcc_level->dcc_slice_fast_clear_size * image->info.array_size; 9344 9345 if (!dcc_fast_clear_size) 9346 break; 9347 9348 size = dcc_level->dcc_offset + dcc_fast_clear_size; 9349 } 9350 9351 /* Initialize the mipmap levels without DCC. */ 9352 if (size != image->planes[0].surface.meta_size) { 9353 flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bindings[0].bo, 9354 radv_buffer_get_va(image->bindings[0].bo) + 9355 image->bindings[0].offset + 9356 image->planes[0].surface.meta_offset + size, 9357 image->planes[0].surface.meta_size - size, 0xffffffff); 9358 } 9359 } 9360 9361 return flush_bits; 9362} 9363 9364/** 9365 * Initialize DCC/FMASK/CMASK metadata for a color image. 9366 */ 9367static void 9368radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 9369 VkImageLayout src_layout, bool src_render_loop, 9370 VkImageLayout dst_layout, bool dst_render_loop, 9371 unsigned src_queue_mask, unsigned dst_queue_mask, 9372 const VkImageSubresourceRange *range) 9373{ 9374 uint32_t flush_bits = 0; 9375 9376 /* Transitioning from LAYOUT_UNDEFINED layout not everyone is 9377 * consistent in considering previous rendering work for WAW hazards. 9378 */ 9379 cmd_buffer->state.flush_bits |= 9380 radv_src_access_flush(cmd_buffer, VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, image); 9381 9382 if (radv_image_has_cmask(image)) { 9383 uint32_t value; 9384 9385 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) { 9386 /* TODO: Fix clearing CMASK layers on GFX9. */ 9387 if (radv_image_is_tc_compat_cmask(image) || 9388 (radv_image_has_fmask(image) && 9389 radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout, 9390 dst_render_loop, dst_queue_mask))) { 9391 value = 0xccccccccu; 9392 } else { 9393 value = 0xffffffffu; 9394 } 9395 } else { 9396 static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff}; 9397 uint32_t log2_samples = util_logbase2(image->info.samples); 9398 9399 value = cmask_clear_values[log2_samples]; 9400 } 9401 9402 flush_bits |= radv_init_cmask(cmd_buffer, image, range, value); 9403 } 9404 9405 if (radv_image_has_fmask(image)) { 9406 flush_bits |= radv_init_fmask(cmd_buffer, image, range); 9407 } 9408 9409 if (radv_dcc_enabled(image, range->baseMipLevel)) { 9410 uint32_t value = 0xffffffffu; /* Fully expanded mode. */ 9411 9412 if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, 9413 dst_layout, dst_render_loop, dst_queue_mask)) { 9414 value = 0u; 9415 } 9416 9417 flush_bits |= radv_init_dcc(cmd_buffer, image, range, value); 9418 } 9419 9420 if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) { 9421 radv_update_fce_metadata(cmd_buffer, image, range, false); 9422 9423 uint32_t color_values[2] = {0}; 9424 radv_set_color_clear_metadata(cmd_buffer, image, range, color_values); 9425 } 9426 9427 cmd_buffer->state.flush_bits |= flush_bits; 9428} 9429 9430static void 9431radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 9432 VkImageLayout src_layout, VkImageLayout dst_layout, unsigned dst_queue_mask) 9433{ 9434 /* If the image is read-only, we don't have to retile DCC because it can't change. */ 9435 if (!(image->vk.usage & RADV_IMAGE_USAGE_WRITE_BITS)) 9436 return; 9437 9438 if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR && 9439 (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR || 9440 (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN)))) 9441 radv_retile_dcc(cmd_buffer, image); 9442} 9443 9444static bool 9445radv_image_need_retile(const struct radv_image *image) 9446{ 9447 return image->planes[0].surface.display_dcc_offset && 9448 image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset; 9449} 9450 9451/** 9452 * Handle color image transitions for DCC/FMASK/CMASK. 9453 */ 9454static void 9455radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 9456 VkImageLayout src_layout, bool src_render_loop, 9457 VkImageLayout dst_layout, bool dst_render_loop, 9458 unsigned src_queue_mask, unsigned dst_queue_mask, 9459 const VkImageSubresourceRange *range) 9460{ 9461 bool dcc_decompressed = false, fast_clear_flushed = false; 9462 9463 if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) && 9464 !radv_dcc_enabled(image, range->baseMipLevel)) 9465 return; 9466 9467 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { 9468 radv_init_color_image_metadata(cmd_buffer, image, src_layout, src_render_loop, dst_layout, 9469 dst_render_loop, src_queue_mask, dst_queue_mask, range); 9470 9471 if (radv_image_need_retile(image)) 9472 radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask); 9473 return; 9474 } 9475 9476 if (radv_dcc_enabled(image, range->baseMipLevel)) { 9477 if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) { 9478 cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu); 9479 } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, 9480 src_layout, src_render_loop, src_queue_mask) && 9481 !radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, 9482 dst_layout, dst_render_loop, dst_queue_mask)) { 9483 radv_decompress_dcc(cmd_buffer, image, range); 9484 dcc_decompressed = true; 9485 } else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, 9486 src_layout, src_render_loop, src_queue_mask) && 9487 !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, 9488 dst_layout, dst_render_loop, dst_queue_mask)) { 9489 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); 9490 fast_clear_flushed = true; 9491 } 9492 9493 if (radv_image_need_retile(image)) 9494 radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask); 9495 } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) { 9496 if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, 9497 src_layout, src_render_loop, src_queue_mask) && 9498 !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, 9499 dst_layout, dst_render_loop, dst_queue_mask)) { 9500 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); 9501 fast_clear_flushed = true; 9502 } 9503 } 9504 9505 /* MSAA color decompress. */ 9506 if (radv_image_has_fmask(image) && 9507 (image->vk.usage & (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT)) && 9508 radv_layout_fmask_compressed(cmd_buffer->device, image, src_layout, src_queue_mask) && 9509 !radv_layout_fmask_compressed(cmd_buffer->device, image, dst_layout, dst_queue_mask)) { 9510 if (radv_dcc_enabled(image, range->baseMipLevel) && 9511 !radv_image_use_dcc_image_stores(cmd_buffer->device, image) && !dcc_decompressed) { 9512 /* A DCC decompress is required before expanding FMASK 9513 * when DCC stores aren't supported to avoid being in 9514 * a state where DCC is compressed and the main 9515 * surface is uncompressed. 9516 */ 9517 radv_decompress_dcc(cmd_buffer, image, range); 9518 } else if (!fast_clear_flushed) { 9519 /* A FMASK decompress is required before expanding 9520 * FMASK. 9521 */ 9522 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range); 9523 } 9524 9525 struct radv_barrier_data barrier = {0}; 9526 barrier.layout_transitions.fmask_color_expand = 1; 9527 radv_describe_layout_transition(cmd_buffer, &barrier); 9528 9529 radv_expand_fmask_image_inplace(cmd_buffer, image, range); 9530 } 9531} 9532 9533static void 9534radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, 9535 VkImageLayout src_layout, bool src_render_loop, 9536 VkImageLayout dst_layout, bool dst_render_loop, uint32_t src_family_index, 9537 uint32_t dst_family_index, const VkImageSubresourceRange *range, 9538 struct radv_sample_locations_state *sample_locs) 9539{ 9540 enum radv_queue_family src_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, src_family_index); 9541 enum radv_queue_family dst_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, dst_family_index); 9542 if (image->exclusive && src_family_index != dst_family_index) { 9543 /* This is an acquire or a release operation and there will be 9544 * a corresponding release/acquire. Do the transition in the 9545 * most flexible queue. */ 9546 9547 assert(src_qf == cmd_buffer->qf || 9548 dst_qf == cmd_buffer->qf); 9549 9550 if (src_family_index == VK_QUEUE_FAMILY_EXTERNAL || src_family_index == VK_QUEUE_FAMILY_FOREIGN_EXT) 9551 return; 9552 9553 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) 9554 return; 9555 9556 if (cmd_buffer->qf == RADV_QUEUE_COMPUTE && 9557 (src_qf == RADV_QUEUE_GENERAL || dst_qf == RADV_QUEUE_GENERAL)) 9558 return; 9559 } 9560 9561 unsigned src_queue_mask = 9562 radv_image_queue_family_mask(image, src_qf, cmd_buffer->qf); 9563 unsigned dst_queue_mask = 9564 radv_image_queue_family_mask(image, dst_qf, cmd_buffer->qf); 9565 9566 if (src_layout == dst_layout && src_render_loop == dst_render_loop && src_queue_mask == dst_queue_mask) 9567 return; 9568 9569 if (vk_format_has_depth(image->vk.format)) { 9570 radv_handle_depth_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout, 9571 dst_render_loop, src_queue_mask, dst_queue_mask, range, 9572 sample_locs); 9573 } else { 9574 radv_handle_color_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout, 9575 dst_render_loop, src_queue_mask, dst_queue_mask, range); 9576 } 9577} 9578 9579static void 9580radv_cp_dma_wait_for_stages(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 stage_mask) 9581{ 9582 /* Make sure CP DMA is idle because the driver might have performed a DMA operation for copying a 9583 * buffer (or a MSAA image using FMASK). Note that updating a buffer is considered a clear 9584 * operation but it might also use a CP DMA copy in some rare situations. Other operations using 9585 * a CP DMA clear are implicitly synchronized (see CP_DMA_SYNC). 9586 */ 9587 if (stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT | 9588 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | 9589 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) 9590 si_cp_dma_wait_for_idle(cmd_buffer); 9591} 9592 9593static void 9594radv_barrier(struct radv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_info, 9595 enum rgp_barrier_reason reason) 9596{ 9597 enum radv_cmd_flush_bits src_flush_bits = 0; 9598 enum radv_cmd_flush_bits dst_flush_bits = 0; 9599 VkPipelineStageFlags2 src_stage_mask = 0; 9600 VkPipelineStageFlags2 dst_stage_mask = 0; 9601 9602 if (cmd_buffer->state.subpass) 9603 radv_mark_noncoherent_rb(cmd_buffer); 9604 9605 radv_describe_barrier_start(cmd_buffer, reason); 9606 9607 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) { 9608 src_stage_mask |= dep_info->pMemoryBarriers[i].srcStageMask; 9609 src_flush_bits |= 9610 radv_src_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].srcAccessMask, NULL); 9611 dst_stage_mask |= dep_info->pMemoryBarriers[i].dstStageMask; 9612 dst_flush_bits |= 9613 radv_dst_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].dstAccessMask, NULL); 9614 } 9615 9616 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) { 9617 src_stage_mask |= dep_info->pBufferMemoryBarriers[i].srcStageMask; 9618 src_flush_bits |= 9619 radv_src_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].srcAccessMask, NULL); 9620 dst_stage_mask |= dep_info->pBufferMemoryBarriers[i].dstStageMask; 9621 dst_flush_bits |= 9622 radv_dst_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].dstAccessMask, NULL); 9623 } 9624 9625 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) { 9626 RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image); 9627 9628 src_stage_mask |= dep_info->pImageMemoryBarriers[i].srcStageMask; 9629 src_flush_bits |= 9630 radv_src_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].srcAccessMask, image); 9631 dst_stage_mask |= dep_info->pImageMemoryBarriers[i].dstStageMask; 9632 dst_flush_bits |= 9633 radv_dst_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].dstAccessMask, image); 9634 } 9635 9636 /* The Vulkan spec 1.1.98 says: 9637 * 9638 * "An execution dependency with only 9639 * VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT in the destination stage mask 9640 * will only prevent that stage from executing in subsequently 9641 * submitted commands. As this stage does not perform any actual 9642 * execution, this is not observable - in effect, it does not delay 9643 * processing of subsequent commands. Similarly an execution dependency 9644 * with only VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT in the source stage mask 9645 * will effectively not wait for any prior commands to complete." 9646 */ 9647 if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT) 9648 radv_stage_flush(cmd_buffer, src_stage_mask); 9649 cmd_buffer->state.flush_bits |= src_flush_bits; 9650 9651 radv_ace_internal_barrier(cmd_buffer, src_stage_mask, 0); 9652 9653 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) { 9654 RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image); 9655 9656 const struct VkSampleLocationsInfoEXT *sample_locs_info = 9657 vk_find_struct_const(dep_info->pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT); 9658 struct radv_sample_locations_state sample_locations; 9659 9660 if (sample_locs_info) { 9661 assert(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT); 9662 sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel; 9663 sample_locations.grid_size = sample_locs_info->sampleLocationGridSize; 9664 sample_locations.count = sample_locs_info->sampleLocationsCount; 9665 typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations, 9666 sample_locs_info->sampleLocationsCount); 9667 } 9668 9669 radv_handle_image_transition( 9670 cmd_buffer, image, dep_info->pImageMemoryBarriers[i].oldLayout, 9671 false, /* Outside of a renderpass we are never in a renderloop */ 9672 dep_info->pImageMemoryBarriers[i].newLayout, 9673 false, /* Outside of a renderpass we are never in a renderloop */ 9674 dep_info->pImageMemoryBarriers[i].srcQueueFamilyIndex, 9675 dep_info->pImageMemoryBarriers[i].dstQueueFamilyIndex, 9676 &dep_info->pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL); 9677 } 9678 9679 radv_ace_internal_barrier(cmd_buffer, 0, dst_stage_mask); 9680 radv_cp_dma_wait_for_stages(cmd_buffer, src_stage_mask); 9681 9682 cmd_buffer->state.flush_bits |= dst_flush_bits; 9683 9684 radv_describe_barrier_end(cmd_buffer); 9685} 9686 9687VKAPI_ATTR void VKAPI_CALL 9688radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, 9689 const VkDependencyInfo *pDependencyInfo) 9690{ 9691 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 9692 9693 radv_barrier(cmd_buffer, pDependencyInfo, RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER); 9694} 9695 9696static void 9697write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event, 9698 VkPipelineStageFlags2 stageMask, unsigned value) 9699{ 9700 struct radeon_cmdbuf *cs = cmd_buffer->cs; 9701 uint64_t va = radv_buffer_get_va(event->bo); 9702 9703 si_emit_cache_flush(cmd_buffer); 9704 9705 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo); 9706 9707 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28); 9708 9709 if (stageMask & (VK_PIPELINE_STAGE_2_COPY_BIT | 9710 VK_PIPELINE_STAGE_2_RESOLVE_BIT | 9711 VK_PIPELINE_STAGE_2_BLIT_BIT | 9712 VK_PIPELINE_STAGE_2_CLEAR_BIT)) { 9713 /* Be conservative for now. */ 9714 stageMask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT; 9715 } 9716 9717 /* Flags that only require a top-of-pipe event. */ 9718 VkPipelineStageFlags2 top_of_pipe_flags = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT; 9719 9720 /* Flags that only require a post-index-fetch event. */ 9721 VkPipelineStageFlags2 post_index_fetch_flags = 9722 top_of_pipe_flags | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT; 9723 9724 /* Flags that only require signaling post PS. */ 9725 VkPipelineStageFlags2 post_ps_flags = 9726 post_index_fetch_flags | VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT | 9727 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT | 9728 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT | 9729 VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV | 9730 VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT | 9731 VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT | 9732 VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR | 9733 VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT; 9734 9735 /* Flags that only require signaling post CS. */ 9736 VkPipelineStageFlags2 post_cs_flags = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT; 9737 9738 radv_cp_dma_wait_for_stages(cmd_buffer, stageMask); 9739 9740 if (!(stageMask & ~top_of_pipe_flags)) { 9741 /* Just need to sync the PFP engine. */ 9742 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); 9743 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); 9744 radeon_emit(cs, va); 9745 radeon_emit(cs, va >> 32); 9746 radeon_emit(cs, value); 9747 } else if (!(stageMask & ~post_index_fetch_flags)) { 9748 /* Sync ME because PFP reads index and indirect buffers. */ 9749 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); 9750 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME)); 9751 radeon_emit(cs, va); 9752 radeon_emit(cs, va >> 32); 9753 radeon_emit(cs, value); 9754 } else { 9755 unsigned event_type; 9756 9757 if (!(stageMask & ~post_ps_flags)) { 9758 /* Sync previous fragment shaders. */ 9759 event_type = V_028A90_PS_DONE; 9760 } else if (!(stageMask & ~post_cs_flags)) { 9761 /* Sync previous compute shaders. */ 9762 event_type = V_028A90_CS_DONE; 9763 } else { 9764 /* Otherwise, sync all prior GPU work. */ 9765 event_type = V_028A90_BOTTOM_OF_PIPE_TS; 9766 } 9767 9768 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, 9769 radv_cmd_buffer_uses_mec(cmd_buffer), event_type, 0, 9770 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value, 9771 cmd_buffer->gfx9_eop_bug_va); 9772 } 9773 9774 assert(cmd_buffer->cs->cdw <= cdw_max); 9775} 9776 9777VKAPI_ATTR void VKAPI_CALL 9778radv_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, 9779 const VkDependencyInfo* pDependencyInfo) 9780{ 9781 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 9782 RADV_FROM_HANDLE(radv_event, event, _event); 9783 VkPipelineStageFlags2 src_stage_mask = 0; 9784 9785 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) 9786 src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask; 9787 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) 9788 src_stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask; 9789 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) 9790 src_stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask; 9791 9792 write_event(cmd_buffer, event, src_stage_mask, 1); 9793} 9794 9795VKAPI_ATTR void VKAPI_CALL 9796radv_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, 9797 VkPipelineStageFlags2 stageMask) 9798{ 9799 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 9800 RADV_FROM_HANDLE(radv_event, event, _event); 9801 9802 write_event(cmd_buffer, event, stageMask, 0); 9803} 9804 9805VKAPI_ATTR void VKAPI_CALL 9806radv_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents, 9807 const VkDependencyInfo* pDependencyInfos) 9808{ 9809 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 9810 struct radeon_cmdbuf *cs = cmd_buffer->cs; 9811 9812 for (unsigned i = 0; i < eventCount; ++i) { 9813 RADV_FROM_HANDLE(radv_event, event, pEvents[i]); 9814 uint64_t va = radv_buffer_get_va(event->bo); 9815 9816 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo); 9817 9818 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7); 9819 9820 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff); 9821 assert(cmd_buffer->cs->cdw <= cdw_max); 9822 } 9823 9824 radv_barrier(cmd_buffer, pDependencyInfos, RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS); 9825} 9826 9827VKAPI_ATTR void VKAPI_CALL 9828radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask) 9829{ 9830 /* No-op */ 9831} 9832 9833/* VK_EXT_conditional_rendering */ 9834VKAPI_ATTR void VKAPI_CALL 9835radv_CmdBeginConditionalRenderingEXT( 9836 VkCommandBuffer commandBuffer, 9837 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin) 9838{ 9839 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 9840 RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer); 9841 struct radeon_cmdbuf *cs = cmd_buffer->cs; 9842 unsigned pred_op = PREDICATION_OP_BOOL32; 9843 bool draw_visible = true; 9844 uint64_t va; 9845 9846 va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset; 9847 9848 /* By default, if the 32-bit value at offset in buffer memory is zero, 9849 * then the rendering commands are discarded, otherwise they are 9850 * executed as normal. If the inverted flag is set, all commands are 9851 * discarded if the value is non zero. 9852 */ 9853 if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) { 9854 draw_visible = false; 9855 } 9856 9857 si_emit_cache_flush(cmd_buffer); 9858 9859 if (cmd_buffer->qf == RADV_QUEUE_GENERAL && 9860 !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) { 9861 uint64_t pred_value = 0, pred_va; 9862 unsigned pred_offset; 9863 9864 /* From the Vulkan spec 1.1.107: 9865 * 9866 * "If the 32-bit value at offset in buffer memory is zero, 9867 * then the rendering commands are discarded, otherwise they 9868 * are executed as normal. If the value of the predicate in 9869 * buffer memory changes while conditional rendering is 9870 * active, the rendering commands may be discarded in an 9871 * implementation-dependent way. Some implementations may 9872 * latch the value of the predicate upon beginning conditional 9873 * rendering while others may read it before every rendering 9874 * command." 9875 * 9876 * But, the AMD hardware treats the predicate as a 64-bit 9877 * value which means we need a workaround in the driver. 9878 * Luckily, it's not required to support if the value changes 9879 * when predication is active. 9880 * 9881 * The workaround is as follows: 9882 * 1) allocate a 64-value in the upload BO and initialize it 9883 * to 0 9884 * 2) copy the 32-bit predicate value to the upload BO 9885 * 3) use the new allocated VA address for predication 9886 * 9887 * Based on the conditionalrender demo, it's faster to do the 9888 * COPY_DATA in ME (+ sync PFP) instead of PFP. 9889 */ 9890 radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset); 9891 9892 pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset; 9893 9894 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 9895 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 9896 COPY_DATA_WR_CONFIRM); 9897 radeon_emit(cs, va); 9898 radeon_emit(cs, va >> 32); 9899 radeon_emit(cs, pred_va); 9900 radeon_emit(cs, pred_va >> 32); 9901 9902 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 9903 radeon_emit(cs, 0); 9904 9905 va = pred_va; 9906 pred_op = PREDICATION_OP_BOOL64; 9907 } 9908 9909 /* MEC doesn't support predication, we emulate it elsewhere. */ 9910 if (!radv_cmd_buffer_uses_mec(cmd_buffer)) { 9911 si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va); 9912 } 9913 9914 /* Store conditional rendering user info. */ 9915 cmd_buffer->state.predicating = true; 9916 cmd_buffer->state.predication_type = draw_visible; 9917 cmd_buffer->state.predication_op = pred_op; 9918 cmd_buffer->state.predication_va = va; 9919 cmd_buffer->mec_inv_pred_emitted = false; 9920} 9921 9922VKAPI_ATTR void VKAPI_CALL 9923radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) 9924{ 9925 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 9926 9927 /* MEC doesn't support predication, no need to emit anything here. */ 9928 if (!radv_cmd_buffer_uses_mec(cmd_buffer)) { 9929 si_emit_set_predication_state(cmd_buffer, false, 0, 0); 9930 } 9931 9932 /* Reset conditional rendering user info. */ 9933 cmd_buffer->state.predicating = false; 9934 cmd_buffer->state.predication_type = -1; 9935 cmd_buffer->state.predication_op = 0; 9936 cmd_buffer->state.predication_va = 0; 9937 cmd_buffer->mec_inv_pred_emitted = false; 9938} 9939 9940/* VK_EXT_transform_feedback */ 9941VKAPI_ATTR void VKAPI_CALL 9942radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding, 9943 uint32_t bindingCount, const VkBuffer *pBuffers, 9944 const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes) 9945{ 9946 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 9947 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; 9948 uint8_t enabled_mask = 0; 9949 9950 assert(firstBinding + bindingCount <= MAX_SO_BUFFERS); 9951 for (uint32_t i = 0; i < bindingCount; i++) { 9952 uint32_t idx = firstBinding + i; 9953 9954 sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]); 9955 sb[idx].offset = pOffsets[i]; 9956 9957 if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) { 9958 sb[idx].size = sb[idx].buffer->vk.size - sb[idx].offset; 9959 } else { 9960 sb[idx].size = pSizes[i]; 9961 } 9962 9963 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo); 9964 9965 enabled_mask |= 1 << idx; 9966 } 9967 9968 cmd_buffer->state.streamout.enabled_mask |= enabled_mask; 9969 9970 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER; 9971} 9972 9973bool 9974radv_is_streamout_enabled(struct radv_cmd_buffer *cmd_buffer) 9975{ 9976 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 9977 9978 /* Streamout must be enabled for the PRIMITIVES_GENERATED query to work. */ 9979 return (so->streamout_enabled || cmd_buffer->state.prims_gen_query_enabled) && 9980 !cmd_buffer->state.suspend_streamout; 9981} 9982 9983void 9984radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer) 9985{ 9986 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 9987 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 9988 bool streamout_enabled = radv_is_streamout_enabled(cmd_buffer); 9989 struct radeon_cmdbuf *cs = cmd_buffer->cs; 9990 uint32_t enabled_stream_buffers_mask = 0; 9991 9992 if (pipeline && pipeline->streamout_shader) { 9993 enabled_stream_buffers_mask = pipeline->streamout_shader->info.so.enabled_stream_buffers_mask; 9994 } 9995 9996 radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2); 9997 radeon_emit(cs, S_028B94_STREAMOUT_0_EN(streamout_enabled) | S_028B94_RAST_STREAM(0) | 9998 S_028B94_STREAMOUT_1_EN(streamout_enabled) | 9999 S_028B94_STREAMOUT_2_EN(streamout_enabled) | 10000 S_028B94_STREAMOUT_3_EN(streamout_enabled)); 10001 radeon_emit(cs, so->hw_enabled_mask & enabled_stream_buffers_mask); 10002 10003 cmd_buffer->state.context_roll_without_scissor_emitted = true; 10004} 10005 10006static void 10007radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable) 10008{ 10009 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 10010 bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer); 10011 uint32_t old_hw_enabled_mask = so->hw_enabled_mask; 10012 10013 so->streamout_enabled = enable; 10014 10015 so->hw_enabled_mask = so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) | 10016 (so->enabled_mask << 12); 10017 10018 if (!cmd_buffer->device->physical_device->use_ngg_streamout && 10019 ((old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) || 10020 (old_hw_enabled_mask != so->hw_enabled_mask))) 10021 radv_emit_streamout_enable(cmd_buffer); 10022 10023 if (cmd_buffer->device->physical_device->use_ngg_streamout) { 10024 cmd_buffer->gds_needed = true; 10025 cmd_buffer->gds_oa_needed = true; 10026 } 10027} 10028 10029static void 10030radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer) 10031{ 10032 struct radeon_cmdbuf *cs = cmd_buffer->cs; 10033 unsigned reg_strmout_cntl; 10034 10035 /* The register is at different places on different ASICs. */ 10036 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) { 10037 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; 10038 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); 10039 radeon_emit(cs, S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME)); 10040 radeon_emit(cs, R_0300FC_CP_STRMOUT_CNTL >> 2); 10041 radeon_emit(cs, 0); 10042 radeon_emit(cs, 0); 10043 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) { 10044 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; 10045 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0); 10046 } else { 10047 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; 10048 radeon_set_config_reg(cs, reg_strmout_cntl, 0); 10049 } 10050 10051 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 10052 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); 10053 10054 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 10055 radeon_emit(cs, 10056 WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 10057 radeon_emit(cs, reg_strmout_cntl >> 2); /* register */ 10058 radeon_emit(cs, 0); 10059 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ 10060 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ 10061 radeon_emit(cs, 4); /* poll interval */ 10062} 10063 10064static void 10065radv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer, 10066 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 10067 const VkDeviceSize *pCounterBufferOffsets) 10068 10069{ 10070 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings; 10071 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 10072 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; 10073 struct radv_shader_info *info = &pipeline->streamout_shader->info; 10074 struct radeon_cmdbuf *cs = cmd_buffer->cs; 10075 10076 radv_flush_vgt_streamout(cmd_buffer); 10077 10078 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 10079 u_foreach_bit(i, so->enabled_mask) 10080 { 10081 int32_t counter_buffer_idx = i - firstCounterBuffer; 10082 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 10083 counter_buffer_idx = -1; 10084 10085 /* AMD GCN binds streamout buffers as shader resources. 10086 * VGT only counts primitives and tells the shader through 10087 * SGPRs what to do. 10088 */ 10089 radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2); 10090 radeon_emit(cs, sb[i].size >> 2); /* BUFFER_SIZE (in DW) */ 10091 radeon_emit(cs, info->so.strides[i]); /* VTX_STRIDE (in DW) */ 10092 10093 cmd_buffer->state.context_roll_without_scissor_emitted = true; 10094 10095 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) { 10096 /* The array of counter buffers is optional. */ 10097 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 10098 uint64_t va = radv_buffer_get_va(buffer->bo); 10099 uint64_t counter_buffer_offset = 0; 10100 10101 if (pCounterBufferOffsets) 10102 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx]; 10103 10104 va += buffer->offset + counter_buffer_offset; 10105 10106 /* Append */ 10107 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 10108 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */ 10109 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ 10110 radeon_emit(cs, 0); /* unused */ 10111 radeon_emit(cs, 0); /* unused */ 10112 radeon_emit(cs, va); /* src address lo */ 10113 radeon_emit(cs, va >> 32); /* src address hi */ 10114 10115 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 10116 } else { 10117 /* Start from the beginning. */ 10118 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 10119 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */ 10120 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ 10121 radeon_emit(cs, 0); /* unused */ 10122 radeon_emit(cs, 0); /* unused */ 10123 radeon_emit(cs, 0); /* unused */ 10124 radeon_emit(cs, 0); /* unused */ 10125 } 10126 } 10127 10128 radv_set_streamout_enable(cmd_buffer, true); 10129} 10130 10131static void 10132gfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer, 10133 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 10134 const VkDeviceSize *pCounterBufferOffsets) 10135{ 10136 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 10137 unsigned last_target = util_last_bit(so->enabled_mask) - 1; 10138 struct radeon_cmdbuf *cs = cmd_buffer->cs; 10139 10140 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10); 10141 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 10142 10143 /* Sync because the next streamout operation will overwrite GDS and we 10144 * have to make sure it's idle. 10145 * TODO: Improve by tracking if there is a streamout operation in 10146 * flight. 10147 */ 10148 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH; 10149 si_emit_cache_flush(cmd_buffer); 10150 10151 u_foreach_bit(i, so->enabled_mask) 10152 { 10153 int32_t counter_buffer_idx = i - firstCounterBuffer; 10154 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 10155 counter_buffer_idx = -1; 10156 10157 bool append = 10158 counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]; 10159 uint64_t va = 0; 10160 10161 if (append) { 10162 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 10163 uint64_t counter_buffer_offset = 0; 10164 10165 if (pCounterBufferOffsets) 10166 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx]; 10167 10168 va += radv_buffer_get_va(buffer->bo); 10169 va += buffer->offset + counter_buffer_offset; 10170 10171 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 10172 } 10173 10174 radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); 10175 radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | 10176 S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target)); 10177 radeon_emit(cs, va); 10178 radeon_emit(cs, va >> 32); 10179 radeon_emit(cs, 4 * i); /* destination in GDS */ 10180 radeon_emit(cs, 0); 10181 radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target)); 10182 } 10183 10184 radv_set_streamout_enable(cmd_buffer, true); 10185} 10186 10187VKAPI_ATTR void VKAPI_CALL 10188radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, 10189 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 10190 const VkDeviceSize *pCounterBufferOffsets) 10191{ 10192 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 10193 10194 if (cmd_buffer->device->physical_device->use_ngg_streamout) { 10195 gfx10_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, 10196 pCounterBuffers, pCounterBufferOffsets); 10197 } else { 10198 radv_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers, 10199 pCounterBufferOffsets); 10200 } 10201} 10202 10203static void 10204radv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer, 10205 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 10206 const VkDeviceSize *pCounterBufferOffsets) 10207{ 10208 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 10209 struct radeon_cmdbuf *cs = cmd_buffer->cs; 10210 10211 radv_flush_vgt_streamout(cmd_buffer); 10212 10213 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 10214 u_foreach_bit(i, so->enabled_mask) 10215 { 10216 int32_t counter_buffer_idx = i - firstCounterBuffer; 10217 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 10218 counter_buffer_idx = -1; 10219 10220 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) { 10221 /* The array of counters buffer is optional. */ 10222 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 10223 uint64_t va = radv_buffer_get_va(buffer->bo); 10224 uint64_t counter_buffer_offset = 0; 10225 10226 if (pCounterBufferOffsets) 10227 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx]; 10228 10229 va += buffer->offset + counter_buffer_offset; 10230 10231 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 10232 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */ 10233 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | 10234 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ 10235 radeon_emit(cs, va); /* dst address lo */ 10236 radeon_emit(cs, va >> 32); /* dst address hi */ 10237 radeon_emit(cs, 0); /* unused */ 10238 radeon_emit(cs, 0); /* unused */ 10239 10240 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 10241 } 10242 10243 /* Deactivate transform feedback by zeroing the buffer size. 10244 * The counters (primitives generated, primitives emitted) may 10245 * be enabled even if there is not buffer bound. This ensures 10246 * that the primitives-emitted query won't increment. 10247 */ 10248 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0); 10249 10250 cmd_buffer->state.context_roll_without_scissor_emitted = true; 10251 } 10252 10253 radv_set_streamout_enable(cmd_buffer, false); 10254} 10255 10256static void 10257gfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer, 10258 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 10259 const VkDeviceSize *pCounterBufferOffsets) 10260{ 10261 struct radv_streamout_state *so = &cmd_buffer->state.streamout; 10262 struct radeon_cmdbuf *cs = cmd_buffer->cs; 10263 10264 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10); 10265 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS); 10266 10267 u_foreach_bit(i, so->enabled_mask) 10268 { 10269 int32_t counter_buffer_idx = i - firstCounterBuffer; 10270 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount) 10271 counter_buffer_idx = -1; 10272 10273 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) { 10274 /* The array of counters buffer is optional. */ 10275 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]); 10276 uint64_t va = radv_buffer_get_va(buffer->bo); 10277 uint64_t counter_buffer_offset = 0; 10278 10279 if (pCounterBufferOffsets) 10280 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx]; 10281 10282 va += buffer->offset + counter_buffer_offset; 10283 10284 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, 10285 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0, 10286 EOP_DST_SEL_TC_L2, EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0); 10287 10288 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo); 10289 } 10290 } 10291 10292 radv_set_streamout_enable(cmd_buffer, false); 10293} 10294 10295VKAPI_ATTR void VKAPI_CALL 10296radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, 10297 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers, 10298 const VkDeviceSize *pCounterBufferOffsets) 10299{ 10300 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 10301 10302 if (cmd_buffer->device->physical_device->use_ngg_streamout) { 10303 gfx10_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers, 10304 pCounterBufferOffsets); 10305 } else { 10306 radv_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers, 10307 pCounterBufferOffsets); 10308 } 10309} 10310 10311VKAPI_ATTR void VKAPI_CALL 10312radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount, 10313 uint32_t firstInstance, VkBuffer _counterBuffer, 10314 VkDeviceSize counterBufferOffset, uint32_t counterOffset, 10315 uint32_t vertexStride) 10316{ 10317 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 10318 RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer); 10319 struct radv_draw_info info; 10320 10321 info.count = 0; 10322 info.instance_count = instanceCount; 10323 info.first_instance = firstInstance; 10324 info.strmout_buffer = counterBuffer; 10325 info.strmout_buffer_offset = counterBufferOffset; 10326 info.stride = vertexStride; 10327 info.indexed = false; 10328 info.indirect = NULL; 10329 10330 if (!radv_before_draw(cmd_buffer, &info, 1)) 10331 return; 10332 struct VkMultiDrawInfoEXT minfo = { 0, 0 }; 10333 radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0); 10334 radv_after_draw(cmd_buffer); 10335} 10336 10337/* VK_AMD_buffer_marker */ 10338VKAPI_ATTR void VKAPI_CALL 10339radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, 10340 VkBuffer dstBuffer, VkDeviceSize dstOffset, uint32_t marker) 10341{ 10342 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); 10343 RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer); 10344 struct radeon_cmdbuf *cs = cmd_buffer->cs; 10345 uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + dstOffset; 10346 10347 si_emit_cache_flush(cmd_buffer); 10348 10349 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12); 10350 10351 if (!(stage & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) { 10352 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 10353 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | 10354 COPY_DATA_WR_CONFIRM); 10355 radeon_emit(cs, marker); 10356 radeon_emit(cs, 0); 10357 radeon_emit(cs, va); 10358 radeon_emit(cs, va >> 32); 10359 } else { 10360 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, 10361 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 10362 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker, 10363 cmd_buffer->gfx9_eop_bug_va); 10364 } 10365 10366 assert(cmd_buffer->cs->cdw <= cdw_max); 10367} 10368 10369void 10370radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer, 10371 VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline, 10372 uint32_t groupIndex) 10373{ 10374 fprintf(stderr, "radv: unimplemented vkCmdBindPipelineShaderGroupNV\n"); 10375 abort(); 10376}