1/* 2 * Copyright © 2019 Raspberry Pi Ltd 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "v3dv_private.h" 25#include "util/u_pack_color.h" 26#include "vk_util.h" 27 28void 29v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo) 30{ 31 if (!bo) 32 return; 33 34 if (job->bo_handle_mask & bo->handle_bit) { 35 if (_mesa_set_search(job->bos, bo)) 36 return; 37 } 38 39 _mesa_set_add(job->bos, bo); 40 job->bo_count++; 41 job->bo_handle_mask |= bo->handle_bit; 42} 43 44void 45v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo) 46{ 47 assert(bo); 48 _mesa_set_add(job->bos, bo); 49 job->bo_count++; 50 job->bo_handle_mask |= bo->handle_bit; 51} 52 53static void 54cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer, 55 struct v3dv_device *device) 56{ 57 /* Do not reset the base object! If we are calling this from a command 58 * buffer reset that would reset the loader's dispatch table for the 59 * command buffer, and any other relevant info from vk_object_base 60 */ 61 const uint32_t base_size = sizeof(struct vk_command_buffer); 62 uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size; 63 memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size); 64 65 cmd_buffer->device = device; 66 67 list_inithead(&cmd_buffer->private_objs); 68 list_inithead(&cmd_buffer->jobs); 69 list_inithead(&cmd_buffer->list_link); 70 71 cmd_buffer->state.subpass_idx = -1; 72 cmd_buffer->state.meta.subpass_idx = -1; 73 74 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_INITIALIZED; 75} 76 77static void cmd_buffer_destroy(struct vk_command_buffer *cmd_buffer); 78 79static VkResult 80cmd_buffer_create(struct v3dv_device *device, 81 struct vk_command_pool *pool, 82 VkCommandBufferLevel level, 83 VkCommandBuffer *pCommandBuffer) 84{ 85 struct v3dv_cmd_buffer *cmd_buffer; 86 cmd_buffer = vk_zalloc(&pool->alloc, 87 sizeof(*cmd_buffer), 88 8, 89 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 90 if (cmd_buffer == NULL) 91 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 92 93 VkResult result; 94 result = vk_command_buffer_init(&cmd_buffer->vk, pool, level); 95 if (result != VK_SUCCESS) { 96 vk_free(&pool->alloc, cmd_buffer); 97 return result; 98 } 99 100 cmd_buffer->vk.destroy = cmd_buffer_destroy; 101 cmd_buffer_init(cmd_buffer, device); 102 103 *pCommandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer); 104 105 return VK_SUCCESS; 106} 107 108static void 109job_destroy_gpu_cl_resources(struct v3dv_job *job) 110{ 111 assert(job->type == V3DV_JOB_TYPE_GPU_CL || 112 job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY); 113 114 v3dv_cl_destroy(&job->bcl); 115 v3dv_cl_destroy(&job->rcl); 116 v3dv_cl_destroy(&job->indirect); 117 118 /* Since we don't ref BOs when we add them to the command buffer, don't 119 * unref them here either. Bo's will be freed when their corresponding API 120 * objects are destroyed. 121 */ 122 _mesa_set_destroy(job->bos, NULL); 123 124 v3dv_bo_free(job->device, job->tile_alloc); 125 v3dv_bo_free(job->device, job->tile_state); 126} 127 128static void 129job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job) 130{ 131 assert(job->type == V3DV_JOB_TYPE_GPU_CL); 132 133 list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) { 134 list_del(&bo->list_link); 135 vk_free(&job->device->vk.alloc, bo); 136 } 137 138 list_for_each_entry_safe(struct v3dv_bo, bo, &job->rcl.bo_list, list_link) { 139 list_del(&bo->list_link); 140 vk_free(&job->device->vk.alloc, bo); 141 } 142 143 list_for_each_entry_safe(struct v3dv_bo, bo, &job->indirect.bo_list, list_link) { 144 list_del(&bo->list_link); 145 vk_free(&job->device->vk.alloc, bo); 146 } 147} 148 149static void 150job_destroy_gpu_csd_resources(struct v3dv_job *job) 151{ 152 assert(job->type == V3DV_JOB_TYPE_GPU_CSD); 153 assert(job->cmd_buffer); 154 155 v3dv_cl_destroy(&job->indirect); 156 157 _mesa_set_destroy(job->bos, NULL); 158 159 if (job->csd.shared_memory) 160 v3dv_bo_free(job->device, job->csd.shared_memory); 161} 162 163static void 164job_destroy_cpu_wait_events_resources(struct v3dv_job *job) 165{ 166 assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); 167 assert(job->cmd_buffer); 168 vk_free(&job->cmd_buffer->device->vk.alloc, job->cpu.event_wait.events); 169} 170 171void 172v3dv_job_destroy(struct v3dv_job *job) 173{ 174 assert(job); 175 176 list_del(&job->list_link); 177 178 /* Cloned jobs don't make deep copies of the original jobs, so they don't 179 * own any of their resources. However, they do allocate clones of BO 180 * structs, so make sure we free those. 181 */ 182 if (!job->is_clone) { 183 switch (job->type) { 184 case V3DV_JOB_TYPE_GPU_CL: 185 case V3DV_JOB_TYPE_GPU_CL_SECONDARY: 186 job_destroy_gpu_cl_resources(job); 187 break; 188 case V3DV_JOB_TYPE_GPU_CSD: 189 job_destroy_gpu_csd_resources(job); 190 break; 191 case V3DV_JOB_TYPE_CPU_WAIT_EVENTS: 192 job_destroy_cpu_wait_events_resources(job); 193 break; 194 default: 195 break; 196 } 197 } else { 198 /* Cloned jobs */ 199 if (job->type == V3DV_JOB_TYPE_GPU_CL) 200 job_destroy_cloned_gpu_cl_resources(job); 201 } 202 203 vk_free(&job->device->vk.alloc, job); 204} 205 206void 207v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer, 208 uint64_t obj, 209 v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb) 210{ 211 struct v3dv_cmd_buffer_private_obj *pobj = 212 vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(*pobj), 8, 213 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 214 if (!pobj) { 215 v3dv_flag_oom(cmd_buffer, NULL); 216 return; 217 } 218 219 pobj->obj = obj; 220 pobj->destroy_cb = destroy_cb; 221 222 list_addtail(&pobj->list_link, &cmd_buffer->private_objs); 223} 224 225static void 226cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer *cmd_buffer, 227 struct v3dv_cmd_buffer_private_obj *pobj) 228{ 229 assert(pobj && pobj->obj && pobj->destroy_cb); 230 pobj->destroy_cb(v3dv_device_to_handle(cmd_buffer->device), 231 pobj->obj, 232 &cmd_buffer->device->vk.alloc); 233 list_del(&pobj->list_link); 234 vk_free(&cmd_buffer->device->vk.alloc, pobj); 235} 236 237static void 238cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer) 239{ 240 list_for_each_entry_safe(struct v3dv_job, job, 241 &cmd_buffer->jobs, list_link) { 242 v3dv_job_destroy(job); 243 } 244 245 if (cmd_buffer->state.job) 246 v3dv_job_destroy(cmd_buffer->state.job); 247 248 if (cmd_buffer->state.attachments) 249 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments); 250 251 if (cmd_buffer->state.query.end.alloc_count > 0) 252 vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.query.end.states); 253 254 if (cmd_buffer->push_constants_resource.bo) 255 v3dv_bo_free(cmd_buffer->device, cmd_buffer->push_constants_resource.bo); 256 257 list_for_each_entry_safe(struct v3dv_cmd_buffer_private_obj, pobj, 258 &cmd_buffer->private_objs, list_link) { 259 cmd_buffer_destroy_private_obj(cmd_buffer, pobj); 260 } 261 262 if (cmd_buffer->state.meta.attachments) { 263 assert(cmd_buffer->state.meta.attachment_alloc_count > 0); 264 vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.meta.attachments); 265 } 266} 267 268static void 269cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer) 270{ 271 struct v3dv_cmd_buffer *cmd_buffer = 272 container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk); 273 274 cmd_buffer_free_resources(cmd_buffer); 275 vk_command_buffer_finish(&cmd_buffer->vk); 276 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer); 277} 278 279static bool 280cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer, 281 uint32_t subpass_idx) 282{ 283 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 284 assert(state->pass); 285 286 const struct v3dv_physical_device *physical_device = 287 &cmd_buffer->device->instance->physicalDevice; 288 289 if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY) 290 return false; 291 292 if (!cmd_buffer->state.job) 293 return false; 294 295 if (cmd_buffer->state.job->always_flush) 296 return false; 297 298 if (!physical_device->options.merge_jobs) 299 return false; 300 301 /* Each render pass starts a new job */ 302 if (subpass_idx == 0) 303 return false; 304 305 /* Two subpasses can be merged in the same job if we can emit a single RCL 306 * for them (since the RCL includes the END_OF_RENDERING command that 307 * triggers the "render job finished" interrupt). We can do this so long 308 * as both subpasses render against the same attachments. 309 */ 310 assert(state->subpass_idx == subpass_idx - 1); 311 struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx]; 312 struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx]; 313 314 if (subpass->ds_attachment.attachment != 315 prev_subpass->ds_attachment.attachment) 316 return false; 317 318 if (subpass->color_count != prev_subpass->color_count) 319 return false; 320 321 for (uint32_t i = 0; i < subpass->color_count; i++) { 322 if (subpass->color_attachments[i].attachment != 323 prev_subpass->color_attachments[i].attachment) { 324 return false; 325 } 326 } 327 328 /* Don't merge if the subpasses have different view masks, since in that 329 * case the framebuffer setup is different and we need to emit different 330 * RCLs. 331 */ 332 if (subpass->view_mask != prev_subpass->view_mask) 333 return false; 334 335 /* FIXME: Since some attachment formats can't be resolved using the TLB we 336 * need to emit separate resolve jobs for them and that would not be 337 * compatible with subpass merges. We could fix that by testing if any of 338 * the attachments to resolve doesn't support TLB resolves. 339 */ 340 if (prev_subpass->resolve_attachments || subpass->resolve_attachments || 341 prev_subpass->resolve_depth || prev_subpass->resolve_stencil || 342 subpass->resolve_depth || subpass->resolve_stencil) { 343 return false; 344 } 345 346 return true; 347} 348 349/** 350 * Computes and sets the job frame tiling information required to setup frame 351 * binning and rendering. 352 */ 353static struct v3dv_frame_tiling * 354job_compute_frame_tiling(struct v3dv_job *job, 355 uint32_t width, 356 uint32_t height, 357 uint32_t layers, 358 uint32_t render_target_count, 359 uint8_t max_internal_bpp, 360 bool msaa) 361{ 362 assert(job); 363 struct v3dv_frame_tiling *tiling = &job->frame_tiling; 364 365 tiling->width = width; 366 tiling->height = height; 367 tiling->layers = layers; 368 tiling->render_target_count = render_target_count; 369 tiling->msaa = msaa; 370 tiling->internal_bpp = max_internal_bpp; 371 372 /* We can use double-buffer when MSAA is disabled to reduce tile store 373 * overhead. 374 * 375 * FIXME: if we are emitting any tile loads the hardware will serialize 376 * loads and stores across tiles effectivley disabling double buffering, 377 * so we would want to check for that and not enable it in that case to 378 * avoid reducing the tile size. 379 */ 380 tiling->double_buffer = 381 unlikely(V3D_DEBUG & V3D_DEBUG_DOUBLE_BUFFER) && !msaa; 382 383 assert(!tiling->msaa || !tiling->double_buffer); 384 385 v3d_choose_tile_size(render_target_count, max_internal_bpp, 386 tiling->msaa, tiling->double_buffer, 387 &tiling->tile_width, &tiling->tile_height); 388 389 tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width); 390 tiling->draw_tiles_y = DIV_ROUND_UP(height, tiling->tile_height); 391 392 /* Size up our supertiles until we get under the limit */ 393 const uint32_t max_supertiles = 256; 394 tiling->supertile_width = 1; 395 tiling->supertile_height = 1; 396 for (;;) { 397 tiling->frame_width_in_supertiles = 398 DIV_ROUND_UP(tiling->draw_tiles_x, tiling->supertile_width); 399 tiling->frame_height_in_supertiles = 400 DIV_ROUND_UP(tiling->draw_tiles_y, tiling->supertile_height); 401 const uint32_t num_supertiles = tiling->frame_width_in_supertiles * 402 tiling->frame_height_in_supertiles; 403 if (num_supertiles < max_supertiles) 404 break; 405 406 if (tiling->supertile_width < tiling->supertile_height) 407 tiling->supertile_width++; 408 else 409 tiling->supertile_height++; 410 } 411 412 return tiling; 413} 414 415void 416v3dv_job_start_frame(struct v3dv_job *job, 417 uint32_t width, 418 uint32_t height, 419 uint32_t layers, 420 bool allocate_tile_state_for_all_layers, 421 uint32_t render_target_count, 422 uint8_t max_internal_bpp, 423 bool msaa) 424{ 425 assert(job); 426 427 /* Start by computing frame tiling spec for this job */ 428 const struct v3dv_frame_tiling *tiling = 429 job_compute_frame_tiling(job, 430 width, height, layers, 431 render_target_count, max_internal_bpp, msaa); 432 433 v3dv_cl_ensure_space_with_branch(&job->bcl, 256); 434 v3dv_return_if_oom(NULL, job); 435 436 /* We only need to allocate tile state for all layers if the binner 437 * writes primitives to layers other than the first. This can only be 438 * done using layered rendering (writing gl_Layer from a geometry shader), 439 * so for other cases of multilayered framebuffers (typically with 440 * meta copy/clear operations) that won't use layered rendering, we only 441 * need one layer worth of of tile state for the binner. 442 */ 443 if (!allocate_tile_state_for_all_layers) 444 layers = 1; 445 446 /* The PTB will request the tile alloc initial size per tile at start 447 * of tile binning. 448 */ 449 uint32_t tile_alloc_size = 64 * tiling->layers * 450 tiling->draw_tiles_x * 451 tiling->draw_tiles_y; 452 453 /* The PTB allocates in aligned 4k chunks after the initial setup. */ 454 tile_alloc_size = align(tile_alloc_size, 4096); 455 456 /* Include the first two chunk allocations that the PTB does so that 457 * we definitely clear the OOM condition before triggering one (the HW 458 * won't trigger OOM during the first allocations). 459 */ 460 tile_alloc_size += 8192; 461 462 /* For performance, allocate some extra initial memory after the PTB's 463 * minimal allocations, so that we hopefully don't have to block the 464 * GPU on the kernel handling an OOM signal. 465 */ 466 tile_alloc_size += 512 * 1024; 467 468 job->tile_alloc = v3dv_bo_alloc(job->device, tile_alloc_size, 469 "tile_alloc", true); 470 if (!job->tile_alloc) { 471 v3dv_flag_oom(NULL, job); 472 return; 473 } 474 475 v3dv_job_add_bo_unchecked(job, job->tile_alloc); 476 477 const uint32_t tsda_per_tile_size = 256; 478 const uint32_t tile_state_size = tiling->layers * 479 tiling->draw_tiles_x * 480 tiling->draw_tiles_y * 481 tsda_per_tile_size; 482 job->tile_state = v3dv_bo_alloc(job->device, tile_state_size, "TSDA", true); 483 if (!job->tile_state) { 484 v3dv_flag_oom(NULL, job); 485 return; 486 } 487 488 v3dv_job_add_bo_unchecked(job, job->tile_state); 489 490 v3dv_X(job->device, job_emit_binning_prolog)(job, tiling, layers); 491 492 job->ez_state = V3D_EZ_UNDECIDED; 493 job->first_ez_state = V3D_EZ_UNDECIDED; 494} 495 496static void 497cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer) 498{ 499 assert(cmd_buffer->state.job); 500 501 /* Typically, we have a single job for each subpass and we emit the job's RCL 502 * here when we are ending the frame for the subpass. However, some commands 503 * such as vkCmdClearAttachments need to run in their own separate job and 504 * they emit their own RCL even if they execute inside a subpass. In this 505 * scenario, we don't want to emit subpass RCL when we end the frame for 506 * those jobs, so we only emit the subpass RCL if the job has not recorded 507 * any RCL commands of its own. 508 */ 509 if (v3dv_cl_offset(&cmd_buffer->state.job->rcl) == 0) 510 v3dv_X(cmd_buffer->device, cmd_buffer_emit_render_pass_rcl)(cmd_buffer); 511 512 v3dv_X(cmd_buffer->device, job_emit_binning_flush)(cmd_buffer->state.job); 513} 514 515struct v3dv_job * 516v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device, 517 enum v3dv_job_type type, 518 struct v3dv_cmd_buffer *cmd_buffer, 519 uint32_t subpass_idx) 520{ 521 struct v3dv_job *job = vk_zalloc(&device->vk.alloc, 522 sizeof(struct v3dv_job), 8, 523 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 524 if (!job) { 525 v3dv_flag_oom(cmd_buffer, NULL); 526 return NULL; 527 } 528 529 v3dv_job_init(job, type, device, cmd_buffer, subpass_idx); 530 return job; 531} 532 533static void 534cmd_buffer_add_cpu_jobs_for_pending_state(struct v3dv_cmd_buffer *cmd_buffer) 535{ 536 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 537 538 if (state->query.end.used_count > 0) { 539 const uint32_t query_count = state->query.end.used_count; 540 for (uint32_t i = 0; i < query_count; i++) { 541 assert(i < state->query.end.used_count); 542 struct v3dv_job *job = 543 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 544 V3DV_JOB_TYPE_CPU_END_QUERY, 545 cmd_buffer, -1); 546 v3dv_return_if_oom(cmd_buffer, NULL); 547 548 job->cpu.query_end = state->query.end.states[i]; 549 list_addtail(&job->list_link, &cmd_buffer->jobs); 550 } 551 } 552} 553 554void 555v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer) 556{ 557 struct v3dv_job *job = cmd_buffer->state.job; 558 if (!job) 559 return; 560 561 /* Always clear BCL state after a job has been finished if we don't have 562 * a pending graphics barrier that could consume it (BCL barriers only 563 * apply to graphics jobs). This can happen if the application recorded 564 * a barrier involving geometry stages but none of the draw calls in the 565 * job actually required a binning sync. 566 */ 567 if (!(cmd_buffer->state.barrier.dst_mask & V3DV_BARRIER_GRAPHICS_BIT)) { 568 cmd_buffer->state.barrier.bcl_buffer_access = 0; 569 cmd_buffer->state.barrier.bcl_image_access = 0; 570 } 571 572 if (cmd_buffer->state.oom) { 573 v3dv_job_destroy(job); 574 cmd_buffer->state.job = NULL; 575 return; 576 } 577 578 /* If we have created a job for a command buffer then we should have 579 * recorded something into it: if the job was started in a render pass, it 580 * should at least have the start frame commands, otherwise, it should have 581 * a transfer command. The only exception are secondary command buffers 582 * inside a render pass. 583 */ 584 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY || 585 v3dv_cl_offset(&job->bcl) > 0); 586 587 /* When we merge multiple subpasses into the same job we must only emit one 588 * RCL, so we do that here, when we decided that we need to finish the job. 589 * Any rendering that happens outside a render pass is never merged, so 590 * the RCL should have been emitted by the time we got here. 591 */ 592 assert(v3dv_cl_offset(&job->rcl) != 0 || cmd_buffer->state.pass); 593 594 /* If we are finishing a job inside a render pass we have two scenarios: 595 * 596 * 1. It is a regular CL, in which case we will submit the job to the GPU, 597 * so we may need to generate an RCL and add a binning flush. 598 * 599 * 2. It is a partial CL recorded in a secondary command buffer, in which 600 * case we are not submitting it directly to the GPU but rather branch to 601 * it from a primary command buffer. In this case we just want to end 602 * the BCL with a RETURN_FROM_SUB_LIST and the RCL and binning flush 603 * will be the primary job that branches to this CL. 604 */ 605 if (cmd_buffer->state.pass) { 606 if (job->type == V3DV_JOB_TYPE_GPU_CL) { 607 cmd_buffer_end_render_pass_frame(cmd_buffer); 608 } else { 609 assert(job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY); 610 v3dv_X(cmd_buffer->device, cmd_buffer_end_render_pass_secondary)(cmd_buffer); 611 } 612 } 613 614 list_addtail(&job->list_link, &cmd_buffer->jobs); 615 cmd_buffer->state.job = NULL; 616 617 /* If we have recorded any state with this last GPU job that requires to 618 * emit CPU jobs after the job is completed, add them now. The only 619 * exception is secondary command buffers inside a render pass, because in 620 * that case we want to defer this until we finish recording the primary 621 * job into which we execute the secondary. 622 */ 623 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY || 624 !cmd_buffer->state.pass) { 625 cmd_buffer_add_cpu_jobs_for_pending_state(cmd_buffer); 626 } 627} 628 629bool 630v3dv_job_type_is_gpu(struct v3dv_job *job) 631{ 632 switch (job->type) { 633 case V3DV_JOB_TYPE_GPU_CL: 634 case V3DV_JOB_TYPE_GPU_CL_SECONDARY: 635 case V3DV_JOB_TYPE_GPU_TFU: 636 case V3DV_JOB_TYPE_GPU_CSD: 637 return true; 638 default: 639 return false; 640 } 641} 642 643static void 644cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer, 645 struct v3dv_job *job) 646{ 647 assert(cmd_buffer && job); 648 649 /* Serialization only affects GPU jobs, CPU jobs are always automatically 650 * serialized. 651 */ 652 if (!v3dv_job_type_is_gpu(job)) 653 return; 654 655 uint8_t barrier_mask = cmd_buffer->state.barrier.dst_mask; 656 if (barrier_mask == 0) 657 return; 658 659 uint8_t bit = 0; 660 uint8_t *src_mask; 661 if (job->type == V3DV_JOB_TYPE_GPU_CSD) { 662 assert(!job->is_transfer); 663 bit = V3DV_BARRIER_COMPUTE_BIT; 664 src_mask = &cmd_buffer->state.barrier.src_mask_compute; 665 } else if (job->is_transfer) { 666 assert(job->type == V3DV_JOB_TYPE_GPU_CL || 667 job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY || 668 job->type == V3DV_JOB_TYPE_GPU_TFU); 669 bit = V3DV_BARRIER_TRANSFER_BIT; 670 src_mask = &cmd_buffer->state.barrier.src_mask_transfer; 671 } else { 672 assert(job->type == V3DV_JOB_TYPE_GPU_CL || 673 job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY); 674 bit = V3DV_BARRIER_GRAPHICS_BIT; 675 src_mask = &cmd_buffer->state.barrier.src_mask_graphics; 676 } 677 678 if (barrier_mask & bit) { 679 job->serialize = *src_mask; 680 *src_mask = 0; 681 cmd_buffer->state.barrier.dst_mask &= ~bit; 682 } 683} 684 685void 686v3dv_job_init(struct v3dv_job *job, 687 enum v3dv_job_type type, 688 struct v3dv_device *device, 689 struct v3dv_cmd_buffer *cmd_buffer, 690 int32_t subpass_idx) 691{ 692 assert(job); 693 694 /* Make sure we haven't made this new job current before calling here */ 695 assert(!cmd_buffer || cmd_buffer->state.job != job); 696 697 job->type = type; 698 699 job->device = device; 700 job->cmd_buffer = cmd_buffer; 701 702 list_inithead(&job->list_link); 703 704 if (type == V3DV_JOB_TYPE_GPU_CL || 705 type == V3DV_JOB_TYPE_GPU_CL_SECONDARY || 706 type == V3DV_JOB_TYPE_GPU_CSD) { 707 job->bos = 708 _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); 709 job->bo_count = 0; 710 711 v3dv_cl_init(job, &job->indirect); 712 713 if (unlikely(V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)) 714 job->always_flush = true; 715 } 716 717 if (type == V3DV_JOB_TYPE_GPU_CL || 718 type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) { 719 v3dv_cl_init(job, &job->bcl); 720 v3dv_cl_init(job, &job->rcl); 721 } 722 723 if (cmd_buffer) { 724 /* Flag all state as dirty. Generally, we need to re-emit state for each 725 * new job. 726 * 727 * FIXME: there may be some exceptions, in which case we could skip some 728 * bits. 729 */ 730 cmd_buffer->state.dirty = ~0; 731 cmd_buffer->state.dirty_descriptor_stages = ~0; 732 733 /* Honor inheritance of occlussion queries in secondaries if requested */ 734 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && 735 cmd_buffer->state.inheritance.occlusion_query_enable) { 736 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY; 737 } 738 739 /* Keep track of the first subpass that we are recording in this new job. 740 * We will use this when we emit the RCL to decide how to emit our loads 741 * and stores. 742 */ 743 if (cmd_buffer->state.pass) 744 job->first_subpass = subpass_idx; 745 746 job->is_transfer = cmd_buffer->state.is_transfer; 747 748 cmd_buffer_serialize_job_if_needed(cmd_buffer, job); 749 750 job->perf = cmd_buffer->state.query.active_query.perf; 751 } 752} 753 754struct v3dv_job * 755v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer, 756 int32_t subpass_idx, 757 enum v3dv_job_type type) 758{ 759 /* Don't create a new job if we can merge the current subpass into 760 * the current job. 761 */ 762 if (cmd_buffer->state.pass && 763 subpass_idx != -1 && 764 cmd_buffer_can_merge_subpass(cmd_buffer, subpass_idx)) { 765 cmd_buffer->state.job->is_subpass_finish = false; 766 return cmd_buffer->state.job; 767 } 768 769 /* Ensure we are not starting a new job without finishing a previous one */ 770 if (cmd_buffer->state.job != NULL) 771 v3dv_cmd_buffer_finish_job(cmd_buffer); 772 773 assert(cmd_buffer->state.job == NULL); 774 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc, 775 sizeof(struct v3dv_job), 8, 776 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 777 778 if (!job) { 779 fprintf(stderr, "Error: failed to allocate CPU memory for job\n"); 780 v3dv_flag_oom(cmd_buffer, NULL); 781 return NULL; 782 } 783 784 v3dv_job_init(job, type, cmd_buffer->device, cmd_buffer, subpass_idx); 785 cmd_buffer->state.job = job; 786 787 return job; 788} 789 790static VkResult 791cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer, 792 VkCommandBufferResetFlags flags) 793{ 794 vk_command_buffer_reset(&cmd_buffer->vk); 795 if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) { 796 struct v3dv_device *device = cmd_buffer->device; 797 798 /* FIXME: For now we always free all resources as if 799 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set. 800 */ 801 if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_NEW) 802 cmd_buffer_free_resources(cmd_buffer); 803 804 cmd_buffer_init(cmd_buffer, device); 805 } 806 807 assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED); 808 return VK_SUCCESS; 809} 810 811VKAPI_ATTR VkResult VKAPI_CALL 812v3dv_AllocateCommandBuffers(VkDevice _device, 813 const VkCommandBufferAllocateInfo *pAllocateInfo, 814 VkCommandBuffer *pCommandBuffers) 815{ 816 V3DV_FROM_HANDLE(v3dv_device, device, _device); 817 VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool); 818 819 VkResult result = VK_SUCCESS; 820 uint32_t i; 821 822 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { 823 result = cmd_buffer_create(device, pool, pAllocateInfo->level, 824 &pCommandBuffers[i]); 825 if (result != VK_SUCCESS) 826 break; 827 } 828 829 if (result != VK_SUCCESS) { 830 while (i--) { 831 VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]); 832 cmd_buffer_destroy(cmd_buffer); 833 } 834 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) 835 pCommandBuffers[i] = VK_NULL_HANDLE; 836 } 837 838 return result; 839} 840 841static void 842cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer) 843{ 844 assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count); 845 const struct v3dv_render_pass *pass = cmd_buffer->state.pass; 846 const struct v3dv_subpass *subpass = 847 &pass->subpasses[cmd_buffer->state.subpass_idx]; 848 849 if (!subpass->resolve_attachments) 850 return; 851 852 /* At this point we have already ended the current subpass and now we are 853 * about to emit vkCmdResolveImage calls to get the resolves we can't handle 854 * handle in the subpass RCL. 855 * 856 * vkCmdResolveImage is not supposed to be called inside a render pass so 857 * before we call that we need to make sure our command buffer state reflects 858 * that we are no longer in a subpass by finishing the current job and 859 * resetting the framebuffer and render pass state temporarily and then 860 * restoring it after we are done with the resolves. 861 */ 862 if (cmd_buffer->state.job) 863 v3dv_cmd_buffer_finish_job(cmd_buffer); 864 struct v3dv_framebuffer *restore_fb = cmd_buffer->state.framebuffer; 865 struct v3dv_render_pass *restore_pass = cmd_buffer->state.pass; 866 uint32_t restore_subpass_idx = cmd_buffer->state.subpass_idx; 867 cmd_buffer->state.framebuffer = NULL; 868 cmd_buffer->state.pass = NULL; 869 cmd_buffer->state.subpass_idx = -1; 870 871 VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer); 872 for (uint32_t i = 0; i < subpass->color_count; i++) { 873 const uint32_t src_attachment_idx = 874 subpass->color_attachments[i].attachment; 875 if (src_attachment_idx == VK_ATTACHMENT_UNUSED) 876 continue; 877 878 /* Skip if this attachment doesn't have a resolve or if it was already 879 * implemented as a TLB resolve. 880 */ 881 if (!cmd_buffer->state.attachments[src_attachment_idx].has_resolve || 882 cmd_buffer->state.attachments[src_attachment_idx].use_tlb_resolve) { 883 continue; 884 } 885 886 const uint32_t dst_attachment_idx = 887 subpass->resolve_attachments[i].attachment; 888 assert(dst_attachment_idx != VK_ATTACHMENT_UNUSED); 889 890 struct v3dv_image_view *src_iview = 891 cmd_buffer->state.attachments[src_attachment_idx].image_view; 892 struct v3dv_image_view *dst_iview = 893 cmd_buffer->state.attachments[dst_attachment_idx].image_view; 894 895 VkImageResolve2 region = { 896 .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2, 897 .srcSubresource = { 898 VK_IMAGE_ASPECT_COLOR_BIT, 899 src_iview->vk.base_mip_level, 900 src_iview->vk.base_array_layer, 901 src_iview->vk.layer_count, 902 }, 903 .srcOffset = { 0, 0, 0 }, 904 .dstSubresource = { 905 VK_IMAGE_ASPECT_COLOR_BIT, 906 dst_iview->vk.base_mip_level, 907 dst_iview->vk.base_array_layer, 908 dst_iview->vk.layer_count, 909 }, 910 .dstOffset = { 0, 0, 0 }, 911 .extent = src_iview->vk.image->extent, 912 }; 913 914 struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image; 915 struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image; 916 VkResolveImageInfo2 resolve_info = { 917 .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2, 918 .srcImage = v3dv_image_to_handle(src_image), 919 .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL, 920 .dstImage = v3dv_image_to_handle(dst_image), 921 .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL, 922 .regionCount = 1, 923 .pRegions = ®ion, 924 }; 925 v3dv_CmdResolveImage2KHR(cmd_buffer_handle, &resolve_info); 926 } 927 928 cmd_buffer->state.framebuffer = restore_fb; 929 cmd_buffer->state.pass = restore_pass; 930 cmd_buffer->state.subpass_idx = restore_subpass_idx; 931} 932 933static VkResult 934cmd_buffer_begin_render_pass_secondary( 935 struct v3dv_cmd_buffer *cmd_buffer, 936 const VkCommandBufferInheritanceInfo *inheritance_info) 937{ 938 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); 939 assert(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT); 940 assert(inheritance_info); 941 942 cmd_buffer->state.pass = 943 v3dv_render_pass_from_handle(inheritance_info->renderPass); 944 assert(cmd_buffer->state.pass); 945 946 cmd_buffer->state.framebuffer = 947 v3dv_framebuffer_from_handle(inheritance_info->framebuffer); 948 949 assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count); 950 cmd_buffer->state.subpass_idx = inheritance_info->subpass; 951 952 cmd_buffer->state.inheritance.occlusion_query_enable = 953 inheritance_info->occlusionQueryEnable; 954 955 /* Secondaries that execute inside a render pass won't start subpasses 956 * so we want to create a job for them here. 957 */ 958 struct v3dv_job *job = 959 v3dv_cmd_buffer_start_job(cmd_buffer, inheritance_info->subpass, 960 V3DV_JOB_TYPE_GPU_CL_SECONDARY); 961 if (!job) { 962 v3dv_flag_oom(cmd_buffer, NULL); 963 return VK_ERROR_OUT_OF_HOST_MEMORY; 964 } 965 966 /* Secondary command buffers don't know about the render area, but our 967 * scissor setup accounts for it, so let's make sure we make it large 968 * enough that it doesn't actually constrain any rendering. This should 969 * be fine, since the Vulkan spec states: 970 * 971 * "The application must ensure (using scissor if necessary) that all 972 * rendering is contained within the render area." 973 */ 974 const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer; 975 cmd_buffer->state.render_area.offset.x = 0; 976 cmd_buffer->state.render_area.offset.y = 0; 977 cmd_buffer->state.render_area.extent.width = 978 framebuffer ? framebuffer->width : V3D_MAX_IMAGE_DIMENSION; 979 cmd_buffer->state.render_area.extent.height = 980 framebuffer ? framebuffer->height : V3D_MAX_IMAGE_DIMENSION; 981 982 return VK_SUCCESS; 983} 984 985VKAPI_ATTR VkResult VKAPI_CALL 986v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer, 987 const VkCommandBufferBeginInfo *pBeginInfo) 988{ 989 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 990 991 /* If this is the first vkBeginCommandBuffer, we must initialize the 992 * command buffer's state. Otherwise, we must reset its state. In both 993 * cases we reset it. 994 */ 995 VkResult result = cmd_buffer_reset(cmd_buffer, 0); 996 if (result != VK_SUCCESS) 997 return result; 998 999 assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED); 1000 1001 cmd_buffer->usage_flags = pBeginInfo->flags; 1002 1003 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { 1004 if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 1005 result = 1006 cmd_buffer_begin_render_pass_secondary(cmd_buffer, 1007 pBeginInfo->pInheritanceInfo); 1008 if (result != VK_SUCCESS) 1009 return result; 1010 } 1011 } 1012 1013 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_RECORDING; 1014 1015 return VK_SUCCESS; 1016} 1017 1018VKAPI_ATTR VkResult VKAPI_CALL 1019v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer, 1020 VkCommandBufferResetFlags flags) 1021{ 1022 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1023 return cmd_buffer_reset(cmd_buffer, flags); 1024} 1025 1026static void 1027cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer) 1028{ 1029 /* Render areas and scissor/viewport are only relevant inside render passes, 1030 * otherwise we are dealing with transfer operations where these elements 1031 * don't apply. 1032 */ 1033 assert(cmd_buffer->state.pass); 1034 const VkRect2D *rect = &cmd_buffer->state.render_area; 1035 1036 /* We should only call this at the beginning of a subpass so we should 1037 * always have framebuffer information available. 1038 */ 1039 assert(cmd_buffer->state.framebuffer); 1040 cmd_buffer->state.tile_aligned_render_area = 1041 v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, rect, 1042 cmd_buffer->state.framebuffer, 1043 cmd_buffer->state.pass, 1044 cmd_buffer->state.subpass_idx); 1045 1046 if (!cmd_buffer->state.tile_aligned_render_area) { 1047 perf_debug("Render area for subpass %d of render pass %p doesn't " 1048 "match render pass granularity.\n", 1049 cmd_buffer->state.subpass_idx, cmd_buffer->state.pass); 1050 } 1051} 1052 1053static void 1054cmd_buffer_update_attachment_resolve_state(struct v3dv_cmd_buffer *cmd_buffer) 1055{ 1056 /* NOTE: This should be called after cmd_buffer_update_tile_alignment() 1057 * since it relies on up-to-date information about subpass tile alignment. 1058 */ 1059 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1060 const struct v3dv_render_pass *pass = state->pass; 1061 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx]; 1062 1063 for (uint32_t i = 0; i < subpass->color_count; i++) { 1064 const uint32_t attachment_idx = subpass->color_attachments[i].attachment; 1065 if (attachment_idx == VK_ATTACHMENT_UNUSED) 1066 continue; 1067 1068 state->attachments[attachment_idx].has_resolve = 1069 subpass->resolve_attachments && 1070 subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED; 1071 1072 state->attachments[attachment_idx].use_tlb_resolve = 1073 state->attachments[attachment_idx].has_resolve && 1074 state->tile_aligned_render_area && 1075 pass->attachments[attachment_idx].try_tlb_resolve; 1076 } 1077 1078 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment; 1079 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { 1080 uint32_t ds_resolve_attachment_idx = 1081 subpass->ds_resolve_attachment.attachment; 1082 state->attachments[ds_attachment_idx].has_resolve = 1083 ds_resolve_attachment_idx != VK_ATTACHMENT_UNUSED; 1084 1085 assert(!state->attachments[ds_attachment_idx].has_resolve || 1086 (subpass->resolve_depth || subpass->resolve_stencil)); 1087 1088 state->attachments[ds_attachment_idx].use_tlb_resolve = 1089 state->attachments[ds_attachment_idx].has_resolve && 1090 state->tile_aligned_render_area && 1091 pass->attachments[ds_attachment_idx].try_tlb_resolve; 1092 } 1093} 1094 1095static void 1096cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer, 1097 uint32_t attachment_idx, 1098 const VkClearColorValue *color) 1099{ 1100 assert(attachment_idx < cmd_buffer->state.pass->attachment_count); 1101 1102 const struct v3dv_render_pass_attachment *attachment = 1103 &cmd_buffer->state.pass->attachments[attachment_idx]; 1104 1105 uint32_t internal_type, internal_bpp; 1106 const struct v3dv_format *format = 1107 v3dv_X(cmd_buffer->device, get_format)(attachment->desc.format); 1108 1109 v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_output_format) 1110 (format->rt_type, &internal_type, &internal_bpp); 1111 1112 uint32_t internal_size = 4 << internal_bpp; 1113 1114 struct v3dv_cmd_buffer_attachment_state *attachment_state = 1115 &cmd_buffer->state.attachments[attachment_idx]; 1116 1117 v3dv_X(cmd_buffer->device, get_hw_clear_color) 1118 (color, internal_type, internal_size, &attachment_state->clear_value.color[0]); 1119 1120 attachment_state->vk_clear_value.color = *color; 1121} 1122 1123static void 1124cmd_buffer_state_set_attachment_clear_depth_stencil( 1125 struct v3dv_cmd_buffer *cmd_buffer, 1126 uint32_t attachment_idx, 1127 bool clear_depth, bool clear_stencil, 1128 const VkClearDepthStencilValue *ds) 1129{ 1130 struct v3dv_cmd_buffer_attachment_state *attachment_state = 1131 &cmd_buffer->state.attachments[attachment_idx]; 1132 1133 if (clear_depth) 1134 attachment_state->clear_value.z = ds->depth; 1135 1136 if (clear_stencil) 1137 attachment_state->clear_value.s = ds->stencil; 1138 1139 attachment_state->vk_clear_value.depthStencil = *ds; 1140} 1141 1142static void 1143cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer *cmd_buffer, 1144 uint32_t count, const VkClearValue *values) 1145{ 1146 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1147 const struct v3dv_render_pass *pass = state->pass; 1148 1149 /* There could be less clear values than attachments in the render pass, in 1150 * which case we only want to process as many as we have, or there could be 1151 * more, in which case we want to ignore those for which we don't have a 1152 * corresponding attachment. 1153 */ 1154 count = MIN2(count, pass->attachment_count); 1155 for (uint32_t i = 0; i < count; i++) { 1156 const struct v3dv_render_pass_attachment *attachment = 1157 &pass->attachments[i]; 1158 1159 if (attachment->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR) 1160 continue; 1161 1162 VkImageAspectFlags aspects = vk_format_aspects(attachment->desc.format); 1163 if (aspects & VK_IMAGE_ASPECT_COLOR_BIT) { 1164 cmd_buffer_state_set_attachment_clear_color(cmd_buffer, i, 1165 &values[i].color); 1166 } else if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | 1167 VK_IMAGE_ASPECT_STENCIL_BIT)) { 1168 cmd_buffer_state_set_attachment_clear_depth_stencil( 1169 cmd_buffer, i, 1170 aspects & VK_IMAGE_ASPECT_DEPTH_BIT, 1171 aspects & VK_IMAGE_ASPECT_STENCIL_BIT, 1172 &values[i].depthStencil); 1173 } 1174 } 1175} 1176 1177static void 1178cmd_buffer_state_set_attachments(struct v3dv_cmd_buffer *cmd_buffer, 1179 const VkRenderPassBeginInfo *pRenderPassBegin) 1180{ 1181 V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass); 1182 V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer); 1183 1184 const VkRenderPassAttachmentBeginInfo *attach_begin = 1185 vk_find_struct_const(pRenderPassBegin, RENDER_PASS_ATTACHMENT_BEGIN_INFO); 1186 1187 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1188 1189 for (uint32_t i = 0; i < pass->attachment_count; i++) { 1190 if (attach_begin && attach_begin->attachmentCount != 0) { 1191 state->attachments[i].image_view = 1192 v3dv_image_view_from_handle(attach_begin->pAttachments[i]); 1193 } else if (framebuffer) { 1194 state->attachments[i].image_view = framebuffer->attachments[i]; 1195 } else { 1196 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); 1197 state->attachments[i].image_view = NULL; 1198 } 1199 } 1200} 1201 1202static void 1203cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer, 1204 const VkRenderPassBeginInfo *pRenderPassBegin) 1205{ 1206 cmd_buffer_state_set_clear_values(cmd_buffer, 1207 pRenderPassBegin->clearValueCount, 1208 pRenderPassBegin->pClearValues); 1209 1210 cmd_buffer_state_set_attachments(cmd_buffer, pRenderPassBegin); 1211} 1212 1213static void 1214cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer) 1215{ 1216 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1217 const struct v3dv_render_pass *pass = state->pass; 1218 1219 if (state->attachment_alloc_count < pass->attachment_count) { 1220 if (state->attachments > 0) { 1221 assert(state->attachment_alloc_count > 0); 1222 vk_free(&cmd_buffer->device->vk.alloc, state->attachments); 1223 } 1224 1225 uint32_t size = sizeof(struct v3dv_cmd_buffer_attachment_state) * 1226 pass->attachment_count; 1227 state->attachments = vk_zalloc(&cmd_buffer->device->vk.alloc, size, 8, 1228 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 1229 if (!state->attachments) { 1230 v3dv_flag_oom(cmd_buffer, NULL); 1231 return; 1232 } 1233 state->attachment_alloc_count = pass->attachment_count; 1234 } 1235 1236 assert(state->attachment_alloc_count >= pass->attachment_count); 1237} 1238 1239VKAPI_ATTR void VKAPI_CALL 1240v3dv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, 1241 const VkRenderPassBeginInfo *pRenderPassBegin, 1242 const VkSubpassBeginInfo *pSubpassBeginInfo) 1243{ 1244 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1245 V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass); 1246 V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer); 1247 1248 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1249 state->pass = pass; 1250 state->framebuffer = framebuffer; 1251 1252 cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer); 1253 v3dv_return_if_oom(cmd_buffer, NULL); 1254 1255 cmd_buffer_init_render_pass_attachment_state(cmd_buffer, pRenderPassBegin); 1256 1257 state->render_area = pRenderPassBegin->renderArea; 1258 1259 /* If our render area is smaller than the current clip window we will have 1260 * to emit a new clip window to constraint it to the render area. 1261 */ 1262 uint32_t min_render_x = state->render_area.offset.x; 1263 uint32_t min_render_y = state->render_area.offset.y; 1264 uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1; 1265 uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1; 1266 uint32_t min_clip_x = state->clip_window.offset.x; 1267 uint32_t min_clip_y = state->clip_window.offset.y; 1268 uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1; 1269 uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1; 1270 if (min_render_x > min_clip_x || min_render_y > min_clip_y || 1271 max_render_x < max_clip_x || max_render_y < max_clip_y) { 1272 state->dirty |= V3DV_CMD_DIRTY_SCISSOR; 1273 } 1274 1275 /* Setup for first subpass */ 1276 v3dv_cmd_buffer_subpass_start(cmd_buffer, 0); 1277} 1278 1279VKAPI_ATTR void VKAPI_CALL 1280v3dv_CmdNextSubpass2(VkCommandBuffer commandBuffer, 1281 const VkSubpassBeginInfo *pSubpassBeginInfo, 1282 const VkSubpassEndInfo *pSubpassEndInfo) 1283{ 1284 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1285 1286 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1287 assert(state->subpass_idx < state->pass->subpass_count - 1); 1288 1289 /* Finish the previous subpass */ 1290 v3dv_cmd_buffer_subpass_finish(cmd_buffer); 1291 cmd_buffer_subpass_handle_pending_resolves(cmd_buffer); 1292 1293 /* Start the next subpass */ 1294 v3dv_cmd_buffer_subpass_start(cmd_buffer, state->subpass_idx + 1); 1295} 1296 1297static void 1298cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer) 1299{ 1300 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); 1301 1302 assert(cmd_buffer->state.pass); 1303 assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count); 1304 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1305 const struct v3dv_render_pass *pass = state->pass; 1306 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx]; 1307 1308 /* We only need to emit subpass clears as draw calls when the render 1309 * area is not aligned to tile boundaries or for GFXH-1461. 1310 */ 1311 if (cmd_buffer->state.tile_aligned_render_area && 1312 !subpass->do_depth_clear_with_draw && 1313 !subpass->do_depth_clear_with_draw) { 1314 return; 1315 } 1316 1317 uint32_t att_count = 0; 1318 VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */ 1319 1320 /* We only need to emit subpass clears as draw calls for color attachments 1321 * if the render area is not aligned to tile boundaries. 1322 */ 1323 if (!cmd_buffer->state.tile_aligned_render_area) { 1324 for (uint32_t i = 0; i < subpass->color_count; i++) { 1325 const uint32_t att_idx = subpass->color_attachments[i].attachment; 1326 if (att_idx == VK_ATTACHMENT_UNUSED) 1327 continue; 1328 1329 struct v3dv_render_pass_attachment *att = &pass->attachments[att_idx]; 1330 if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR) 1331 continue; 1332 1333 if (state->subpass_idx != att->first_subpass) 1334 continue; 1335 1336 atts[att_count].aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; 1337 atts[att_count].colorAttachment = i; 1338 atts[att_count].clearValue = state->attachments[att_idx].vk_clear_value; 1339 att_count++; 1340 } 1341 } 1342 1343 /* For D/S we may also need to emit a subpass clear for GFXH-1461 */ 1344 const uint32_t ds_att_idx = subpass->ds_attachment.attachment; 1345 if (ds_att_idx != VK_ATTACHMENT_UNUSED) { 1346 struct v3dv_render_pass_attachment *att = &pass->attachments[ds_att_idx]; 1347 if (state->subpass_idx == att->first_subpass) { 1348 VkImageAspectFlags aspects = vk_format_aspects(att->desc.format); 1349 if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR || 1350 (cmd_buffer->state.tile_aligned_render_area && 1351 !subpass->do_depth_clear_with_draw)) { 1352 aspects &= ~VK_IMAGE_ASPECT_DEPTH_BIT; 1353 } 1354 if (att->desc.stencilLoadOp != VK_ATTACHMENT_LOAD_OP_CLEAR || 1355 (cmd_buffer->state.tile_aligned_render_area && 1356 !subpass->do_stencil_clear_with_draw)) { 1357 aspects &= ~VK_IMAGE_ASPECT_STENCIL_BIT; 1358 } 1359 if (aspects) { 1360 atts[att_count].aspectMask = aspects; 1361 atts[att_count].colorAttachment = 0; /* Ignored */ 1362 atts[att_count].clearValue = 1363 state->attachments[ds_att_idx].vk_clear_value; 1364 att_count++; 1365 } 1366 } 1367 } 1368 1369 if (att_count == 0) 1370 return; 1371 1372 if (!cmd_buffer->state.tile_aligned_render_area) { 1373 perf_debug("Render area doesn't match render pass granularity, falling " 1374 "back to vkCmdClearAttachments for " 1375 "VK_ATTACHMENT_LOAD_OP_CLEAR.\n"); 1376 } else if (subpass->do_depth_clear_with_draw || 1377 subpass->do_stencil_clear_with_draw) { 1378 perf_debug("Subpass clears DEPTH but loads STENCIL (or viceversa), " 1379 "falling back to vkCmdClearAttachments for " 1380 "VK_ATTACHMENT_LOAD_OP_CLEAR.\n"); 1381 } 1382 1383 /* From the Vulkan 1.0 spec: 1384 * 1385 * "VK_ATTACHMENT_LOAD_OP_CLEAR specifies that the contents within the 1386 * render area will be cleared to a uniform value, which is specified 1387 * when a render pass instance is begun." 1388 * 1389 * So the clear is only constrained by the render area and not by pipeline 1390 * state such as scissor or viewport, these are the semantics of 1391 * vkCmdClearAttachments as well. 1392 */ 1393 VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer); 1394 VkClearRect rect = { 1395 .rect = state->render_area, 1396 .baseArrayLayer = 0, 1397 .layerCount = 1, 1398 }; 1399 v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect); 1400} 1401 1402static struct v3dv_job * 1403cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, 1404 uint32_t subpass_idx, 1405 enum v3dv_job_type type) 1406{ 1407 assert(type == V3DV_JOB_TYPE_GPU_CL || 1408 type == V3DV_JOB_TYPE_GPU_CL_SECONDARY); 1409 1410 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1411 assert(subpass_idx < state->pass->subpass_count); 1412 1413 /* Starting a new job can trigger a finish of the current one, so don't 1414 * change the command buffer state for the new job until we are done creating 1415 * the new job. 1416 */ 1417 struct v3dv_job *job = 1418 v3dv_cmd_buffer_start_job(cmd_buffer, subpass_idx, type); 1419 if (!job) 1420 return NULL; 1421 1422 state->subpass_idx = subpass_idx; 1423 1424 /* If we are starting a new job we need to setup binning. We only do this 1425 * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_SECONDARY 1426 * jobs are not submitted to the GPU directly, and are instead meant to be 1427 * branched to from other V3DV_JOB_TYPE_GPU_CL jobs. 1428 */ 1429 if (type == V3DV_JOB_TYPE_GPU_CL && 1430 job->first_subpass == state->subpass_idx) { 1431 const struct v3dv_subpass *subpass = 1432 &state->pass->subpasses[state->subpass_idx]; 1433 1434 const struct v3dv_framebuffer *framebuffer = state->framebuffer; 1435 1436 uint8_t internal_bpp; 1437 bool msaa; 1438 v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa) 1439 (framebuffer, state->attachments, subpass, &internal_bpp, &msaa); 1440 1441 /* From the Vulkan spec: 1442 * 1443 * "If the render pass uses multiview, then layers must be one and 1444 * each attachment requires a number of layers that is greater than 1445 * the maximum bit index set in the view mask in the subpasses in 1446 * which it is used." 1447 * 1448 * So when multiview is enabled, we take the number of layers from the 1449 * last bit set in the view mask. 1450 */ 1451 uint32_t layers = framebuffer->layers; 1452 if (subpass->view_mask != 0) { 1453 assert(framebuffer->layers == 1); 1454 layers = util_last_bit(subpass->view_mask); 1455 } 1456 1457 v3dv_job_start_frame(job, 1458 framebuffer->width, 1459 framebuffer->height, 1460 layers, 1461 true, 1462 subpass->color_count, 1463 internal_bpp, 1464 msaa); 1465 } 1466 1467 return job; 1468} 1469 1470struct v3dv_job * 1471v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer, 1472 uint32_t subpass_idx) 1473{ 1474 assert(cmd_buffer->state.pass); 1475 assert(subpass_idx < cmd_buffer->state.pass->subpass_count); 1476 1477 struct v3dv_job *job = 1478 cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx, 1479 V3DV_JOB_TYPE_GPU_CL); 1480 if (!job) 1481 return NULL; 1482 1483 /* Check if our render area is aligned to tile boundaries. We have to do 1484 * this in each subpass because the subset of attachments used can change 1485 * and with that the tile size selected by the hardware can change too. 1486 */ 1487 cmd_buffer_update_tile_alignment(cmd_buffer); 1488 1489 cmd_buffer_update_attachment_resolve_state(cmd_buffer); 1490 1491 /* If we can't use TLB clears then we need to emit draw clears for any 1492 * LOAD_OP_CLEAR attachments in this subpass now. We might also need to emit 1493 * Depth/Stencil clears if we hit GFXH-1461. 1494 * 1495 * Secondary command buffers don't start subpasses (and may not even have 1496 * framebuffer state), so we only care about this in primaries. The only 1497 * exception could be a secondary runnning inside a subpass that needs to 1498 * record a meta operation (with its own render pass) that relies on 1499 * attachment load clears, but we don't have any instances of that right 1500 * now. 1501 */ 1502 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) 1503 cmd_buffer_emit_subpass_clears(cmd_buffer); 1504 1505 return job; 1506} 1507 1508struct v3dv_job * 1509v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer *cmd_buffer, 1510 uint32_t subpass_idx) 1511{ 1512 assert(cmd_buffer->state.pass); 1513 assert(subpass_idx < cmd_buffer->state.pass->subpass_count); 1514 1515 struct v3dv_job *job; 1516 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { 1517 job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx, 1518 V3DV_JOB_TYPE_GPU_CL); 1519 } else { 1520 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); 1521 job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx, 1522 V3DV_JOB_TYPE_GPU_CL_SECONDARY); 1523 } 1524 1525 if (!job) 1526 return NULL; 1527 1528 job->is_subpass_continue = true; 1529 1530 return job; 1531} 1532 1533void 1534v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer) 1535{ 1536 /* We can end up here without a job if the last command recorded into the 1537 * subpass already finished the job (for example a pipeline barrier). In 1538 * that case we miss to set the is_subpass_finish flag, but that is not 1539 * required for proper behavior. 1540 */ 1541 struct v3dv_job *job = cmd_buffer->state.job; 1542 if (job) 1543 job->is_subpass_finish = true; 1544} 1545 1546VKAPI_ATTR void VKAPI_CALL 1547v3dv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, 1548 const VkSubpassEndInfo *pSubpassEndInfo) 1549{ 1550 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1551 1552 /* Finalize last subpass */ 1553 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1554 assert(state->subpass_idx == state->pass->subpass_count - 1); 1555 v3dv_cmd_buffer_subpass_finish(cmd_buffer); 1556 v3dv_cmd_buffer_finish_job(cmd_buffer); 1557 1558 cmd_buffer_subpass_handle_pending_resolves(cmd_buffer); 1559 1560 /* We are no longer inside a render pass */ 1561 state->framebuffer = NULL; 1562 state->pass = NULL; 1563 state->subpass_idx = -1; 1564} 1565 1566VKAPI_ATTR VkResult VKAPI_CALL 1567v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer) 1568{ 1569 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1570 1571 if (cmd_buffer->state.oom) 1572 return VK_ERROR_OUT_OF_HOST_MEMORY; 1573 1574 /* Primaries should have ended any recording jobs by the time they hit 1575 * vkEndRenderPass (if we are inside a render pass). Commands outside 1576 * a render pass instance (for both primaries and secondaries) spawn 1577 * complete jobs too. So the only case where we can get here without 1578 * finishing a recording job is when we are recording a secondary 1579 * inside a render pass. 1580 */ 1581 if (cmd_buffer->state.job) { 1582 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && 1583 cmd_buffer->state.pass); 1584 v3dv_cmd_buffer_finish_job(cmd_buffer); 1585 } 1586 1587 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE; 1588 1589 return VK_SUCCESS; 1590} 1591 1592static void 1593clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer, 1594 struct list_head *dst, 1595 struct list_head *src) 1596{ 1597 assert(cmd_buffer); 1598 1599 list_inithead(dst); 1600 list_for_each_entry(struct v3dv_bo, bo, src, list_link) { 1601 struct v3dv_bo *clone_bo = 1602 vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct v3dv_bo), 8, 1603 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 1604 if (!clone_bo) { 1605 v3dv_flag_oom(cmd_buffer, NULL); 1606 return; 1607 } 1608 1609 *clone_bo = *bo; 1610 list_addtail(&clone_bo->list_link, dst); 1611 } 1612} 1613 1614/* Clones a job for inclusion in the given command buffer. Note that this 1615 * doesn't make a deep copy so the cloned job it doesn't own any resources. 1616 * Useful when we need to have a job in more than one list, which happens 1617 * for jobs recorded in secondary command buffers when we want to execute 1618 * them in primaries. 1619 */ 1620struct v3dv_job * 1621v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job, 1622 struct v3dv_cmd_buffer *cmd_buffer) 1623{ 1624 struct v3dv_job *clone_job = vk_alloc(&job->device->vk.alloc, 1625 sizeof(struct v3dv_job), 8, 1626 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 1627 if (!clone_job) { 1628 v3dv_flag_oom(cmd_buffer, NULL); 1629 return NULL; 1630 } 1631 1632 /* Cloned jobs don't duplicate resources! */ 1633 *clone_job = *job; 1634 clone_job->is_clone = true; 1635 clone_job->cmd_buffer = cmd_buffer; 1636 list_addtail(&clone_job->list_link, &cmd_buffer->jobs); 1637 1638 /* We need to regen the BO lists so that they point to the BO list in the 1639 * cloned job. Otherwise functions like list_length() will loop forever. 1640 */ 1641 if (job->type == V3DV_JOB_TYPE_GPU_CL) { 1642 clone_bo_list(cmd_buffer, &clone_job->bcl.bo_list, &job->bcl.bo_list); 1643 clone_bo_list(cmd_buffer, &clone_job->rcl.bo_list, &job->rcl.bo_list); 1644 clone_bo_list(cmd_buffer, &clone_job->indirect.bo_list, 1645 &job->indirect.bo_list); 1646 } 1647 1648 return clone_job; 1649} 1650 1651void 1652v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state *dst, 1653 struct v3dv_barrier_state *src) 1654{ 1655 dst->dst_mask |= src->dst_mask; 1656 1657 dst->src_mask_graphics |= src->src_mask_graphics; 1658 dst->src_mask_compute |= src->src_mask_compute; 1659 dst->src_mask_transfer |= src->src_mask_transfer; 1660 1661 dst->bcl_buffer_access |= src->bcl_buffer_access; 1662 dst->bcl_image_access |= src->bcl_image_access; 1663} 1664 1665static void 1666cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary, 1667 uint32_t cmd_buffer_count, 1668 const VkCommandBuffer *cmd_buffers) 1669{ 1670 struct v3dv_barrier_state pending_barrier = { 0 }; 1671 for (uint32_t i = 0; i < cmd_buffer_count; i++) { 1672 V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]); 1673 1674 assert(!(secondary->usage_flags & 1675 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)); 1676 1677 /* Secondary command buffers that execute outside a render pass create 1678 * complete jobs with an RCL and tile setup, so we simply want to merge 1679 * their job list into the primary's. However, because they may be 1680 * executed into multiple primaries at the same time and we only have a 1681 * single list_link in each job, we can't just add then to the primary's 1682 * job list and we instead have to clone them first. 1683 * 1684 * Alternatively, we could create a "execute secondary" CPU job that 1685 * when executed in a queue, would submit all the jobs in the referenced 1686 * secondary command buffer. However, this would raise some challenges 1687 * to make it work with the implementation of wait threads in the queue 1688 * which we use for event waits, for example. 1689 */ 1690 list_for_each_entry(struct v3dv_job, secondary_job, 1691 &secondary->jobs, list_link) { 1692 /* These can only happen inside a render pass */ 1693 assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_SECONDARY); 1694 struct v3dv_job *job = v3dv_job_clone_in_cmd_buffer(secondary_job, primary); 1695 if (!job) 1696 return; 1697 1698 if (pending_barrier.dst_mask) { 1699 /* FIXME: do the same we do for primaries and only choose the 1700 * relevant src masks. 1701 */ 1702 job->serialize = pending_barrier.src_mask_graphics | 1703 pending_barrier.src_mask_transfer | 1704 pending_barrier.src_mask_compute; 1705 if (pending_barrier.bcl_buffer_access || 1706 pending_barrier.bcl_image_access) { 1707 job->needs_bcl_sync = true; 1708 } 1709 memset(&pending_barrier, 0, sizeof(pending_barrier)); 1710 } 1711 } 1712 1713 /* If this secondary had any pending barrier state we will need that 1714 * barrier state consumed with whatever comes after it (first job in 1715 * the next secondary or the primary, if this was the last secondary). 1716 */ 1717 assert(secondary->state.barrier.dst_mask || 1718 (!secondary->state.barrier.bcl_buffer_access && 1719 !secondary->state.barrier.bcl_image_access)); 1720 pending_barrier = secondary->state.barrier; 1721 } 1722 1723 if (pending_barrier.dst_mask) { 1724 v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier, 1725 &pending_barrier); 1726 } 1727} 1728 1729VKAPI_ATTR void VKAPI_CALL 1730v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer, 1731 uint32_t commandBufferCount, 1732 const VkCommandBuffer *pCommandBuffers) 1733{ 1734 V3DV_FROM_HANDLE(v3dv_cmd_buffer, primary, commandBuffer); 1735 1736 if (primary->state.pass != NULL) { 1737 v3dv_X(primary->device, cmd_buffer_execute_inside_pass) 1738 (primary, commandBufferCount, pCommandBuffers); 1739 } else { 1740 cmd_buffer_execute_outside_pass(primary, 1741 commandBufferCount, pCommandBuffers); 1742 } 1743} 1744 1745/* This goes though the list of possible dynamic states in the pipeline and, 1746 * for those that are not configured as dynamic, copies relevant state into 1747 * the command buffer. 1748 */ 1749static void 1750cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer, 1751 const struct v3dv_dynamic_state *src) 1752{ 1753 struct v3dv_dynamic_state *dest = &cmd_buffer->state.dynamic; 1754 uint32_t dynamic_mask = src->mask; 1755 uint32_t dirty = 0; 1756 1757 if (!(dynamic_mask & V3DV_DYNAMIC_VIEWPORT)) { 1758 dest->viewport.count = src->viewport.count; 1759 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports, 1760 src->viewport.count * sizeof(VkViewport))) { 1761 typed_memcpy(dest->viewport.viewports, 1762 src->viewport.viewports, 1763 src->viewport.count); 1764 typed_memcpy(dest->viewport.scale, src->viewport.scale, 1765 src->viewport.count); 1766 typed_memcpy(dest->viewport.translate, src->viewport.translate, 1767 src->viewport.count); 1768 dirty |= V3DV_CMD_DIRTY_VIEWPORT; 1769 } 1770 } 1771 1772 if (!(dynamic_mask & V3DV_DYNAMIC_SCISSOR)) { 1773 dest->scissor.count = src->scissor.count; 1774 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors, 1775 src->scissor.count * sizeof(VkRect2D))) { 1776 typed_memcpy(dest->scissor.scissors, 1777 src->scissor.scissors, src->scissor.count); 1778 dirty |= V3DV_CMD_DIRTY_SCISSOR; 1779 } 1780 } 1781 1782 if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) { 1783 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask, 1784 sizeof(src->stencil_compare_mask))) { 1785 dest->stencil_compare_mask = src->stencil_compare_mask; 1786 dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK; 1787 } 1788 } 1789 1790 if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) { 1791 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask, 1792 sizeof(src->stencil_write_mask))) { 1793 dest->stencil_write_mask = src->stencil_write_mask; 1794 dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK; 1795 } 1796 } 1797 1798 if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_REFERENCE)) { 1799 if (memcmp(&dest->stencil_reference, &src->stencil_reference, 1800 sizeof(src->stencil_reference))) { 1801 dest->stencil_reference = src->stencil_reference; 1802 dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE; 1803 } 1804 } 1805 1806 if (!(dynamic_mask & V3DV_DYNAMIC_BLEND_CONSTANTS)) { 1807 if (memcmp(dest->blend_constants, src->blend_constants, 1808 sizeof(src->blend_constants))) { 1809 memcpy(dest->blend_constants, src->blend_constants, 1810 sizeof(src->blend_constants)); 1811 dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS; 1812 } 1813 } 1814 1815 if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BIAS)) { 1816 if (memcmp(&dest->depth_bias, &src->depth_bias, 1817 sizeof(src->depth_bias))) { 1818 memcpy(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias)); 1819 dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS; 1820 } 1821 } 1822 1823 if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) { 1824 if (dest->line_width != src->line_width) { 1825 dest->line_width = src->line_width; 1826 dirty |= V3DV_CMD_DIRTY_LINE_WIDTH; 1827 } 1828 } 1829 1830 if (!(dynamic_mask & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) { 1831 if (dest->color_write_enable != src->color_write_enable) { 1832 dest->color_write_enable = src->color_write_enable; 1833 dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE; 1834 } 1835 } 1836 1837 cmd_buffer->state.dynamic.mask = dynamic_mask; 1838 cmd_buffer->state.dirty |= dirty; 1839} 1840 1841static void 1842bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer, 1843 struct v3dv_pipeline *pipeline) 1844{ 1845 assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT)); 1846 if (cmd_buffer->state.gfx.pipeline == pipeline) 1847 return; 1848 1849 cmd_buffer->state.gfx.pipeline = pipeline; 1850 1851 cmd_buffer_bind_pipeline_static_state(cmd_buffer, &pipeline->dynamic_state); 1852 1853 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE; 1854} 1855 1856static void 1857bind_compute_pipeline(struct v3dv_cmd_buffer *cmd_buffer, 1858 struct v3dv_pipeline *pipeline) 1859{ 1860 assert(pipeline && pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT); 1861 1862 if (cmd_buffer->state.compute.pipeline == pipeline) 1863 return; 1864 1865 cmd_buffer->state.compute.pipeline = pipeline; 1866 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_PIPELINE; 1867} 1868 1869VKAPI_ATTR void VKAPI_CALL 1870v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer, 1871 VkPipelineBindPoint pipelineBindPoint, 1872 VkPipeline _pipeline) 1873{ 1874 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1875 V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, _pipeline); 1876 1877 switch (pipelineBindPoint) { 1878 case VK_PIPELINE_BIND_POINT_COMPUTE: 1879 bind_compute_pipeline(cmd_buffer, pipeline); 1880 break; 1881 1882 case VK_PIPELINE_BIND_POINT_GRAPHICS: 1883 bind_graphics_pipeline(cmd_buffer, pipeline); 1884 break; 1885 1886 default: 1887 assert(!"invalid bind point"); 1888 break; 1889 } 1890} 1891 1892/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */ 1893void 1894v3dv_viewport_compute_xform(const VkViewport *viewport, 1895 float scale[3], 1896 float translate[3]) 1897{ 1898 float x = viewport->x; 1899 float y = viewport->y; 1900 float half_width = 0.5f * viewport->width; 1901 float half_height = 0.5f * viewport->height; 1902 double n = viewport->minDepth; 1903 double f = viewport->maxDepth; 1904 1905 scale[0] = half_width; 1906 translate[0] = half_width + x; 1907 scale[1] = half_height; 1908 translate[1] = half_height + y; 1909 1910 scale[2] = (f - n); 1911 translate[2] = n; 1912 1913 /* It seems that if the scale is small enough the hardware won't clip 1914 * correctly so we work around this my choosing the smallest scale that 1915 * seems to work. 1916 * 1917 * This case is exercised by CTS: 1918 * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero 1919 */ 1920 const float min_abs_scale = 0.000009f; 1921 if (fabs(scale[2]) < min_abs_scale) 1922 scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f); 1923} 1924 1925VKAPI_ATTR void VKAPI_CALL 1926v3dv_CmdSetViewport(VkCommandBuffer commandBuffer, 1927 uint32_t firstViewport, 1928 uint32_t viewportCount, 1929 const VkViewport *pViewports) 1930{ 1931 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1932 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1933 const uint32_t total_count = firstViewport + viewportCount; 1934 1935 assert(firstViewport < MAX_VIEWPORTS); 1936 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS); 1937 1938 if (state->dynamic.viewport.count < total_count) 1939 state->dynamic.viewport.count = total_count; 1940 1941 if (!memcmp(state->dynamic.viewport.viewports + firstViewport, 1942 pViewports, viewportCount * sizeof(*pViewports))) { 1943 return; 1944 } 1945 1946 memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports, 1947 viewportCount * sizeof(*pViewports)); 1948 1949 for (uint32_t i = firstViewport; i < total_count; i++) { 1950 v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i], 1951 state->dynamic.viewport.scale[i], 1952 state->dynamic.viewport.translate[i]); 1953 } 1954 1955 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT; 1956} 1957 1958VKAPI_ATTR void VKAPI_CALL 1959v3dv_CmdSetScissor(VkCommandBuffer commandBuffer, 1960 uint32_t firstScissor, 1961 uint32_t scissorCount, 1962 const VkRect2D *pScissors) 1963{ 1964 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 1965 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1966 1967 assert(firstScissor < MAX_SCISSORS); 1968 assert(firstScissor + scissorCount >= 1 && 1969 firstScissor + scissorCount <= MAX_SCISSORS); 1970 1971 if (state->dynamic.scissor.count < firstScissor + scissorCount) 1972 state->dynamic.scissor.count = firstScissor + scissorCount; 1973 1974 if (!memcmp(state->dynamic.scissor.scissors + firstScissor, 1975 pScissors, scissorCount * sizeof(*pScissors))) { 1976 return; 1977 } 1978 1979 memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors, 1980 scissorCount * sizeof(*pScissors)); 1981 1982 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_SCISSOR; 1983} 1984 1985static void 1986emit_scissor(struct v3dv_cmd_buffer *cmd_buffer) 1987{ 1988 if (cmd_buffer->state.dynamic.viewport.count == 0) 1989 return; 1990 1991 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; 1992 1993 /* FIXME: right now we only support one viewport. viewporst[0] would work 1994 * now, but would need to change if we allow multiple viewports. 1995 */ 1996 float *vptranslate = dynamic->viewport.translate[0]; 1997 float *vpscale = dynamic->viewport.scale[0]; 1998 1999 float vp_minx = -fabsf(vpscale[0]) + vptranslate[0]; 2000 float vp_maxx = fabsf(vpscale[0]) + vptranslate[0]; 2001 float vp_miny = -fabsf(vpscale[1]) + vptranslate[1]; 2002 float vp_maxy = fabsf(vpscale[1]) + vptranslate[1]; 2003 2004 /* Quoting from v3dx_emit: 2005 * "Clip to the scissor if it's enabled, but still clip to the 2006 * drawable regardless since that controls where the binner 2007 * tries to put things. 2008 * 2009 * Additionally, always clip the rendering to the viewport, 2010 * since the hardware does guardband clipping, meaning 2011 * primitives would rasterize outside of the view volume." 2012 */ 2013 uint32_t minx, miny, maxx, maxy; 2014 2015 /* From the Vulkan spec: 2016 * 2017 * "The application must ensure (using scissor if necessary) that all 2018 * rendering is contained within the render area. The render area must be 2019 * contained within the framebuffer dimensions." 2020 * 2021 * So it is the application's responsibility to ensure this. Still, we can 2022 * help by automatically restricting the scissor rect to the render area. 2023 */ 2024 minx = MAX2(vp_minx, cmd_buffer->state.render_area.offset.x); 2025 miny = MAX2(vp_miny, cmd_buffer->state.render_area.offset.y); 2026 maxx = MIN2(vp_maxx, cmd_buffer->state.render_area.offset.x + 2027 cmd_buffer->state.render_area.extent.width); 2028 maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y + 2029 cmd_buffer->state.render_area.extent.height); 2030 2031 minx = vp_minx; 2032 miny = vp_miny; 2033 maxx = vp_maxx; 2034 maxy = vp_maxy; 2035 2036 /* Clip against user provided scissor if needed. 2037 * 2038 * FIXME: right now we only allow one scissor. Below would need to be 2039 * updated if we support more 2040 */ 2041 if (dynamic->scissor.count > 0) { 2042 VkRect2D *scissor = &dynamic->scissor.scissors[0]; 2043 minx = MAX2(minx, scissor->offset.x); 2044 miny = MAX2(miny, scissor->offset.y); 2045 maxx = MIN2(maxx, scissor->offset.x + scissor->extent.width); 2046 maxy = MIN2(maxy, scissor->offset.y + scissor->extent.height); 2047 } 2048 2049 /* If the scissor is outside the viewport area we end up with 2050 * min{x,y} > max{x,y}. 2051 */ 2052 if (minx > maxx) 2053 maxx = minx; 2054 if (miny > maxy) 2055 maxy = miny; 2056 2057 cmd_buffer->state.clip_window.offset.x = minx; 2058 cmd_buffer->state.clip_window.offset.y = miny; 2059 cmd_buffer->state.clip_window.extent.width = maxx - minx; 2060 cmd_buffer->state.clip_window.extent.height = maxy - miny; 2061 2062 v3dv_X(cmd_buffer->device, job_emit_clip_window) 2063 (cmd_buffer->state.job, &cmd_buffer->state.clip_window); 2064 2065 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_SCISSOR; 2066} 2067 2068static void 2069update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer, 2070 uint32_t dirty_uniform_state) 2071{ 2072 /* We need to update uniform streams if any piece of state that is passed 2073 * to the shader as a uniform may have changed. 2074 * 2075 * If only descriptor sets are dirty then we can safely ignore updates 2076 * for shader stages that don't access descriptors. 2077 */ 2078 2079 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 2080 assert(pipeline); 2081 2082 const bool has_new_pipeline = dirty_uniform_state & V3DV_CMD_DIRTY_PIPELINE; 2083 const bool has_new_viewport = dirty_uniform_state & V3DV_CMD_DIRTY_VIEWPORT; 2084 const bool has_new_push_constants = dirty_uniform_state & V3DV_CMD_DIRTY_PUSH_CONSTANTS; 2085 const bool has_new_descriptors = dirty_uniform_state & V3DV_CMD_DIRTY_DESCRIPTOR_SETS; 2086 const bool has_new_view_index = dirty_uniform_state & V3DV_CMD_DIRTY_VIEW_INDEX; 2087 2088 /* VK_SHADER_STAGE_FRAGMENT_BIT */ 2089 const bool has_new_descriptors_fs = 2090 has_new_descriptors && 2091 (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_FRAGMENT_BIT); 2092 2093 const bool has_new_push_constants_fs = 2094 has_new_push_constants && 2095 (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_FRAGMENT_BIT); 2096 2097 const bool needs_fs_update = has_new_pipeline || 2098 has_new_view_index || 2099 has_new_push_constants_fs || 2100 has_new_descriptors_fs; 2101 2102 if (needs_fs_update) { 2103 struct v3dv_shader_variant *fs_variant = 2104 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]; 2105 2106 cmd_buffer->state.uniforms.fs = 2107 v3dv_write_uniforms(cmd_buffer, pipeline, fs_variant); 2108 } 2109 2110 /* VK_SHADER_STAGE_GEOMETRY_BIT */ 2111 if (pipeline->has_gs) { 2112 const bool has_new_descriptors_gs = 2113 has_new_descriptors && 2114 (cmd_buffer->state.dirty_descriptor_stages & 2115 VK_SHADER_STAGE_GEOMETRY_BIT); 2116 2117 const bool has_new_push_constants_gs = 2118 has_new_push_constants && 2119 (cmd_buffer->state.dirty_push_constants_stages & 2120 VK_SHADER_STAGE_GEOMETRY_BIT); 2121 2122 const bool needs_gs_update = has_new_viewport || 2123 has_new_view_index || 2124 has_new_pipeline || 2125 has_new_push_constants_gs || 2126 has_new_descriptors_gs; 2127 2128 if (needs_gs_update) { 2129 struct v3dv_shader_variant *gs_variant = 2130 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]; 2131 2132 struct v3dv_shader_variant *gs_bin_variant = 2133 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]; 2134 2135 cmd_buffer->state.uniforms.gs = 2136 v3dv_write_uniforms(cmd_buffer, pipeline, gs_variant); 2137 2138 cmd_buffer->state.uniforms.gs_bin = 2139 v3dv_write_uniforms(cmd_buffer, pipeline, gs_bin_variant); 2140 } 2141 } 2142 2143 /* VK_SHADER_STAGE_VERTEX_BIT */ 2144 const bool has_new_descriptors_vs = 2145 has_new_descriptors && 2146 (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_VERTEX_BIT); 2147 2148 const bool has_new_push_constants_vs = 2149 has_new_push_constants && 2150 (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_VERTEX_BIT); 2151 2152 const bool needs_vs_update = has_new_viewport || 2153 has_new_view_index || 2154 has_new_pipeline || 2155 has_new_push_constants_vs || 2156 has_new_descriptors_vs; 2157 2158 if (needs_vs_update) { 2159 struct v3dv_shader_variant *vs_variant = 2160 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]; 2161 2162 struct v3dv_shader_variant *vs_bin_variant = 2163 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]; 2164 2165 cmd_buffer->state.uniforms.vs = 2166 v3dv_write_uniforms(cmd_buffer, pipeline, vs_variant); 2167 2168 cmd_buffer->state.uniforms.vs_bin = 2169 v3dv_write_uniforms(cmd_buffer, pipeline, vs_bin_variant); 2170 } 2171 2172 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEW_INDEX; 2173} 2174 2175/* This stores command buffer state that we might be about to stomp for 2176 * a meta operation. 2177 */ 2178void 2179v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer, 2180 bool push_descriptor_state) 2181{ 2182 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 2183 2184 if (state->subpass_idx != -1) { 2185 state->meta.subpass_idx = state->subpass_idx; 2186 state->meta.framebuffer = v3dv_framebuffer_to_handle(state->framebuffer); 2187 state->meta.pass = v3dv_render_pass_to_handle(state->pass); 2188 2189 const uint32_t attachment_state_item_size = 2190 sizeof(struct v3dv_cmd_buffer_attachment_state); 2191 const uint32_t attachment_state_total_size = 2192 attachment_state_item_size * state->attachment_alloc_count; 2193 if (state->meta.attachment_alloc_count < state->attachment_alloc_count) { 2194 if (state->meta.attachment_alloc_count > 0) 2195 vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments); 2196 2197 state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc, 2198 attachment_state_total_size, 8, 2199 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 2200 if (!state->meta.attachments) { 2201 v3dv_flag_oom(cmd_buffer, NULL); 2202 return; 2203 } 2204 state->meta.attachment_alloc_count = state->attachment_alloc_count; 2205 } 2206 state->meta.attachment_count = state->attachment_alloc_count; 2207 memcpy(state->meta.attachments, state->attachments, 2208 attachment_state_total_size); 2209 2210 state->meta.tile_aligned_render_area = state->tile_aligned_render_area; 2211 memcpy(&state->meta.render_area, &state->render_area, sizeof(VkRect2D)); 2212 } 2213 2214 /* We expect that meta operations are graphics-only, so we only take into 2215 * account the graphics pipeline, and the graphics state 2216 */ 2217 state->meta.gfx.pipeline = state->gfx.pipeline; 2218 memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic)); 2219 2220 struct v3dv_descriptor_state *gfx_descriptor_state = 2221 &cmd_buffer->state.gfx.descriptor_state; 2222 2223 if (push_descriptor_state) { 2224 if (gfx_descriptor_state->valid != 0) { 2225 memcpy(&state->meta.gfx.descriptor_state, gfx_descriptor_state, 2226 sizeof(state->gfx.descriptor_state)); 2227 } 2228 state->meta.has_descriptor_state = true; 2229 } else { 2230 state->meta.has_descriptor_state = false; 2231 } 2232 2233 if (cmd_buffer->state.push_constants_size > 0) { 2234 state->meta.push_constants_size = cmd_buffer->state.push_constants_size; 2235 memcpy(state->meta.push_constants, cmd_buffer->state.push_constants_data, 2236 cmd_buffer->state.push_constants_size); 2237 cmd_buffer->state.push_constants_size = 0; 2238 } 2239} 2240 2241/* This restores command buffer state after a meta operation 2242 */ 2243void 2244v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer, 2245 uint32_t dirty_dynamic_state, 2246 bool needs_subpass_resume) 2247{ 2248 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 2249 2250 if (state->meta.subpass_idx != -1) { 2251 state->pass = v3dv_render_pass_from_handle(state->meta.pass); 2252 state->framebuffer = v3dv_framebuffer_from_handle(state->meta.framebuffer); 2253 2254 assert(state->meta.attachment_count <= state->attachment_alloc_count); 2255 const uint32_t attachment_state_item_size = 2256 sizeof(struct v3dv_cmd_buffer_attachment_state); 2257 const uint32_t attachment_state_total_size = 2258 attachment_state_item_size * state->meta.attachment_count; 2259 memcpy(state->attachments, state->meta.attachments, 2260 attachment_state_total_size); 2261 2262 state->tile_aligned_render_area = state->meta.tile_aligned_render_area; 2263 memcpy(&state->render_area, &state->meta.render_area, sizeof(VkRect2D)); 2264 2265 /* Is needs_subpass_resume is true it means that the emitted the meta 2266 * operation in its own job (possibly with an RT config that is 2267 * incompatible with the current subpass), so resuming subpass execution 2268 * after it requires that we create a new job with the subpass RT setup. 2269 */ 2270 if (needs_subpass_resume) 2271 v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->meta.subpass_idx); 2272 } else { 2273 state->subpass_idx = -1; 2274 } 2275 2276 if (state->meta.gfx.pipeline != NULL) { 2277 struct v3dv_pipeline *pipeline = state->meta.gfx.pipeline; 2278 VkPipelineBindPoint pipeline_binding = 2279 v3dv_pipeline_get_binding_point(pipeline); 2280 v3dv_CmdBindPipeline(v3dv_cmd_buffer_to_handle(cmd_buffer), 2281 pipeline_binding, 2282 v3dv_pipeline_to_handle(state->meta.gfx.pipeline)); 2283 } else { 2284 state->gfx.pipeline = NULL; 2285 } 2286 2287 if (dirty_dynamic_state) { 2288 memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic)); 2289 state->dirty |= dirty_dynamic_state; 2290 } 2291 2292 if (state->meta.has_descriptor_state) { 2293 if (state->meta.gfx.descriptor_state.valid != 0) { 2294 memcpy(&state->gfx.descriptor_state, &state->meta.gfx.descriptor_state, 2295 sizeof(state->gfx.descriptor_state)); 2296 } else { 2297 state->gfx.descriptor_state.valid = 0; 2298 } 2299 } 2300 2301 /* We only need to restore push constant data if we had any data in the 2302 * original command buffer and the meta operation wrote new push constant 2303 * data. 2304 */ 2305 if (state->meta.push_constants_size > 0 && 2306 cmd_buffer->state.push_constants_size > 0) { 2307 memcpy(cmd_buffer->state.push_constants_data, state->meta.push_constants, 2308 state->meta.push_constants_size); 2309 } 2310 cmd_buffer->state.push_constants_size = state->meta.push_constants_size; 2311 2312 state->meta.gfx.pipeline = NULL; 2313 state->meta.framebuffer = VK_NULL_HANDLE; 2314 state->meta.pass = VK_NULL_HANDLE; 2315 state->meta.subpass_idx = -1; 2316 state->meta.has_descriptor_state = false; 2317 state->meta.push_constants_size = 0; 2318} 2319 2320static struct v3dv_job * 2321cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer) 2322{ 2323 struct v3dv_job *job = cmd_buffer->state.job; 2324 assert(job); 2325 2326 /* If the job has been flagged with 'always_flush' and it has already 2327 * recorded any draw calls then we need to start a new job for it. 2328 */ 2329 if (job->always_flush && job->draw_count > 0) { 2330 assert(cmd_buffer->state.pass); 2331 /* First, flag the current job as not being the last in the 2332 * current subpass 2333 */ 2334 job->is_subpass_finish = false; 2335 2336 /* Now start a new job in the same subpass and flag it as continuing 2337 * the current subpass. 2338 */ 2339 job = v3dv_cmd_buffer_subpass_resume(cmd_buffer, 2340 cmd_buffer->state.subpass_idx); 2341 assert(job->draw_count == 0); 2342 2343 /* Inherit the 'always flush' behavior */ 2344 job->always_flush = true; 2345 } 2346 2347 assert(job->draw_count == 0 || !job->always_flush); 2348 return job; 2349} 2350 2351/** 2352 * The Vulkan spec states: 2353 * 2354 * "It is legal for a subpass to use no color or depth/stencil 2355 * attachments (...) This kind of subpass can use shader side effects such 2356 * as image stores and atomics to produce an output. In this case, the 2357 * subpass continues to use the width, height, and layers of the framebuffer 2358 * to define the dimensions of the rendering area, and the 2359 * rasterizationSamples from each pipeline’s 2360 * VkPipelineMultisampleStateCreateInfo to define the number of samples used 2361 * in rasterization." 2362 * 2363 * We need to enable MSAA in the TILE_BINNING_MODE_CFG packet, which we 2364 * emit when we start a new frame at the begining of a subpass. At that point, 2365 * if the framebuffer doesn't have any attachments we won't enable MSAA and 2366 * the job won't be valid in the scenario described by the spec. 2367 * 2368 * This function is intended to be called before a draw call and will test if 2369 * we are in that scenario, in which case, it will restart the current job 2370 * with MSAA enabled. 2371 */ 2372static void 2373cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer) 2374{ 2375 assert(cmd_buffer->state.job); 2376 2377 /* We don't support variableMultisampleRate so we know that all pipelines 2378 * bound in the same subpass must have matching number of samples, so we 2379 * can do this check only on the first draw call. 2380 */ 2381 if (cmd_buffer->state.job->draw_count > 0) 2382 return; 2383 2384 /* We only need to restart the frame if the pipeline requires MSAA but 2385 * our frame tiling didn't enable it. 2386 */ 2387 if (!cmd_buffer->state.gfx.pipeline->msaa || 2388 cmd_buffer->state.job->frame_tiling.msaa) { 2389 return; 2390 } 2391 2392 /* FIXME: Secondary command buffers don't start frames. Instead, they are 2393 * recorded into primary jobs that start them. For secondaries, we should 2394 * still handle this scenario, but we should do that when we record them 2395 * into primaries by testing if any of the secondaries has multisampled 2396 * draw calls in them, and then using that info to decide if we need to 2397 * restart the primary job into which they are being recorded. 2398 */ 2399 if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY) 2400 return; 2401 2402 /* Drop the current job and restart it with MSAA enabled */ 2403 struct v3dv_job *old_job = cmd_buffer->state.job; 2404 cmd_buffer->state.job = NULL; 2405 2406 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc, 2407 sizeof(struct v3dv_job), 8, 2408 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 2409 if (!job) { 2410 v3dv_flag_oom(cmd_buffer, NULL); 2411 return; 2412 } 2413 2414 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CL, cmd_buffer->device, cmd_buffer, 2415 cmd_buffer->state.subpass_idx); 2416 cmd_buffer->state.job = job; 2417 2418 v3dv_job_start_frame(job, 2419 old_job->frame_tiling.width, 2420 old_job->frame_tiling.height, 2421 old_job->frame_tiling.layers, 2422 true, 2423 old_job->frame_tiling.render_target_count, 2424 old_job->frame_tiling.internal_bpp, 2425 true /* msaa */); 2426 2427 v3dv_job_destroy(old_job); 2428} 2429 2430static bool 2431cmd_buffer_binning_sync_required(struct v3dv_cmd_buffer *cmd_buffer, 2432 struct v3dv_pipeline *pipeline, 2433 bool indexed, bool indirect) 2434{ 2435 const struct v3dv_descriptor_maps *vs_bin_maps = 2436 pipeline->shared_data->maps[BROADCOM_SHADER_VERTEX_BIN]; 2437 2438 const struct v3dv_descriptor_maps *gs_bin_maps = 2439 pipeline->shared_data->maps[BROADCOM_SHADER_GEOMETRY_BIN]; 2440 2441 VkAccessFlags buffer_access = 2442 cmd_buffer->state.barrier.bcl_buffer_access; 2443 if (buffer_access) { 2444 /* Index buffer read */ 2445 if (indexed && (buffer_access & VK_ACCESS_INDEX_READ_BIT)) 2446 return true; 2447 2448 /* Indirect buffer read */ 2449 if (indirect && (buffer_access & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)) 2450 return true; 2451 2452 /* Attribute read */ 2453 if (buffer_access & VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT) { 2454 const struct v3d_vs_prog_data *prog_data = 2455 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs; 2456 2457 for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) { 2458 if (prog_data->vattr_sizes[i] > 0) 2459 return true; 2460 } 2461 } 2462 2463 /* UBO / SSBO read */ 2464 if (buffer_access & (VK_ACCESS_UNIFORM_READ_BIT | 2465 VK_ACCESS_SHADER_READ_BIT | 2466 VK_ACCESS_MEMORY_READ_BIT)) { 2467 2468 if (vs_bin_maps->ubo_map.num_desc > 0 || 2469 vs_bin_maps->ssbo_map.num_desc > 0) { 2470 return true; 2471 } 2472 2473 if (gs_bin_maps && (gs_bin_maps->ubo_map.num_desc > 0 || 2474 gs_bin_maps->ssbo_map.num_desc > 0)) { 2475 return true; 2476 } 2477 } 2478 2479 /* SSBO write */ 2480 if (buffer_access & (VK_ACCESS_SHADER_WRITE_BIT | 2481 VK_ACCESS_MEMORY_WRITE_BIT)) { 2482 if (vs_bin_maps->ssbo_map.num_desc > 0) 2483 return true; 2484 2485 if (gs_bin_maps && gs_bin_maps->ssbo_map.num_desc > 0) 2486 return true; 2487 } 2488 } 2489 2490 VkAccessFlags image_access = 2491 cmd_buffer->state.barrier.bcl_image_access; 2492 if (image_access) { 2493 /* Image load / store */ 2494 if (image_access & (VK_ACCESS_SHADER_READ_BIT | 2495 VK_ACCESS_SHADER_WRITE_BIT | 2496 VK_ACCESS_MEMORY_READ_BIT | 2497 VK_ACCESS_MEMORY_WRITE_BIT)) { 2498 if (vs_bin_maps->texture_map.num_desc > 0 || 2499 vs_bin_maps->sampler_map.num_desc > 0) { 2500 return true; 2501 } 2502 } 2503 } 2504 2505 return false; 2506} 2507 2508static void 2509consume_bcl_sync(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_job *job) 2510{ 2511 job->needs_bcl_sync = true; 2512 cmd_buffer->state.barrier.bcl_buffer_access = 0; 2513 cmd_buffer->state.barrier.bcl_image_access = 0; 2514} 2515 2516void 2517v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer, 2518 bool indexed, bool indirect) 2519{ 2520 assert(cmd_buffer->state.gfx.pipeline); 2521 assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT)); 2522 2523 /* If we emitted a pipeline barrier right before this draw we won't have 2524 * an active job. In that case, create a new job continuing the current 2525 * subpass. 2526 */ 2527 if (!cmd_buffer->state.job) { 2528 v3dv_cmd_buffer_subpass_resume(cmd_buffer, 2529 cmd_buffer->state.subpass_idx); 2530 } 2531 2532 /* Restart single sample job for MSAA pipeline if needed */ 2533 cmd_buffer_restart_job_for_msaa_if_needed(cmd_buffer); 2534 2535 /* If the job is configured to flush on every draw call we need to create 2536 * a new job now. 2537 */ 2538 struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer); 2539 job->draw_count++; 2540 2541 /* Track VK_KHR_buffer_device_address usage in the job */ 2542 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 2543 job->uses_buffer_device_address |= pipeline->uses_buffer_device_address; 2544 2545 /* If this job is serialized (has consumed a barrier) then check if we need 2546 * to sync at the binning stage by testing if the binning shaders involved 2547 * with the draw call require access to external resources. 2548 */ 2549 if (job->serialize && (cmd_buffer->state.barrier.bcl_buffer_access || 2550 cmd_buffer->state.barrier.bcl_image_access)) { 2551 assert(!job->needs_bcl_sync); 2552 if (cmd_buffer_binning_sync_required(cmd_buffer, pipeline, 2553 indexed, indirect)) { 2554 consume_bcl_sync(cmd_buffer, job); 2555 } 2556 } 2557 2558 /* GL shader state binds shaders, uniform and vertex attribute state. The 2559 * compiler injects uniforms to handle some descriptor types (such as 2560 * textures), so we need to regen that when descriptor state changes. 2561 * 2562 * We also need to emit new shader state if we have a dirty viewport since 2563 * that will require that we new uniform state for QUNIFORM_VIEWPORT_*. 2564 */ 2565 uint32_t *dirty = &cmd_buffer->state.dirty; 2566 2567 const uint32_t dirty_uniform_state = 2568 *dirty & (V3DV_CMD_DIRTY_PIPELINE | 2569 V3DV_CMD_DIRTY_PUSH_CONSTANTS | 2570 V3DV_CMD_DIRTY_DESCRIPTOR_SETS | 2571 V3DV_CMD_DIRTY_VIEWPORT | 2572 V3DV_CMD_DIRTY_VIEW_INDEX); 2573 2574 if (dirty_uniform_state) 2575 update_gfx_uniform_state(cmd_buffer, dirty_uniform_state); 2576 2577 struct v3dv_device *device = cmd_buffer->device; 2578 2579 if (dirty_uniform_state || (*dirty & V3DV_CMD_DIRTY_VERTEX_BUFFER)) 2580 v3dv_X(device, cmd_buffer_emit_gl_shader_state)(cmd_buffer); 2581 2582 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) { 2583 v3dv_X(device, cmd_buffer_emit_configuration_bits)(cmd_buffer); 2584 v3dv_X(device, cmd_buffer_emit_varyings_state)(cmd_buffer); 2585 } 2586 2587 if (*dirty & (V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR)) { 2588 emit_scissor(cmd_buffer); 2589 } 2590 2591 if (*dirty & V3DV_CMD_DIRTY_VIEWPORT) { 2592 v3dv_X(device, cmd_buffer_emit_viewport)(cmd_buffer); 2593 } 2594 2595 if (*dirty & V3DV_CMD_DIRTY_INDEX_BUFFER) 2596 v3dv_X(device, cmd_buffer_emit_index_buffer)(cmd_buffer); 2597 2598 const uint32_t dynamic_stencil_dirty_flags = 2599 V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK | 2600 V3DV_CMD_DIRTY_STENCIL_WRITE_MASK | 2601 V3DV_CMD_DIRTY_STENCIL_REFERENCE; 2602 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | dynamic_stencil_dirty_flags)) 2603 v3dv_X(device, cmd_buffer_emit_stencil)(cmd_buffer); 2604 2605 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS)) 2606 v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer); 2607 2608 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS)) 2609 v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer); 2610 2611 if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY) 2612 v3dv_X(device, cmd_buffer_emit_occlusion_query)(cmd_buffer); 2613 2614 if (*dirty & V3DV_CMD_DIRTY_LINE_WIDTH) 2615 v3dv_X(device, cmd_buffer_emit_line_width)(cmd_buffer); 2616 2617 if (*dirty & V3DV_CMD_DIRTY_PIPELINE) 2618 v3dv_X(device, cmd_buffer_emit_sample_state)(cmd_buffer); 2619 2620 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE)) 2621 v3dv_X(device, cmd_buffer_emit_color_write_mask)(cmd_buffer); 2622 2623 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE; 2624} 2625 2626static inline void 2627cmd_buffer_set_view_index(struct v3dv_cmd_buffer *cmd_buffer, 2628 uint32_t view_index) 2629{ 2630 cmd_buffer->state.view_index = view_index; 2631 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX; 2632} 2633 2634static void 2635cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer, 2636 struct v3dv_draw_info *info) 2637{ 2638 2639 struct v3dv_render_pass *pass = cmd_buffer->state.pass; 2640 if (likely(!pass->multiview_enabled)) { 2641 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false); 2642 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info); 2643 return; 2644 } 2645 2646 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; 2647 while (view_mask) { 2648 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); 2649 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false); 2650 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info); 2651 } 2652} 2653 2654VKAPI_ATTR void VKAPI_CALL 2655v3dv_CmdDraw(VkCommandBuffer commandBuffer, 2656 uint32_t vertexCount, 2657 uint32_t instanceCount, 2658 uint32_t firstVertex, 2659 uint32_t firstInstance) 2660{ 2661 if (vertexCount == 0 || instanceCount == 0) 2662 return; 2663 2664 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2665 struct v3dv_draw_info info = {}; 2666 info.vertex_count = vertexCount; 2667 info.instance_count = instanceCount; 2668 info.first_instance = firstInstance; 2669 info.first_vertex = firstVertex; 2670 2671 cmd_buffer_draw(cmd_buffer, &info); 2672} 2673 2674VKAPI_ATTR void VKAPI_CALL 2675v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer, 2676 uint32_t indexCount, 2677 uint32_t instanceCount, 2678 uint32_t firstIndex, 2679 int32_t vertexOffset, 2680 uint32_t firstInstance) 2681{ 2682 if (indexCount == 0 || instanceCount == 0) 2683 return; 2684 2685 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2686 2687 struct v3dv_render_pass *pass = cmd_buffer->state.pass; 2688 if (likely(!pass->multiview_enabled)) { 2689 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false); 2690 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed) 2691 (cmd_buffer, indexCount, instanceCount, 2692 firstIndex, vertexOffset, firstInstance); 2693 return; 2694 } 2695 2696 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; 2697 while (view_mask) { 2698 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); 2699 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false); 2700 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed) 2701 (cmd_buffer, indexCount, instanceCount, 2702 firstIndex, vertexOffset, firstInstance); 2703 } 2704} 2705 2706VKAPI_ATTR void VKAPI_CALL 2707v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer, 2708 VkBuffer _buffer, 2709 VkDeviceSize offset, 2710 uint32_t drawCount, 2711 uint32_t stride) 2712{ 2713 /* drawCount is the number of draws to execute, and can be zero. */ 2714 if (drawCount == 0) 2715 return; 2716 2717 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2718 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer); 2719 2720 struct v3dv_render_pass *pass = cmd_buffer->state.pass; 2721 if (likely(!pass->multiview_enabled)) { 2722 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true); 2723 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect) 2724 (cmd_buffer, buffer, offset, drawCount, stride); 2725 return; 2726 } 2727 2728 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; 2729 while (view_mask) { 2730 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); 2731 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true); 2732 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect) 2733 (cmd_buffer, buffer, offset, drawCount, stride); 2734 } 2735} 2736 2737VKAPI_ATTR void VKAPI_CALL 2738v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, 2739 VkBuffer _buffer, 2740 VkDeviceSize offset, 2741 uint32_t drawCount, 2742 uint32_t stride) 2743{ 2744 /* drawCount is the number of draws to execute, and can be zero. */ 2745 if (drawCount == 0) 2746 return; 2747 2748 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2749 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer); 2750 2751 struct v3dv_render_pass *pass = cmd_buffer->state.pass; 2752 if (likely(!pass->multiview_enabled)) { 2753 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true); 2754 v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect) 2755 (cmd_buffer, buffer, offset, drawCount, stride); 2756 return; 2757 } 2758 2759 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask; 2760 while (view_mask) { 2761 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask)); 2762 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true); 2763 v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect) 2764 (cmd_buffer, buffer, offset, drawCount, stride); 2765 } 2766} 2767 2768VKAPI_ATTR void VKAPI_CALL 2769v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer, 2770 VkPipelineStageFlags srcStageMask, 2771 VkPipelineStageFlags dstStageMask, 2772 VkDependencyFlags dependencyFlags, 2773 uint32_t memoryBarrierCount, 2774 const VkMemoryBarrier *pMemoryBarriers, 2775 uint32_t bufferBarrierCount, 2776 const VkBufferMemoryBarrier *pBufferBarriers, 2777 uint32_t imageBarrierCount, 2778 const VkImageMemoryBarrier *pImageBarriers) 2779{ 2780 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2781 2782 /* We can safely skip barriers for image layout transitions from UNDEFINED 2783 * layout. 2784 */ 2785 if (imageBarrierCount > 0) { 2786 bool all_undefined = true; 2787 for (int i = 0; all_undefined && i < imageBarrierCount; i++) { 2788 if (pImageBarriers[i].oldLayout != VK_IMAGE_LAYOUT_UNDEFINED) 2789 all_undefined = false; 2790 } 2791 if (all_undefined) 2792 imageBarrierCount = 0; 2793 } 2794 2795 if (memoryBarrierCount + bufferBarrierCount + imageBarrierCount == 0) 2796 return; 2797 2798 /* We only care about barriers between GPU jobs */ 2799 if (srcStageMask == VK_PIPELINE_STAGE_HOST_BIT || 2800 dstStageMask == VK_PIPELINE_STAGE_HOST_BIT) { 2801 return; 2802 } 2803 2804 /* If we have a recording job, finish it here */ 2805 struct v3dv_job *job = cmd_buffer->state.job; 2806 if (job) 2807 v3dv_cmd_buffer_finish_job(cmd_buffer); 2808 2809 /* Track the source of the barrier */ 2810 uint8_t src_mask = 0; 2811 if (srcStageMask & (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | 2812 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { 2813 src_mask |= V3DV_BARRIER_COMPUTE_BIT; 2814 } 2815 2816 if (srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | 2817 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { 2818 src_mask |= V3DV_BARRIER_TRANSFER_BIT; 2819 } 2820 2821 if (srcStageMask & (~(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | 2822 VK_PIPELINE_STAGE_TRANSFER_BIT))) { 2823 src_mask |= V3DV_BARRIER_GRAPHICS_BIT; 2824 } 2825 2826 /* Track consumer of the barrier */ 2827 if (dstStageMask & (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | 2828 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { 2829 cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_COMPUTE_BIT; 2830 cmd_buffer->state.barrier.src_mask_compute |= src_mask; 2831 } 2832 2833 if (dstStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | 2834 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { 2835 cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_TRANSFER_BIT; 2836 cmd_buffer->state.barrier.src_mask_transfer |= src_mask; 2837 } 2838 2839 if (dstStageMask & (~(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | 2840 VK_PIPELINE_STAGE_TRANSFER_BIT))) { 2841 cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_GRAPHICS_BIT; 2842 cmd_buffer->state.barrier.src_mask_graphics |= src_mask; 2843 2844 if (dstStageMask & (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | 2845 VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | 2846 VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | 2847 VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | 2848 VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | 2849 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | 2850 VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | 2851 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) { 2852 for (int i = 0; i < memoryBarrierCount; i++) { 2853 cmd_buffer->state.barrier.bcl_buffer_access |= 2854 pMemoryBarriers[i].dstAccessMask; 2855 cmd_buffer->state.barrier.bcl_image_access |= 2856 pMemoryBarriers[i].dstAccessMask; 2857 } 2858 for (int i = 0; i < bufferBarrierCount; i++) { 2859 cmd_buffer->state.barrier.bcl_buffer_access |= 2860 pBufferBarriers[i].dstAccessMask; 2861 } 2862 for (int i = 0; i < imageBarrierCount; i++) { 2863 if (pImageBarriers[i].oldLayout != VK_IMAGE_LAYOUT_UNDEFINED) { 2864 cmd_buffer->state.barrier.bcl_image_access |= 2865 pImageBarriers[i].dstAccessMask; 2866 } 2867 } 2868 } 2869 } 2870} 2871 2872VKAPI_ATTR void VKAPI_CALL 2873v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, 2874 uint32_t firstBinding, 2875 uint32_t bindingCount, 2876 const VkBuffer *pBuffers, 2877 const VkDeviceSize *pOffsets) 2878{ 2879 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2880 struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings; 2881 2882 /* We have to defer setting up vertex buffer since we need the buffer 2883 * stride from the pipeline. 2884 */ 2885 2886 assert(firstBinding + bindingCount <= MAX_VBS); 2887 bool vb_state_changed = false; 2888 for (uint32_t i = 0; i < bindingCount; i++) { 2889 if (vb[firstBinding + i].buffer != v3dv_buffer_from_handle(pBuffers[i])) { 2890 vb[firstBinding + i].buffer = v3dv_buffer_from_handle(pBuffers[i]); 2891 vb_state_changed = true; 2892 } 2893 if (vb[firstBinding + i].offset != pOffsets[i]) { 2894 vb[firstBinding + i].offset = pOffsets[i]; 2895 vb_state_changed = true; 2896 } 2897 } 2898 2899 if (vb_state_changed) 2900 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER; 2901} 2902 2903static uint32_t 2904get_index_size(VkIndexType index_type) 2905{ 2906 switch (index_type) { 2907 case VK_INDEX_TYPE_UINT8_EXT: 2908 return 1; 2909 break; 2910 case VK_INDEX_TYPE_UINT16: 2911 return 2; 2912 break; 2913 case VK_INDEX_TYPE_UINT32: 2914 return 4; 2915 break; 2916 default: 2917 unreachable("Unsupported index type"); 2918 } 2919} 2920 2921VKAPI_ATTR void VKAPI_CALL 2922v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, 2923 VkBuffer buffer, 2924 VkDeviceSize offset, 2925 VkIndexType indexType) 2926{ 2927 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2928 2929 const uint32_t index_size = get_index_size(indexType); 2930 if (buffer == cmd_buffer->state.index_buffer.buffer && 2931 offset == cmd_buffer->state.index_buffer.offset && 2932 index_size == cmd_buffer->state.index_buffer.index_size) { 2933 return; 2934 } 2935 2936 cmd_buffer->state.index_buffer.buffer = buffer; 2937 cmd_buffer->state.index_buffer.offset = offset; 2938 cmd_buffer->state.index_buffer.index_size = index_size; 2939 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_INDEX_BUFFER; 2940} 2941 2942VKAPI_ATTR void VKAPI_CALL 2943v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, 2944 VkStencilFaceFlags faceMask, 2945 uint32_t compareMask) 2946{ 2947 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2948 2949 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 2950 cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask & 0xff; 2951 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 2952 cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask & 0xff; 2953 2954 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK; 2955} 2956 2957VKAPI_ATTR void VKAPI_CALL 2958v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, 2959 VkStencilFaceFlags faceMask, 2960 uint32_t writeMask) 2961{ 2962 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2963 2964 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 2965 cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask & 0xff; 2966 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 2967 cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask & 0xff; 2968 2969 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK; 2970} 2971 2972VKAPI_ATTR void VKAPI_CALL 2973v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer, 2974 VkStencilFaceFlags faceMask, 2975 uint32_t reference) 2976{ 2977 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2978 2979 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 2980 cmd_buffer->state.dynamic.stencil_reference.front = reference & 0xff; 2981 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 2982 cmd_buffer->state.dynamic.stencil_reference.back = reference & 0xff; 2983 2984 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE; 2985} 2986 2987VKAPI_ATTR void VKAPI_CALL 2988v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer, 2989 float depthBiasConstantFactor, 2990 float depthBiasClamp, 2991 float depthBiasSlopeFactor) 2992{ 2993 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 2994 2995 cmd_buffer->state.dynamic.depth_bias.constant_factor = depthBiasConstantFactor; 2996 cmd_buffer->state.dynamic.depth_bias.depth_bias_clamp = depthBiasClamp; 2997 cmd_buffer->state.dynamic.depth_bias.slope_factor = depthBiasSlopeFactor; 2998 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS; 2999} 3000 3001VKAPI_ATTR void VKAPI_CALL 3002v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, 3003 float minDepthBounds, 3004 float maxDepthBounds) 3005{ 3006 /* We do not support depth bounds testing so we just ingore this. We are 3007 * already asserting that pipelines don't enable the feature anyway. 3008 */ 3009} 3010 3011VKAPI_ATTR void VKAPI_CALL 3012v3dv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, 3013 uint32_t lineStippleFactor, 3014 uint16_t lineStipplePattern) 3015{ 3016 /* We do not support stippled line rasterization so we just ignore this. */ 3017} 3018 3019VKAPI_ATTR void VKAPI_CALL 3020v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer, 3021 float lineWidth) 3022{ 3023 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3024 3025 cmd_buffer->state.dynamic.line_width = lineWidth; 3026 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH; 3027} 3028 3029VKAPI_ATTR void VKAPI_CALL 3030v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, 3031 VkPipelineBindPoint pipelineBindPoint, 3032 VkPipelineLayout _layout, 3033 uint32_t firstSet, 3034 uint32_t descriptorSetCount, 3035 const VkDescriptorSet *pDescriptorSets, 3036 uint32_t dynamicOffsetCount, 3037 const uint32_t *pDynamicOffsets) 3038{ 3039 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3040 V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, _layout); 3041 3042 uint32_t dyn_index = 0; 3043 3044 assert(firstSet + descriptorSetCount <= MAX_SETS); 3045 3046 struct v3dv_descriptor_state *descriptor_state = 3047 pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE ? 3048 &cmd_buffer->state.compute.descriptor_state : 3049 &cmd_buffer->state.gfx.descriptor_state; 3050 3051 VkShaderStageFlags dirty_stages = 0; 3052 bool descriptor_state_changed = false; 3053 for (uint32_t i = 0; i < descriptorSetCount; i++) { 3054 V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]); 3055 uint32_t index = firstSet + i; 3056 3057 descriptor_state->valid |= (1u << index); 3058 if (descriptor_state->descriptor_sets[index] != set) { 3059 descriptor_state->descriptor_sets[index] = set; 3060 dirty_stages |= set->layout->shader_stages; 3061 descriptor_state_changed = true; 3062 } 3063 3064 for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) { 3065 uint32_t idx = j + layout->set[i + firstSet].dynamic_offset_start; 3066 3067 if (descriptor_state->dynamic_offsets[idx] != pDynamicOffsets[dyn_index]) { 3068 descriptor_state->dynamic_offsets[idx] = pDynamicOffsets[dyn_index]; 3069 dirty_stages |= set->layout->shader_stages; 3070 descriptor_state_changed = true; 3071 } 3072 } 3073 } 3074 3075 if (descriptor_state_changed) { 3076 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) { 3077 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DESCRIPTOR_SETS; 3078 cmd_buffer->state.dirty_descriptor_stages |= dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS; 3079 } else { 3080 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS; 3081 cmd_buffer->state.dirty_descriptor_stages |= VK_SHADER_STAGE_COMPUTE_BIT; 3082 } 3083 } 3084} 3085 3086VKAPI_ATTR void VKAPI_CALL 3087v3dv_CmdPushConstants(VkCommandBuffer commandBuffer, 3088 VkPipelineLayout layout, 3089 VkShaderStageFlags stageFlags, 3090 uint32_t offset, 3091 uint32_t size, 3092 const void *pValues) 3093{ 3094 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3095 3096 if (!memcmp((uint8_t *) cmd_buffer->state.push_constants_data + offset, 3097 pValues, size)) { 3098 return; 3099 } 3100 3101 memcpy((uint8_t *) cmd_buffer->state.push_constants_data + offset, 3102 pValues, size); 3103 cmd_buffer->state.push_constants_size = 3104 MAX2(offset + size, cmd_buffer->state.push_constants_size); 3105 3106 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS | 3107 V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO; 3108 cmd_buffer->state.dirty_push_constants_stages |= stageFlags; 3109} 3110 3111VKAPI_ATTR void VKAPI_CALL 3112v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, 3113 const float blendConstants[4]) 3114{ 3115 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3116 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 3117 3118 if (!memcmp(state->dynamic.blend_constants, blendConstants, 3119 sizeof(state->dynamic.blend_constants))) { 3120 return; 3121 } 3122 3123 memcpy(state->dynamic.blend_constants, blendConstants, 3124 sizeof(state->dynamic.blend_constants)); 3125 3126 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS; 3127} 3128 3129VKAPI_ATTR void VKAPI_CALL 3130v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, 3131 uint32_t attachmentCount, 3132 const VkBool32 *pColorWriteEnables) 3133{ 3134 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3135 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 3136 uint32_t color_write_enable = 0; 3137 3138 for (uint32_t i = 0; i < attachmentCount; i++) 3139 color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0; 3140 3141 if (state->dynamic.color_write_enable == color_write_enable) 3142 return; 3143 3144 state->dynamic.color_write_enable = color_write_enable; 3145 3146 state->dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE; 3147} 3148 3149void 3150v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer, 3151 struct v3dv_query_pool *pool, 3152 uint32_t first, 3153 uint32_t count) 3154{ 3155 /* Resets can only happen outside a render pass instance so we should not 3156 * be in the middle of job recording. 3157 */ 3158 assert(cmd_buffer->state.pass == NULL); 3159 assert(cmd_buffer->state.job == NULL); 3160 3161 assert(first < pool->query_count); 3162 assert(first + count <= pool->query_count); 3163 3164 struct v3dv_job *job = 3165 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3166 V3DV_JOB_TYPE_CPU_RESET_QUERIES, 3167 cmd_buffer, -1); 3168 v3dv_return_if_oom(cmd_buffer, NULL); 3169 3170 job->cpu.query_reset.pool = pool; 3171 job->cpu.query_reset.first = first; 3172 job->cpu.query_reset.count = count; 3173 3174 list_addtail(&job->list_link, &cmd_buffer->jobs); 3175} 3176 3177void 3178v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer, 3179 uint32_t slot_size, 3180 uint32_t used_count, 3181 uint32_t *alloc_count, 3182 void **ptr) 3183{ 3184 if (used_count >= *alloc_count) { 3185 const uint32_t prev_slot_count = *alloc_count; 3186 void *old_buffer = *ptr; 3187 3188 const uint32_t new_slot_count = MAX2(*alloc_count * 2, 4); 3189 const uint32_t bytes = new_slot_count * slot_size; 3190 *ptr = vk_alloc(&cmd_buffer->device->vk.alloc, bytes, 8, 3191 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 3192 if (*ptr == NULL) { 3193 fprintf(stderr, "Error: failed to allocate CPU buffer for query.\n"); 3194 v3dv_flag_oom(cmd_buffer, NULL); 3195 return; 3196 } 3197 3198 memcpy(*ptr, old_buffer, prev_slot_count * slot_size); 3199 *alloc_count = new_slot_count; 3200 } 3201 assert(used_count < *alloc_count); 3202} 3203 3204void 3205v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer, 3206 struct v3dv_query_pool *pool, 3207 uint32_t query, 3208 VkQueryControlFlags flags) 3209{ 3210 assert(query < pool->query_count); 3211 switch (pool->query_type) { 3212 case VK_QUERY_TYPE_OCCLUSION: 3213 /* FIXME: we only support one active occlusion query for now */ 3214 assert(cmd_buffer->state.query.active_query.bo == NULL); 3215 3216 cmd_buffer->state.query.active_query.bo = pool->queries[query].bo; 3217 cmd_buffer->state.query.active_query.offset = pool->queries[query].offset; 3218 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; 3219 break; 3220 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 3221 assert(cmd_buffer->state.query.active_query.perf == NULL); 3222 if (cmd_buffer->state.pass) 3223 v3dv_cmd_buffer_subpass_finish(cmd_buffer); 3224 3225 cmd_buffer->state.query.active_query.perf = 3226 &pool->queries[query].perf; 3227 3228 if (cmd_buffer->state.pass) { 3229 v3dv_cmd_buffer_subpass_resume(cmd_buffer, 3230 cmd_buffer->state.subpass_idx); 3231 } 3232 break; 3233 } 3234 default: 3235 unreachable("Unsupported query type"); 3236 } 3237} 3238 3239static void 3240v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer, 3241 struct v3dv_query_pool *pool, 3242 uint32_t query) 3243{ 3244 assert(query < pool->query_count); 3245 3246 if (cmd_buffer->state.pass && 3247 pool->query_type != VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 3248 /* Queue the EndQuery in the command buffer state, we will create a CPU 3249 * job to flag all of these queries as possibly available right after the 3250 * render pass job in which they have been recorded. 3251 */ 3252 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 3253 v3dv_cmd_buffer_ensure_array_state(cmd_buffer, 3254 sizeof(struct v3dv_end_query_cpu_job_info), 3255 state->query.end.used_count, 3256 &state->query.end.alloc_count, 3257 (void **) &state->query.end.states); 3258 v3dv_return_if_oom(cmd_buffer, NULL); 3259 3260 struct v3dv_end_query_cpu_job_info *info = 3261 &state->query.end.states[state->query.end.used_count++]; 3262 3263 info->pool = pool; 3264 info->query = query; 3265 3266 /* From the Vulkan spec: 3267 * 3268 * "If queries are used while executing a render pass instance that has 3269 * multiview enabled, the query uses N consecutive query indices in 3270 * the query pool (starting at query) where N is the number of bits set 3271 * in the view mask in the subpass the query is used in. How the 3272 * numerical results of the query are distributed among the queries is 3273 * implementation-dependent." 3274 * 3275 * In our case, only the first query is used but this means we still need 3276 * to flag the other queries as available so we don't emit errors when 3277 * the applications attempt to retrive values from them. 3278 */ 3279 struct v3dv_render_pass *pass = cmd_buffer->state.pass; 3280 if (!pass->multiview_enabled) { 3281 info->count = 1; 3282 } else { 3283 struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx]; 3284 info->count = util_bitcount(subpass->view_mask); 3285 } 3286 } else { 3287 /* Otherwise, schedule the CPU job immediately */ 3288 struct v3dv_job *job = 3289 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3290 V3DV_JOB_TYPE_CPU_END_QUERY, 3291 cmd_buffer, -1); 3292 v3dv_return_if_oom(cmd_buffer, NULL); 3293 3294 job->cpu.query_end.pool = pool; 3295 job->cpu.query_end.query = query; 3296 3297 /* Multiview queries cannot cross subpass boundaries */ 3298 job->cpu.query_end.count = 1; 3299 3300 list_addtail(&job->list_link, &cmd_buffer->jobs); 3301 } 3302} 3303 3304static void 3305v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer, 3306 struct v3dv_query_pool *pool, 3307 uint32_t query) 3308{ 3309 assert(query < pool->query_count); 3310 assert(cmd_buffer->state.query.active_query.bo != NULL); 3311 3312 v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query); 3313 3314 cmd_buffer->state.query.active_query.bo = NULL; 3315 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; 3316} 3317 3318static void 3319v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer *cmd_buffer, 3320 struct v3dv_query_pool *pool, 3321 uint32_t query) 3322{ 3323 assert(query < pool->query_count); 3324 assert(cmd_buffer->state.query.active_query.perf != NULL); 3325 3326 if (cmd_buffer->state.pass) 3327 v3dv_cmd_buffer_subpass_finish(cmd_buffer); 3328 3329 v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query); 3330 3331 cmd_buffer->state.query.active_query.perf = NULL; 3332 3333 if (cmd_buffer->state.pass) 3334 v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx); 3335} 3336 3337void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer, 3338 struct v3dv_query_pool *pool, 3339 uint32_t query) 3340{ 3341 switch (pool->query_type) { 3342 case VK_QUERY_TYPE_OCCLUSION: 3343 v3dv_cmd_buffer_end_occlusion_query(cmd_buffer, pool, query); 3344 break; 3345 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 3346 v3dv_cmd_buffer_end_performance_query(cmd_buffer, pool, query); 3347 break; 3348 default: 3349 unreachable("Unsupported query type"); 3350 } 3351} 3352 3353void 3354v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer, 3355 struct v3dv_query_pool *pool, 3356 uint32_t first, 3357 uint32_t count, 3358 struct v3dv_buffer *dst, 3359 uint32_t offset, 3360 uint32_t stride, 3361 VkQueryResultFlags flags) 3362{ 3363 /* Copies can only happen outside a render pass instance so we should not 3364 * be in the middle of job recording. 3365 */ 3366 assert(cmd_buffer->state.pass == NULL); 3367 assert(cmd_buffer->state.job == NULL); 3368 3369 assert(first < pool->query_count); 3370 assert(first + count <= pool->query_count); 3371 3372 struct v3dv_job *job = 3373 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3374 V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS, 3375 cmd_buffer, -1); 3376 v3dv_return_if_oom(cmd_buffer, NULL); 3377 3378 job->cpu.query_copy_results.pool = pool; 3379 job->cpu.query_copy_results.first = first; 3380 job->cpu.query_copy_results.count = count; 3381 job->cpu.query_copy_results.dst = dst; 3382 job->cpu.query_copy_results.offset = offset; 3383 job->cpu.query_copy_results.stride = stride; 3384 job->cpu.query_copy_results.flags = flags; 3385 3386 list_addtail(&job->list_link, &cmd_buffer->jobs); 3387} 3388 3389void 3390v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer, 3391 struct drm_v3d_submit_tfu *tfu) 3392{ 3393 struct v3dv_device *device = cmd_buffer->device; 3394 struct v3dv_job *job = vk_zalloc(&device->vk.alloc, 3395 sizeof(struct v3dv_job), 8, 3396 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 3397 if (!job) { 3398 v3dv_flag_oom(cmd_buffer, NULL); 3399 return; 3400 } 3401 3402 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_TFU, device, cmd_buffer, -1); 3403 job->tfu = *tfu; 3404 list_addtail(&job->list_link, &cmd_buffer->jobs); 3405} 3406 3407VKAPI_ATTR void VKAPI_CALL 3408v3dv_CmdSetEvent(VkCommandBuffer commandBuffer, 3409 VkEvent _event, 3410 VkPipelineStageFlags stageMask) 3411{ 3412 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3413 V3DV_FROM_HANDLE(v3dv_event, event, _event); 3414 3415 /* Event (re)sets can only happen outside a render pass instance so we 3416 * should not be in the middle of job recording. 3417 */ 3418 assert(cmd_buffer->state.pass == NULL); 3419 assert(cmd_buffer->state.job == NULL); 3420 3421 struct v3dv_job *job = 3422 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3423 V3DV_JOB_TYPE_CPU_SET_EVENT, 3424 cmd_buffer, -1); 3425 v3dv_return_if_oom(cmd_buffer, NULL); 3426 3427 job->cpu.event_set.event = event; 3428 job->cpu.event_set.state = 1; 3429 3430 list_addtail(&job->list_link, &cmd_buffer->jobs); 3431} 3432 3433VKAPI_ATTR void VKAPI_CALL 3434v3dv_CmdResetEvent(VkCommandBuffer commandBuffer, 3435 VkEvent _event, 3436 VkPipelineStageFlags stageMask) 3437{ 3438 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3439 V3DV_FROM_HANDLE(v3dv_event, event, _event); 3440 3441 /* Event (re)sets can only happen outside a render pass instance so we 3442 * should not be in the middle of job recording. 3443 */ 3444 assert(cmd_buffer->state.pass == NULL); 3445 assert(cmd_buffer->state.job == NULL); 3446 3447 struct v3dv_job *job = 3448 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3449 V3DV_JOB_TYPE_CPU_SET_EVENT, 3450 cmd_buffer, -1); 3451 v3dv_return_if_oom(cmd_buffer, NULL); 3452 3453 job->cpu.event_set.event = event; 3454 job->cpu.event_set.state = 0; 3455 3456 list_addtail(&job->list_link, &cmd_buffer->jobs); 3457} 3458 3459VKAPI_ATTR void VKAPI_CALL 3460v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer, 3461 uint32_t eventCount, 3462 const VkEvent *pEvents, 3463 VkPipelineStageFlags srcStageMask, 3464 VkPipelineStageFlags dstStageMask, 3465 uint32_t memoryBarrierCount, 3466 const VkMemoryBarrier *pMemoryBarriers, 3467 uint32_t bufferMemoryBarrierCount, 3468 const VkBufferMemoryBarrier *pBufferMemoryBarriers, 3469 uint32_t imageMemoryBarrierCount, 3470 const VkImageMemoryBarrier *pImageMemoryBarriers) 3471{ 3472 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3473 3474 assert(eventCount > 0); 3475 3476 struct v3dv_job *job = 3477 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3478 V3DV_JOB_TYPE_CPU_WAIT_EVENTS, 3479 cmd_buffer, -1); 3480 v3dv_return_if_oom(cmd_buffer, NULL); 3481 3482 const uint32_t event_list_size = sizeof(struct v3dv_event *) * eventCount; 3483 3484 job->cpu.event_wait.events = 3485 vk_alloc(&cmd_buffer->device->vk.alloc, event_list_size, 8, 3486 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 3487 if (!job->cpu.event_wait.events) { 3488 v3dv_flag_oom(cmd_buffer, NULL); 3489 return; 3490 } 3491 job->cpu.event_wait.event_count = eventCount; 3492 3493 for (uint32_t i = 0; i < eventCount; i++) 3494 job->cpu.event_wait.events[i] = v3dv_event_from_handle(pEvents[i]); 3495 3496 /* vkCmdWaitEvents can be recorded inside a render pass, so we might have 3497 * an active job. 3498 * 3499 * If we are inside a render pass, because we vkCmd(Re)SetEvent can't happen 3500 * inside a render pass, it is safe to move the wait job so it happens right 3501 * before the current job we are currently recording for the subpass, if any 3502 * (it would actually be safe to move it all the way back to right before 3503 * the start of the render pass). 3504 * 3505 * If we are outside a render pass then we should not have any on-going job 3506 * and we are free to just add the wait job without restrictions. 3507 */ 3508 assert(cmd_buffer->state.pass || !cmd_buffer->state.job); 3509 list_addtail(&job->list_link, &cmd_buffer->jobs); 3510} 3511 3512VKAPI_ATTR void VKAPI_CALL 3513v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer, 3514 VkPipelineStageFlagBits pipelineStage, 3515 VkQueryPool queryPool, 3516 uint32_t query) 3517{ 3518 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3519 V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool); 3520 3521 /* If this is called inside a render pass we need to finish the current 3522 * job here... 3523 */ 3524 struct v3dv_render_pass *pass = cmd_buffer->state.pass; 3525 if (pass) 3526 v3dv_cmd_buffer_finish_job(cmd_buffer); 3527 3528 struct v3dv_job *job = 3529 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3530 V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY, 3531 cmd_buffer, -1); 3532 v3dv_return_if_oom(cmd_buffer, NULL); 3533 3534 job->cpu.query_timestamp.pool = query_pool; 3535 job->cpu.query_timestamp.query = query; 3536 3537 if (!pass || !pass->multiview_enabled) { 3538 job->cpu.query_timestamp.count = 1; 3539 } else { 3540 struct v3dv_subpass *subpass = 3541 &pass->subpasses[cmd_buffer->state.subpass_idx]; 3542 job->cpu.query_timestamp.count = util_bitcount(subpass->view_mask); 3543 } 3544 3545 list_addtail(&job->list_link, &cmd_buffer->jobs); 3546 cmd_buffer->state.job = NULL; 3547 3548 /* ...and resume the subpass after the timestamp */ 3549 if (cmd_buffer->state.pass) 3550 v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx); 3551} 3552 3553static void 3554cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer) 3555{ 3556 assert(cmd_buffer->state.compute.pipeline); 3557 assert(cmd_buffer->state.compute.pipeline->active_stages == 3558 VK_SHADER_STAGE_COMPUTE_BIT); 3559 3560 cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_COMPUTE_PIPELINE | 3561 V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS); 3562 cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT; 3563 cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT; 3564} 3565 3566#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16 3567#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0 3568/* Allow this dispatch to start while the last one is still running. */ 3569#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26) 3570/* Maximum supergroup ID. 6 bits. */ 3571#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20 3572/* Batches per supergroup minus 1. 8 bits. */ 3573#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12 3574/* Workgroups per supergroup, 0 means 16 */ 3575#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8 3576#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0 3577 3578#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2) 3579#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1) 3580#define V3D_CSD_CFG5_THREADING (1 << 0) 3581 3582void 3583v3dv_cmd_buffer_rewrite_indirect_csd_job( 3584 struct v3dv_csd_indirect_cpu_job_info *info, 3585 const uint32_t *wg_counts) 3586{ 3587 assert(info->csd_job); 3588 struct v3dv_job *job = info->csd_job; 3589 3590 assert(job->type == V3DV_JOB_TYPE_GPU_CSD); 3591 assert(wg_counts[0] > 0 && wg_counts[1] > 0 && wg_counts[2] > 0); 3592 3593 struct drm_v3d_submit_csd *submit = &job->csd.submit; 3594 3595 job->csd.wg_count[0] = wg_counts[0]; 3596 job->csd.wg_count[1] = wg_counts[1]; 3597 job->csd.wg_count[2] = wg_counts[2]; 3598 3599 submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT; 3600 submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; 3601 submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; 3602 3603 submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) * 3604 (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; 3605 assert(submit->cfg[4] != ~0); 3606 3607 if (info->needs_wg_uniform_rewrite) { 3608 /* Make sure the GPU is not currently accessing the indirect CL for this 3609 * job, since we are about to overwrite some of the uniform data. 3610 */ 3611 v3dv_bo_wait(job->device, job->indirect.bo, PIPE_TIMEOUT_INFINITE); 3612 3613 for (uint32_t i = 0; i < 3; i++) { 3614 if (info->wg_uniform_offsets[i]) { 3615 /* Sanity check that our uniform pointers are within the allocated 3616 * BO space for our indirect CL. 3617 */ 3618 assert(info->wg_uniform_offsets[i] >= (uint32_t *) job->indirect.base); 3619 assert(info->wg_uniform_offsets[i] < (uint32_t *) job->indirect.next); 3620 *(info->wg_uniform_offsets[i]) = wg_counts[i]; 3621 } 3622 } 3623 } 3624} 3625 3626static struct v3dv_job * 3627cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, 3628 uint32_t base_offset_x, 3629 uint32_t base_offset_y, 3630 uint32_t base_offset_z, 3631 uint32_t group_count_x, 3632 uint32_t group_count_y, 3633 uint32_t group_count_z, 3634 uint32_t **wg_uniform_offsets_out, 3635 uint32_t *wg_size_out) 3636{ 3637 struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline; 3638 assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]); 3639 struct v3dv_shader_variant *cs_variant = 3640 pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]; 3641 3642 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc, 3643 sizeof(struct v3dv_job), 8, 3644 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 3645 if (!job) { 3646 v3dv_flag_oom(cmd_buffer, NULL); 3647 return NULL; 3648 } 3649 3650 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1); 3651 cmd_buffer->state.job = job; 3652 3653 struct drm_v3d_submit_csd *submit = &job->csd.submit; 3654 3655 job->csd.wg_count[0] = group_count_x; 3656 job->csd.wg_count[1] = group_count_y; 3657 job->csd.wg_count[2] = group_count_z; 3658 3659 job->csd.wg_base[0] = base_offset_x; 3660 job->csd.wg_base[1] = base_offset_y; 3661 job->csd.wg_base[2] = base_offset_z; 3662 3663 submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT; 3664 submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT; 3665 submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT; 3666 3667 const struct v3d_compute_prog_data *cpd = 3668 cs_variant->prog_data.cs; 3669 3670 const uint32_t num_wgs = group_count_x * group_count_y * group_count_z; 3671 const uint32_t wg_size = cpd->local_size[0] * 3672 cpd->local_size[1] * 3673 cpd->local_size[2]; 3674 3675 uint32_t wgs_per_sg = 3676 v3d_csd_choose_workgroups_per_supergroup( 3677 &cmd_buffer->device->devinfo, 3678 cs_variant->prog_data.cs->has_subgroups, 3679 cs_variant->prog_data.cs->base.has_control_barrier, 3680 cs_variant->prog_data.cs->base.threads, 3681 num_wgs, wg_size); 3682 3683 uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16); 3684 uint32_t whole_sgs = num_wgs / wgs_per_sg; 3685 uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg; 3686 uint32_t num_batches = batches_per_sg * whole_sgs + 3687 DIV_ROUND_UP(rem_wgs * wg_size, 16); 3688 3689 submit->cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT; 3690 submit->cfg[3] |= (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT; 3691 submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT; 3692 if (wg_size_out) 3693 *wg_size_out = wg_size; 3694 3695 submit->cfg[4] = num_batches - 1; 3696 assert(submit->cfg[4] != ~0); 3697 3698 assert(pipeline->shared_data->assembly_bo); 3699 struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo; 3700 3701 submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset; 3702 submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; 3703 if (cs_variant->prog_data.base->single_seg) 3704 submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; 3705 if (cs_variant->prog_data.base->threads == 4) 3706 submit->cfg[5] |= V3D_CSD_CFG5_THREADING; 3707 3708 if (cs_variant->prog_data.cs->shared_size > 0) { 3709 job->csd.shared_memory = 3710 v3dv_bo_alloc(cmd_buffer->device, 3711 cs_variant->prog_data.cs->shared_size * wgs_per_sg, 3712 "shared_vars", true); 3713 if (!job->csd.shared_memory) { 3714 v3dv_flag_oom(cmd_buffer, NULL); 3715 return job; 3716 } 3717 } 3718 3719 v3dv_job_add_bo_unchecked(job, cs_assembly_bo); 3720 struct v3dv_cl_reloc uniforms = 3721 v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline, 3722 cs_variant, 3723 wg_uniform_offsets_out); 3724 submit->cfg[6] = uniforms.bo->offset + uniforms.offset; 3725 3726 3727 /* Track VK_KHR_buffer_device_address usage in the job */ 3728 job->uses_buffer_device_address |= pipeline->uses_buffer_device_address; 3729 3730 v3dv_job_add_bo(job, uniforms.bo); 3731 3732 return job; 3733} 3734 3735static void 3736cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer, 3737 uint32_t base_offset_x, 3738 uint32_t base_offset_y, 3739 uint32_t base_offset_z, 3740 uint32_t group_count_x, 3741 uint32_t group_count_y, 3742 uint32_t group_count_z) 3743{ 3744 if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0) 3745 return; 3746 3747 struct v3dv_job *job = 3748 cmd_buffer_create_csd_job(cmd_buffer, 3749 base_offset_x, 3750 base_offset_y, 3751 base_offset_z, 3752 group_count_x, 3753 group_count_y, 3754 group_count_z, 3755 NULL, NULL); 3756 3757 list_addtail(&job->list_link, &cmd_buffer->jobs); 3758 cmd_buffer->state.job = NULL; 3759} 3760 3761VKAPI_ATTR void VKAPI_CALL 3762v3dv_CmdDispatch(VkCommandBuffer commandBuffer, 3763 uint32_t groupCountX, 3764 uint32_t groupCountY, 3765 uint32_t groupCountZ) 3766{ 3767 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3768 3769 cmd_buffer_emit_pre_dispatch(cmd_buffer); 3770 cmd_buffer_dispatch(cmd_buffer, 0, 0, 0, 3771 groupCountX, groupCountY, groupCountZ); 3772} 3773 3774VKAPI_ATTR void VKAPI_CALL 3775v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer, 3776 uint32_t baseGroupX, 3777 uint32_t baseGroupY, 3778 uint32_t baseGroupZ, 3779 uint32_t groupCountX, 3780 uint32_t groupCountY, 3781 uint32_t groupCountZ) 3782{ 3783 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3784 3785 cmd_buffer_emit_pre_dispatch(cmd_buffer); 3786 cmd_buffer_dispatch(cmd_buffer, 3787 baseGroupX, baseGroupY, baseGroupZ, 3788 groupCountX, groupCountY, groupCountZ); 3789} 3790 3791 3792static void 3793cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer, 3794 struct v3dv_buffer *buffer, 3795 uint32_t offset) 3796{ 3797 /* We can't do indirect dispatches, so instead we record a CPU job that, 3798 * when executed in the queue, will map the indirect buffer, read the 3799 * dispatch parameters, and submit a regular dispatch. 3800 */ 3801 struct v3dv_job *job = 3802 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, 3803 V3DV_JOB_TYPE_CPU_CSD_INDIRECT, 3804 cmd_buffer, -1); 3805 v3dv_return_if_oom(cmd_buffer, NULL); 3806 3807 /* We need to create a CSD job now, even if we still don't know the actual 3808 * dispatch parameters, because the job setup needs to be done using the 3809 * current command buffer state (i.e. pipeline, descriptor sets, push 3810 * constants, etc.). So we create the job with default dispatch parameters 3811 * and we will rewrite the parts we need at submit time if the indirect 3812 * parameters don't match the ones we used to setup the job. 3813 */ 3814 struct v3dv_job *csd_job = 3815 cmd_buffer_create_csd_job(cmd_buffer, 3816 0, 0, 0, 3817 1, 1, 1, 3818 &job->cpu.csd_indirect.wg_uniform_offsets[0], 3819 &job->cpu.csd_indirect.wg_size); 3820 v3dv_return_if_oom(cmd_buffer, NULL); 3821 assert(csd_job); 3822 3823 job->cpu.csd_indirect.buffer = buffer; 3824 job->cpu.csd_indirect.offset = offset; 3825 job->cpu.csd_indirect.csd_job = csd_job; 3826 3827 /* If the compute shader reads the workgroup sizes we will also need to 3828 * rewrite the corresponding uniforms. 3829 */ 3830 job->cpu.csd_indirect.needs_wg_uniform_rewrite = 3831 job->cpu.csd_indirect.wg_uniform_offsets[0] || 3832 job->cpu.csd_indirect.wg_uniform_offsets[1] || 3833 job->cpu.csd_indirect.wg_uniform_offsets[2]; 3834 3835 list_addtail(&job->list_link, &cmd_buffer->jobs); 3836 list_addtail(&csd_job->list_link, &cmd_buffer->jobs); 3837 cmd_buffer->state.job = NULL; 3838} 3839 3840VKAPI_ATTR void VKAPI_CALL 3841v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, 3842 VkBuffer _buffer, 3843 VkDeviceSize offset) 3844{ 3845 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); 3846 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer); 3847 3848 assert(offset <= UINT32_MAX); 3849 3850 cmd_buffer_emit_pre_dispatch(cmd_buffer); 3851 cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset); 3852} 3853 3854VKAPI_ATTR void VKAPI_CALL 3855v3dv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask) 3856{ 3857 /* Nothing to do here since we only support a single device */ 3858 assert(deviceMask == 0x1); 3859} 3860