1/* 2 * Copyright © 2021 Raspberry Pi Ltd 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "v3dv_private.h" 25#include "broadcom/common/v3d_macros.h" 26#include "broadcom/common/v3d_util.h" 27#include "broadcom/cle/v3dx_pack.h" 28#include "broadcom/compiler/v3d_compiler.h" 29 30#include "util/half_float.h" 31#include "vulkan/util/vk_format.h" 32#include "util/u_pack_color.h" 33 34void 35v3dX(job_emit_binning_flush)(struct v3dv_job *job) 36{ 37 assert(job); 38 39 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(FLUSH)); 40 v3dv_return_if_oom(NULL, job); 41 42 cl_emit(&job->bcl, FLUSH, flush); 43} 44 45void 46v3dX(job_emit_binning_prolog)(struct v3dv_job *job, 47 const struct v3dv_frame_tiling *tiling, 48 uint32_t layers) 49{ 50 /* This must go before the binning mode configuration. It is 51 * required for layered framebuffers to work. 52 */ 53 cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) { 54 config.number_of_layers = layers; 55 } 56 57 assert(!tiling->double_buffer || !tiling->msaa); 58 cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { 59 config.width_in_pixels = tiling->width; 60 config.height_in_pixels = tiling->height; 61 config.number_of_render_targets = MAX2(tiling->render_target_count, 1); 62 config.multisample_mode_4x = tiling->msaa; 63 config.double_buffer_in_non_ms_mode = tiling->double_buffer; 64 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; 65 } 66 67 /* There's definitely nothing in the VCD cache we want. */ 68 cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin); 69 70 /* "Binning mode lists must have a Start Tile Binning item (6) after 71 * any prefix state data before the binning list proper starts." 72 */ 73 cl_emit(&job->bcl, START_TILE_BINNING, bin); 74} 75 76void 77v3dX(cmd_buffer_end_render_pass_secondary)(struct v3dv_cmd_buffer *cmd_buffer) 78{ 79 assert(cmd_buffer->state.job); 80 v3dv_cl_ensure_space_with_branch(&cmd_buffer->state.job->bcl, 81 cl_packet_length(RETURN_FROM_SUB_LIST)); 82 v3dv_return_if_oom(cmd_buffer, NULL); 83 cl_emit(&cmd_buffer->state.job->bcl, RETURN_FROM_SUB_LIST, ret); 84} 85 86void 87v3dX(job_emit_clip_window)(struct v3dv_job *job, const VkRect2D *rect) 88{ 89 assert(job); 90 91 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CLIP_WINDOW)); 92 v3dv_return_if_oom(NULL, job); 93 94 cl_emit(&job->bcl, CLIP_WINDOW, clip) { 95 clip.clip_window_left_pixel_coordinate = rect->offset.x; 96 clip.clip_window_bottom_pixel_coordinate = rect->offset.y; 97 clip.clip_window_width_in_pixels = rect->extent.width; 98 clip.clip_window_height_in_pixels = rect->extent.height; 99 } 100} 101 102static void 103cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer, 104 struct v3dv_cl *cl, 105 struct v3dv_image_view *iview, 106 uint32_t layer, 107 uint32_t buffer) 108{ 109 const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image; 110 const struct v3d_resource_slice *slice = 111 &image->slices[iview->vk.base_mip_level]; 112 uint32_t layer_offset = 113 v3dv_layer_offset(image, iview->vk.base_mip_level, 114 iview->vk.base_array_layer + layer); 115 116 cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) { 117 load.buffer_to_load = buffer; 118 load.address = v3dv_cl_address(image->mem->bo, layer_offset); 119 120 load.input_image_format = iview->format->rt_type; 121 122 /* If we create an image view with only the stencil format, we 123 * re-interpret the format as RGBA8_UINT, as it is want we want in 124 * general (see CreateImageView). 125 * 126 * However, when we are loading/storing tiles from the ZSTENCIL tile 127 * buffer, we need to use the underlying DS format. 128 */ 129 if (buffer == ZSTENCIL && 130 iview->format->rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) { 131 assert(image->format->rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8); 132 load.input_image_format = image->format->rt_type; 133 } 134 135 load.r_b_swap = iview->swap_rb; 136 load.channel_reverse = iview->channel_reverse; 137 load.memory_format = slice->tiling; 138 139 if (slice->tiling == V3D_TILING_UIF_NO_XOR || 140 slice->tiling == V3D_TILING_UIF_XOR) { 141 load.height_in_ub_or_stride = 142 slice->padded_height_of_output_image_in_uif_blocks; 143 } else if (slice->tiling == V3D_TILING_RASTER) { 144 load.height_in_ub_or_stride = slice->stride; 145 } 146 147 if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT) 148 load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES; 149 else 150 load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0; 151 } 152} 153 154static bool 155check_needs_load(const struct v3dv_cmd_buffer_state *state, 156 VkImageAspectFlags aspect, 157 uint32_t first_subpass_idx, 158 VkAttachmentLoadOp load_op) 159{ 160 /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are 161 * testing does not exist in the image. 162 */ 163 if (!aspect) 164 return false; 165 166 /* Attachment (or view) load operations apply on the first subpass that 167 * uses the attachment (or view), otherwise we always need to load. 168 */ 169 if (state->job->first_subpass > first_subpass_idx) 170 return true; 171 172 /* If the job is continuing a subpass started in another job, we always 173 * need to load. 174 */ 175 if (state->job->is_subpass_continue) 176 return true; 177 178 /* If the area is not aligned to tile boundaries, we always need to load */ 179 if (!state->tile_aligned_render_area) 180 return true; 181 182 /* The attachment load operations must be LOAD */ 183 return load_op == VK_ATTACHMENT_LOAD_OP_LOAD; 184} 185 186static inline uint32_t 187v3dv_zs_buffer(bool depth, bool stencil) 188{ 189 if (depth && stencil) 190 return ZSTENCIL; 191 else if (depth) 192 return Z; 193 else if (stencil) 194 return STENCIL; 195 return NONE; 196} 197 198static void 199cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer, 200 struct v3dv_cl *cl, 201 uint32_t layer) 202{ 203 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 204 const struct v3dv_render_pass *pass = state->pass; 205 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx]; 206 207 assert(!pass->multiview_enabled || layer < MAX_MULTIVIEW_VIEW_COUNT); 208 209 for (uint32_t i = 0; i < subpass->color_count; i++) { 210 uint32_t attachment_idx = subpass->color_attachments[i].attachment; 211 212 if (attachment_idx == VK_ATTACHMENT_UNUSED) 213 continue; 214 215 const struct v3dv_render_pass_attachment *attachment = 216 &state->pass->attachments[attachment_idx]; 217 218 /* According to the Vulkan spec: 219 * 220 * "The load operation for each sample in an attachment happens before 221 * any recorded command which accesses the sample in the first subpass 222 * where the attachment is used." 223 * 224 * If the load operation is CLEAR, we must only clear once on the first 225 * subpass that uses the attachment (and in that case we don't LOAD). 226 * After that, we always want to load so we don't lose any rendering done 227 * by a previous subpass to the same attachment. We also want to load 228 * if the current job is continuing subpass work started by a previous 229 * job, for the same reason. 230 * 231 * If the render area is not aligned to tile boundaries then we have 232 * tiles which are partially covered by it. In this case, we need to 233 * load the tiles so we can preserve the pixels that are outside the 234 * render area for any such tiles. 235 */ 236 uint32_t first_subpass = !pass->multiview_enabled ? 237 attachment->first_subpass : 238 attachment->views[layer].first_subpass; 239 240 bool needs_load = check_needs_load(state, 241 VK_IMAGE_ASPECT_COLOR_BIT, 242 first_subpass, 243 attachment->desc.loadOp); 244 if (needs_load) { 245 struct v3dv_image_view *iview = 246 state->attachments[attachment_idx].image_view; 247 cmd_buffer_render_pass_emit_load(cmd_buffer, cl, iview, 248 layer, RENDER_TARGET_0 + i); 249 } 250 } 251 252 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment; 253 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { 254 const struct v3dv_render_pass_attachment *ds_attachment = 255 &state->pass->attachments[ds_attachment_idx]; 256 257 const VkImageAspectFlags ds_aspects = 258 vk_format_aspects(ds_attachment->desc.format); 259 260 uint32_t ds_first_subpass = !pass->multiview_enabled ? 261 ds_attachment->first_subpass : 262 ds_attachment->views[layer].first_subpass; 263 264 const bool needs_depth_load = 265 check_needs_load(state, 266 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, 267 ds_first_subpass, 268 ds_attachment->desc.loadOp); 269 270 const bool needs_stencil_load = 271 check_needs_load(state, 272 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT, 273 ds_first_subpass, 274 ds_attachment->desc.stencilLoadOp); 275 276 if (needs_depth_load || needs_stencil_load) { 277 struct v3dv_image_view *iview = 278 state->attachments[ds_attachment_idx].image_view; 279 /* From the Vulkan spec: 280 * 281 * "When an image view of a depth/stencil image is used as a 282 * depth/stencil framebuffer attachment, the aspectMask is ignored 283 * and both depth and stencil image subresources are used." 284 * 285 * So we ignore the aspects from the subresource range of the image 286 * view for the depth/stencil attachment, but we still need to restrict 287 * the to aspects compatible with the render pass and the image. 288 */ 289 const uint32_t zs_buffer = 290 v3dv_zs_buffer(needs_depth_load, needs_stencil_load); 291 cmd_buffer_render_pass_emit_load(cmd_buffer, cl, 292 iview, layer, zs_buffer); 293 } 294 } 295 296 cl_emit(cl, END_OF_LOADS, end); 297} 298 299static void 300cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer, 301 struct v3dv_cl *cl, 302 uint32_t attachment_idx, 303 uint32_t layer, 304 uint32_t buffer, 305 bool clear, 306 bool is_multisample_resolve) 307{ 308 const struct v3dv_image_view *iview = 309 cmd_buffer->state.attachments[attachment_idx].image_view; 310 const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image; 311 const struct v3d_resource_slice *slice = 312 &image->slices[iview->vk.base_mip_level]; 313 uint32_t layer_offset = v3dv_layer_offset(image, 314 iview->vk.base_mip_level, 315 iview->vk.base_array_layer + layer); 316 317 cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) { 318 store.buffer_to_store = buffer; 319 store.address = v3dv_cl_address(image->mem->bo, layer_offset); 320 store.clear_buffer_being_stored = clear; 321 322 store.output_image_format = iview->format->rt_type; 323 324 /* If we create an image view with only the stencil format, we 325 * re-interpret the format as RGBA8_UINT, as it is want we want in 326 * general (see CreateImageView). 327 * 328 * However, when we are loading/storing tiles from the ZSTENCIL tile 329 * buffer, we need to use the underlying DS format. 330 */ 331 if (buffer == ZSTENCIL && 332 iview->format->rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) { 333 assert(image->format->rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8); 334 store.output_image_format = image->format->rt_type; 335 } 336 337 store.r_b_swap = iview->swap_rb; 338 store.channel_reverse = iview->channel_reverse; 339 store.memory_format = slice->tiling; 340 341 if (slice->tiling == V3D_TILING_UIF_NO_XOR || 342 slice->tiling == V3D_TILING_UIF_XOR) { 343 store.height_in_ub_or_stride = 344 slice->padded_height_of_output_image_in_uif_blocks; 345 } else if (slice->tiling == V3D_TILING_RASTER) { 346 store.height_in_ub_or_stride = slice->stride; 347 } 348 349 if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT) 350 store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES; 351 else if (is_multisample_resolve) 352 store.decimate_mode = V3D_DECIMATE_MODE_4X; 353 else 354 store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0; 355 } 356} 357 358static bool 359check_needs_clear(const struct v3dv_cmd_buffer_state *state, 360 VkImageAspectFlags aspect, 361 uint32_t first_subpass_idx, 362 VkAttachmentLoadOp load_op, 363 bool do_clear_with_draw) 364{ 365 /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are 366 * testing does not exist in the image. 367 */ 368 if (!aspect) 369 return false; 370 371 /* If the aspect needs to be cleared with a draw call then we won't emit 372 * the clear here. 373 */ 374 if (do_clear_with_draw) 375 return false; 376 377 /* If this is resuming a subpass started with another job, then attachment 378 * load operations don't apply. 379 */ 380 if (state->job->is_subpass_continue) 381 return false; 382 383 /* If the render area is not aligned to tile boudaries we can't use the 384 * TLB for a clear. 385 */ 386 if (!state->tile_aligned_render_area) 387 return false; 388 389 /* If this job is running in a subpass other than the first subpass in 390 * which this attachment (or view) is used then attachment load operations 391 * don't apply. 392 */ 393 if (state->job->first_subpass != first_subpass_idx) 394 return false; 395 396 /* The attachment load operation must be CLEAR */ 397 return load_op == VK_ATTACHMENT_LOAD_OP_CLEAR; 398} 399 400static bool 401check_needs_store(const struct v3dv_cmd_buffer_state *state, 402 VkImageAspectFlags aspect, 403 uint32_t last_subpass_idx, 404 VkAttachmentStoreOp store_op) 405{ 406 /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are 407 * testing does not exist in the image. 408 */ 409 if (!aspect) 410 return false; 411 412 /* Attachment (or view) store operations only apply on the last subpass 413 * where the attachment (or view) is used, in other subpasses we always 414 * need to store. 415 */ 416 if (state->subpass_idx < last_subpass_idx) 417 return true; 418 419 /* Attachment store operations only apply on the last job we emit on the the 420 * last subpass where the attachment is used, otherwise we always need to 421 * store. 422 */ 423 if (!state->job->is_subpass_finish) 424 return true; 425 426 /* The attachment store operation must be STORE */ 427 return store_op == VK_ATTACHMENT_STORE_OP_STORE; 428} 429 430static void 431cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, 432 struct v3dv_cl *cl, 433 uint32_t layer) 434{ 435 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 436 struct v3dv_render_pass *pass = state->pass; 437 const struct v3dv_subpass *subpass = 438 &pass->subpasses[state->subpass_idx]; 439 440 bool has_stores = false; 441 bool use_global_zs_clear = false; 442 bool use_global_rt_clear = false; 443 444 assert(!pass->multiview_enabled || layer < MAX_MULTIVIEW_VIEW_COUNT); 445 446 /* FIXME: separate stencil */ 447 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment; 448 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { 449 const struct v3dv_render_pass_attachment *ds_attachment = 450 &state->pass->attachments[ds_attachment_idx]; 451 452 assert(state->job->first_subpass >= ds_attachment->first_subpass); 453 assert(state->subpass_idx >= ds_attachment->first_subpass); 454 assert(state->subpass_idx <= ds_attachment->last_subpass); 455 456 /* From the Vulkan spec, VkImageSubresourceRange: 457 * 458 * "When an image view of a depth/stencil image is used as a 459 * depth/stencil framebuffer attachment, the aspectMask is ignored 460 * and both depth and stencil image subresources are used." 461 * 462 * So we ignore the aspects from the subresource range of the image 463 * view for the depth/stencil attachment, but we still need to restrict 464 * the to aspects compatible with the render pass and the image. 465 */ 466 const VkImageAspectFlags aspects = 467 vk_format_aspects(ds_attachment->desc.format); 468 469 /* Only clear once on the first subpass that uses the attachment */ 470 uint32_t ds_first_subpass = !state->pass->multiview_enabled ? 471 ds_attachment->first_subpass : 472 ds_attachment->views[layer].first_subpass; 473 474 bool needs_depth_clear = 475 check_needs_clear(state, 476 aspects & VK_IMAGE_ASPECT_DEPTH_BIT, 477 ds_first_subpass, 478 ds_attachment->desc.loadOp, 479 subpass->do_depth_clear_with_draw); 480 481 bool needs_stencil_clear = 482 check_needs_clear(state, 483 aspects & VK_IMAGE_ASPECT_STENCIL_BIT, 484 ds_first_subpass, 485 ds_attachment->desc.stencilLoadOp, 486 subpass->do_stencil_clear_with_draw); 487 488 /* Skip the last store if it is not required */ 489 uint32_t ds_last_subpass = !pass->multiview_enabled ? 490 ds_attachment->last_subpass : 491 ds_attachment->views[layer].last_subpass; 492 493 bool needs_depth_store = 494 check_needs_store(state, 495 aspects & VK_IMAGE_ASPECT_DEPTH_BIT, 496 ds_last_subpass, 497 ds_attachment->desc.storeOp); 498 499 bool needs_stencil_store = 500 check_needs_store(state, 501 aspects & VK_IMAGE_ASPECT_STENCIL_BIT, 502 ds_last_subpass, 503 ds_attachment->desc.stencilStoreOp); 504 505 /* If we have a resolve, handle it before storing the tile */ 506 const struct v3dv_cmd_buffer_attachment_state *ds_att_state = 507 &state->attachments[ds_attachment_idx]; 508 if (ds_att_state->use_tlb_resolve) { 509 assert(ds_att_state->has_resolve); 510 assert(subpass->resolve_depth || subpass->resolve_stencil); 511 const uint32_t resolve_attachment_idx = 512 subpass->ds_resolve_attachment.attachment; 513 assert(resolve_attachment_idx != VK_ATTACHMENT_UNUSED); 514 515 const uint32_t zs_buffer = 516 v3dv_zs_buffer(subpass->resolve_depth, subpass->resolve_stencil); 517 cmd_buffer_render_pass_emit_store(cmd_buffer, cl, 518 resolve_attachment_idx, layer, 519 zs_buffer, 520 false, false); 521 has_stores = true; 522 } else if (ds_att_state->has_resolve) { 523 /* If we can't use the TLB to implement the resolve we will need to 524 * store the attachment so we can implement it later using a blit. 525 */ 526 needs_depth_store = subpass->resolve_depth; 527 needs_stencil_store = subpass->resolve_stencil; 528 } 529 530 /* GFXH-1689: The per-buffer store command's clear buffer bit is broken 531 * for depth/stencil. 532 * 533 * There used to be some confusion regarding the Clear Tile Buffers 534 * Z/S bit also being broken, but we confirmed with Broadcom that this 535 * is not the case, it was just that some other hardware bugs (that we 536 * need to work around, such as GFXH-1461) could cause this bit to behave 537 * incorrectly. 538 * 539 * There used to be another issue where the RTs bit in the Clear Tile 540 * Buffers packet also cleared Z/S, but Broadcom confirmed this is 541 * fixed since V3D 4.1. 542 * 543 * So if we have to emit a clear of depth or stencil we don't use 544 * the per-buffer store clear bit, even if we need to store the buffers, 545 * instead we always have to use the Clear Tile Buffers Z/S bit. 546 * If we have configured the job to do early Z/S clearing, then we 547 * don't want to emit any Clear Tile Buffers command at all here. 548 * 549 * Note that GFXH-1689 is not reproduced in the simulator, where 550 * using the clear buffer bit in depth/stencil stores works fine. 551 */ 552 use_global_zs_clear = !state->job->early_zs_clear && 553 (needs_depth_clear || needs_stencil_clear); 554 if (needs_depth_store || needs_stencil_store) { 555 const uint32_t zs_buffer = 556 v3dv_zs_buffer(needs_depth_store, needs_stencil_store); 557 cmd_buffer_render_pass_emit_store(cmd_buffer, cl, 558 ds_attachment_idx, layer, 559 zs_buffer, false, false); 560 has_stores = true; 561 } 562 } 563 564 for (uint32_t i = 0; i < subpass->color_count; i++) { 565 uint32_t attachment_idx = subpass->color_attachments[i].attachment; 566 567 if (attachment_idx == VK_ATTACHMENT_UNUSED) 568 continue; 569 570 const struct v3dv_render_pass_attachment *attachment = 571 &state->pass->attachments[attachment_idx]; 572 573 assert(state->job->first_subpass >= attachment->first_subpass); 574 assert(state->subpass_idx >= attachment->first_subpass); 575 assert(state->subpass_idx <= attachment->last_subpass); 576 577 /* Only clear once on the first subpass that uses the attachment */ 578 uint32_t first_subpass = !pass->multiview_enabled ? 579 attachment->first_subpass : 580 attachment->views[layer].first_subpass; 581 582 bool needs_clear = 583 check_needs_clear(state, 584 VK_IMAGE_ASPECT_COLOR_BIT, 585 first_subpass, 586 attachment->desc.loadOp, 587 false); 588 589 /* Skip the last store if it is not required */ 590 uint32_t last_subpass = !pass->multiview_enabled ? 591 attachment->last_subpass : 592 attachment->views[layer].last_subpass; 593 594 bool needs_store = 595 check_needs_store(state, 596 VK_IMAGE_ASPECT_COLOR_BIT, 597 last_subpass, 598 attachment->desc.storeOp); 599 600 /* If we need to resolve this attachment emit that store first. Notice 601 * that we must not request a tile buffer clear here in that case, since 602 * that would clear the tile buffer before we get to emit the actual 603 * color attachment store below, since the clear happens after the 604 * store is completed. 605 * 606 * If the attachment doesn't support TLB resolves (or the render area 607 * is not aligned to tile boundaries) then we will have to fallback to 608 * doing the resolve in a shader separately after this job, so we will 609 * need to store the multisampled attachment even if that wasn't 610 * requested by the client. 611 */ 612 const struct v3dv_cmd_buffer_attachment_state *att_state = 613 &state->attachments[attachment_idx]; 614 if (att_state->use_tlb_resolve) { 615 assert(att_state->has_resolve); 616 const uint32_t resolve_attachment_idx = 617 subpass->resolve_attachments[i].attachment; 618 cmd_buffer_render_pass_emit_store(cmd_buffer, cl, 619 resolve_attachment_idx, layer, 620 RENDER_TARGET_0 + i, 621 false, true); 622 has_stores = true; 623 } else if (att_state->has_resolve) { 624 needs_store = true; 625 } 626 627 /* Emit the color attachment store if needed */ 628 if (needs_store) { 629 cmd_buffer_render_pass_emit_store(cmd_buffer, cl, 630 attachment_idx, layer, 631 RENDER_TARGET_0 + i, 632 needs_clear && !use_global_rt_clear, 633 false); 634 has_stores = true; 635 } else if (needs_clear) { 636 use_global_rt_clear = true; 637 } 638 } 639 640 /* We always need to emit at least one dummy store */ 641 if (!has_stores) { 642 cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) { 643 store.buffer_to_store = NONE; 644 } 645 } 646 647 /* If we have any depth/stencil clears we can't use the per-buffer clear 648 * bit and instead we have to emit a single clear of all tile buffers. 649 */ 650 if (use_global_zs_clear || use_global_rt_clear) { 651 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) { 652 clear.clear_z_stencil_buffer = use_global_zs_clear; 653 clear.clear_all_render_targets = use_global_rt_clear; 654 } 655 } 656} 657 658static void 659cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer *cmd_buffer, 660 uint32_t layer) 661{ 662 struct v3dv_job *job = cmd_buffer->state.job; 663 assert(job); 664 665 /* Emit the generic list in our indirect state -- the rcl will just 666 * have pointers into it. 667 */ 668 struct v3dv_cl *cl = &job->indirect; 669 v3dv_cl_ensure_space(cl, 200, 1); 670 v3dv_return_if_oom(cmd_buffer, NULL); 671 672 struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl); 673 674 cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords); 675 676 cmd_buffer_render_pass_emit_loads(cmd_buffer, cl, layer); 677 678 /* The binner starts out writing tiles assuming that the initial mode 679 * is triangles, so make sure that's the case. 680 */ 681 cl_emit(cl, PRIM_LIST_FORMAT, fmt) { 682 fmt.primitive_type = LIST_TRIANGLES; 683 } 684 685 /* PTB assumes that value to be 0, but hw will not set it. */ 686 cl_emit(cl, SET_INSTANCEID, set) { 687 set.instance_id = 0; 688 } 689 690 cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch); 691 692 cmd_buffer_render_pass_emit_stores(cmd_buffer, cl, layer); 693 694 cl_emit(cl, END_OF_TILE_MARKER, end); 695 696 cl_emit(cl, RETURN_FROM_SUB_LIST, ret); 697 698 cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) { 699 branch.start = tile_list_start; 700 branch.end = v3dv_cl_get_address(cl); 701 } 702} 703 704static void 705cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer *cmd_buffer, 706 uint32_t layer) 707{ 708 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 709 710 struct v3dv_job *job = cmd_buffer->state.job; 711 struct v3dv_cl *rcl = &job->rcl; 712 713 /* If doing multicore binning, we would need to initialize each 714 * core's tile list here. 715 */ 716 const struct v3dv_frame_tiling *tiling = &job->frame_tiling; 717 const uint32_t tile_alloc_offset = 718 64 * layer * tiling->draw_tiles_x * tiling->draw_tiles_y; 719 cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) { 720 list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset); 721 } 722 723 cmd_buffer_render_pass_emit_per_tile_rcl(cmd_buffer, layer); 724 725 uint32_t supertile_w_in_pixels = 726 tiling->tile_width * tiling->supertile_width; 727 uint32_t supertile_h_in_pixels = 728 tiling->tile_height * tiling->supertile_height; 729 const uint32_t min_x_supertile = 730 state->render_area.offset.x / supertile_w_in_pixels; 731 const uint32_t min_y_supertile = 732 state->render_area.offset.y / supertile_h_in_pixels; 733 734 uint32_t max_render_x = state->render_area.offset.x; 735 if (state->render_area.extent.width > 0) 736 max_render_x += state->render_area.extent.width - 1; 737 uint32_t max_render_y = state->render_area.offset.y; 738 if (state->render_area.extent.height > 0) 739 max_render_y += state->render_area.extent.height - 1; 740 const uint32_t max_x_supertile = max_render_x / supertile_w_in_pixels; 741 const uint32_t max_y_supertile = max_render_y / supertile_h_in_pixels; 742 743 for (int y = min_y_supertile; y <= max_y_supertile; y++) { 744 for (int x = min_x_supertile; x <= max_x_supertile; x++) { 745 cl_emit(rcl, SUPERTILE_COORDINATES, coords) { 746 coords.column_number_in_supertiles = x; 747 coords.row_number_in_supertiles = y; 748 } 749 } 750 } 751} 752 753static void 754set_rcl_early_z_config(struct v3dv_job *job, 755 bool *early_z_disable, 756 uint32_t *early_z_test_and_update_direction) 757{ 758 /* Disable if none of the draw calls in this job enabled EZ */ 759 if (!job->has_ez_draws) { 760 *early_z_disable = true; 761 return; 762 } 763 764 switch (job->first_ez_state) { 765 case V3D_EZ_UNDECIDED: 766 case V3D_EZ_LT_LE: 767 *early_z_disable = false; 768 *early_z_test_and_update_direction = EARLY_Z_DIRECTION_LT_LE; 769 break; 770 case V3D_EZ_GT_GE: 771 *early_z_disable = false; 772 *early_z_test_and_update_direction = EARLY_Z_DIRECTION_GT_GE; 773 break; 774 case V3D_EZ_DISABLED: 775 *early_z_disable = true; 776 break; 777 } 778} 779 780void 781v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) 782{ 783 struct v3dv_job *job = cmd_buffer->state.job; 784 assert(job); 785 786 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 787 const struct v3dv_framebuffer *framebuffer = state->framebuffer; 788 789 /* We can't emit the RCL until we have a framebuffer, which we may not have 790 * if we are recording a secondary command buffer. In that case, we will 791 * have to wait until vkCmdExecuteCommands is called from a primary command 792 * buffer. 793 */ 794 if (!framebuffer) { 795 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); 796 return; 797 } 798 799 const struct v3dv_frame_tiling *tiling = &job->frame_tiling; 800 801 const uint32_t fb_layers = job->frame_tiling.layers; 802 803 v3dv_cl_ensure_space_with_branch(&job->rcl, 200 + 804 MAX2(fb_layers, 1) * 256 * 805 cl_packet_length(SUPERTILE_COORDINATES)); 806 v3dv_return_if_oom(cmd_buffer, NULL); 807 808 assert(state->subpass_idx < state->pass->subpass_count); 809 const struct v3dv_render_pass *pass = state->pass; 810 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx]; 811 struct v3dv_cl *rcl = &job->rcl; 812 813 /* Comon config must be the first TILE_RENDERING_MODE_CFG and 814 * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional 815 * updates to the previous HW state. 816 */ 817 bool do_early_zs_clear = false; 818 const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment; 819 assert(!tiling->msaa || !tiling->double_buffer); 820 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) { 821 config.image_width_pixels = framebuffer->width; 822 config.image_height_pixels = framebuffer->height; 823 config.number_of_render_targets = MAX2(subpass->color_count, 1); 824 config.multisample_mode_4x = tiling->msaa; 825 config.double_buffer_in_non_ms_mode = tiling->double_buffer; 826 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; 827 828 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { 829 const struct v3dv_image_view *iview = 830 state->attachments[ds_attachment_idx].image_view; 831 config.internal_depth_type = iview->internal_type; 832 833 set_rcl_early_z_config(job, 834 &config.early_z_disable, 835 &config.early_z_test_and_update_direction); 836 837 /* Early-Z/S clear can be enabled if the job is clearing and not 838 * storing (or loading) depth. If a stencil aspect is also present 839 * we have the same requirements for it, however, in this case we 840 * can accept stencil loadOp DONT_CARE as well, so instead of 841 * checking that stencil is cleared we check that is not loaded. 842 * 843 * Early-Z/S clearing is independent of Early Z/S testing, so it is 844 * possible to enable one but not the other so long as their 845 * respective requirements are met. 846 */ 847 struct v3dv_render_pass_attachment *ds_attachment = 848 &pass->attachments[ds_attachment_idx]; 849 850 const VkImageAspectFlags ds_aspects = 851 vk_format_aspects(ds_attachment->desc.format); 852 853 bool needs_depth_clear = 854 check_needs_clear(state, 855 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, 856 ds_attachment->first_subpass, 857 ds_attachment->desc.loadOp, 858 subpass->do_depth_clear_with_draw); 859 860 bool needs_depth_store = 861 check_needs_store(state, 862 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, 863 ds_attachment->last_subpass, 864 ds_attachment->desc.storeOp) || 865 subpass->resolve_depth; 866 867 do_early_zs_clear = needs_depth_clear && !needs_depth_store; 868 if (do_early_zs_clear && 869 vk_format_has_stencil(ds_attachment->desc.format)) { 870 bool needs_stencil_load = 871 check_needs_load(state, 872 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT, 873 ds_attachment->first_subpass, 874 ds_attachment->desc.stencilLoadOp); 875 876 bool needs_stencil_store = 877 check_needs_store(state, 878 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT, 879 ds_attachment->last_subpass, 880 ds_attachment->desc.stencilStoreOp) || 881 subpass->resolve_stencil; 882 883 do_early_zs_clear = !needs_stencil_load && !needs_stencil_store; 884 } 885 886 config.early_depth_stencil_clear = do_early_zs_clear; 887 } else { 888 config.early_z_disable = true; 889 } 890 } 891 892 /* If we enabled early Z/S clear, then we can't emit any "Clear Tile Buffers" 893 * commands with the Z/S bit set, so keep track of whether we enabled this 894 * in the job so we can skip these later. 895 */ 896 job->early_zs_clear = do_early_zs_clear; 897 898 for (uint32_t i = 0; i < subpass->color_count; i++) { 899 uint32_t attachment_idx = subpass->color_attachments[i].attachment; 900 if (attachment_idx == VK_ATTACHMENT_UNUSED) 901 continue; 902 903 struct v3dv_image_view *iview = 904 state->attachments[attachment_idx].image_view; 905 906 const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image; 907 const struct v3d_resource_slice *slice = 908 &image->slices[iview->vk.base_mip_level]; 909 910 const uint32_t *clear_color = 911 &state->attachments[attachment_idx].clear_value.color[0]; 912 913 uint32_t clear_pad = 0; 914 if (slice->tiling == V3D_TILING_UIF_NO_XOR || 915 slice->tiling == V3D_TILING_UIF_XOR) { 916 int uif_block_height = v3d_utile_height(image->cpp) * 2; 917 918 uint32_t implicit_padded_height = 919 align(framebuffer->height, uif_block_height) / uif_block_height; 920 921 if (slice->padded_height_of_output_image_in_uif_blocks - 922 implicit_padded_height >= 15) { 923 clear_pad = slice->padded_height_of_output_image_in_uif_blocks; 924 } 925 } 926 927 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { 928 clear.clear_color_low_32_bits = clear_color[0]; 929 clear.clear_color_next_24_bits = clear_color[1] & 0xffffff; 930 clear.render_target_number = i; 931 }; 932 933 if (iview->internal_bpp >= V3D_INTERNAL_BPP_64) { 934 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) { 935 clear.clear_color_mid_low_32_bits = 936 ((clear_color[1] >> 24) | (clear_color[2] << 8)); 937 clear.clear_color_mid_high_24_bits = 938 ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8)); 939 clear.render_target_number = i; 940 }; 941 } 942 943 if (iview->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) { 944 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) { 945 clear.uif_padded_height_in_uif_blocks = clear_pad; 946 clear.clear_color_high_16_bits = clear_color[3] >> 16; 947 clear.render_target_number = i; 948 }; 949 } 950 } 951 952 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { 953 v3dX(cmd_buffer_render_pass_setup_render_target) 954 (cmd_buffer, 0, &rt.render_target_0_internal_bpp, 955 &rt.render_target_0_internal_type, &rt.render_target_0_clamp); 956 v3dX(cmd_buffer_render_pass_setup_render_target) 957 (cmd_buffer, 1, &rt.render_target_1_internal_bpp, 958 &rt.render_target_1_internal_type, &rt.render_target_1_clamp); 959 v3dX(cmd_buffer_render_pass_setup_render_target) 960 (cmd_buffer, 2, &rt.render_target_2_internal_bpp, 961 &rt.render_target_2_internal_type, &rt.render_target_2_clamp); 962 v3dX(cmd_buffer_render_pass_setup_render_target) 963 (cmd_buffer, 3, &rt.render_target_3_internal_bpp, 964 &rt.render_target_3_internal_type, &rt.render_target_3_clamp); 965 } 966 967 /* Ends rendering mode config. */ 968 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { 969 cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { 970 clear.z_clear_value = 971 state->attachments[ds_attachment_idx].clear_value.z; 972 clear.stencil_clear_value = 973 state->attachments[ds_attachment_idx].clear_value.s; 974 }; 975 } else { 976 cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { 977 clear.z_clear_value = 1.0f; 978 clear.stencil_clear_value = 0; 979 }; 980 } 981 982 /* Always set initial block size before the first branch, which needs 983 * to match the value from binning mode config. 984 */ 985 cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) { 986 init.use_auto_chained_tile_lists = true; 987 init.size_of_first_block_in_chained_tile_lists = 988 TILE_ALLOCATION_BLOCK_SIZE_64B; 989 } 990 991 cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) { 992 config.number_of_bin_tile_lists = 1; 993 config.total_frame_width_in_tiles = tiling->draw_tiles_x; 994 config.total_frame_height_in_tiles = tiling->draw_tiles_y; 995 996 config.supertile_width_in_tiles = tiling->supertile_width; 997 config.supertile_height_in_tiles = tiling->supertile_height; 998 999 config.total_frame_width_in_supertiles = 1000 tiling->frame_width_in_supertiles; 1001 config.total_frame_height_in_supertiles = 1002 tiling->frame_height_in_supertiles; 1003 } 1004 1005 /* Emit an initial clear of the tile buffers. This is necessary 1006 * for any buffers that should be cleared (since clearing 1007 * normally happens at the *end* of the generic tile list), but 1008 * it's also nice to clear everything so the first tile doesn't 1009 * inherit any contents from some previous frame. 1010 * 1011 * Also, implement the GFXH-1742 workaround. There's a race in 1012 * the HW between the RCL updating the TLB's internal type/size 1013 * and the spawning of the QPU instances using the TLB's current 1014 * internal type/size. To make sure the QPUs get the right 1015 * state, we need 1 dummy store in between internal type/size 1016 * changes on V3D 3.x, and 2 dummy stores on 4.x. 1017 */ 1018 for (int i = 0; i < 2; i++) { 1019 cl_emit(rcl, TILE_COORDINATES, coords); 1020 cl_emit(rcl, END_OF_LOADS, end); 1021 cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) { 1022 store.buffer_to_store = NONE; 1023 } 1024 if (cmd_buffer->state.tile_aligned_render_area && 1025 (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) { 1026 cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { 1027 clear.clear_z_stencil_buffer = !job->early_zs_clear; 1028 clear.clear_all_render_targets = true; 1029 } 1030 } 1031 cl_emit(rcl, END_OF_TILE_MARKER, end); 1032 } 1033 1034 cl_emit(rcl, FLUSH_VCD_CACHE, flush); 1035 1036 for (int layer = 0; layer < MAX2(1, fb_layers); layer++) { 1037 if (subpass->view_mask == 0 || (subpass->view_mask & (1u << layer))) 1038 cmd_buffer_emit_render_pass_layer_rcl(cmd_buffer, layer); 1039 } 1040 1041 cl_emit(rcl, END_OF_RENDERING, end); 1042} 1043 1044void 1045v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) 1046{ 1047 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; 1048 /* FIXME: right now we only support one viewport. viewporst[0] would work 1049 * now, would need to change if we allow multiple viewports 1050 */ 1051 float *vptranslate = dynamic->viewport.translate[0]; 1052 float *vpscale = dynamic->viewport.scale[0]; 1053 1054 struct v3dv_job *job = cmd_buffer->state.job; 1055 assert(job); 1056 1057 const uint32_t required_cl_size = 1058 cl_packet_length(CLIPPER_XY_SCALING) + 1059 cl_packet_length(CLIPPER_Z_SCALE_AND_OFFSET) + 1060 cl_packet_length(CLIPPER_Z_MIN_MAX_CLIPPING_PLANES) + 1061 cl_packet_length(VIEWPORT_OFFSET); 1062 v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size); 1063 v3dv_return_if_oom(cmd_buffer, NULL); 1064 1065 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { 1066 clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f; 1067 clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f; 1068 } 1069 1070 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { 1071 clip.viewport_z_offset_zc_to_zs = vptranslate[2]; 1072 clip.viewport_z_scale_zc_to_zs = vpscale[2]; 1073 } 1074 cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) { 1075 /* Vulkan's Z NDC is [0..1], unlile OpenGL which is [-1, 1] */ 1076 float z1 = vptranslate[2]; 1077 float z2 = vptranslate[2] + vpscale[2]; 1078 clip.minimum_zw = MIN2(z1, z2); 1079 clip.maximum_zw = MAX2(z1, z2); 1080 } 1081 1082 cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) { 1083 vp.viewport_centre_x_coordinate = vptranslate[0]; 1084 vp.viewport_centre_y_coordinate = vptranslate[1]; 1085 } 1086 1087 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEWPORT; 1088} 1089 1090void 1091v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer) 1092{ 1093 struct v3dv_job *job = cmd_buffer->state.job; 1094 assert(job); 1095 1096 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 1097 struct v3dv_dynamic_state *dynamic_state = &cmd_buffer->state.dynamic; 1098 1099 const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK | 1100 V3DV_DYNAMIC_STENCIL_WRITE_MASK | 1101 V3DV_DYNAMIC_STENCIL_REFERENCE; 1102 1103 v3dv_cl_ensure_space_with_branch(&job->bcl, 1104 2 * cl_packet_length(STENCIL_CFG)); 1105 v3dv_return_if_oom(cmd_buffer, NULL); 1106 1107 bool emitted_stencil = false; 1108 for (uint32_t i = 0; i < 2; i++) { 1109 if (pipeline->emit_stencil_cfg[i]) { 1110 if (dynamic_state->mask & dynamic_stencil_states) { 1111 cl_emit_with_prepacked(&job->bcl, STENCIL_CFG, 1112 pipeline->stencil_cfg[i], config) { 1113 if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK) { 1114 config.stencil_test_mask = 1115 i == 0 ? dynamic_state->stencil_compare_mask.front : 1116 dynamic_state->stencil_compare_mask.back; 1117 } 1118 if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK) { 1119 config.stencil_write_mask = 1120 i == 0 ? dynamic_state->stencil_write_mask.front : 1121 dynamic_state->stencil_write_mask.back; 1122 } 1123 if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_REFERENCE) { 1124 config.stencil_ref_value = 1125 i == 0 ? dynamic_state->stencil_reference.front : 1126 dynamic_state->stencil_reference.back; 1127 } 1128 } 1129 } else { 1130 cl_emit_prepacked(&job->bcl, &pipeline->stencil_cfg[i]); 1131 } 1132 1133 emitted_stencil = true; 1134 } 1135 } 1136 1137 if (emitted_stencil) { 1138 const uint32_t dynamic_stencil_dirty_flags = 1139 V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK | 1140 V3DV_CMD_DIRTY_STENCIL_WRITE_MASK | 1141 V3DV_CMD_DIRTY_STENCIL_REFERENCE; 1142 cmd_buffer->state.dirty &= ~dynamic_stencil_dirty_flags; 1143 } 1144} 1145 1146void 1147v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer) 1148{ 1149 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 1150 assert(pipeline); 1151 1152 if (!pipeline->depth_bias.enabled) 1153 return; 1154 1155 struct v3dv_job *job = cmd_buffer->state.job; 1156 assert(job); 1157 1158 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_OFFSET)); 1159 v3dv_return_if_oom(cmd_buffer, NULL); 1160 1161 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; 1162 cl_emit(&job->bcl, DEPTH_OFFSET, bias) { 1163 bias.depth_offset_factor = dynamic->depth_bias.slope_factor; 1164 bias.depth_offset_units = dynamic->depth_bias.constant_factor; 1165 if (pipeline->depth_bias.is_z16) 1166 bias.depth_offset_units *= 256.0f; 1167 bias.limit = dynamic->depth_bias.depth_bias_clamp; 1168 } 1169 1170 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS; 1171} 1172 1173void 1174v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer) 1175{ 1176 struct v3dv_job *job = cmd_buffer->state.job; 1177 assert(job); 1178 1179 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(LINE_WIDTH)); 1180 v3dv_return_if_oom(cmd_buffer, NULL); 1181 1182 cl_emit(&job->bcl, LINE_WIDTH, line) { 1183 line.line_width = cmd_buffer->state.dynamic.line_width; 1184 } 1185 1186 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_LINE_WIDTH; 1187} 1188 1189void 1190v3dX(cmd_buffer_emit_sample_state)(struct v3dv_cmd_buffer *cmd_buffer) 1191{ 1192 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 1193 assert(pipeline); 1194 1195 struct v3dv_job *job = cmd_buffer->state.job; 1196 assert(job); 1197 1198 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(SAMPLE_STATE)); 1199 v3dv_return_if_oom(cmd_buffer, NULL); 1200 1201 cl_emit(&job->bcl, SAMPLE_STATE, state) { 1202 state.coverage = 1.0f; 1203 state.mask = pipeline->sample_mask; 1204 } 1205} 1206 1207void 1208v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer) 1209{ 1210 struct v3dv_job *job = cmd_buffer->state.job; 1211 assert(job); 1212 1213 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 1214 assert(pipeline); 1215 1216 const uint32_t blend_packets_size = 1217 cl_packet_length(BLEND_ENABLES) + 1218 cl_packet_length(BLEND_CONSTANT_COLOR) + 1219 cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS; 1220 1221 v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size); 1222 v3dv_return_if_oom(cmd_buffer, NULL); 1223 1224 if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) { 1225 if (pipeline->blend.enables) { 1226 cl_emit(&job->bcl, BLEND_ENABLES, enables) { 1227 enables.mask = pipeline->blend.enables; 1228 } 1229 } 1230 1231 for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) { 1232 if (pipeline->blend.enables & (1 << i)) 1233 cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]); 1234 } 1235 } 1236 1237 if (pipeline->blend.needs_color_constants && 1238 cmd_buffer->state.dirty & V3DV_CMD_DIRTY_BLEND_CONSTANTS) { 1239 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; 1240 cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) { 1241 color.red_f16 = _mesa_float_to_half(dynamic->blend_constants[0]); 1242 color.green_f16 = _mesa_float_to_half(dynamic->blend_constants[1]); 1243 color.blue_f16 = _mesa_float_to_half(dynamic->blend_constants[2]); 1244 color.alpha_f16 = _mesa_float_to_half(dynamic->blend_constants[3]); 1245 } 1246 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_BLEND_CONSTANTS; 1247 } 1248} 1249 1250void 1251v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer) 1252{ 1253 struct v3dv_job *job = cmd_buffer->state.job; 1254 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(COLOR_WRITE_MASKS)); 1255 1256 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 1257 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; 1258 cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) { 1259 mask.mask = (~dynamic->color_write_enable | 1260 pipeline->blend.color_write_masks) & 0xffff; 1261 } 1262 1263 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE; 1264} 1265 1266static void 1267emit_flat_shade_flags(struct v3dv_job *job, 1268 int varying_offset, 1269 uint32_t varyings, 1270 enum V3DX(Varying_Flags_Action) lower, 1271 enum V3DX(Varying_Flags_Action) higher) 1272{ 1273 v3dv_cl_ensure_space_with_branch(&job->bcl, 1274 cl_packet_length(FLAT_SHADE_FLAGS)); 1275 v3dv_return_if_oom(NULL, job); 1276 1277 cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) { 1278 flags.varying_offset_v0 = varying_offset; 1279 flags.flat_shade_flags_for_varyings_v024 = varyings; 1280 flags.action_for_flat_shade_flags_of_lower_numbered_varyings = lower; 1281 flags.action_for_flat_shade_flags_of_higher_numbered_varyings = higher; 1282 } 1283} 1284 1285static void 1286emit_noperspective_flags(struct v3dv_job *job, 1287 int varying_offset, 1288 uint32_t varyings, 1289 enum V3DX(Varying_Flags_Action) lower, 1290 enum V3DX(Varying_Flags_Action) higher) 1291{ 1292 v3dv_cl_ensure_space_with_branch(&job->bcl, 1293 cl_packet_length(NON_PERSPECTIVE_FLAGS)); 1294 v3dv_return_if_oom(NULL, job); 1295 1296 cl_emit(&job->bcl, NON_PERSPECTIVE_FLAGS, flags) { 1297 flags.varying_offset_v0 = varying_offset; 1298 flags.non_perspective_flags_for_varyings_v024 = varyings; 1299 flags.action_for_non_perspective_flags_of_lower_numbered_varyings = lower; 1300 flags.action_for_non_perspective_flags_of_higher_numbered_varyings = higher; 1301 } 1302} 1303 1304static void 1305emit_centroid_flags(struct v3dv_job *job, 1306 int varying_offset, 1307 uint32_t varyings, 1308 enum V3DX(Varying_Flags_Action) lower, 1309 enum V3DX(Varying_Flags_Action) higher) 1310{ 1311 v3dv_cl_ensure_space_with_branch(&job->bcl, 1312 cl_packet_length(CENTROID_FLAGS)); 1313 v3dv_return_if_oom(NULL, job); 1314 1315 cl_emit(&job->bcl, CENTROID_FLAGS, flags) { 1316 flags.varying_offset_v0 = varying_offset; 1317 flags.centroid_flags_for_varyings_v024 = varyings; 1318 flags.action_for_centroid_flags_of_lower_numbered_varyings = lower; 1319 flags.action_for_centroid_flags_of_higher_numbered_varyings = higher; 1320 } 1321} 1322 1323static bool 1324emit_varying_flags(struct v3dv_job *job, 1325 uint32_t num_flags, 1326 const uint32_t *flags, 1327 void (*flag_emit_callback)(struct v3dv_job *job, 1328 int varying_offset, 1329 uint32_t flags, 1330 enum V3DX(Varying_Flags_Action) lower, 1331 enum V3DX(Varying_Flags_Action) higher)) 1332{ 1333 bool emitted_any = false; 1334 for (int i = 0; i < num_flags; i++) { 1335 if (!flags[i]) 1336 continue; 1337 1338 if (emitted_any) { 1339 flag_emit_callback(job, i, flags[i], 1340 V3D_VARYING_FLAGS_ACTION_UNCHANGED, 1341 V3D_VARYING_FLAGS_ACTION_UNCHANGED); 1342 } else if (i == 0) { 1343 flag_emit_callback(job, i, flags[i], 1344 V3D_VARYING_FLAGS_ACTION_UNCHANGED, 1345 V3D_VARYING_FLAGS_ACTION_ZEROED); 1346 } else { 1347 flag_emit_callback(job, i, flags[i], 1348 V3D_VARYING_FLAGS_ACTION_ZEROED, 1349 V3D_VARYING_FLAGS_ACTION_ZEROED); 1350 } 1351 1352 emitted_any = true; 1353 } 1354 1355 return emitted_any; 1356} 1357 1358void 1359v3dX(cmd_buffer_emit_varyings_state)(struct v3dv_cmd_buffer *cmd_buffer) 1360{ 1361 struct v3dv_job *job = cmd_buffer->state.job; 1362 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 1363 1364 struct v3d_fs_prog_data *prog_data_fs = 1365 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs; 1366 1367 const uint32_t num_flags = 1368 ARRAY_SIZE(prog_data_fs->flat_shade_flags); 1369 const uint32_t *flat_shade_flags = prog_data_fs->flat_shade_flags; 1370 const uint32_t *noperspective_flags = prog_data_fs->noperspective_flags; 1371 const uint32_t *centroid_flags = prog_data_fs->centroid_flags; 1372 1373 if (!emit_varying_flags(job, num_flags, flat_shade_flags, 1374 emit_flat_shade_flags)) { 1375 v3dv_cl_ensure_space_with_branch( 1376 &job->bcl, cl_packet_length(ZERO_ALL_FLAT_SHADE_FLAGS)); 1377 v3dv_return_if_oom(cmd_buffer, NULL); 1378 1379 cl_emit(&job->bcl, ZERO_ALL_FLAT_SHADE_FLAGS, flags); 1380 } 1381 1382 if (!emit_varying_flags(job, num_flags, noperspective_flags, 1383 emit_noperspective_flags)) { 1384 v3dv_cl_ensure_space_with_branch( 1385 &job->bcl, cl_packet_length(ZERO_ALL_NON_PERSPECTIVE_FLAGS)); 1386 v3dv_return_if_oom(cmd_buffer, NULL); 1387 1388 cl_emit(&job->bcl, ZERO_ALL_NON_PERSPECTIVE_FLAGS, flags); 1389 } 1390 1391 if (!emit_varying_flags(job, num_flags, centroid_flags, 1392 emit_centroid_flags)) { 1393 v3dv_cl_ensure_space_with_branch( 1394 &job->bcl, cl_packet_length(ZERO_ALL_CENTROID_FLAGS)); 1395 v3dv_return_if_oom(cmd_buffer, NULL); 1396 1397 cl_emit(&job->bcl, ZERO_ALL_CENTROID_FLAGS, flags); 1398 } 1399} 1400 1401/* Updates job early Z state tracking. Returns False if EZ must be disabled 1402 * for the current draw call. 1403 */ 1404static bool 1405job_update_ez_state(struct v3dv_job *job, 1406 struct v3dv_pipeline *pipeline, 1407 struct v3dv_cmd_buffer *cmd_buffer) 1408{ 1409 /* If first_ez_state is V3D_EZ_DISABLED it means that we have already 1410 * determined that we should disable EZ completely for all draw calls in 1411 * this job. This will cause us to disable EZ for the entire job in the 1412 * Tile Rendering Mode RCL packet and when we do that we need to make sure 1413 * we never emit a draw call in the job with EZ enabled in the CFG_BITS 1414 * packet, so ez_state must also be V3D_EZ_DISABLED; 1415 */ 1416 if (job->first_ez_state == V3D_EZ_DISABLED) { 1417 assert(job->ez_state == V3D_EZ_DISABLED); 1418 return false; 1419 } 1420 1421 /* If ez_state is V3D_EZ_DISABLED it means that we have already decided 1422 * that EZ must be disabled for the remaining of the frame. 1423 */ 1424 if (job->ez_state == V3D_EZ_DISABLED) 1425 return false; 1426 1427 /* This is part of the pre draw call handling, so we should be inside a 1428 * render pass. 1429 */ 1430 assert(cmd_buffer->state.pass); 1431 1432 /* If this is the first time we update EZ state for this job we first check 1433 * if there is anything that requires disabling it completely for the entire 1434 * job (based on state that is not related to the current draw call and 1435 * pipeline state). 1436 */ 1437 if (!job->decided_global_ez_enable) { 1438 job->decided_global_ez_enable = true; 1439 1440 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1441 assert(state->subpass_idx < state->pass->subpass_count); 1442 struct v3dv_subpass *subpass = &state->pass->subpasses[state->subpass_idx]; 1443 if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) { 1444 job->first_ez_state = V3D_EZ_DISABLED; 1445 job->ez_state = V3D_EZ_DISABLED; 1446 return false; 1447 } 1448 1449 /* GFXH-1918: the early-z buffer may load incorrect depth values 1450 * if the frame has odd width or height. 1451 * 1452 * So we need to disable EZ in this case. 1453 */ 1454 const struct v3dv_render_pass_attachment *ds_attachment = 1455 &state->pass->attachments[subpass->ds_attachment.attachment]; 1456 1457 const VkImageAspectFlags ds_aspects = 1458 vk_format_aspects(ds_attachment->desc.format); 1459 1460 bool needs_depth_load = 1461 check_needs_load(state, 1462 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, 1463 ds_attachment->first_subpass, 1464 ds_attachment->desc.loadOp); 1465 1466 if (needs_depth_load) { 1467 struct v3dv_framebuffer *fb = state->framebuffer; 1468 1469 if (!fb) { 1470 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); 1471 perf_debug("Loading depth aspect in a secondary command buffer " 1472 "without framebuffer info disables early-z tests.\n"); 1473 job->first_ez_state = V3D_EZ_DISABLED; 1474 job->ez_state = V3D_EZ_DISABLED; 1475 return false; 1476 } 1477 1478 if (((fb->width % 2) != 0 || (fb->height % 2) != 0)) { 1479 perf_debug("Loading depth aspect for framebuffer with odd width " 1480 "or height disables early-Z tests.\n"); 1481 job->first_ez_state = V3D_EZ_DISABLED; 1482 job->ez_state = V3D_EZ_DISABLED; 1483 return false; 1484 } 1485 } 1486 } 1487 1488 /* Otherwise, we can decide to selectively enable or disable EZ for draw 1489 * calls using the CFG_BITS packet based on the bound pipeline state. 1490 */ 1491 bool disable_ez = false; 1492 bool incompatible_test = false; 1493 switch (pipeline->ez_state) { 1494 case V3D_EZ_UNDECIDED: 1495 /* If the pipeline didn't pick a direction but didn't disable, then go 1496 * along with the current EZ state. This allows EZ optimization for Z 1497 * func == EQUAL or NEVER. 1498 */ 1499 break; 1500 1501 case V3D_EZ_LT_LE: 1502 case V3D_EZ_GT_GE: 1503 /* If the pipeline picked a direction, then it needs to match the current 1504 * direction if we've decided on one. 1505 */ 1506 if (job->ez_state == V3D_EZ_UNDECIDED) { 1507 job->ez_state = pipeline->ez_state; 1508 } else if (job->ez_state != pipeline->ez_state) { 1509 disable_ez = true; 1510 incompatible_test = true; 1511 } 1512 break; 1513 1514 case V3D_EZ_DISABLED: 1515 disable_ez = true; 1516 incompatible_test = pipeline->incompatible_ez_test; 1517 break; 1518 } 1519 1520 if (job->first_ez_state == V3D_EZ_UNDECIDED && !disable_ez) { 1521 assert(job->ez_state != V3D_EZ_DISABLED); 1522 job->first_ez_state = job->ez_state; 1523 } 1524 1525 /* If we had to disable EZ because of an incompatible test direction and 1526 * and the pipeline writes depth then we need to disable EZ for the rest of 1527 * the frame. 1528 */ 1529 if (incompatible_test && pipeline->z_updates_enable) { 1530 assert(disable_ez); 1531 job->ez_state = V3D_EZ_DISABLED; 1532 } 1533 1534 if (!disable_ez) 1535 job->has_ez_draws = true; 1536 1537 return !disable_ez; 1538} 1539 1540void 1541v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer) 1542{ 1543 struct v3dv_job *job = cmd_buffer->state.job; 1544 assert(job); 1545 1546 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 1547 assert(pipeline); 1548 1549 bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer); 1550 1551 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS)); 1552 v3dv_return_if_oom(cmd_buffer, NULL); 1553 1554 cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) { 1555 config.early_z_enable = enable_ez; 1556 config.early_z_updates_enable = config.early_z_enable && 1557 pipeline->z_updates_enable; 1558 } 1559} 1560 1561void 1562v3dX(cmd_buffer_emit_occlusion_query)(struct v3dv_cmd_buffer *cmd_buffer) 1563{ 1564 struct v3dv_job *job = cmd_buffer->state.job; 1565 assert(job); 1566 1567 v3dv_cl_ensure_space_with_branch(&job->bcl, 1568 cl_packet_length(OCCLUSION_QUERY_COUNTER)); 1569 v3dv_return_if_oom(cmd_buffer, NULL); 1570 1571 cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) { 1572 if (cmd_buffer->state.query.active_query.bo) { 1573 counter.address = 1574 v3dv_cl_address(cmd_buffer->state.query.active_query.bo, 1575 cmd_buffer->state.query.active_query.offset); 1576 } 1577 } 1578 1579 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY; 1580} 1581 1582static struct v3dv_job * 1583cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer *cmd_buffer, 1584 bool is_bcl_barrier) 1585{ 1586 assert(cmd_buffer->state.subpass_idx != -1); 1587 v3dv_cmd_buffer_finish_job(cmd_buffer); 1588 struct v3dv_job *job = 1589 v3dv_cmd_buffer_subpass_resume(cmd_buffer, 1590 cmd_buffer->state.subpass_idx); 1591 if (!job) 1592 return NULL; 1593 1594 /* FIXME: we can do better than all barriers */ 1595 job->serialize = V3DV_BARRIER_ALL; 1596 job->needs_bcl_sync = is_bcl_barrier; 1597 return job; 1598} 1599 1600static void 1601cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary, 1602 struct v3dv_cmd_buffer *secondary) 1603{ 1604 struct v3dv_cmd_buffer_state *p_state = &primary->state; 1605 struct v3dv_cmd_buffer_state *s_state = &secondary->state; 1606 1607 const uint32_t total_state_count = 1608 p_state->query.end.used_count + s_state->query.end.used_count; 1609 v3dv_cmd_buffer_ensure_array_state(primary, 1610 sizeof(struct v3dv_end_query_cpu_job_info), 1611 total_state_count, 1612 &p_state->query.end.alloc_count, 1613 (void **) &p_state->query.end.states); 1614 v3dv_return_if_oom(primary, NULL); 1615 1616 for (uint32_t i = 0; i < s_state->query.end.used_count; i++) { 1617 const struct v3dv_end_query_cpu_job_info *s_qstate = 1618 &secondary->state.query.end.states[i]; 1619 1620 struct v3dv_end_query_cpu_job_info *p_qstate = 1621 &p_state->query.end.states[p_state->query.end.used_count++]; 1622 1623 p_qstate->pool = s_qstate->pool; 1624 p_qstate->query = s_qstate->query; 1625 } 1626} 1627 1628void 1629v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary, 1630 uint32_t cmd_buffer_count, 1631 const VkCommandBuffer *cmd_buffers) 1632{ 1633 assert(primary->state.job); 1634 1635 /* Emit occlusion query state if needed so the draw calls inside our 1636 * secondaries update the counters. 1637 */ 1638 bool has_occlusion_query = 1639 primary->state.dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY; 1640 if (has_occlusion_query) 1641 v3dX(cmd_buffer_emit_occlusion_query)(primary); 1642 1643 /* FIXME: if our primary job tiling doesn't enable MSSA but any of the 1644 * pipelines used by the secondaries do, we need to re-start the primary 1645 * job to enable MSAA. See cmd_buffer_restart_job_for_msaa_if_needed. 1646 */ 1647 struct v3dv_barrier_state pending_barrier = { 0 }; 1648 for (uint32_t i = 0; i < cmd_buffer_count; i++) { 1649 V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]); 1650 1651 assert(secondary->usage_flags & 1652 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT); 1653 1654 list_for_each_entry(struct v3dv_job, secondary_job, 1655 &secondary->jobs, list_link) { 1656 if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) { 1657 /* If the job is a CL, then we branch to it from the primary BCL. 1658 * In this case the secondary's BCL is finished with a 1659 * RETURN_FROM_SUB_LIST command to return back to the primary BCL 1660 * once we are done executing it. 1661 */ 1662 assert(v3dv_cl_offset(&secondary_job->rcl) == 0); 1663 assert(secondary_job->bcl.bo); 1664 1665 /* Sanity check that secondary BCL ends with RETURN_FROM_SUB_LIST */ 1666 STATIC_ASSERT(cl_packet_length(RETURN_FROM_SUB_LIST) == 1); 1667 assert(v3dv_cl_offset(&secondary_job->bcl) >= 1); 1668 assert(*(((uint8_t *)secondary_job->bcl.next) - 1) == 1669 V3DX(RETURN_FROM_SUB_LIST_opcode)); 1670 1671 /* If this secondary has any barriers (or we had any pending barrier 1672 * to apply), then we can't just branch to it from the primary, we 1673 * need to split the primary to create a new job that can consume 1674 * the barriers first. 1675 * 1676 * FIXME: in this case, maybe just copy the secondary BCL without 1677 * the RETURN_FROM_SUB_LIST into the primary job to skip the 1678 * branch? 1679 */ 1680 struct v3dv_job *primary_job = primary->state.job; 1681 if (!primary_job || secondary_job->serialize || 1682 pending_barrier.dst_mask) { 1683 const bool needs_bcl_barrier = 1684 secondary_job->needs_bcl_sync || 1685 pending_barrier.bcl_buffer_access || 1686 pending_barrier.bcl_image_access; 1687 1688 primary_job = 1689 cmd_buffer_subpass_split_for_barrier(primary, 1690 needs_bcl_barrier); 1691 v3dv_return_if_oom(primary, NULL); 1692 1693 /* Since we have created a new primary we need to re-emit 1694 * occlusion query state. 1695 */ 1696 if (has_occlusion_query) 1697 v3dX(cmd_buffer_emit_occlusion_query)(primary); 1698 } 1699 1700 /* Make sure our primary job has all required BO references */ 1701 set_foreach(secondary_job->bos, entry) { 1702 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key; 1703 v3dv_job_add_bo(primary_job, bo); 1704 } 1705 1706 /* Emit required branch instructions. We expect each of these 1707 * to end with a corresponding 'return from sub list' item. 1708 */ 1709 list_for_each_entry(struct v3dv_bo, bcl_bo, 1710 &secondary_job->bcl.bo_list, list_link) { 1711 v3dv_cl_ensure_space_with_branch(&primary_job->bcl, 1712 cl_packet_length(BRANCH_TO_SUB_LIST)); 1713 v3dv_return_if_oom(primary, NULL); 1714 cl_emit(&primary_job->bcl, BRANCH_TO_SUB_LIST, branch) { 1715 branch.address = v3dv_cl_address(bcl_bo, 0); 1716 } 1717 } 1718 1719 primary_job->tmu_dirty_rcl |= secondary_job->tmu_dirty_rcl; 1720 } else { 1721 /* This is a regular job (CPU or GPU), so just finish the current 1722 * primary job (if any) and then add the secondary job to the 1723 * primary's job list right after it. 1724 */ 1725 v3dv_cmd_buffer_finish_job(primary); 1726 v3dv_job_clone_in_cmd_buffer(secondary_job, primary); 1727 if (pending_barrier.dst_mask) { 1728 /* FIXME: do the same we do for primaries and only choose the 1729 * relevant src masks. 1730 */ 1731 secondary_job->serialize = pending_barrier.src_mask_graphics | 1732 pending_barrier.src_mask_transfer | 1733 pending_barrier.src_mask_compute; 1734 if (pending_barrier.bcl_buffer_access || 1735 pending_barrier.bcl_image_access) { 1736 secondary_job->needs_bcl_sync = true; 1737 } 1738 } 1739 } 1740 1741 memset(&pending_barrier, 0, sizeof(pending_barrier)); 1742 } 1743 1744 /* If the secondary has recorded any vkCmdEndQuery commands, we need to 1745 * copy this state to the primary so it is processed properly when the 1746 * current primary job is finished. 1747 */ 1748 cmd_buffer_copy_secondary_end_query_state(primary, secondary); 1749 1750 /* If this secondary had any pending barrier state we will need that 1751 * barrier state consumed with whatever comes next in the primary. 1752 */ 1753 assert(secondary->state.barrier.dst_mask || 1754 (!secondary->state.barrier.bcl_buffer_access && 1755 !secondary->state.barrier.bcl_image_access)); 1756 1757 pending_barrier = secondary->state.barrier; 1758 } 1759 1760 if (pending_barrier.dst_mask) { 1761 v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier, 1762 &pending_barrier); 1763 } 1764} 1765 1766static void 1767emit_gs_shader_state_record(struct v3dv_job *job, 1768 struct v3dv_bo *assembly_bo, 1769 struct v3dv_shader_variant *gs_bin, 1770 struct v3dv_cl_reloc gs_bin_uniforms, 1771 struct v3dv_shader_variant *gs, 1772 struct v3dv_cl_reloc gs_render_uniforms) 1773{ 1774 cl_emit(&job->indirect, GEOMETRY_SHADER_STATE_RECORD, shader) { 1775 shader.geometry_bin_mode_shader_code_address = 1776 v3dv_cl_address(assembly_bo, gs_bin->assembly_offset); 1777 shader.geometry_bin_mode_shader_4_way_threadable = 1778 gs_bin->prog_data.gs->base.threads == 4; 1779 shader.geometry_bin_mode_shader_start_in_final_thread_section = 1780 gs_bin->prog_data.gs->base.single_seg; 1781 shader.geometry_bin_mode_shader_propagate_nans = true; 1782 shader.geometry_bin_mode_shader_uniforms_address = 1783 gs_bin_uniforms; 1784 1785 shader.geometry_render_mode_shader_code_address = 1786 v3dv_cl_address(assembly_bo, gs->assembly_offset); 1787 shader.geometry_render_mode_shader_4_way_threadable = 1788 gs->prog_data.gs->base.threads == 4; 1789 shader.geometry_render_mode_shader_start_in_final_thread_section = 1790 gs->prog_data.gs->base.single_seg; 1791 shader.geometry_render_mode_shader_propagate_nans = true; 1792 shader.geometry_render_mode_shader_uniforms_address = 1793 gs_render_uniforms; 1794 } 1795} 1796 1797static uint8_t 1798v3d_gs_output_primitive(enum shader_prim prim_type) 1799{ 1800 switch (prim_type) { 1801 case SHADER_PRIM_POINTS: 1802 return GEOMETRY_SHADER_POINTS; 1803 case SHADER_PRIM_LINE_STRIP: 1804 return GEOMETRY_SHADER_LINE_STRIP; 1805 case SHADER_PRIM_TRIANGLE_STRIP: 1806 return GEOMETRY_SHADER_TRI_STRIP; 1807 default: 1808 unreachable("Unsupported primitive type"); 1809 } 1810} 1811 1812static void 1813emit_tes_gs_common_params(struct v3dv_job *job, 1814 uint8_t gs_out_prim_type, 1815 uint8_t gs_num_invocations) 1816{ 1817 cl_emit(&job->indirect, TESSELLATION_GEOMETRY_COMMON_PARAMS, shader) { 1818 shader.tessellation_type = TESSELLATION_TYPE_TRIANGLE; 1819 shader.tessellation_point_mode = false; 1820 shader.tessellation_edge_spacing = TESSELLATION_EDGE_SPACING_EVEN; 1821 shader.tessellation_clockwise = true; 1822 shader.tessellation_invocations = 1; 1823 1824 shader.geometry_shader_output_format = 1825 v3d_gs_output_primitive(gs_out_prim_type); 1826 shader.geometry_shader_instances = gs_num_invocations & 0x1F; 1827 } 1828} 1829 1830static uint8_t 1831simd_width_to_gs_pack_mode(uint32_t width) 1832{ 1833 switch (width) { 1834 case 16: 1835 return V3D_PACK_MODE_16_WAY; 1836 case 8: 1837 return V3D_PACK_MODE_8_WAY; 1838 case 4: 1839 return V3D_PACK_MODE_4_WAY; 1840 case 1: 1841 return V3D_PACK_MODE_1_WAY; 1842 default: 1843 unreachable("Invalid SIMD width"); 1844 }; 1845} 1846 1847static void 1848emit_tes_gs_shader_params(struct v3dv_job *job, 1849 uint32_t gs_simd, 1850 uint32_t gs_vpm_output_size, 1851 uint32_t gs_max_vpm_input_size_per_batch) 1852{ 1853 cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) { 1854 shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED; 1855 shader.per_patch_data_column_depth = 1; 1856 shader.tcs_output_segment_size_in_sectors = 1; 1857 shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; 1858 shader.tes_output_segment_size_in_sectors = 1; 1859 shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; 1860 shader.gs_output_segment_size_in_sectors = gs_vpm_output_size; 1861 shader.gs_output_segment_pack_mode = 1862 simd_width_to_gs_pack_mode(gs_simd); 1863 shader.tbg_max_patches_per_tcs_batch = 1; 1864 shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0; 1865 shader.tbg_min_tcs_output_segments_required_in_play = 1; 1866 shader.tbg_min_per_patch_data_segments_required_in_play = 1; 1867 shader.tpg_max_patches_per_tes_batch = 1; 1868 shader.tpg_max_vertex_segments_per_tes_batch = 0; 1869 shader.tpg_max_tcs_output_segments_per_tes_batch = 1; 1870 shader.tpg_min_tes_output_segments_required_in_play = 1; 1871 shader.gbg_max_tes_output_vertex_segments_per_gs_batch = 1872 gs_max_vpm_input_size_per_batch; 1873 shader.gbg_min_gs_output_segments_required_in_play = 1; 1874 } 1875} 1876 1877void 1878v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) 1879{ 1880 struct v3dv_job *job = cmd_buffer->state.job; 1881 assert(job); 1882 1883 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 1884 struct v3dv_pipeline *pipeline = state->gfx.pipeline; 1885 assert(pipeline); 1886 1887 struct v3dv_shader_variant *vs_variant = 1888 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]; 1889 struct v3d_vs_prog_data *prog_data_vs = vs_variant->prog_data.vs; 1890 1891 struct v3dv_shader_variant *vs_bin_variant = 1892 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]; 1893 struct v3d_vs_prog_data *prog_data_vs_bin = vs_bin_variant->prog_data.vs; 1894 1895 struct v3dv_shader_variant *fs_variant = 1896 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]; 1897 struct v3d_fs_prog_data *prog_data_fs = fs_variant->prog_data.fs; 1898 1899 struct v3dv_shader_variant *gs_variant = NULL; 1900 struct v3dv_shader_variant *gs_bin_variant = NULL; 1901 struct v3d_gs_prog_data *prog_data_gs = NULL; 1902 struct v3d_gs_prog_data *prog_data_gs_bin = NULL; 1903 if (pipeline->has_gs) { 1904 gs_variant = 1905 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]; 1906 prog_data_gs = gs_variant->prog_data.gs; 1907 1908 gs_bin_variant = 1909 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]; 1910 prog_data_gs_bin = gs_bin_variant->prog_data.gs; 1911 } 1912 1913 /* Update the cache dirty flag based on the shader progs data */ 1914 job->tmu_dirty_rcl |= prog_data_vs_bin->base.tmu_dirty_rcl; 1915 job->tmu_dirty_rcl |= prog_data_vs->base.tmu_dirty_rcl; 1916 job->tmu_dirty_rcl |= prog_data_fs->base.tmu_dirty_rcl; 1917 if (pipeline->has_gs) { 1918 job->tmu_dirty_rcl |= prog_data_gs_bin->base.tmu_dirty_rcl; 1919 job->tmu_dirty_rcl |= prog_data_gs->base.tmu_dirty_rcl; 1920 } 1921 1922 /* See GFXH-930 workaround below */ 1923 uint32_t num_elements_to_emit = MAX2(pipeline->va_count, 1); 1924 1925 uint32_t shader_state_record_length = 1926 cl_packet_length(GL_SHADER_STATE_RECORD); 1927 if (pipeline->has_gs) { 1928 shader_state_record_length += 1929 cl_packet_length(GEOMETRY_SHADER_STATE_RECORD) + 1930 cl_packet_length(TESSELLATION_GEOMETRY_COMMON_PARAMS) + 1931 2 * cl_packet_length(TESSELLATION_GEOMETRY_SHADER_PARAMS); 1932 } 1933 1934 uint32_t shader_rec_offset = 1935 v3dv_cl_ensure_space(&job->indirect, 1936 shader_state_record_length + 1937 num_elements_to_emit * 1938 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD), 1939 32); 1940 v3dv_return_if_oom(cmd_buffer, NULL); 1941 1942 struct v3dv_bo *assembly_bo = pipeline->shared_data->assembly_bo; 1943 1944 if (pipeline->has_gs) { 1945 emit_gs_shader_state_record(job, 1946 assembly_bo, 1947 gs_bin_variant, 1948 cmd_buffer->state.uniforms.gs_bin, 1949 gs_variant, 1950 cmd_buffer->state.uniforms.gs); 1951 1952 emit_tes_gs_common_params(job, 1953 prog_data_gs->out_prim_type, 1954 prog_data_gs->num_invocations); 1955 1956 emit_tes_gs_shader_params(job, 1957 pipeline->vpm_cfg_bin.gs_width, 1958 pipeline->vpm_cfg_bin.Gd, 1959 pipeline->vpm_cfg_bin.Gv); 1960 1961 emit_tes_gs_shader_params(job, 1962 pipeline->vpm_cfg.gs_width, 1963 pipeline->vpm_cfg.Gd, 1964 pipeline->vpm_cfg.Gv); 1965 } 1966 1967 struct v3dv_bo *default_attribute_values = 1968 pipeline->default_attribute_values != NULL ? 1969 pipeline->default_attribute_values : 1970 pipeline->device->default_attribute_float; 1971 1972 cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD, 1973 pipeline->shader_state_record, shader) { 1974 1975 /* FIXME: we are setting this values here and during the 1976 * prepacking. This is because both cl_emit_with_prepacked and v3dvx_pack 1977 * asserts for minimum values of these. It would be good to get 1978 * v3dvx_pack to assert on the final value if possible 1979 */ 1980 shader.min_coord_shader_input_segments_required_in_play = 1981 pipeline->vpm_cfg_bin.As; 1982 shader.min_vertex_shader_input_segments_required_in_play = 1983 pipeline->vpm_cfg.As; 1984 1985 shader.coordinate_shader_code_address = 1986 v3dv_cl_address(assembly_bo, vs_bin_variant->assembly_offset); 1987 shader.vertex_shader_code_address = 1988 v3dv_cl_address(assembly_bo, vs_variant->assembly_offset); 1989 shader.fragment_shader_code_address = 1990 v3dv_cl_address(assembly_bo, fs_variant->assembly_offset); 1991 1992 shader.coordinate_shader_uniforms_address = cmd_buffer->state.uniforms.vs_bin; 1993 shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs; 1994 shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs; 1995 1996 shader.address_of_default_attribute_values = 1997 v3dv_cl_address(default_attribute_values, 0); 1998 1999 shader.any_shader_reads_hardware_written_primitive_id = 2000 (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid; 2001 shader.insert_primitive_id_as_first_varying_to_fragment_shader = 2002 !pipeline->has_gs && prog_data_fs->uses_pid; 2003 } 2004 2005 /* Upload vertex element attributes (SHADER_STATE_ATTRIBUTE_RECORD) */ 2006 bool cs_loaded_any = false; 2007 const bool cs_uses_builtins = prog_data_vs_bin->uses_iid || 2008 prog_data_vs_bin->uses_biid || 2009 prog_data_vs_bin->uses_vid; 2010 const uint32_t packet_length = 2011 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD); 2012 2013 uint32_t emitted_va_count = 0; 2014 for (uint32_t i = 0; emitted_va_count < pipeline->va_count; i++) { 2015 assert(i < MAX_VERTEX_ATTRIBS); 2016 2017 if (pipeline->va[i].vk_format == VK_FORMAT_UNDEFINED) 2018 continue; 2019 2020 const uint32_t binding = pipeline->va[i].binding; 2021 2022 /* We store each vertex attribute in the array using its driver location 2023 * as index. 2024 */ 2025 const uint32_t location = i; 2026 2027 struct v3dv_vertex_binding *c_vb = &cmd_buffer->state.vertex_bindings[binding]; 2028 2029 cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, 2030 &pipeline->vertex_attrs[i * packet_length], attr) { 2031 2032 assert(c_vb->buffer->mem->bo); 2033 attr.address = v3dv_cl_address(c_vb->buffer->mem->bo, 2034 c_vb->buffer->mem_offset + 2035 pipeline->va[i].offset + 2036 c_vb->offset); 2037 2038 attr.number_of_values_read_by_coordinate_shader = 2039 prog_data_vs_bin->vattr_sizes[location]; 2040 attr.number_of_values_read_by_vertex_shader = 2041 prog_data_vs->vattr_sizes[location]; 2042 2043 /* GFXH-930: At least one attribute must be enabled and read by CS 2044 * and VS. If we have attributes being consumed by the VS but not 2045 * the CS, then set up a dummy load of the last attribute into the 2046 * CS's VPM inputs. (Since CS is just dead-code-elimination compared 2047 * to VS, we can't have CS loading but not VS). 2048 * 2049 * GFXH-1602: first attribute must be active if using builtins. 2050 */ 2051 if (prog_data_vs_bin->vattr_sizes[location]) 2052 cs_loaded_any = true; 2053 2054 if (i == 0 && cs_uses_builtins && !cs_loaded_any) { 2055 attr.number_of_values_read_by_coordinate_shader = 1; 2056 cs_loaded_any = true; 2057 } else if (i == pipeline->va_count - 1 && !cs_loaded_any) { 2058 attr.number_of_values_read_by_coordinate_shader = 1; 2059 cs_loaded_any = true; 2060 } 2061 2062 attr.maximum_index = 0xffffff; 2063 } 2064 2065 emitted_va_count++; 2066 } 2067 2068 if (pipeline->va_count == 0) { 2069 /* GFXH-930: At least one attribute must be enabled and read 2070 * by CS and VS. If we have no attributes being consumed by 2071 * the shader, set up a dummy to be loaded into the VPM. 2072 */ 2073 cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) { 2074 /* Valid address of data whose value will be unused. */ 2075 attr.address = v3dv_cl_address(job->indirect.bo, 0); 2076 2077 attr.type = ATTRIBUTE_FLOAT; 2078 attr.stride = 0; 2079 attr.vec_size = 1; 2080 2081 attr.number_of_values_read_by_coordinate_shader = 1; 2082 attr.number_of_values_read_by_vertex_shader = 1; 2083 } 2084 } 2085 2086 if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) { 2087 v3dv_cl_ensure_space_with_branch(&job->bcl, 2088 sizeof(pipeline->vcm_cache_size)); 2089 v3dv_return_if_oom(cmd_buffer, NULL); 2090 2091 cl_emit_prepacked(&job->bcl, &pipeline->vcm_cache_size); 2092 } 2093 2094 v3dv_cl_ensure_space_with_branch(&job->bcl, 2095 cl_packet_length(GL_SHADER_STATE)); 2096 v3dv_return_if_oom(cmd_buffer, NULL); 2097 2098 if (pipeline->has_gs) { 2099 cl_emit(&job->bcl, GL_SHADER_STATE_INCLUDING_GS, state) { 2100 state.address = v3dv_cl_address(job->indirect.bo, shader_rec_offset); 2101 state.number_of_attribute_arrays = num_elements_to_emit; 2102 } 2103 } else { 2104 cl_emit(&job->bcl, GL_SHADER_STATE, state) { 2105 state.address = v3dv_cl_address(job->indirect.bo, shader_rec_offset); 2106 state.number_of_attribute_arrays = num_elements_to_emit; 2107 } 2108 } 2109 2110 /* Clearing push constants and descriptor sets for all stages is not quite 2111 * correct (some shader stages may not be used at all or they may not be 2112 * consuming push constants), however this is not relevant because if we 2113 * bind a different pipeline we always have to rebuild the uniform streams. 2114 */ 2115 cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_VERTEX_BUFFER | 2116 V3DV_CMD_DIRTY_DESCRIPTOR_SETS | 2117 V3DV_CMD_DIRTY_PUSH_CONSTANTS); 2118 cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS; 2119 cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS; 2120} 2121 2122void 2123v3dX(cmd_buffer_emit_draw)(struct v3dv_cmd_buffer *cmd_buffer, 2124 struct v3dv_draw_info *info) 2125{ 2126 struct v3dv_job *job = cmd_buffer->state.job; 2127 assert(job); 2128 2129 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 2130 struct v3dv_pipeline *pipeline = state->gfx.pipeline; 2131 2132 assert(pipeline); 2133 2134 uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology); 2135 2136 if (info->first_instance > 0) { 2137 v3dv_cl_ensure_space_with_branch( 2138 &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE)); 2139 v3dv_return_if_oom(cmd_buffer, NULL); 2140 2141 cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) { 2142 base.base_instance = info->first_instance; 2143 base.base_vertex = 0; 2144 } 2145 } 2146 2147 if (info->instance_count > 1) { 2148 v3dv_cl_ensure_space_with_branch( 2149 &job->bcl, cl_packet_length(VERTEX_ARRAY_INSTANCED_PRIMS)); 2150 v3dv_return_if_oom(cmd_buffer, NULL); 2151 2152 cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) { 2153 prim.mode = hw_prim_type; 2154 prim.index_of_first_vertex = info->first_vertex; 2155 prim.number_of_instances = info->instance_count; 2156 prim.instance_length = info->vertex_count; 2157 } 2158 } else { 2159 v3dv_cl_ensure_space_with_branch( 2160 &job->bcl, cl_packet_length(VERTEX_ARRAY_PRIMS)); 2161 v3dv_return_if_oom(cmd_buffer, NULL); 2162 cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) { 2163 prim.mode = hw_prim_type; 2164 prim.length = info->vertex_count; 2165 prim.index_of_first_vertex = info->first_vertex; 2166 } 2167 } 2168} 2169 2170void 2171v3dX(cmd_buffer_emit_index_buffer)(struct v3dv_cmd_buffer *cmd_buffer) 2172{ 2173 struct v3dv_job *job = cmd_buffer->state.job; 2174 assert(job); 2175 2176 /* We flag all state as dirty when we create a new job so make sure we 2177 * have a valid index buffer before attempting to emit state for it. 2178 */ 2179 struct v3dv_buffer *ibuffer = 2180 v3dv_buffer_from_handle(cmd_buffer->state.index_buffer.buffer); 2181 if (ibuffer) { 2182 v3dv_cl_ensure_space_with_branch( 2183 &job->bcl, cl_packet_length(INDEX_BUFFER_SETUP)); 2184 v3dv_return_if_oom(cmd_buffer, NULL); 2185 2186 const uint32_t offset = cmd_buffer->state.index_buffer.offset; 2187 cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) { 2188 ib.address = v3dv_cl_address(ibuffer->mem->bo, 2189 ibuffer->mem_offset + offset); 2190 ib.size = ibuffer->mem->bo->size; 2191 } 2192 } 2193 2194 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_INDEX_BUFFER; 2195} 2196 2197void 2198v3dX(cmd_buffer_emit_draw_indexed)(struct v3dv_cmd_buffer *cmd_buffer, 2199 uint32_t indexCount, 2200 uint32_t instanceCount, 2201 uint32_t firstIndex, 2202 int32_t vertexOffset, 2203 uint32_t firstInstance) 2204{ 2205 struct v3dv_job *job = cmd_buffer->state.job; 2206 assert(job); 2207 2208 const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 2209 uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology); 2210 uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1; 2211 uint32_t index_offset = firstIndex * cmd_buffer->state.index_buffer.index_size; 2212 2213 if (vertexOffset != 0 || firstInstance != 0) { 2214 v3dv_cl_ensure_space_with_branch( 2215 &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE)); 2216 v3dv_return_if_oom(cmd_buffer, NULL); 2217 2218 cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) { 2219 base.base_instance = firstInstance; 2220 base.base_vertex = vertexOffset; 2221 } 2222 } 2223 2224 if (instanceCount == 1) { 2225 v3dv_cl_ensure_space_with_branch( 2226 &job->bcl, cl_packet_length(INDEXED_PRIM_LIST)); 2227 v3dv_return_if_oom(cmd_buffer, NULL); 2228 2229 cl_emit(&job->bcl, INDEXED_PRIM_LIST, prim) { 2230 prim.index_type = index_type; 2231 prim.length = indexCount; 2232 prim.index_offset = index_offset; 2233 prim.mode = hw_prim_type; 2234 prim.enable_primitive_restarts = pipeline->primitive_restart; 2235 } 2236 } else if (instanceCount > 1) { 2237 v3dv_cl_ensure_space_with_branch( 2238 &job->bcl, cl_packet_length(INDEXED_INSTANCED_PRIM_LIST)); 2239 v3dv_return_if_oom(cmd_buffer, NULL); 2240 2241 cl_emit(&job->bcl, INDEXED_INSTANCED_PRIM_LIST, prim) { 2242 prim.index_type = index_type; 2243 prim.index_offset = index_offset; 2244 prim.mode = hw_prim_type; 2245 prim.enable_primitive_restarts = pipeline->primitive_restart; 2246 prim.number_of_instances = instanceCount; 2247 prim.instance_length = indexCount; 2248 } 2249 } 2250} 2251 2252void 2253v3dX(cmd_buffer_emit_draw_indirect)(struct v3dv_cmd_buffer *cmd_buffer, 2254 struct v3dv_buffer *buffer, 2255 VkDeviceSize offset, 2256 uint32_t drawCount, 2257 uint32_t stride) 2258{ 2259 struct v3dv_job *job = cmd_buffer->state.job; 2260 assert(job); 2261 2262 const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 2263 uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology); 2264 2265 v3dv_cl_ensure_space_with_branch( 2266 &job->bcl, cl_packet_length(INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS)); 2267 v3dv_return_if_oom(cmd_buffer, NULL); 2268 2269 cl_emit(&job->bcl, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS, prim) { 2270 prim.mode = hw_prim_type; 2271 prim.number_of_draw_indirect_array_records = drawCount; 2272 prim.stride_in_multiples_of_4_bytes = stride >> 2; 2273 prim.address = v3dv_cl_address(buffer->mem->bo, 2274 buffer->mem_offset + offset); 2275 } 2276} 2277 2278void 2279v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer, 2280 struct v3dv_buffer *buffer, 2281 VkDeviceSize offset, 2282 uint32_t drawCount, 2283 uint32_t stride) 2284{ 2285 struct v3dv_job *job = cmd_buffer->state.job; 2286 assert(job); 2287 2288 const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; 2289 uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology); 2290 uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1; 2291 2292 v3dv_cl_ensure_space_with_branch( 2293 &job->bcl, cl_packet_length(INDIRECT_INDEXED_INSTANCED_PRIM_LIST)); 2294 v3dv_return_if_oom(cmd_buffer, NULL); 2295 2296 cl_emit(&job->bcl, INDIRECT_INDEXED_INSTANCED_PRIM_LIST, prim) { 2297 prim.index_type = index_type; 2298 prim.mode = hw_prim_type; 2299 prim.enable_primitive_restarts = pipeline->primitive_restart; 2300 prim.number_of_draw_indirect_indexed_records = drawCount; 2301 prim.stride_in_multiples_of_4_bytes = stride >> 2; 2302 prim.address = v3dv_cl_address(buffer->mem->bo, 2303 buffer->mem_offset + offset); 2304 } 2305} 2306 2307void 2308v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer, 2309 int rt, 2310 uint32_t *rt_bpp, 2311 uint32_t *rt_type, 2312 uint32_t *rt_clamp) 2313{ 2314 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; 2315 2316 assert(state->subpass_idx < state->pass->subpass_count); 2317 const struct v3dv_subpass *subpass = 2318 &state->pass->subpasses[state->subpass_idx]; 2319 2320 if (rt >= subpass->color_count) 2321 return; 2322 2323 struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt]; 2324 const uint32_t attachment_idx = attachment->attachment; 2325 if (attachment_idx == VK_ATTACHMENT_UNUSED) 2326 return; 2327 2328 assert(attachment_idx < state->framebuffer->attachment_count && 2329 attachment_idx < state->attachment_alloc_count); 2330 struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view; 2331 assert(vk_format_is_color(iview->vk.format)); 2332 2333 *rt_bpp = iview->internal_bpp; 2334 *rt_type = iview->internal_type; 2335 if (vk_format_is_int(iview->vk.view_format)) 2336 *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT; 2337 else if (vk_format_is_srgb(iview->vk.view_format)) 2338 *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM; 2339 else 2340 *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE; 2341} 2342