1/* 2 * Copyright © 2022 Imagination Technologies Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a copy 5 * of this software and associated documentation files (the "Software"), to deal 6 * in the Software without restriction, including without limitation the rights 7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 * copies of the Software, and to permit persons to whom the Software is 9 * furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include <assert.h> 25#include <limits.h> 26#include <stdbool.h> 27#include <stddef.h> 28#include <stdint.h> 29#include <string.h> 30#include <vulkan/vulkan.h> 31 32#include "hwdef/rogue_hw_defs.h" 33#include "hwdef/rogue_hw_utils.h" 34#include "pvr_bo.h" 35#include "pvr_csb.h" 36#include "pvr_csb_enum_helpers.h" 37#include "pvr_device_info.h" 38#include "pvr_end_of_tile.h" 39#include "pvr_formats.h" 40#include "pvr_hw_pass.h" 41#include "pvr_job_common.h" 42#include "pvr_job_render.h" 43#include "pvr_limits.h" 44#include "pvr_pds.h" 45#include "pvr_private.h" 46#include "pvr_types.h" 47#include "pvr_winsys.h" 48#include "util/bitscan.h" 49#include "util/compiler.h" 50#include "util/list.h" 51#include "util/macros.h" 52#include "util/u_dynarray.h" 53#include "util/u_pack_color.h" 54#include "vk_alloc.h" 55#include "vk_command_buffer.h" 56#include "vk_command_pool.h" 57#include "vk_format.h" 58#include "vk_log.h" 59#include "vk_object.h" 60#include "vk_util.h" 61 62/* Structure used to pass data into pvr_compute_generate_control_stream() 63 * function. 64 */ 65struct pvr_compute_kernel_info { 66 pvr_dev_addr_t indirect_buffer_addr; 67 bool global_offsets_present; 68 uint32_t usc_common_size; 69 uint32_t usc_unified_size; 70 uint32_t pds_temp_size; 71 uint32_t pds_data_size; 72 enum PVRX(CDMCTRL_USC_TARGET) usc_target; 73 bool is_fence; 74 uint32_t pds_data_offset; 75 uint32_t pds_code_offset; 76 enum PVRX(CDMCTRL_SD_TYPE) sd_type; 77 bool usc_common_shared; 78 uint32_t local_size[PVR_WORKGROUP_DIMENSIONS]; 79 uint32_t global_size[PVR_WORKGROUP_DIMENSIONS]; 80 uint32_t max_instances; 81}; 82 83static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer, 84 struct pvr_sub_cmd *sub_cmd) 85{ 86 switch (sub_cmd->type) { 87 case PVR_SUB_CMD_TYPE_GRAPHICS: 88 pvr_csb_finish(&sub_cmd->gfx.control_stream); 89 pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.depth_bias_bo); 90 pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.scissor_bo); 91 break; 92 93 case PVR_SUB_CMD_TYPE_COMPUTE: 94 pvr_csb_finish(&sub_cmd->compute.control_stream); 95 break; 96 97 case PVR_SUB_CMD_TYPE_TRANSFER: 98 list_for_each_entry_safe (struct pvr_transfer_cmd, 99 transfer_cmd, 100 &sub_cmd->transfer.transfer_cmds, 101 link) { 102 list_del(&transfer_cmd->link); 103 vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd); 104 } 105 break; 106 107 default: 108 pvr_finishme("Unsupported sub-command type %d", sub_cmd->type); 109 break; 110 } 111 112 list_del(&sub_cmd->link); 113 vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd); 114} 115 116static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer) 117{ 118 list_for_each_entry_safe (struct pvr_sub_cmd, 119 sub_cmd, 120 &cmd_buffer->sub_cmds, 121 link) { 122 pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd); 123 } 124} 125 126static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer) 127{ 128 struct pvr_cmd_buffer *cmd_buffer = 129 container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk); 130 131 vk_free(&cmd_buffer->vk.pool->alloc, 132 cmd_buffer->state.render_pass_info.attachments); 133 vk_free(&cmd_buffer->vk.pool->alloc, 134 cmd_buffer->state.render_pass_info.clear_values); 135 136 pvr_cmd_buffer_free_sub_cmds(cmd_buffer); 137 138 list_for_each_entry_safe (struct pvr_bo, bo, &cmd_buffer->bo_list, link) { 139 list_del(&bo->link); 140 pvr_bo_free(cmd_buffer->device, bo); 141 } 142 143 util_dynarray_fini(&cmd_buffer->scissor_array); 144 util_dynarray_fini(&cmd_buffer->depth_bias_array); 145 146 vk_command_buffer_finish(&cmd_buffer->vk); 147 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer); 148} 149 150static VkResult pvr_cmd_buffer_create(struct pvr_device *device, 151 struct vk_command_pool *pool, 152 VkCommandBufferLevel level, 153 VkCommandBuffer *pCommandBuffer) 154{ 155 struct pvr_cmd_buffer *cmd_buffer; 156 VkResult result; 157 158 cmd_buffer = vk_zalloc(&pool->alloc, 159 sizeof(*cmd_buffer), 160 8U, 161 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 162 if (!cmd_buffer) 163 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 164 165 result = vk_command_buffer_init(&cmd_buffer->vk, pool, level); 166 if (result != VK_SUCCESS) { 167 vk_free(&pool->alloc, cmd_buffer); 168 return result; 169 } 170 171 cmd_buffer->vk.destroy = pvr_cmd_buffer_destroy; 172 cmd_buffer->device = device; 173 174 util_dynarray_init(&cmd_buffer->depth_bias_array, NULL); 175 util_dynarray_init(&cmd_buffer->scissor_array, NULL); 176 177 cmd_buffer->state.status = VK_SUCCESS; 178 cmd_buffer->status = PVR_CMD_BUFFER_STATUS_INITIAL; 179 180 list_inithead(&cmd_buffer->sub_cmds); 181 list_inithead(&cmd_buffer->bo_list); 182 183 *pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer); 184 185 return VK_SUCCESS; 186} 187 188VkResult 189pvr_AllocateCommandBuffers(VkDevice _device, 190 const VkCommandBufferAllocateInfo *pAllocateInfo, 191 VkCommandBuffer *pCommandBuffers) 192{ 193 VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool); 194 PVR_FROM_HANDLE(pvr_device, device, _device); 195 VkResult result = VK_SUCCESS; 196 uint32_t i; 197 198 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { 199 result = pvr_cmd_buffer_create(device, 200 pool, 201 pAllocateInfo->level, 202 &pCommandBuffers[i]); 203 if (result != VK_SUCCESS) 204 break; 205 } 206 207 if (result != VK_SUCCESS) { 208 while (i--) { 209 VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]); 210 pvr_cmd_buffer_destroy(cmd_buffer); 211 } 212 213 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) 214 pCommandBuffers[i] = VK_NULL_HANDLE; 215 } 216 217 return result; 218} 219 220static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer, 221 enum pvr_sub_cmd_type type) 222{ 223 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 224 uint32_t barriers; 225 226 switch (type) { 227 case PVR_SUB_CMD_TYPE_GRAPHICS: 228 barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT; 229 break; 230 231 case PVR_SUB_CMD_TYPE_COMPUTE: 232 barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT; 233 break; 234 235 case PVR_SUB_CMD_TYPE_TRANSFER: 236 barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT; 237 break; 238 239 default: 240 barriers = 0; 241 pvr_finishme("Unsupported sub-command type %d", type); 242 break; 243 } 244 245 for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++) 246 state->barriers_needed[i] |= barriers; 247} 248 249static VkResult 250pvr_cmd_buffer_upload_tables(struct pvr_device *device, 251 struct pvr_cmd_buffer *cmd_buffer, 252 struct pvr_sub_cmd_gfx *const sub_cmd) 253{ 254 const uint32_t cache_line_size = 255 rogue_get_slc_cache_line_size(&device->pdevice->dev_info); 256 VkResult result; 257 258 assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo); 259 260 if (cmd_buffer->depth_bias_array.size > 0) { 261 result = 262 pvr_gpu_upload(device, 263 device->heaps.general_heap, 264 util_dynarray_begin(&cmd_buffer->depth_bias_array), 265 cmd_buffer->depth_bias_array.size, 266 cache_line_size, 267 &sub_cmd->depth_bias_bo); 268 if (result != VK_SUCCESS) 269 return result; 270 } 271 272 if (cmd_buffer->scissor_array.size > 0) { 273 result = pvr_gpu_upload(device, 274 device->heaps.general_heap, 275 util_dynarray_begin(&cmd_buffer->scissor_array), 276 cmd_buffer->scissor_array.size, 277 cache_line_size, 278 &sub_cmd->scissor_bo); 279 if (result != VK_SUCCESS) 280 goto err_free_depth_bias_bo; 281 } 282 283 util_dynarray_clear(&cmd_buffer->depth_bias_array); 284 util_dynarray_clear(&cmd_buffer->scissor_array); 285 286 return VK_SUCCESS; 287 288err_free_depth_bias_bo: 289 pvr_bo_free(device, sub_cmd->depth_bias_bo); 290 sub_cmd->depth_bias_bo = NULL; 291 292 return result; 293} 294 295static VkResult 296pvr_cmd_buffer_emit_ppp_state(struct pvr_cmd_buffer *cmd_buffer, 297 struct pvr_sub_cmd_gfx *const sub_cmd) 298{ 299 struct pvr_framebuffer *framebuffer = 300 cmd_buffer->state.render_pass_info.framebuffer; 301 302 pvr_csb_emit (&sub_cmd->control_stream, VDMCTRL_PPP_STATE0, state0) { 303 state0.addrmsb = framebuffer->ppp_state_bo->vma->dev_addr; 304 state0.word_count = framebuffer->ppp_state_size; 305 } 306 307 pvr_csb_emit (&sub_cmd->control_stream, VDMCTRL_PPP_STATE1, state1) { 308 state1.addrlsb = framebuffer->ppp_state_bo->vma->dev_addr; 309 } 310 311 return VK_SUCCESS; 312} 313 314static VkResult 315pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer, 316 const void *const data, 317 const size_t size, 318 struct pvr_bo **const pvr_bo_out) 319{ 320 struct pvr_device *const device = cmd_buffer->device; 321 const uint32_t cache_line_size = 322 rogue_get_slc_cache_line_size(&device->pdevice->dev_info); 323 struct pvr_bo *pvr_bo; 324 VkResult result; 325 326 result = pvr_gpu_upload(device, 327 device->heaps.general_heap, 328 data, 329 size, 330 cache_line_size, 331 &pvr_bo); 332 if (result != VK_SUCCESS) 333 return result; 334 335 list_add(&pvr_bo->link, &cmd_buffer->bo_list); 336 337 *pvr_bo_out = pvr_bo; 338 339 return VK_SUCCESS; 340} 341 342static VkResult 343pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer, 344 const void *const code, 345 const size_t code_size, 346 uint64_t code_alignment, 347 struct pvr_bo **const pvr_bo_out) 348{ 349 struct pvr_device *const device = cmd_buffer->device; 350 const uint32_t cache_line_size = 351 rogue_get_slc_cache_line_size(&device->pdevice->dev_info); 352 struct pvr_bo *pvr_bo; 353 VkResult result; 354 355 code_alignment = MAX2(code_alignment, cache_line_size); 356 357 result = 358 pvr_gpu_upload_usc(device, code, code_size, code_alignment, &pvr_bo); 359 if (result != VK_SUCCESS) 360 return result; 361 362 list_add(&pvr_bo->link, &cmd_buffer->bo_list); 363 364 *pvr_bo_out = pvr_bo; 365 366 return VK_SUCCESS; 367} 368 369static VkResult 370pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer *const cmd_buffer, 371 const uint32_t *data, 372 uint32_t data_size_dwords, 373 uint32_t data_alignment, 374 const uint32_t *code, 375 uint32_t code_size_dwords, 376 uint32_t code_alignment, 377 uint64_t min_alignment, 378 struct pvr_pds_upload *const pds_upload_out) 379{ 380 struct pvr_device *const device = cmd_buffer->device; 381 VkResult result; 382 383 result = pvr_gpu_upload_pds(device, 384 data, 385 data_size_dwords, 386 data_alignment, 387 code, 388 code_size_dwords, 389 code_alignment, 390 min_alignment, 391 pds_upload_out); 392 if (result != VK_SUCCESS) 393 return result; 394 395 list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list); 396 397 return VK_SUCCESS; 398} 399 400static inline VkResult 401pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer, 402 const uint32_t *data, 403 uint32_t data_size_dwords, 404 uint32_t data_alignment, 405 struct pvr_pds_upload *const pds_upload_out) 406{ 407 return pvr_cmd_buffer_upload_pds(cmd_buffer, 408 data, 409 data_size_dwords, 410 data_alignment, 411 NULL, 412 0, 413 0, 414 data_alignment, 415 pds_upload_out); 416} 417 418static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload( 419 struct pvr_cmd_buffer *const cmd_buffer, 420 const uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS], 421 struct pvr_pds_upload *const pds_upload_out) 422{ 423 struct pvr_pds_event_program pixel_event_program = { 424 /* No data to DMA, just a DOUTU needed. */ 425 .num_emit_word_pairs = 0, 426 }; 427 const uint32_t staging_buffer_size = 428 cmd_buffer->device->pixel_event_data_size_in_dwords * sizeof(uint32_t); 429 const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc; 430 struct pvr_device *const device = cmd_buffer->device; 431 /* FIXME: This should come from the compiler for the USC pixel program. */ 432 const uint32_t usc_temp_count = 0; 433 struct pvr_bo *usc_eot_program; 434 uint8_t *usc_eot_program_ptr; 435 uint32_t *staging_buffer; 436 VkResult result; 437 438 result = pvr_cmd_buffer_upload_usc(cmd_buffer, 439 pvr_end_of_tile_program, 440 sizeof(pvr_end_of_tile_program), 441 4, 442 &usc_eot_program); 443 if (result != VK_SUCCESS) 444 return result; 445 446 assert((pbe_cs_words[1] & 0x3F) == 0x20); 447 448 /* FIXME: Stop patching the framebuffer address (this will require the 449 * end-of-tile program to be generated at run-time). 450 */ 451 pvr_bo_cpu_map(device, usc_eot_program); 452 usc_eot_program_ptr = usc_eot_program->bo->map; 453 usc_eot_program_ptr[6] = (pbe_cs_words[0] >> 0) & 0xFF; 454 usc_eot_program_ptr[7] = (pbe_cs_words[0] >> 8) & 0xFF; 455 usc_eot_program_ptr[8] = (pbe_cs_words[0] >> 16) & 0xFF; 456 usc_eot_program_ptr[9] = (pbe_cs_words[0] >> 24) & 0xFF; 457 pvr_bo_cpu_unmap(device, usc_eot_program); 458 459 pvr_pds_setup_doutu(&pixel_event_program.task_control, 460 usc_eot_program->vma->dev_addr.addr, 461 usc_temp_count, 462 PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE), 463 false); 464 465 /* TODO: We could skip allocating this and generate directly into the device 466 * buffer thus removing one allocation and memcpy() per job. Would this 467 * speed up things in a noticeable way? 468 */ 469 staging_buffer = vk_alloc(allocator, 470 staging_buffer_size, 471 8, 472 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 473 if (!staging_buffer) { 474 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 475 goto err_free_usc_pixel_program; 476 } 477 478 /* Generate the data segment. The code segment was uploaded earlier when 479 * setting up the PDS static heap data. 480 */ 481 pvr_pds_generate_pixel_event_data_segment(&pixel_event_program, 482 staging_buffer, 483 &device->pdevice->dev_info); 484 485 result = pvr_cmd_buffer_upload_pds_data( 486 cmd_buffer, 487 staging_buffer, 488 cmd_buffer->device->pixel_event_data_size_in_dwords, 489 4, 490 pds_upload_out); 491 if (result != VK_SUCCESS) 492 goto err_free_pixel_event_staging_buffer; 493 494 vk_free(allocator, staging_buffer); 495 496 return VK_SUCCESS; 497 498err_free_pixel_event_staging_buffer: 499 vk_free(allocator, staging_buffer); 500 501err_free_usc_pixel_program: 502 list_del(&usc_eot_program->link); 503 pvr_bo_free(device, usc_eot_program); 504 505 return result; 506} 507 508static uint32_t pvr_get_hw_clear_color(VkFormat vk_format, 509 const VkClearValue *clear_value) 510{ 511 union util_color uc = { .ui = 0 }; 512 513 switch (vk_format) { 514 case VK_FORMAT_B8G8R8A8_UNORM: 515 util_pack_color(clear_value->color.float32, 516 PIPE_FORMAT_R8G8B8A8_UNORM, 517 &uc); 518 break; 519 520 default: 521 assert(!"Unsupported format"); 522 uc.ui[0] = 0; 523 break; 524 } 525 526 return uc.ui[0]; 527} 528 529static VkResult 530pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer, 531 uint32_t idx, 532 pvr_dev_addr_t *const addr_out) 533{ 534 const struct pvr_render_pass_info *render_pass_info = 535 &cmd_buffer->state.render_pass_info; 536 const struct pvr_render_pass *pass = render_pass_info->pass; 537 const struct pvr_renderpass_hwsetup_render *hw_render = 538 &pass->hw_setup->renders[idx]; 539 ASSERTED const struct pvr_load_op *load_op = hw_render->client_data; 540 const struct pvr_renderpass_colorinit *color_init = 541 &hw_render->color_init[0]; 542 const struct pvr_render_pass_attachment *attachment = 543 &pass->attachments[color_init->driver_id]; 544 const VkClearValue *clear_value = 545 &render_pass_info->clear_values[color_init->driver_id]; 546 uint32_t hw_clear_value; 547 struct pvr_bo *clear_bo; 548 VkResult result; 549 550 pvr_finishme("Add missing load op data support"); 551 552 assert(load_op->is_hw_object); 553 assert(hw_render->color_init_count == 1); 554 555 /* FIXME: add support for RENDERPASS_SURFACE_INITOP_LOAD. */ 556 assert(color_init->op == RENDERPASS_SURFACE_INITOP_CLEAR); 557 558 /* FIXME: do this at the point we store the clear values? */ 559 hw_clear_value = pvr_get_hw_clear_color(attachment->vk_format, clear_value); 560 561 result = pvr_cmd_buffer_upload_general(cmd_buffer, 562 &hw_clear_value, 563 sizeof(hw_clear_value), 564 &clear_bo); 565 if (result != VK_SUCCESS) 566 return result; 567 568 *addr_out = clear_bo->vma->dev_addr; 569 570 return VK_SUCCESS; 571} 572 573static VkResult pvr_load_op_pds_data_create_and_upload( 574 struct pvr_cmd_buffer *cmd_buffer, 575 uint32_t idx, 576 pvr_dev_addr_t constants_addr, 577 struct pvr_pds_upload *const pds_upload_out) 578{ 579 const struct pvr_render_pass_info *render_pass_info = 580 &cmd_buffer->state.render_pass_info; 581 const struct pvr_load_op *load_op = 582 render_pass_info->pass->hw_setup->renders[idx].client_data; 583 struct pvr_device *device = cmd_buffer->device; 584 const struct pvr_device_info *dev_info = &device->pdevice->dev_info; 585 struct pvr_pds_pixel_shader_sa_program program = { 0 }; 586 uint32_t staging_buffer_size; 587 uint32_t *staging_buffer; 588 VkResult result; 589 590 program.num_texture_dma_kicks = 1; 591 592 pvr_csb_pack (&program.texture_dma_address[0], 593 PDSINST_DOUT_FIELDS_DOUTD_SRC0, 594 value) { 595 value.sbase = constants_addr; 596 } 597 598 pvr_csb_pack (&program.texture_dma_control[0], 599 PDSINST_DOUT_FIELDS_DOUTD_SRC1, 600 value) { 601 value.dest = PVRX(PDSINST_DOUTD_DEST_COMMON_STORE); 602 value.a0 = load_op->shareds_dest_offset; 603 value.bsize = load_op->shareds_count; 604 } 605 606 pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info); 607 608 staging_buffer_size = program.data_size * sizeof(*staging_buffer); 609 610 staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc, 611 staging_buffer_size, 612 8, 613 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 614 if (!staging_buffer) 615 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 616 617 pvr_pds_generate_pixel_shader_sa_texture_state_data(&program, 618 staging_buffer, 619 dev_info); 620 621 result = pvr_cmd_buffer_upload_pds_data(cmd_buffer, 622 staging_buffer, 623 program.data_size, 624 1, 625 pds_upload_out); 626 if (result != VK_SUCCESS) { 627 vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer); 628 return result; 629 } 630 631 vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer); 632 633 return VK_SUCCESS; 634} 635 636/* FIXME: Should this function be specific to the HW background object, in 637 * which case its name should be changed, or should it have the load op 638 * structure passed in? 639 */ 640static VkResult 641pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer, 642 uint32_t idx, 643 struct pvr_pds_upload *const pds_upload_out) 644{ 645 pvr_dev_addr_t constants_addr; 646 VkResult result; 647 648 result = 649 pvr_load_op_constants_create_and_upload(cmd_buffer, idx, &constants_addr); 650 if (result != VK_SUCCESS) 651 return result; 652 653 return pvr_load_op_pds_data_create_and_upload(cmd_buffer, 654 idx, 655 constants_addr, 656 pds_upload_out); 657} 658 659static void pvr_pds_bgnd_pack_state( 660 const struct pvr_load_op *load_op, 661 const struct pvr_pds_upload *load_op_program, 662 uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS]) 663{ 664 pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) { 665 value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset); 666 value.texunicode_addr = 667 PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset); 668 } 669 670 pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) { 671 value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset); 672 } 673 674 pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) { 675 value.usc_sharedsize = 676 DIV_ROUND_UP(load_op->const_shareds_count, 677 PVRX(CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE)); 678 value.pds_texturestatesize = DIV_ROUND_UP( 679 load_op_program->data_size, 680 PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE)); 681 value.pds_tempsize = 682 DIV_ROUND_UP(load_op->temps_count, 683 PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE)); 684 } 685} 686 687/** 688 * \brief Calculates the stride in pixels based on the pitch in bytes and pixel 689 * format. 690 * 691 * \param[in] pitch Width pitch in bytes. 692 * \param[in] vk_format Vulkan image format. 693 * \return Stride in pixels. 694 */ 695static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format) 696{ 697 const unsigned int cpp = vk_format_get_blocksize(vk_format); 698 699 assert(pitch % cpp == 0); 700 701 return pitch / cpp; 702} 703 704static void pvr_setup_pbe_state( 705 const struct pvr_device_info *dev_info, 706 struct pvr_framebuffer *framebuffer, 707 uint32_t mrt_index, 708 const struct usc_mrt_resource *mrt_resource, 709 const struct pvr_image_view *const iview, 710 const VkRect2D *render_area, 711 const bool down_scale, 712 const uint32_t samples, 713 uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS], 714 uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS]) 715{ 716 const struct pvr_image *image = iview->image; 717 uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch; 718 719 struct pvr_pbe_surf_params surface_params; 720 struct pvr_pbe_render_params render_params; 721 bool with_packed_usc_channel; 722 const uint8_t *swizzle; 723 uint32_t position; 724 725 /* down_scale should be true when performing a resolve, in which case there 726 * should be more than one sample. 727 */ 728 assert((down_scale && samples > 1U) || (!down_scale && samples == 1U)); 729 730 /* Setup surface parameters. */ 731 732 if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) { 733 switch (iview->vk.format) { 734 case VK_FORMAT_B8G8R8A8_UNORM: 735 with_packed_usc_channel = true; 736 break; 737 case VK_FORMAT_D32_SFLOAT: 738 with_packed_usc_channel = false; 739 break; 740 default: 741 unreachable("Unsupported Vulkan image format"); 742 } 743 } else { 744 with_packed_usc_channel = false; 745 } 746 747 swizzle = pvr_get_format_swizzle(iview->vk.format); 748 memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle)); 749 750 pvr_pbe_get_src_format_and_gamma(iview->vk.format, 751 PVR_PBE_GAMMA_NONE, 752 with_packed_usc_channel, 753 &surface_params.source_format, 754 &surface_params.gamma); 755 756 surface_params.is_normalized = vk_format_is_normalized(iview->vk.format); 757 surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format); 758 surface_params.nr_components = vk_format_get_nr_components(iview->vk.format); 759 760 /* FIXME: Should we have an inline function to return the address of a mip 761 * level? 762 */ 763 surface_params.addr = 764 PVR_DEV_ADDR_OFFSET(image->vma->dev_addr, 765 image->mip_levels[iview->vk.base_mip_level].offset); 766 767 surface_params.mem_layout = image->memlayout; 768 surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format); 769 surface_params.depth = iview->vk.extent.depth; 770 surface_params.width = iview->vk.extent.width; 771 surface_params.height = iview->vk.extent.height; 772 surface_params.z_only_render = false; 773 surface_params.down_scale = down_scale; 774 surface_params.msaa_mode = samples; 775 776 /* Setup render parameters. */ 777 778 if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) { 779 position = mrt_resource->u.mem.offset_in_dwords; 780 } else { 781 assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REGISTER); 782 assert(mrt_resource->u.reg.offset == 0); 783 784 position = mrt_resource->u.reg.out_reg; 785 } 786 787 assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers)); 788 789 switch (position) { 790 case 0: 791 case 4: 792 render_params.source_start = PVR_PBE_STARTPOS_BIT0; 793 break; 794 case 1: 795 case 5: 796 render_params.source_start = PVR_PBE_STARTPOS_BIT32; 797 break; 798 case 2: 799 case 6: 800 render_params.source_start = PVR_PBE_STARTPOS_BIT64; 801 break; 802 case 3: 803 case 7: 804 render_params.source_start = PVR_PBE_STARTPOS_BIT96; 805 break; 806 default: 807 assert(!"Invalid output register"); 808 break; 809 } 810 811 render_params.min_x_clip = MAX2(0, render_area->offset.x); 812 render_params.min_y_clip = MAX2(0, render_area->offset.y); 813 render_params.max_x_clip = 814 MIN2(framebuffer->width, 815 render_area->offset.x + render_area->extent.width) - 816 1; 817 render_params.max_y_clip = 818 MIN2(framebuffer->height, 819 render_area->offset.y + render_area->extent.height) - 820 1; 821 822 render_params.slice = 0; 823 render_params.mrt_index = mrt_index; 824 825 pvr_pbe_pack_state(dev_info, 826 &surface_params, 827 &render_params, 828 pbe_cs_words, 829 pbe_reg_words); 830} 831 832static struct pvr_render_target * 833pvr_get_render_target(const struct pvr_render_pass *pass, 834 const struct pvr_framebuffer *framebuffer, 835 uint32_t idx) 836{ 837 const struct pvr_renderpass_hwsetup_render *hw_render = 838 &pass->hw_setup->renders[idx]; 839 uint32_t rt_idx = 0; 840 841 switch (hw_render->sample_count) { 842 case 1: 843 case 2: 844 case 4: 845 case 8: 846 rt_idx = util_logbase2(hw_render->sample_count); 847 break; 848 849 default: 850 unreachable("Unsupported sample count"); 851 break; 852 } 853 854 return &framebuffer->render_targets[rt_idx]; 855} 856 857static uint32_t 858pvr_pass_get_pixel_output_width(const struct pvr_render_pass *pass, 859 uint32_t idx, 860 const struct pvr_device_info *dev_info) 861{ 862 const struct pvr_renderpass_hwsetup_render *hw_render = 863 &pass->hw_setup->renders[idx]; 864 /* Default value based on the maximum value found in all existing cores. The 865 * maximum is used as this is being treated as a lower bound, making it a 866 * "safer" choice than the minimum value found in all existing cores. 867 */ 868 const uint32_t min_output_regs = 869 PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U); 870 const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs); 871 872 return util_next_power_of_two(width); 873} 874 875static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info, 876 struct pvr_cmd_buffer *cmd_buffer, 877 struct pvr_sub_cmd_gfx *sub_cmd) 878{ 879 struct pvr_render_pass_info *render_pass_info = 880 &cmd_buffer->state.render_pass_info; 881 const struct pvr_renderpass_hwsetup_render *hw_render = 882 &render_pass_info->pass->hw_setup->renders[sub_cmd->hw_render_idx]; 883 struct pvr_render_job *job = &sub_cmd->job; 884 struct pvr_pds_upload pds_pixel_event_program; 885 886 uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS] 887 [ROGUE_NUM_PBESTATE_STATE_WORDS]; 888 struct pvr_render_target *render_target; 889 VkResult result; 890 891 assert(hw_render->eot_surface_count < ARRAY_SIZE(pbe_cs_words)); 892 893 for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) { 894 const struct pvr_renderpass_hwsetup_eot_surface *surface = 895 &hw_render->eot_surfaces[i]; 896 const struct pvr_image_view *iview = 897 render_pass_info->attachments[surface->attachment_index]; 898 const struct usc_mrt_resource *mrt_resource = 899 &hw_render->eot_setup.mrt_resources[surface->mrt_index]; 900 uint32_t samples = 1; 901 902 if (surface->need_resolve) 903 pvr_finishme("Set up job resolve information."); 904 905 pvr_setup_pbe_state(dev_info, 906 render_pass_info->framebuffer, 907 surface->mrt_index, 908 mrt_resource, 909 iview, 910 &render_pass_info->render_area, 911 surface->need_resolve, 912 samples, 913 pbe_cs_words[i], 914 job->pbe_reg_words[i]); 915 } 916 917 /* FIXME: The fragment program only supports a single surface at present. */ 918 assert(hw_render->eot_surface_count == 1); 919 result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload( 920 cmd_buffer, 921 pbe_cs_words[0], 922 &pds_pixel_event_program); 923 if (result != VK_SUCCESS) 924 return result; 925 926 job->pds_pixel_event_data_offset = pds_pixel_event_program.data_offset; 927 928 /* FIXME: Don't do this if there is a barrier load. */ 929 if (render_pass_info->enable_bg_tag) { 930 const struct pvr_load_op *load_op = hw_render->client_data; 931 struct pvr_pds_upload load_op_program; 932 933 /* FIXME: Should we free the PDS pixel event data or let it be freed 934 * when the pool gets emptied? 935 */ 936 result = pvr_load_op_data_create_and_upload(cmd_buffer, 937 sub_cmd->hw_render_idx, 938 &load_op_program); 939 if (result != VK_SUCCESS) 940 return result; 941 942 pvr_pds_bgnd_pack_state(load_op, 943 &load_op_program, 944 job->pds_bgnd_reg_values); 945 } 946 947 job->enable_bg_tag = render_pass_info->enable_bg_tag; 948 job->process_empty_tiles = render_pass_info->process_empty_tiles; 949 950 render_target = pvr_get_render_target(render_pass_info->pass, 951 render_pass_info->framebuffer, 952 sub_cmd->hw_render_idx); 953 job->rt_dataset = render_target->rt_dataset; 954 955 job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream); 956 957 /* FIXME: Need to set up the border color table at device creation 958 * time. Set to invalid for the time being. 959 */ 960 job->border_colour_table_addr = PVR_DEV_ADDR_INVALID; 961 962 if (sub_cmd->depth_bias_bo) 963 job->depth_bias_table_addr = sub_cmd->depth_bias_bo->vma->dev_addr; 964 else 965 job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID; 966 967 if (sub_cmd->scissor_bo) 968 job->scissor_table_addr = sub_cmd->scissor_bo->vma->dev_addr; 969 else 970 job->scissor_table_addr = PVR_DEV_ADDR_INVALID; 971 972 job->pixel_output_width = 973 pvr_pass_get_pixel_output_width(render_pass_info->pass, 974 sub_cmd->hw_render_idx, 975 dev_info); 976 977 if (hw_render->ds_surface_id != -1) { 978 struct pvr_image_view *iview = 979 render_pass_info->attachments[hw_render->ds_surface_id]; 980 const struct pvr_image *image = iview->image; 981 982 if (vk_format_has_depth(image->vk.format)) { 983 uint32_t level_pitch = 984 image->mip_levels[iview->vk.base_mip_level].pitch; 985 986 /* FIXME: Is this sufficient for depth buffers? */ 987 job->depth_addr = image->dev_addr; 988 989 job->depth_stride = 990 pvr_stride_from_pitch(level_pitch, iview->vk.format); 991 job->depth_height = iview->vk.extent.height; 992 job->depth_physical_width = 993 u_minify(image->physical_extent.width, iview->vk.base_mip_level); 994 job->depth_physical_height = 995 u_minify(image->physical_extent.height, iview->vk.base_mip_level); 996 job->depth_layer_size = image->layer_size; 997 998 if (hw_render->ds_surface_id < render_pass_info->clear_value_count) { 999 VkClearValue *clear_values = 1000 &render_pass_info->clear_values[hw_render->ds_surface_id]; 1001 1002 job->depth_clear_value = clear_values->depthStencil.depth; 1003 } else { 1004 job->depth_clear_value = 1.0f; 1005 } 1006 1007 job->depth_vk_format = iview->vk.format; 1008 1009 job->depth_memlayout = image->memlayout; 1010 } else { 1011 job->depth_addr = PVR_DEV_ADDR_INVALID; 1012 job->depth_stride = 0; 1013 job->depth_height = 0; 1014 job->depth_physical_width = 0; 1015 job->depth_physical_height = 0; 1016 job->depth_layer_size = 0; 1017 job->depth_clear_value = 1.0f; 1018 job->depth_vk_format = VK_FORMAT_UNDEFINED; 1019 job->depth_memlayout = PVR_MEMLAYOUT_LINEAR; 1020 } 1021 1022 if (vk_format_has_stencil(image->vk.format)) { 1023 /* FIXME: Is this sufficient for stencil buffers? */ 1024 job->stencil_addr = image->dev_addr; 1025 } else { 1026 job->stencil_addr = PVR_DEV_ADDR_INVALID; 1027 } 1028 1029 job->samples = image->vk.samples; 1030 } else { 1031 pvr_finishme("Set up correct number of samples for render job"); 1032 1033 job->depth_addr = PVR_DEV_ADDR_INVALID; 1034 job->depth_stride = 0; 1035 job->depth_height = 0; 1036 job->depth_physical_width = 0; 1037 job->depth_physical_height = 0; 1038 job->depth_layer_size = 0; 1039 job->depth_clear_value = 1.0f; 1040 job->depth_vk_format = VK_FORMAT_UNDEFINED; 1041 job->depth_memlayout = PVR_MEMLAYOUT_LINEAR; 1042 1043 job->stencil_addr = PVR_DEV_ADDR_INVALID; 1044 1045 job->samples = 1; 1046 } 1047 1048 if (sub_cmd->max_tiles_in_flight == 1049 PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) { 1050 /* Use the default limit based on the partition store. */ 1051 job->max_tiles_in_flight = 0U; 1052 } else { 1053 job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight; 1054 } 1055 1056 job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops; 1057 job->disable_compute_overlap = false; 1058 job->max_shared_registers = cmd_buffer->state.max_shared_regs; 1059 job->run_frag = true; 1060 job->geometry_terminate = true; 1061 1062 return VK_SUCCESS; 1063} 1064 1065/* Number of shareds used in the Issue Data Fence(IDF)/Wait Data Fence(WDF) 1066 * kernel. 1067 */ 1068#define PVR_IDF_WDF_IN_REGISTER_CONST_COUNT 12U 1069 1070static void 1071pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice, 1072 struct pvr_cmd_buffer *cmd_buffer, 1073 struct pvr_sub_cmd_compute *sub_cmd) 1074{ 1075 const struct pvr_device_runtime_info *dev_runtime_info = 1076 &pdevice->dev_runtime_info; 1077 const struct pvr_device_info *dev_info = &pdevice->dev_info; 1078 1079 if (sub_cmd->uses_barrier) 1080 sub_cmd->submit_info.flags |= PVR_WINSYS_COMPUTE_FLAG_PREVENT_ALL_OVERLAP; 1081 1082 pvr_csb_pack (&sub_cmd->submit_info.regs.cdm_ctrl_stream_base, 1083 CR_CDM_CTRL_STREAM_BASE, 1084 value) { 1085 value.addr = pvr_csb_get_start_address(&sub_cmd->control_stream); 1086 } 1087 1088 /* FIXME: Need to set up the border color table at device creation 1089 * time. Set to invalid for the time being. 1090 */ 1091 pvr_csb_pack (&sub_cmd->submit_info.regs.tpu_border_colour_table, 1092 CR_TPU_BORDER_COLOUR_TABLE_CDM, 1093 value) { 1094 value.border_colour_table_address = PVR_DEV_ADDR_INVALID; 1095 } 1096 1097 sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds, 1098 cmd_buffer->state.max_shared_regs); 1099 1100 cmd_buffer->state.max_shared_regs = 0U; 1101 1102 if (PVR_HAS_FEATURE(dev_info, compute_morton_capable)) 1103 sub_cmd->submit_info.regs.cdm_item = 0; 1104 1105 pvr_csb_pack (&sub_cmd->submit_info.regs.tpu, CR_TPU, value) { 1106 value.tag_cem_4k_face_packing = true; 1107 } 1108 1109 if (PVR_HAS_FEATURE(dev_info, cluster_grouping) && 1110 PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) && 1111 dev_runtime_info->num_phantoms > 1 && sub_cmd->uses_atomic_ops) { 1112 /* Each phantom has its own MCU, so atomicity can only be guaranteed 1113 * when all work items are processed on the same phantom. This means we 1114 * need to disable all USCs other than those of the first phantom, which 1115 * has 4 clusters. 1116 */ 1117 pvr_csb_pack (&sub_cmd->submit_info.regs.compute_cluster, 1118 CR_COMPUTE_CLUSTER, 1119 value) { 1120 value.mask = 0xFU; 1121 } 1122 } else { 1123 pvr_csb_pack (&sub_cmd->submit_info.regs.compute_cluster, 1124 CR_COMPUTE_CLUSTER, 1125 value) { 1126 value.mask = 0U; 1127 } 1128 } 1129 1130 if (PVR_HAS_FEATURE(dev_info, gpu_multicore_support) && 1131 sub_cmd->uses_atomic_ops) { 1132 sub_cmd->submit_info.flags |= PVR_WINSYS_COMPUTE_FLAG_SINGLE_CORE; 1133 } 1134} 1135 1136#define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \ 1137 (1024 / PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)) 1138 1139static uint32_t 1140pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice, 1141 uint32_t coeff_regs_count, 1142 bool use_barrier, 1143 uint32_t total_workitems) 1144{ 1145 const struct pvr_device_runtime_info *dev_runtime_info = 1146 &pdevice->dev_runtime_info; 1147 const struct pvr_device_info *dev_info = &pdevice->dev_info; 1148 uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK; 1149 uint32_t max_avail_coeff_regs = 1150 dev_runtime_info->cdm_max_local_mem_size_regs; 1151 uint32_t localstore_chunks_count = 1152 DIV_ROUND_UP(coeff_regs_count << 2, 1153 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)); 1154 1155 /* Ensure that we cannot have more workgroups in a slot than the available 1156 * number of coefficients allow us to have. 1157 */ 1158 if (coeff_regs_count > 0U) { 1159 /* If TA or 3D can overlap with CDM, or if the TA is running a geometry 1160 * shader then we need to consider this in calculating max allowed 1161 * work-groups. 1162 */ 1163 if (PVR_HAS_QUIRK(dev_info, 52354) && 1164 (PVR_HAS_FEATURE(dev_info, compute_overlap) || 1165 PVR_HAS_FEATURE(dev_info, gs_rta_support))) { 1166 /* Solve for n (number of work-groups per task). All values are in 1167 * size of common store alloc blocks: 1168 * 1169 * n + (2n + 7) * (local_memory_size_max - 1) = 1170 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max) 1171 * ==> 1172 * n + 2n * (local_memory_size_max - 1) = 1173 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max) 1174 * - (7 * (local_memory_size_max - 1)) 1175 * ==> 1176 * n * (1 + 2 * (local_memory_size_max - 1)) = 1177 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max) 1178 * - (7 * (local_memory_size_max - 1)) 1179 * ==> 1180 * n = ((coefficient_memory_pool_size) - 1181 * (7 * pixel_allocation_size_max) - 1182 * (7 * (local_memory_size_max - 1)) / (1 + 1183 * 2 * (local_memory_size_max - 1))) 1184 */ 1185 uint32_t max_common_store_blocks = 1186 DIV_ROUND_UP(max_avail_coeff_regs * 4U, 1187 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)); 1188 1189 /* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max) 1190 */ 1191 max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES * 1192 PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS; 1193 1194 /* - (7 * (local_memory_size_max - 1)) */ 1195 max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES * 1196 (localstore_chunks_count - 1U)); 1197 1198 /* Divide by (1 + 2 * (local_memory_size_max - 1)) */ 1199 max_workgroups_per_task = max_common_store_blocks / 1200 (1U + 2U * (localstore_chunks_count - 1U)); 1201 1202 max_workgroups_per_task = 1203 MIN2(max_workgroups_per_task, 1204 ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK); 1205 1206 } else { 1207 max_workgroups_per_task = 1208 MIN2((max_avail_coeff_regs / coeff_regs_count), 1209 max_workgroups_per_task); 1210 } 1211 } 1212 1213 /* max_workgroups_per_task should at least be one. */ 1214 assert(max_workgroups_per_task >= 1U); 1215 1216 if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) { 1217 /* In this case, the work group size will have been padded up to the 1218 * next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be 1219 * ROGUE_MAX_INSTANCES_PER_TASK. 1220 */ 1221 return ROGUE_MAX_INSTANCES_PER_TASK; 1222 } 1223 1224 /* In this case, the number of instances in the slot must be clamped to 1225 * accommodate whole work-groups only. 1226 */ 1227 if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) { 1228 max_workgroups_per_task = 1229 MIN2(max_workgroups_per_task, 1230 ROGUE_MAX_INSTANCES_PER_TASK / total_workitems); 1231 return total_workitems * max_workgroups_per_task; 1232 } 1233 1234 return MIN2(total_workitems * max_workgroups_per_task, 1235 ROGUE_MAX_INSTANCES_PER_TASK); 1236} 1237 1238static void 1239pvr_compute_generate_control_stream(struct pvr_csb *csb, 1240 struct pvr_sub_cmd_compute *sub_cmd, 1241 const struct pvr_compute_kernel_info *info) 1242{ 1243 /* Compute kernel 0. */ 1244 pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) { 1245 kernel0.indirect_present = !!info->indirect_buffer_addr.addr; 1246 kernel0.global_offsets_present = info->global_offsets_present; 1247 kernel0.usc_common_size = info->usc_common_size; 1248 kernel0.usc_unified_size = info->usc_unified_size; 1249 kernel0.pds_temp_size = info->pds_temp_size; 1250 kernel0.pds_data_size = info->pds_data_size; 1251 kernel0.usc_target = info->usc_target; 1252 kernel0.fence = info->is_fence; 1253 } 1254 1255 /* Compute kernel 1. */ 1256 pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) { 1257 kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset); 1258 kernel1.sd_type = info->sd_type; 1259 kernel1.usc_common_shared = info->usc_common_shared; 1260 } 1261 1262 /* Compute kernel 2. */ 1263 pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) { 1264 kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset); 1265 } 1266 1267 if (info->indirect_buffer_addr.addr) { 1268 /* Compute kernel 6. */ 1269 pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) { 1270 kernel6.indirect_addrmsb = info->indirect_buffer_addr; 1271 } 1272 1273 /* Compute kernel 7. */ 1274 pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) { 1275 kernel7.indirect_addrlsb = info->indirect_buffer_addr; 1276 } 1277 } else { 1278 /* Compute kernel 3. */ 1279 pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) { 1280 assert(info->global_size[0U] > 0U); 1281 kernel3.workgroup_x = info->global_size[0U] - 1U; 1282 } 1283 1284 /* Compute kernel 4. */ 1285 pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) { 1286 assert(info->global_size[1U] > 0U); 1287 kernel4.workgroup_y = info->global_size[1U] - 1U; 1288 } 1289 1290 /* Compute kernel 5. */ 1291 pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) { 1292 assert(info->global_size[2U] > 0U); 1293 kernel5.workgroup_z = info->global_size[2U] - 1U; 1294 } 1295 } 1296 1297 /* Compute kernel 8. */ 1298 pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) { 1299 if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK) 1300 kernel8.max_instances = 0U; 1301 else 1302 kernel8.max_instances = info->max_instances; 1303 1304 assert(info->local_size[0U] > 0U); 1305 kernel8.workgroup_size_x = info->local_size[0U] - 1U; 1306 assert(info->local_size[1U] > 0U); 1307 kernel8.workgroup_size_y = info->local_size[1U] - 1U; 1308 assert(info->local_size[2U] > 0U); 1309 kernel8.workgroup_size_z = info->local_size[2U] - 1U; 1310 } 1311 1312 /* Track the highest amount of shared registers usage in this dispatch. 1313 * This is used by the FW for context switching, so must be large enough 1314 * to contain all the shared registers that might be in use for this compute 1315 * job. Coefficients don't need to be included as the context switch will not 1316 * happen within the execution of a single workgroup, thus nothing needs to 1317 * be preserved. 1318 */ 1319 if (info->usc_common_shared) { 1320 sub_cmd->num_shared_regs = 1321 MAX2(sub_cmd->num_shared_regs, info->usc_common_size); 1322 } 1323} 1324 1325/* TODO: This can be pre-packed and uploaded directly. Would that provide any 1326 * speed up? 1327 */ 1328static void 1329pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer, 1330 struct pvr_sub_cmd_compute *const sub_cmd) 1331{ 1332 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 1333 bool *const is_sw_barier_required = 1334 &state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing; 1335 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; 1336 struct pvr_csb *csb = &sub_cmd->control_stream; 1337 const struct pvr_pds_upload *program; 1338 1339 if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) && 1340 *is_sw_barier_required) { 1341 *is_sw_barier_required = false; 1342 program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds; 1343 } else { 1344 program = &cmd_buffer->device->idfwdf_state.pds; 1345 } 1346 1347 struct pvr_compute_kernel_info info = { 1348 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID, 1349 .global_offsets_present = false, 1350 .usc_common_size = 1351 DIV_ROUND_UP(cmd_buffer->device->idfwdf_state.usc_shareds << 2, 1352 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)), 1353 .usc_unified_size = 0U, 1354 .pds_temp_size = 0U, 1355 .pds_data_size = 1356 DIV_ROUND_UP(program->data_size << 2, 1357 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)), 1358 .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL), 1359 .is_fence = false, 1360 .pds_data_offset = program->data_offset, 1361 .sd_type = PVRX(CDMCTRL_SD_TYPE_USC), 1362 .usc_common_shared = true, 1363 .pds_code_offset = program->code_offset, 1364 .global_size = { 1U, 1U, 1U }, 1365 .local_size = { 1U, 1U, 1U }, 1366 }; 1367 1368 /* We don't need to pad work-group size for this case. */ 1369 1370 info.max_instances = 1371 pvr_compute_flat_slot_size(pdevice, 1372 cmd_buffer->device->idfwdf_state.usc_shareds, 1373 false, 1374 1U); 1375 1376 pvr_compute_generate_control_stream(csb, sub_cmd, &info); 1377} 1378 1379static void 1380pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer, 1381 struct pvr_sub_cmd_compute *const sub_cmd, 1382 bool deallocate_shareds) 1383{ 1384 const struct pvr_pds_upload *program = 1385 &cmd_buffer->device->pds_compute_fence_program; 1386 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; 1387 struct pvr_csb *csb = &sub_cmd->control_stream; 1388 1389 struct pvr_compute_kernel_info info = { 1390 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID, 1391 .global_offsets_present = false, 1392 .usc_common_size = 0U, 1393 .usc_unified_size = 0U, 1394 .pds_temp_size = 0U, 1395 .pds_data_size = 1396 DIV_ROUND_UP(program->data_size << 2, 1397 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)), 1398 .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY), 1399 .is_fence = true, 1400 .pds_data_offset = program->data_offset, 1401 .sd_type = PVRX(CDMCTRL_SD_TYPE_PDS), 1402 .usc_common_shared = deallocate_shareds, 1403 .pds_code_offset = program->code_offset, 1404 .global_size = { 1U, 1U, 1U }, 1405 .local_size = { 1U, 1U, 1U }, 1406 }; 1407 1408 /* We don't need to pad work-group size for this case. */ 1409 /* Here we calculate the slot size. This can depend on the use of barriers, 1410 * local memory, BRN's or other factors. 1411 */ 1412 info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U); 1413 1414 pvr_compute_generate_control_stream(csb, sub_cmd, &info); 1415} 1416 1417static VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer) 1418{ 1419 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 1420 struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd; 1421 struct pvr_device *device = cmd_buffer->device; 1422 VkResult result; 1423 1424 /* FIXME: Is this NULL check required because this function is called from 1425 * pvr_resolve_unemitted_resolve_attachments()? See comment about this 1426 * function being called twice in a row in pvr_CmdEndRenderPass(). 1427 */ 1428 if (!sub_cmd) 1429 return VK_SUCCESS; 1430 1431 switch (sub_cmd->type) { 1432 case PVR_SUB_CMD_TYPE_GRAPHICS: { 1433 struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx; 1434 1435 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { 1436 result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream); 1437 if (result != VK_SUCCESS) { 1438 state->status = result; 1439 return result; 1440 } 1441 1442 break; 1443 } 1444 1445 /* TODO: Check if the sub_cmd can be skipped based on 1446 * sub_cmd->gfx.empty_cmd flag. 1447 */ 1448 1449 result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd); 1450 if (result != VK_SUCCESS) { 1451 state->status = result; 1452 return result; 1453 } 1454 1455 result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, gfx_sub_cmd); 1456 if (result != VK_SUCCESS) { 1457 state->status = result; 1458 return result; 1459 } 1460 1461 result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream); 1462 if (result != VK_SUCCESS) { 1463 state->status = result; 1464 return result; 1465 } 1466 1467 result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info, 1468 cmd_buffer, 1469 gfx_sub_cmd); 1470 if (result != VK_SUCCESS) { 1471 state->status = result; 1472 return result; 1473 } 1474 1475 break; 1476 } 1477 1478 case PVR_SUB_CMD_TYPE_COMPUTE: { 1479 struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute; 1480 1481 pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true); 1482 1483 result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream); 1484 if (result != VK_SUCCESS) { 1485 state->status = result; 1486 return result; 1487 } 1488 1489 pvr_sub_cmd_compute_job_init(device->pdevice, 1490 cmd_buffer, 1491 compute_sub_cmd); 1492 break; 1493 } 1494 1495 case PVR_SUB_CMD_TYPE_TRANSFER: 1496 break; 1497 1498 default: 1499 pvr_finishme("Unsupported sub-command type %d", sub_cmd->type); 1500 break; 1501 } 1502 1503 state->current_sub_cmd = NULL; 1504 1505 return VK_SUCCESS; 1506} 1507 1508static void pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer_state *state, 1509 bool start_geom) 1510{ 1511 if (start_geom) { 1512 /* 1513 * Initial geometry phase State. 1514 * It's the driver's responsibility to ensure that the state of the 1515 * hardware is correctly initialized at the start of every geometry 1516 * phase. This is required to prevent stale state from a previous 1517 * geometry phase erroneously affecting the next geometry phase. The 1518 * following fields in PPP State Header, and their corresponding state 1519 * words, must be supplied in the first PPP State Update of a geometry 1520 * phase that contains any geometry (draw calls). Any field not listed 1521 * below is safe to ignore. 1522 * 1523 * TA_PRES_STREAM_OUT_SIZE 1524 * TA_PRES_PPPCTRL 1525 * TA_PRES_VARYING_WORD2 1526 * TA_PRES_VARYING_WORD1 1527 * TA_PRES_VARYING_WORD0 1528 * TA_PRES_OUTSELECTS 1529 * TA_PRES_WCLAMP 1530 * TA_VIEWPORT_COUNT 1531 * TA_PRES_VIEWPORT 1532 * TA_PRES_REGION_CLIP 1533 * TA_PRES_PDSSTATEPTR0 1534 * TA_PRES_ISPCTLFB 1535 * TA_PRES_ISPCTLFA 1536 * TA_PRES_ISPCTL 1537 * 1538 * If a geometry phase does not contain any geometry, this restriction 1539 * can be ignored. If the first draw call in a geometry phase will only 1540 * update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set 1541 * in the ISP State Control Word, the PDS State Pointers 1542 * (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to 1543 * be supplied, since they will never reach the PDS in the fragment 1544 * phase. 1545 */ 1546 1547 state->emit_state_bits = 0; 1548 1549 state->emit_state.stream_out = true; 1550 state->emit_state.ppp_control = true; 1551 state->emit_state.varying_word2 = true; 1552 state->emit_state.varying_word1 = true; 1553 state->emit_state.varying_word0 = true; 1554 state->emit_state.output_selects = true; 1555 state->emit_state.wclamp = true; 1556 state->emit_state.viewport = true; 1557 state->emit_state.region_clip = true; 1558 state->emit_state.pds_fragment_stateptr0 = true; 1559 state->emit_state.isp_fb = true; 1560 state->emit_state.isp = true; 1561 } else { 1562 state->emit_state.ppp_control = true; 1563 state->emit_state.varying_word1 = true; 1564 state->emit_state.varying_word0 = true; 1565 state->emit_state.output_selects = true; 1566 state->emit_state.viewport = true; 1567 state->emit_state.region_clip = true; 1568 state->emit_state.pds_fragment_stateptr0 = true; 1569 state->emit_state.isp_fb = true; 1570 state->emit_state.isp = true; 1571 } 1572 1573 memset(&state->ppp_state, 0U, sizeof(state->ppp_state)); 1574 1575 state->dirty.vertex_bindings = true; 1576 state->dirty.gfx_pipeline_binding = true; 1577 state->dirty.viewport = true; 1578} 1579 1580static VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer, 1581 enum pvr_sub_cmd_type type) 1582{ 1583 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 1584 struct pvr_device *device = cmd_buffer->device; 1585 struct pvr_sub_cmd *sub_cmd; 1586 VkResult result; 1587 1588 /* Check the current status of the buffer. */ 1589 if (state->status != VK_SUCCESS) 1590 return state->status; 1591 1592 pvr_cmd_buffer_update_barriers(cmd_buffer, type); 1593 1594 if (state->current_sub_cmd) { 1595 if (state->current_sub_cmd->type == type) { 1596 /* Continue adding to the current sub command. */ 1597 return VK_SUCCESS; 1598 } 1599 1600 /* End the current sub command. */ 1601 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); 1602 if (result != VK_SUCCESS) 1603 return result; 1604 } 1605 1606 sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc, 1607 sizeof(*sub_cmd), 1608 8, 1609 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 1610 if (!sub_cmd) { 1611 state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); 1612 return state->status; 1613 } 1614 1615 sub_cmd->type = type; 1616 1617 switch (type) { 1618 case PVR_SUB_CMD_TYPE_GRAPHICS: 1619 1620 sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED; 1621 sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED; 1622 sub_cmd->gfx.modifies_depth = false; 1623 sub_cmd->gfx.modifies_stencil = false; 1624 sub_cmd->gfx.max_tiles_in_flight = 1625 PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info, 1626 isp_max_tiles_in_flight, 1627 1); 1628 sub_cmd->gfx.hw_render_idx = state->render_pass_info.current_hw_subpass; 1629 sub_cmd->gfx.framebuffer = state->render_pass_info.framebuffer; 1630 sub_cmd->gfx.empty_cmd = true; 1631 1632 pvr_reset_graphics_dirty_state(state, true); 1633 pvr_csb_init(device, 1634 PVR_CMD_STREAM_TYPE_GRAPHICS, 1635 &sub_cmd->gfx.control_stream); 1636 break; 1637 1638 case PVR_SUB_CMD_TYPE_COMPUTE: 1639 pvr_csb_init(device, 1640 PVR_CMD_STREAM_TYPE_COMPUTE, 1641 &sub_cmd->compute.control_stream); 1642 break; 1643 1644 case PVR_SUB_CMD_TYPE_TRANSFER: 1645 list_inithead(&sub_cmd->transfer.transfer_cmds); 1646 break; 1647 1648 default: 1649 pvr_finishme("Unsupported sub-command type %d", type); 1650 break; 1651 } 1652 1653 list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds); 1654 state->current_sub_cmd = sub_cmd; 1655 1656 return VK_SUCCESS; 1657} 1658 1659VkResult pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer *cmd_buffer, 1660 struct pvr_winsys_heap *heap, 1661 uint64_t size, 1662 uint32_t flags, 1663 struct pvr_bo **const pvr_bo_out) 1664{ 1665 const uint32_t cache_line_size = 1666 rogue_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info); 1667 struct pvr_bo *pvr_bo; 1668 VkResult result; 1669 1670 result = pvr_bo_alloc(cmd_buffer->device, 1671 heap, 1672 size, 1673 cache_line_size, 1674 flags, 1675 &pvr_bo); 1676 if (result != VK_SUCCESS) { 1677 cmd_buffer->state.status = result; 1678 return result; 1679 } 1680 1681 list_add(&pvr_bo->link, &cmd_buffer->bo_list); 1682 1683 *pvr_bo_out = pvr_bo; 1684 1685 return VK_SUCCESS; 1686} 1687 1688VkResult pvr_ResetCommandBuffer(VkCommandBuffer commandBuffer, 1689 VkCommandBufferResetFlags flags) 1690{ 1691 assert(!"Unimplemented"); 1692 return VK_SUCCESS; 1693} 1694 1695static void pvr_cmd_bind_compute_pipeline( 1696 const struct pvr_compute_pipeline *const compute_pipeline, 1697 struct pvr_cmd_buffer *const cmd_buffer) 1698{ 1699 cmd_buffer->state.compute_pipeline = compute_pipeline; 1700 cmd_buffer->state.dirty.compute_pipeline_binding = true; 1701} 1702 1703static void pvr_cmd_bind_graphics_pipeline( 1704 const struct pvr_graphics_pipeline *const gfx_pipeline, 1705 struct pvr_cmd_buffer *const cmd_buffer) 1706{ 1707 struct pvr_dynamic_state *const dest_state = 1708 &cmd_buffer->state.dynamic.common; 1709 const struct pvr_dynamic_state *const src_state = 1710 &gfx_pipeline->dynamic_state; 1711 struct pvr_cmd_buffer_state *const cmd_buffer_state = &cmd_buffer->state; 1712 const uint32_t state_mask = src_state->mask; 1713 1714 cmd_buffer_state->gfx_pipeline = gfx_pipeline; 1715 cmd_buffer_state->dirty.gfx_pipeline_binding = true; 1716 1717 /* FIXME: Handle PVR_DYNAMIC_STATE_BIT_VIEWPORT. */ 1718 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_VIEWPORT)) { 1719 assert(!"Unimplemented"); 1720 } 1721 1722 /* FIXME: Handle PVR_DYNAMIC_STATE_BIT_SCISSOR. */ 1723 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_SCISSOR)) { 1724 assert(!"Unimplemented"); 1725 } 1726 1727 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_LINE_WIDTH)) { 1728 dest_state->line_width = src_state->line_width; 1729 1730 cmd_buffer_state->dirty.line_width = true; 1731 } 1732 1733 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS)) { 1734 memcpy(&dest_state->depth_bias, 1735 &src_state->depth_bias, 1736 sizeof(src_state->depth_bias)); 1737 1738 cmd_buffer_state->dirty.depth_bias = true; 1739 } 1740 1741 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS)) { 1742 STATIC_ASSERT( 1743 __same_type(dest_state->blend_constants, src_state->blend_constants)); 1744 1745 typed_memcpy(dest_state->blend_constants, 1746 src_state->blend_constants, 1747 ARRAY_SIZE(dest_state->blend_constants)); 1748 1749 cmd_buffer_state->dirty.blend_constants = true; 1750 } 1751 1752 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK)) { 1753 dest_state->compare_mask.front = src_state->compare_mask.front; 1754 dest_state->compare_mask.back = src_state->compare_mask.back; 1755 1756 cmd_buffer_state->dirty.compare_mask = true; 1757 } 1758 1759 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK)) { 1760 dest_state->write_mask.front = src_state->write_mask.front; 1761 dest_state->write_mask.back = src_state->write_mask.back; 1762 1763 cmd_buffer_state->dirty.write_mask = true; 1764 } 1765 1766 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE)) { 1767 dest_state->reference.front = src_state->reference.front; 1768 dest_state->reference.back = src_state->reference.back; 1769 1770 cmd_buffer_state->dirty.reference = true; 1771 } 1772} 1773 1774void pvr_CmdBindPipeline(VkCommandBuffer commandBuffer, 1775 VkPipelineBindPoint pipelineBindPoint, 1776 VkPipeline _pipeline) 1777{ 1778 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 1779 PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline); 1780 1781 switch (pipelineBindPoint) { 1782 case VK_PIPELINE_BIND_POINT_COMPUTE: 1783 pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline), 1784 cmd_buffer); 1785 break; 1786 1787 case VK_PIPELINE_BIND_POINT_GRAPHICS: 1788 pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline), 1789 cmd_buffer); 1790 break; 1791 1792 default: 1793 unreachable("Invalid bind point."); 1794 break; 1795 } 1796} 1797 1798#if defined(DEBUG) 1799static void check_viewport_quirk_70165(const struct pvr_device *device, 1800 const VkViewport *pViewport) 1801{ 1802 const struct pvr_device_info *dev_info = &device->pdevice->dev_info; 1803 float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y; 1804 float min_screen_space_value, max_screen_space_value; 1805 float sign_to_unsigned_offset, fixed_point_max; 1806 float guardband_width, guardband_height; 1807 1808 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { 1809 /* Max representable value in 13.4 fixed point format. 1810 * Round-down to avoid precision issues. 1811 * Calculated as (2 ** 13) - 2*(2 ** -4) 1812 */ 1813 fixed_point_max = 8192.0f - 2.0f / 16.0f; 1814 1815 if (PVR_HAS_FEATURE(dev_info, screen_size8K)) { 1816 if (pViewport->width <= 4096 && pViewport->height <= 4096) { 1817 guardband_width = pViewport->width / 4.0f; 1818 guardband_height = pViewport->height / 4.0f; 1819 1820 /* 2k of the range is negative */ 1821 sign_to_unsigned_offset = 2048.0f; 1822 } else { 1823 guardband_width = 0.0f; 1824 guardband_height = 0.0f; 1825 1826 /* For > 4k renders, the entire range is positive */ 1827 sign_to_unsigned_offset = 0.0f; 1828 } 1829 } else { 1830 guardband_width = pViewport->width / 4.0f; 1831 guardband_height = pViewport->height / 4.0f; 1832 1833 /* 2k of the range is negative */ 1834 sign_to_unsigned_offset = 2048.0f; 1835 } 1836 } else { 1837 /* Max representable value in 16.8 fixed point format 1838 * Calculated as (2 ** 16) - (2 ** -8) 1839 */ 1840 fixed_point_max = 65535.99609375f; 1841 guardband_width = pViewport->width / 4.0f; 1842 guardband_height = pViewport->height / 4.0f; 1843 1844 /* 4k/20k of the range is negative */ 1845 sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET; 1846 } 1847 1848 min_screen_space_value = -sign_to_unsigned_offset; 1849 max_screen_space_value = fixed_point_max - sign_to_unsigned_offset; 1850 1851 min_vertex_x = pViewport->x - guardband_width; 1852 max_vertex_x = pViewport->x + pViewport->width + guardband_width; 1853 min_vertex_y = pViewport->y - guardband_height; 1854 max_vertex_y = pViewport->y + pViewport->height + guardband_height; 1855 if (min_vertex_x < min_screen_space_value || 1856 max_vertex_x > max_screen_space_value || 1857 min_vertex_y < min_screen_space_value || 1858 max_vertex_y > max_screen_space_value) { 1859 mesa_logw("Viewport is affected by BRN70165, geometry outside " 1860 "the viewport could be corrupted"); 1861 } 1862} 1863#endif 1864 1865void pvr_CmdSetViewport(VkCommandBuffer commandBuffer, 1866 uint32_t firstViewport, 1867 uint32_t viewportCount, 1868 const VkViewport *pViewports) 1869{ 1870 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 1871 const uint32_t total_count = firstViewport + viewportCount; 1872 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 1873 1874 assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0); 1875 assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS); 1876 1877 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); 1878 1879#if defined(DEBUG) 1880 if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) { 1881 for (uint32_t viewport = 0; viewport < viewportCount; viewport++) { 1882 check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]); 1883 } 1884 } 1885#endif 1886 1887 if (state->dynamic.common.viewport.count < total_count) 1888 state->dynamic.common.viewport.count = total_count; 1889 1890 memcpy(&state->dynamic.common.viewport.viewports[firstViewport], 1891 pViewports, 1892 viewportCount * sizeof(*pViewports)); 1893 1894 state->dirty.viewport = true; 1895} 1896 1897void pvr_CmdSetScissor(VkCommandBuffer commandBuffer, 1898 uint32_t firstScissor, 1899 uint32_t scissorCount, 1900 const VkRect2D *pScissors) 1901{ 1902 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 1903 const uint32_t total_count = firstScissor + scissorCount; 1904 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 1905 1906 assert(firstScissor < PVR_MAX_VIEWPORTS && scissorCount > 0); 1907 assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS); 1908 1909 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); 1910 1911 if (state->dynamic.common.scissor.count < total_count) 1912 state->dynamic.common.scissor.count = total_count; 1913 1914 memcpy(&state->dynamic.common.scissor.scissors[firstScissor], 1915 pScissors, 1916 scissorCount * sizeof(*pScissors)); 1917 1918 state->dirty.scissor = true; 1919} 1920 1921void pvr_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth) 1922{ 1923 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 1924 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 1925 1926 state->dynamic.common.line_width = lineWidth; 1927 state->dirty.line_width = true; 1928} 1929 1930void pvr_CmdSetDepthBias(VkCommandBuffer commandBuffer, 1931 float depthBiasConstantFactor, 1932 float depthBiasClamp, 1933 float depthBiasSlopeFactor) 1934{ 1935 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 1936 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 1937 1938 state->dynamic.common.depth_bias.constant_factor = depthBiasConstantFactor; 1939 state->dynamic.common.depth_bias.clamp = depthBiasClamp; 1940 state->dynamic.common.depth_bias.slope_factor = depthBiasSlopeFactor; 1941 state->dirty.depth_bias = true; 1942} 1943 1944void pvr_CmdSetBlendConstants(VkCommandBuffer commandBuffer, 1945 const float blendConstants[4]) 1946{ 1947 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 1948 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 1949 1950 STATIC_ASSERT(ARRAY_SIZE(state->dynamic.common.blend_constants) == 4); 1951 memcpy(state->dynamic.common.blend_constants, 1952 blendConstants, 1953 sizeof(state->dynamic.common.blend_constants)); 1954 1955 state->dirty.blend_constants = true; 1956} 1957 1958void pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer, 1959 float minDepthBounds, 1960 float maxDepthBounds) 1961{ 1962 mesa_logd("No support for depth bounds testing."); 1963} 1964 1965void pvr_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, 1966 VkStencilFaceFlags faceMask, 1967 uint32_t compareMask) 1968{ 1969 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 1970 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 1971 1972 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 1973 state->dynamic.common.compare_mask.front = compareMask; 1974 1975 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 1976 state->dynamic.common.compare_mask.back = compareMask; 1977 1978 state->dirty.compare_mask = true; 1979} 1980 1981void pvr_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, 1982 VkStencilFaceFlags faceMask, 1983 uint32_t writeMask) 1984{ 1985 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 1986 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 1987 1988 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 1989 state->dynamic.common.write_mask.front = writeMask; 1990 1991 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 1992 state->dynamic.common.write_mask.back = writeMask; 1993 1994 state->dirty.write_mask = true; 1995} 1996 1997void pvr_CmdSetStencilReference(VkCommandBuffer commandBuffer, 1998 VkStencilFaceFlags faceMask, 1999 uint32_t reference) 2000{ 2001 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 2002 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 2003 2004 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) 2005 state->dynamic.common.reference.front = reference; 2006 2007 if (faceMask & VK_STENCIL_FACE_BACK_BIT) 2008 state->dynamic.common.reference.back = reference; 2009 2010 state->dirty.reference = true; 2011} 2012 2013void pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, 2014 VkPipelineBindPoint pipelineBindPoint, 2015 VkPipelineLayout _layout, 2016 uint32_t firstSet, 2017 uint32_t descriptorSetCount, 2018 const VkDescriptorSet *pDescriptorSets, 2019 uint32_t dynamicOffsetCount, 2020 const uint32_t *pDynamicOffsets) 2021{ 2022 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 2023 struct pvr_descriptor_state *descriptor_state; 2024 2025 assert(firstSet + descriptorSetCount <= PVR_MAX_DESCRIPTOR_SETS); 2026 2027 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); 2028 2029 switch (pipelineBindPoint) { 2030 case VK_PIPELINE_BIND_POINT_GRAPHICS: 2031 case VK_PIPELINE_BIND_POINT_COMPUTE: 2032 break; 2033 2034 default: 2035 unreachable("Unsupported bind point."); 2036 break; 2037 } 2038 2039 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) { 2040 descriptor_state = &cmd_buffer->state.gfx_desc_state; 2041 cmd_buffer->state.dirty.gfx_desc_dirty = true; 2042 } else { 2043 descriptor_state = &cmd_buffer->state.compute_desc_state; 2044 cmd_buffer->state.dirty.compute_desc_dirty = true; 2045 } 2046 2047 for (uint32_t i = 0; i < descriptorSetCount; i++) { 2048 PVR_FROM_HANDLE(pvr_descriptor_set, set, pDescriptorSets[i]); 2049 uint32_t index = firstSet + i; 2050 2051 if (descriptor_state->descriptor_sets[index] != set) { 2052 descriptor_state->descriptor_sets[index] = set; 2053 descriptor_state->valid_mask |= (1u << index); 2054 } 2055 } 2056} 2057 2058void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, 2059 uint32_t firstBinding, 2060 uint32_t bindingCount, 2061 const VkBuffer *pBuffers, 2062 const VkDeviceSize *pOffsets) 2063{ 2064 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 2065 struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings; 2066 2067 /* We have to defer setting up vertex buffer since we need the buffer 2068 * stride from the pipeline. 2069 */ 2070 2071 assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS && 2072 bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS); 2073 2074 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); 2075 2076 for (uint32_t i = 0; i < bindingCount; i++) { 2077 vb[firstBinding + i].buffer = pvr_buffer_from_handle(pBuffers[i]); 2078 vb[firstBinding + i].offset = pOffsets[i]; 2079 } 2080 2081 cmd_buffer->state.dirty.vertex_bindings = true; 2082} 2083 2084void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, 2085 VkBuffer buffer, 2086 VkDeviceSize offset, 2087 VkIndexType indexType) 2088{ 2089 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 2090 PVR_FROM_HANDLE(pvr_buffer, index_buffer, buffer); 2091 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 2092 2093 assert(offset < index_buffer->vk.size); 2094 assert(indexType == VK_INDEX_TYPE_UINT32 || 2095 indexType == VK_INDEX_TYPE_UINT16); 2096 2097 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); 2098 2099 state->index_buffer_binding.buffer = index_buffer; 2100 state->index_buffer_binding.offset = offset; 2101 state->index_buffer_binding.type = indexType; 2102 state->dirty.index_buffer_binding = true; 2103} 2104 2105void pvr_CmdPushConstants(VkCommandBuffer commandBuffer, 2106 VkPipelineLayout layout, 2107 VkShaderStageFlags stageFlags, 2108 uint32_t offset, 2109 uint32_t size, 2110 const void *pValues) 2111{ 2112#if defined(DEBUG) 2113 const uint64_t ending = (uint64_t)offset + (uint64_t)size; 2114#endif 2115 2116 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 2117 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 2118 2119 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); 2120 2121 pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE); 2122 2123 memcpy(&state->push_constants.data[offset], pValues, size); 2124 2125 state->push_constants.dirty_stages |= stageFlags; 2126} 2127 2128static VkResult 2129pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer *cmd_buffer, 2130 const struct pvr_render_pass *pass, 2131 const struct pvr_framebuffer *framebuffer) 2132{ 2133 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 2134 struct pvr_render_pass_info *info = &state->render_pass_info; 2135 2136 assert(pass->attachment_count == framebuffer->attachment_count); 2137 2138 /* Free any previously allocated attachments. */ 2139 vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments); 2140 2141 if (pass->attachment_count == 0) { 2142 info->attachments = NULL; 2143 return VK_SUCCESS; 2144 } 2145 2146 info->attachments = 2147 vk_zalloc(&cmd_buffer->vk.pool->alloc, 2148 pass->attachment_count * sizeof(*info->attachments), 2149 8, 2150 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 2151 if (!info->attachments) { 2152 /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ 2153 state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); 2154 return state->status; 2155 } 2156 2157 if (framebuffer) { 2158 for (uint32_t i = 0; i < pass->attachment_count; i++) 2159 info->attachments[i] = framebuffer->attachments[i]; 2160 } 2161 2162 return VK_SUCCESS; 2163} 2164 2165static VkResult pvr_init_render_targets(struct pvr_device *device, 2166 struct pvr_render_pass *pass, 2167 struct pvr_framebuffer *framebuffer) 2168{ 2169 for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) { 2170 struct pvr_render_target *render_target = 2171 pvr_get_render_target(pass, framebuffer, i); 2172 2173 pthread_mutex_lock(&render_target->mutex); 2174 2175 if (!render_target->valid) { 2176 const struct pvr_renderpass_hwsetup_render *hw_render = 2177 &pass->hw_setup->renders[i]; 2178 VkResult result; 2179 2180 result = pvr_render_target_dataset_create(device, 2181 framebuffer->width, 2182 framebuffer->height, 2183 hw_render->sample_count, 2184 framebuffer->layers, 2185 &render_target->rt_dataset); 2186 if (result != VK_SUCCESS) { 2187 pthread_mutex_unlock(&render_target->mutex); 2188 return result; 2189 } 2190 2191 render_target->valid = true; 2192 } 2193 2194 pthread_mutex_unlock(&render_target->mutex); 2195 } 2196 2197 return VK_SUCCESS; 2198} 2199 2200static const struct pvr_renderpass_hwsetup_subpass * 2201pvr_get_hw_subpass(const struct pvr_render_pass *pass, const uint32_t subpass) 2202{ 2203 const struct pvr_renderpass_hw_map *map = 2204 &pass->hw_setup->subpass_map[subpass]; 2205 2206 return &pass->hw_setup->renders[map->render].subpasses[map->subpass]; 2207} 2208 2209static void pvr_perform_start_of_render_attachment_clear( 2210 struct pvr_cmd_buffer *cmd_buffer, 2211 const struct pvr_framebuffer *framebuffer, 2212 uint32_t index, 2213 bool is_depth_stencil, 2214 uint32_t *index_list_clear_mask) 2215{ 2216 struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info; 2217 const struct pvr_render_pass *pass = info->pass; 2218 const struct pvr_renderpass_hwsetup_render *hw_render; 2219 const struct pvr_renderpass_hwsetup *hw_setup; 2220 struct pvr_image_view *iview; 2221 uint32_t view_idx; 2222 uint32_t height; 2223 uint32_t width; 2224 2225 hw_setup = pass->hw_setup; 2226 hw_render = 2227 &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render]; 2228 2229 if (is_depth_stencil) { 2230 bool stencil_clear; 2231 bool depth_clear; 2232 bool is_stencil; 2233 bool is_depth; 2234 2235 assert(hw_render->ds_surface_id != -1); 2236 assert(index == 0); 2237 2238 view_idx = hw_render->ds_surface_id; 2239 2240 is_depth = vk_format_has_depth(pass->attachments[view_idx].vk_format); 2241 is_stencil = vk_format_has_stencil(pass->attachments[view_idx].vk_format); 2242 depth_clear = hw_render->depth_init == RENDERPASS_SURFACE_INITOP_CLEAR; 2243 stencil_clear = hw_render->stencil_init == 2244 RENDERPASS_SURFACE_INITOP_CLEAR; 2245 2246 /* Attempt to clear the ds attachment. Do not erroneously discard an 2247 * attachment that has no depth clear but has a stencil attachment. 2248 */ 2249 /* if not (a ∧ c) ∨ (b ∧ d) */ 2250 if (!((is_depth && depth_clear) || (is_stencil && stencil_clear))) 2251 return; 2252 } else if (hw_render->color_init[index].op != 2253 RENDERPASS_SURFACE_INITOP_CLEAR) { 2254 return; 2255 } else { 2256 view_idx = hw_render->color_init[index].driver_id; 2257 } 2258 2259 iview = info->attachments[view_idx]; 2260 width = iview->vk.extent.width; 2261 height = iview->vk.extent.height; 2262 2263 /* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init() 2264 * were doing the same check (even if it's just an assert) to determine if a 2265 * clear is needed. 2266 */ 2267 /* If this is single-layer fullscreen, we already do the clears in 2268 * pvr_sub_cmd_gfx_job_init(). 2269 */ 2270 if (info->render_area.offset.x == 0 && info->render_area.offset.y == 0 && 2271 info->render_area.extent.width == width && 2272 info->render_area.extent.height == height && framebuffer->layers == 1) { 2273 return; 2274 } 2275 2276 pvr_finishme("Unimplemented path!"); 2277} 2278 2279static void 2280pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer) 2281{ 2282 struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info; 2283 const struct pvr_framebuffer *framebuffer = info->framebuffer; 2284 const struct pvr_render_pass *pass = info->pass; 2285 const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup; 2286 const struct pvr_renderpass_hwsetup_render *hw_render; 2287 2288 /* Mask of attachment clears using index lists instead of background object 2289 * to clear. 2290 */ 2291 uint32_t index_list_clear_mask = 0; 2292 2293 hw_render = 2294 &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render]; 2295 if (!hw_render) { 2296 info->process_empty_tiles = false; 2297 info->enable_bg_tag = false; 2298 return; 2299 } 2300 2301 for (uint32_t i = 0; i < hw_render->color_init_count; i++) { 2302 pvr_perform_start_of_render_attachment_clear(cmd_buffer, 2303 framebuffer, 2304 i, 2305 false, 2306 &index_list_clear_mask); 2307 } 2308 2309 info->enable_bg_tag = !!hw_render->color_init_count; 2310 2311 /* If we're not using index list for all clears/loads then we need to run 2312 * the background object on empty tiles. 2313 */ 2314 if (hw_render->color_init_count && 2315 index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) { 2316 info->process_empty_tiles = true; 2317 } else { 2318 info->process_empty_tiles = false; 2319 } 2320 2321 if (hw_render->ds_surface_id != -1) { 2322 uint32_t ds_index_list = 0; 2323 2324 pvr_perform_start_of_render_attachment_clear(cmd_buffer, 2325 framebuffer, 2326 0, 2327 true, 2328 &ds_index_list); 2329 } 2330 2331 if (index_list_clear_mask) 2332 pvr_finishme("Add support for generating loadops shaders!"); 2333} 2334 2335static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state, 2336 struct pvr_sub_cmd_gfx *const sub_cmd) 2337{ 2338 const struct pvr_render_pass *pass = state->render_pass_info.pass; 2339 const struct pvr_renderpass_hwsetup_render *hw_render = 2340 &pass->hw_setup->renders[sub_cmd->hw_render_idx]; 2341 2342 if (hw_render->ds_surface_id != -1) { 2343 struct pvr_image_view **iviews = state->render_pass_info.attachments; 2344 2345 state->depth_format = iviews[hw_render->ds_surface_id]->vk.format; 2346 } 2347} 2348 2349static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup) 2350{ 2351 for (uint32_t i = 0; i < hw_setup->render_count; i++) { 2352 struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i]; 2353 uint32_t render_targets_count = 2354 hw_render->init_setup.render_targets_count; 2355 2356 for (uint32_t j = 0; 2357 j < (hw_render->color_init_count * render_targets_count); 2358 j += render_targets_count) { 2359 for (uint32_t k = 0; k < hw_render->init_setup.render_targets_count; 2360 k++) { 2361 if (hw_render->color_init[j + k].op == 2362 RENDERPASS_SURFACE_INITOP_CLEAR) { 2363 return true; 2364 } 2365 } 2366 } 2367 if (hw_render->depth_init == RENDERPASS_SURFACE_INITOP_CLEAR || 2368 hw_render->stencil_init == RENDERPASS_SURFACE_INITOP_CLEAR) { 2369 return true; 2370 } 2371 } 2372 2373 return false; 2374} 2375 2376static VkResult 2377pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer, 2378 const VkRenderPassBeginInfo *pRenderPassBegin) 2379{ 2380 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 2381 2382 /* Free any previously allocated clear values. */ 2383 vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values); 2384 2385 if (pRenderPassBegin->clearValueCount) { 2386 const size_t size = pRenderPassBegin->clearValueCount * 2387 sizeof(*state->render_pass_info.clear_values); 2388 2389 state->render_pass_info.clear_values = 2390 vk_zalloc(&cmd_buffer->vk.pool->alloc, 2391 size, 2392 8, 2393 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 2394 if (!state->render_pass_info.clear_values) { 2395 state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); 2396 return state->status; 2397 } 2398 2399 memcpy(state->render_pass_info.clear_values, 2400 pRenderPassBegin->pClearValues, 2401 size); 2402 } else { 2403 state->render_pass_info.clear_values = NULL; 2404 } 2405 2406 state->render_pass_info.clear_value_count = 2407 pRenderPassBegin->clearValueCount; 2408 2409 return VK_SUCCESS; 2410} 2411 2412void pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, 2413 const VkRenderPassBeginInfo *pRenderPassBeginInfo, 2414 const VkSubpassBeginInfo *pSubpassBeginInfo) 2415{ 2416 PVR_FROM_HANDLE(pvr_framebuffer, 2417 framebuffer, 2418 pRenderPassBeginInfo->framebuffer); 2419 PVR_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass); 2420 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 2421 const struct pvr_renderpass_hwsetup_subpass *hw_subpass; 2422 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 2423 VkResult result; 2424 2425 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); 2426 2427 assert(!state->render_pass_info.pass); 2428 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); 2429 2430 /* FIXME: Create a separate function for everything using pass->subpasses, 2431 * look at cmd_buffer_begin_subpass() for example. */ 2432 state->render_pass_info.pass = pass; 2433 state->render_pass_info.framebuffer = framebuffer; 2434 state->render_pass_info.subpass_idx = 0; 2435 state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea; 2436 state->render_pass_info.current_hw_subpass = 0; 2437 state->render_pass_info.pipeline_bind_point = 2438 pass->subpasses[0].pipeline_bind_point; 2439 state->render_pass_info.userpass_spawn = pass->subpasses[0].userpass_spawn; 2440 state->dirty.userpass_spawn = true; 2441 2442 result = pvr_cmd_buffer_setup_attachments(cmd_buffer, pass, framebuffer); 2443 if (result != VK_SUCCESS) 2444 return; 2445 2446 state->status = 2447 pvr_init_render_targets(cmd_buffer->device, pass, framebuffer); 2448 if (state->status != VK_SUCCESS) 2449 return; 2450 2451 result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo); 2452 if (result != VK_SUCCESS) 2453 return; 2454 2455 assert(pass->subpasses[0].pipeline_bind_point == 2456 VK_PIPELINE_BIND_POINT_GRAPHICS); 2457 2458 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); 2459 if (result != VK_SUCCESS) 2460 return; 2461 2462 /* Run subpass 0 "soft" background object after the actual background 2463 * object. 2464 */ 2465 hw_subpass = pvr_get_hw_subpass(pass, 0); 2466 if (hw_subpass->client_data) 2467 pvr_finishme("Unimplemented path!"); 2468 2469 pvr_perform_start_of_render_clears(cmd_buffer); 2470 pvr_stash_depth_format(&cmd_buffer->state, 2471 &cmd_buffer->state.current_sub_cmd->gfx); 2472 2473 if (!pvr_loadops_contain_clear(pass->hw_setup)) { 2474 state->dynamic.scissor_accum_state = PVR_SCISSOR_ACCUM_CHECK_FOR_CLEAR; 2475 state->dynamic.scissor_accum_bounds.offset.x = 0; 2476 state->dynamic.scissor_accum_bounds.offset.y = 0; 2477 state->dynamic.scissor_accum_bounds.extent.width = 0; 2478 state->dynamic.scissor_accum_bounds.extent.height = 0; 2479 } else { 2480 state->dynamic.scissor_accum_state = PVR_SCISSOR_ACCUM_DISABLED; 2481 } 2482} 2483 2484static void pvr_cmd_buffer_reset(struct pvr_cmd_buffer *cmd_buffer) 2485{ 2486 if (cmd_buffer->status != PVR_CMD_BUFFER_STATUS_INITIAL) { 2487 /* FIXME: For now we always free all resources as if 2488 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set. 2489 */ 2490 pvr_cmd_buffer_free_sub_cmds(cmd_buffer); 2491 2492 list_for_each_entry_safe (struct pvr_bo, bo, &cmd_buffer->bo_list, link) { 2493 list_del(&bo->link); 2494 pvr_bo_free(cmd_buffer->device, bo); 2495 } 2496 2497 util_dynarray_clear(&cmd_buffer->scissor_array); 2498 util_dynarray_clear(&cmd_buffer->depth_bias_array); 2499 2500 cmd_buffer->state.status = VK_SUCCESS; 2501 cmd_buffer->status = PVR_CMD_BUFFER_STATUS_INITIAL; 2502 } 2503} 2504 2505VkResult pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer, 2506 const VkCommandBufferBeginInfo *pBeginInfo) 2507{ 2508 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 2509 struct pvr_cmd_buffer_state *state; 2510 VkResult result; 2511 2512 pvr_cmd_buffer_reset(cmd_buffer); 2513 2514 cmd_buffer->usage_flags = pBeginInfo->flags; 2515 state = &cmd_buffer->state; 2516 2517 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for 2518 * primary level command buffers. 2519 * 2520 * From the Vulkan 1.0 spec: 2521 * 2522 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a 2523 * secondary command buffer is considered to be entirely inside a render 2524 * pass. If this is a primary command buffer, then this bit is ignored. 2525 */ 2526 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { 2527 cmd_buffer->usage_flags &= 2528 ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT; 2529 } 2530 2531 if (cmd_buffer->usage_flags & 2532 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 2533 const VkCommandBufferInheritanceInfo *inheritance_info = 2534 pBeginInfo->pInheritanceInfo; 2535 struct pvr_render_pass *pass; 2536 2537 pass = pvr_render_pass_from_handle(inheritance_info->renderPass); 2538 state->render_pass_info.pass = pass; 2539 state->render_pass_info.framebuffer = 2540 pvr_framebuffer_from_handle(inheritance_info->framebuffer); 2541 state->render_pass_info.subpass_idx = inheritance_info->subpass; 2542 state->render_pass_info.userpass_spawn = 2543 pass->subpasses[inheritance_info->subpass].userpass_spawn; 2544 2545 result = 2546 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); 2547 if (result != VK_SUCCESS) 2548 return result; 2549 } 2550 2551 memset(state->barriers_needed, 2552 0xFF, 2553 sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed)); 2554 2555 cmd_buffer->status = PVR_CMD_BUFFER_STATUS_RECORDING; 2556 2557 return VK_SUCCESS; 2558} 2559 2560VkResult pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer *cmd_buffer, 2561 struct pvr_transfer_cmd *transfer_cmd) 2562{ 2563 struct pvr_sub_cmd_transfer *sub_cmd; 2564 VkResult result; 2565 2566 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER); 2567 if (result != VK_SUCCESS) 2568 return result; 2569 2570 sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer; 2571 2572 list_addtail(&transfer_cmd->link, &sub_cmd->transfer_cmds); 2573 2574 return VK_SUCCESS; 2575} 2576 2577static void 2578pvr_validate_push_descriptors(struct pvr_cmd_buffer *cmd_buffer, 2579 bool *const push_descriptors_dirty_out) 2580{ 2581 /* TODO: Implement this function, based on ValidatePushDescriptors. */ 2582 pvr_finishme("Add support for push descriptors!"); 2583 *push_descriptors_dirty_out = false; 2584} 2585 2586#define PVR_WRITE(_buffer, _value, _offset, _max) \ 2587 do { \ 2588 __typeof__(_value) __value = _value; \ 2589 uint64_t __offset = _offset; \ 2590 uint32_t __nr_dwords = sizeof(__value) / sizeof(uint32_t); \ 2591 static_assert(__same_type(*_buffer, __value), \ 2592 "Buffer and value type mismatch"); \ 2593 assert((__offset + __nr_dwords) <= (_max)); \ 2594 assert((__offset % __nr_dwords) == 0U); \ 2595 _buffer[__offset / __nr_dwords] = __value; \ 2596 } while (0) 2597 2598static VkResult 2599pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer, 2600 const struct pvr_graphics_pipeline *const gfx_pipeline) 2601{ 2602 const struct pvr_vertex_shader_state *const vertex_state = 2603 &gfx_pipeline->vertex_shader_state; 2604 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 2605 const struct pvr_pds_info *const pds_info = state->pds_shader.info; 2606 const uint8_t *entries; 2607 uint32_t *dword_buffer; 2608 uint64_t *qword_buffer; 2609 struct pvr_bo *pvr_bo; 2610 VkResult result; 2611 2612 result = pvr_cmd_buffer_alloc_mem(cmd_buffer, 2613 cmd_buffer->device->heaps.pds_heap, 2614 pds_info->data_size_in_dwords, 2615 PVR_BO_ALLOC_FLAG_CPU_MAPPED, 2616 &pvr_bo); 2617 if (result != VK_SUCCESS) 2618 return result; 2619 2620 dword_buffer = (uint32_t *)pvr_bo->bo->map; 2621 qword_buffer = (uint64_t *)pvr_bo->bo->map; 2622 2623 entries = (uint8_t *)pds_info->entries; 2624 2625 for (uint32_t i = 0; i < pds_info->entry_count; i++) { 2626 const struct pvr_const_map_entry *const entry_header = 2627 (struct pvr_const_map_entry *)entries; 2628 2629 switch (entry_header->type) { 2630 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: { 2631 const struct pvr_const_map_entry_literal32 *const literal = 2632 (struct pvr_const_map_entry_literal32 *)entries; 2633 2634 PVR_WRITE(dword_buffer, 2635 literal->literal_value, 2636 literal->const_offset, 2637 pds_info->data_size_in_dwords); 2638 2639 entries += sizeof(*literal); 2640 break; 2641 } 2642 2643 case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: { 2644 const struct pvr_const_map_entry_doutu_address *const doutu_addr = 2645 (struct pvr_const_map_entry_doutu_address *)entries; 2646 const pvr_dev_addr_t exec_addr = 2647 PVR_DEV_ADDR_OFFSET(vertex_state->bo->vma->dev_addr, 2648 vertex_state->entry_offset); 2649 uint64_t addr = 0ULL; 2650 2651 pvr_set_usc_execution_address64(&addr, exec_addr.addr); 2652 2653 PVR_WRITE(qword_buffer, 2654 addr | doutu_addr->doutu_control, 2655 doutu_addr->const_offset, 2656 pds_info->data_size_in_dwords); 2657 2658 entries += sizeof(*doutu_addr); 2659 break; 2660 } 2661 2662 case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: { 2663 const struct pvr_const_map_entry_base_instance *const base_instance = 2664 (struct pvr_const_map_entry_base_instance *)entries; 2665 2666 PVR_WRITE(dword_buffer, 2667 state->draw_state.base_instance, 2668 base_instance->const_offset, 2669 pds_info->data_size_in_dwords); 2670 2671 entries += sizeof(*base_instance); 2672 break; 2673 } 2674 2675 case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: { 2676 const struct pvr_const_map_entry_vertex_attribute_address 2677 *const attribute = 2678 (struct pvr_const_map_entry_vertex_attribute_address *)entries; 2679 const struct pvr_vertex_binding *const binding = 2680 &state->vertex_bindings[attribute->binding_index]; 2681 const pvr_dev_addr_t addr = 2682 PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr, 2683 binding->offset + attribute->offset); 2684 2685 PVR_WRITE(qword_buffer, 2686 addr.addr, 2687 attribute->const_offset, 2688 pds_info->data_size_in_dwords); 2689 2690 entries += sizeof(*attribute); 2691 break; 2692 } 2693 2694 default: 2695 unreachable("Unsupported data section map"); 2696 break; 2697 } 2698 } 2699 2700 state->pds_vertex_attrib_offset = 2701 pvr_bo->vma->dev_addr.addr - 2702 cmd_buffer->device->heaps.pds_heap->base_addr.addr; 2703 2704 pvr_bo_cpu_unmap(cmd_buffer->device, pvr_bo); 2705 2706 return VK_SUCCESS; 2707} 2708 2709static VkResult pvr_setup_descriptor_mappings( 2710 struct pvr_cmd_buffer *const cmd_buffer, 2711 enum pvr_stage_allocation stage, 2712 const struct pvr_stage_allocation_descriptor_state *descriptor_state, 2713 UNUSED const pvr_dev_addr_t *const num_worgroups_buff_addr, 2714 uint32_t *const descriptor_data_offset_out) 2715{ 2716 const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info; 2717 const struct pvr_descriptor_state *desc_state; 2718 const uint8_t *entries; 2719 uint32_t *dword_buffer; 2720 uint64_t *qword_buffer; 2721 struct pvr_bo *pvr_bo; 2722 VkResult result; 2723 2724 pvr_finishme("Handle num_worgroups_buff_addr"); 2725 2726 if (!pds_info->data_size_in_dwords) 2727 return VK_SUCCESS; 2728 2729 result = pvr_cmd_buffer_alloc_mem(cmd_buffer, 2730 cmd_buffer->device->heaps.pds_heap, 2731 pds_info->data_size_in_dwords, 2732 PVR_BO_ALLOC_FLAG_CPU_MAPPED, 2733 &pvr_bo); 2734 if (result != VK_SUCCESS) 2735 return result; 2736 2737 dword_buffer = (uint32_t *)pvr_bo->bo->map; 2738 qword_buffer = (uint64_t *)pvr_bo->bo->map; 2739 2740 entries = (uint8_t *)pds_info->entries; 2741 2742 switch (stage) { 2743 case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY: 2744 case PVR_STAGE_ALLOCATION_FRAGMENT: 2745 desc_state = &cmd_buffer->state.gfx_desc_state; 2746 break; 2747 2748 case PVR_STAGE_ALLOCATION_COMPUTE: 2749 desc_state = &cmd_buffer->state.compute_desc_state; 2750 break; 2751 2752 default: 2753 unreachable("Unsupported stage."); 2754 break; 2755 } 2756 2757 for (uint32_t i = 0; i < pds_info->entry_count; i++) { 2758 const struct pvr_const_map_entry *const entry_header = 2759 (struct pvr_const_map_entry *)entries; 2760 2761 /* TODO: See if instead of reusing the blend constant buffer type entry, 2762 * we can setup a new buffer type specifically for num_workgroups or other 2763 * built-in variables. The mappings are setup at pipeline creation when 2764 * creating the descriptor program. 2765 */ 2766 pvr_finishme("Handle blend constant reuse for compute."); 2767 2768 switch (entry_header->type) { 2769 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: { 2770 const struct pvr_const_map_entry_literal32 *const literal = 2771 (struct pvr_const_map_entry_literal32 *)entries; 2772 2773 PVR_WRITE(dword_buffer, 2774 literal->literal_value, 2775 literal->const_offset, 2776 pds_info->data_size_in_dwords); 2777 2778 entries += sizeof(*literal); 2779 break; 2780 } 2781 2782 case PVR_PDS_CONST_MAP_ENTRY_TYPE_CONSTANT_BUFFER: { 2783 const struct pvr_const_map_entry_constant_buffer *const_buffer_entry = 2784 (struct pvr_const_map_entry_constant_buffer *)entries; 2785 const uint32_t desc_set = const_buffer_entry->desc_set; 2786 const uint32_t binding = const_buffer_entry->binding; 2787 const struct pvr_descriptor_set *descriptor_set; 2788 const struct pvr_descriptor *descriptor; 2789 pvr_dev_addr_t buffer_addr; 2790 2791 /* TODO: Handle push descriptors. */ 2792 2793 assert(desc_set < PVR_MAX_DESCRIPTOR_SETS); 2794 descriptor_set = desc_state->descriptor_sets[desc_set]; 2795 2796 /* TODO: Handle dynamic buffers. */ 2797 descriptor = &descriptor_set->descriptors[binding]; 2798 assert(descriptor->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); 2799 2800 assert(descriptor->buffer_desc_range == 2801 const_buffer_entry->size_in_dwords * sizeof(uint32_t)); 2802 assert(descriptor->buffer_create_info_size == 2803 const_buffer_entry->size_in_dwords * sizeof(uint32_t)); 2804 2805 buffer_addr = 2806 PVR_DEV_ADDR_OFFSET(descriptor->buffer_dev_addr, 2807 const_buffer_entry->offset * sizeof(uint32_t)); 2808 2809 PVR_WRITE(qword_buffer, 2810 buffer_addr.addr, 2811 const_buffer_entry->const_offset, 2812 pds_info->data_size_in_dwords); 2813 2814 entries += sizeof(*const_buffer_entry); 2815 break; 2816 } 2817 2818 case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: { 2819 const struct pvr_const_map_entry_descriptor_set *desc_set_entry = 2820 (struct pvr_const_map_entry_descriptor_set *)entries; 2821 const uint32_t desc_set_num = desc_set_entry->descriptor_set; 2822 const struct pvr_descriptor_set *descriptor_set; 2823 pvr_dev_addr_t desc_set_addr; 2824 2825 assert(desc_set_num < PVR_MAX_DESCRIPTOR_SETS); 2826 2827 /* TODO: Remove this when the compiler provides us with usage info? 2828 */ 2829 /* We skip DMAing unbound descriptor sets. */ 2830 if (!(desc_state->valid_mask & BITFIELD_BIT(desc_set_num))) { 2831 const struct pvr_const_map_entry_literal32 *literal; 2832 uint32_t zero_literal_value; 2833 2834 entries += sizeof(*desc_set_entry); 2835 literal = (struct pvr_const_map_entry_literal32 *)entries; 2836 2837 /* TODO: Is there any guarantee that a literal will follow the 2838 * descriptor set entry? 2839 */ 2840 assert(literal->type == PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32); 2841 2842 /* We zero out the DMA size so the DMA isn't performed. */ 2843 zero_literal_value = 2844 literal->literal_value & 2845 PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_CLRMSK; 2846 2847 PVR_WRITE(qword_buffer, 2848 UINT64_C(0), 2849 desc_set_entry->const_offset, 2850 pds_info->data_size_in_dwords); 2851 2852 PVR_WRITE(dword_buffer, 2853 zero_literal_value, 2854 desc_set_entry->const_offset, 2855 pds_info->data_size_in_dwords); 2856 2857 entries += sizeof(*literal); 2858 i++; 2859 continue; 2860 } 2861 2862 descriptor_set = desc_state->descriptor_sets[desc_set_num]; 2863 2864 pvr_finishme("Handle push descriptor entry."); 2865 2866 desc_set_addr = descriptor_set->pvr_bo->vma->dev_addr; 2867 2868 if (desc_set_entry->primary) { 2869 desc_set_addr = PVR_DEV_ADDR_OFFSET( 2870 desc_set_addr, 2871 descriptor_set->layout->memory_layout_in_dwords_per_stage[stage] 2872 .primary_offset 2873 << 2U); 2874 } else { 2875 desc_set_addr = PVR_DEV_ADDR_OFFSET( 2876 desc_set_addr, 2877 descriptor_set->layout->memory_layout_in_dwords_per_stage[stage] 2878 .secondary_offset 2879 << 2U); 2880 } 2881 2882 desc_set_addr = PVR_DEV_ADDR_OFFSET( 2883 desc_set_addr, 2884 (uint64_t)desc_set_entry->offset_in_dwords << 2U); 2885 2886 PVR_WRITE(qword_buffer, 2887 desc_set_addr.addr, 2888 desc_set_entry->const_offset, 2889 pds_info->data_size_in_dwords); 2890 2891 entries += sizeof(*desc_set_entry); 2892 break; 2893 } 2894 2895 case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: { 2896 const struct pvr_const_map_entry_special_buffer *special_buff_entry = 2897 (struct pvr_const_map_entry_special_buffer *)entries; 2898 2899 switch (special_buff_entry->buffer_type) { 2900 case PVR_BUFFER_TYPES_COMPILE_TIME: { 2901 uint64_t addr = descriptor_state->static_consts->vma->dev_addr.addr; 2902 2903 PVR_WRITE(qword_buffer, 2904 addr, 2905 special_buff_entry->const_offset, 2906 pds_info->data_size_in_dwords); 2907 break; 2908 } 2909 2910 default: 2911 unreachable("Unsupported special buffer type."); 2912 } 2913 2914 entries += sizeof(*special_buff_entry); 2915 break; 2916 } 2917 2918 default: 2919 unreachable("Unsupported map entry type."); 2920 } 2921 } 2922 2923 pvr_bo_cpu_unmap(cmd_buffer->device, pvr_bo); 2924 2925 *descriptor_data_offset_out = 2926 pvr_bo->vma->dev_addr.addr - 2927 cmd_buffer->device->heaps.pds_heap->base_addr.addr; 2928 2929 return VK_SUCCESS; 2930} 2931 2932#undef PVR_WRITE 2933 2934static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer, 2935 struct pvr_sub_cmd_compute *const sub_cmd) 2936{ 2937 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; 2938 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 2939 struct pvr_csb *csb = &sub_cmd->control_stream; 2940 const struct pvr_compute_pipeline *pipeline = state->compute_pipeline; 2941 const uint32_t const_shared_reg_count = 2942 pipeline->state.shader.const_shared_reg_count; 2943 struct pvr_compute_kernel_info info; 2944 2945 /* No shared regs, no need to use an allocation kernel. */ 2946 if (!const_shared_reg_count) 2947 return; 2948 2949 info = (struct pvr_compute_kernel_info){ 2950 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID, 2951 .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE), 2952 2953 .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL), 2954 .usc_common_shared = true, 2955 .usc_common_size = 2956 DIV_ROUND_UP(const_shared_reg_count, 2957 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)), 2958 2959 .local_size = { 1, 1, 1 }, 2960 .global_size = { 1, 1, 1 }, 2961 }; 2962 2963 /* Sometimes we don't have a secondary program if there were no constants to 2964 * write, but we still need to run a PDS program to accomplish the 2965 * allocation of the local/common store shared registers so we repurpose the 2966 * deallocation PDS program. 2967 */ 2968 if (pipeline->state.descriptor.pds_info.code_size_in_dwords) { 2969 uint32_t pds_data_size_in_dwords = 2970 pipeline->state.descriptor.pds_info.data_size_in_dwords; 2971 2972 info.pds_data_offset = state->pds_compute_descriptor_data_offset; 2973 info.pds_data_size = 2974 DIV_ROUND_UP(pds_data_size_in_dwords << 2U, 2975 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)); 2976 2977 /* Check that we have upload the code section. */ 2978 assert(pipeline->state.descriptor.pds_code.code_size); 2979 info.pds_code_offset = pipeline->state.descriptor.pds_code.code_offset; 2980 } else { 2981 /* FIXME: There should be a deallocation pds program already uploaded 2982 * that we use at this point. 2983 */ 2984 assert(!"Unimplemented"); 2985 } 2986 2987 /* We don't need to pad the workgroup size. */ 2988 2989 info.max_instances = 2990 pvr_compute_flat_slot_size(pdevice, const_shared_reg_count, false, 1U); 2991 2992 pvr_compute_generate_control_stream(csb, sub_cmd, &info); 2993} 2994 2995static uint32_t 2996pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice, 2997 uint32_t workgroup_size, 2998 uint32_t coeff_regs_count) 2999{ 3000 const struct pvr_device_runtime_info *dev_runtime_info = 3001 &pdevice->dev_runtime_info; 3002 const struct pvr_device_info *dev_info = &pdevice->dev_info; 3003 uint32_t max_avail_coeff_regs = 3004 dev_runtime_info->cdm_max_local_mem_size_regs; 3005 uint32_t coeff_regs_count_aligned = 3006 ALIGN_POT(coeff_regs_count, 3007 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE) >> 2U); 3008 3009 /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always* 3010 * pad the work group size to the next multiple of 3011 * ROGUE_MAX_INSTANCES_PER_TASK. 3012 * 3013 * If we use more than 1/8th of the max coefficient registers then we round 3014 * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK 3015 */ 3016 /* TODO: See if this can be optimized. */ 3017 if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK || 3018 coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) { 3019 assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info)); 3020 3021 return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK); 3022 } 3023 3024 return workgroup_size; 3025} 3026 3027/* TODO: Wire up the base_workgroup variant program when implementing 3028 * VK_KHR_device_group. The values will also need patching into the program. 3029 */ 3030static void pvr_compute_update_kernel( 3031 struct pvr_cmd_buffer *cmd_buffer, 3032 struct pvr_sub_cmd_compute *const sub_cmd, 3033 const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS]) 3034{ 3035 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; 3036 const struct pvr_device_runtime_info *dev_runtime_info = 3037 &pdevice->dev_runtime_info; 3038 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 3039 struct pvr_csb *csb = &sub_cmd->control_stream; 3040 const struct pvr_compute_pipeline *pipeline = state->compute_pipeline; 3041 const struct pvr_pds_info *program_info = 3042 &pipeline->state.primary_program_info; 3043 3044 struct pvr_compute_kernel_info info = { 3045 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID, 3046 .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY), 3047 .pds_temp_size = 3048 DIV_ROUND_UP(program_info->temps_required << 2U, 3049 PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)), 3050 3051 .pds_data_size = 3052 DIV_ROUND_UP(program_info->data_size_in_dwords << 2U, 3053 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)), 3054 .pds_data_offset = pipeline->state.primary_program.data_offset, 3055 .pds_code_offset = pipeline->state.primary_program.code_offset, 3056 3057 .sd_type = PVRX(CDMCTRL_SD_TYPE_USC), 3058 3059 .usc_unified_size = 3060 DIV_ROUND_UP(pipeline->state.shader.input_register_count << 2U, 3061 PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)), 3062 3063 /* clang-format off */ 3064 .global_size = { 3065 global_workgroup_size[0], 3066 global_workgroup_size[1], 3067 global_workgroup_size[2] 3068 }, 3069 /* clang-format on */ 3070 }; 3071 3072 uint32_t work_size = pipeline->state.shader.work_size; 3073 uint32_t coeff_regs; 3074 3075 if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) { 3076 /* Enforce a single workgroup per cluster through allocation starvation. 3077 */ 3078 coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs; 3079 } else { 3080 coeff_regs = pipeline->state.shader.coefficient_register_count; 3081 } 3082 3083 info.usc_common_size = 3084 DIV_ROUND_UP(coeff_regs << 2U, 3085 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)); 3086 3087 /* Use a whole slot per workgroup. */ 3088 work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK); 3089 3090 coeff_regs += pipeline->state.shader.const_shared_reg_count; 3091 3092 work_size = 3093 pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs); 3094 3095 info.local_size[0] = work_size; 3096 info.local_size[1] = 1U; 3097 info.local_size[2] = 1U; 3098 3099 info.max_instances = 3100 pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size); 3101 3102 pvr_compute_generate_control_stream(csb, sub_cmd, &info); 3103} 3104 3105void pvr_CmdDispatch(VkCommandBuffer commandBuffer, 3106 uint32_t groupCountX, 3107 uint32_t groupCountY, 3108 uint32_t groupCountZ) 3109{ 3110 const uint32_t workgroup_size[] = { groupCountX, groupCountY, groupCountZ }; 3111 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 3112 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 3113 const struct pvr_compute_pipeline *compute_pipeline = 3114 state->compute_pipeline; 3115 const VkShaderStageFlags push_consts_stage_mask = 3116 compute_pipeline->base.layout->push_constants_shader_stages; 3117 bool push_descriptors_dirty; 3118 struct pvr_sub_cmd_compute *sub_cmd; 3119 VkResult result; 3120 3121 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); 3122 assert(compute_pipeline); 3123 3124 if (!groupCountX || !groupCountY || !groupCountZ) 3125 return; 3126 3127 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE); 3128 3129 sub_cmd = &state->current_sub_cmd->compute; 3130 3131 sub_cmd->uses_atomic_ops |= compute_pipeline->state.shader.uses_atomic_ops; 3132 sub_cmd->uses_barrier |= compute_pipeline->state.shader.uses_barrier; 3133 3134 if (push_consts_stage_mask & VK_SHADER_STAGE_COMPUTE_BIT) { 3135 /* TODO: Add a dirty push constants mask in the cmd_buffer state and 3136 * check for dirty compute stage. 3137 */ 3138 pvr_finishme("Add support for push constants."); 3139 } 3140 3141 pvr_validate_push_descriptors(cmd_buffer, &push_descriptors_dirty); 3142 3143 if (compute_pipeline->state.shader.uses_num_workgroups) { 3144 struct pvr_bo *num_workgroups_bo; 3145 3146 result = pvr_cmd_buffer_upload_general(cmd_buffer, 3147 workgroup_size, 3148 sizeof(workgroup_size), 3149 &num_workgroups_bo); 3150 if (result != VK_SUCCESS) 3151 return; 3152 3153 result = pvr_setup_descriptor_mappings( 3154 cmd_buffer, 3155 PVR_STAGE_ALLOCATION_COMPUTE, 3156 &compute_pipeline->state.descriptor, 3157 &num_workgroups_bo->vma->dev_addr, 3158 &state->pds_compute_descriptor_data_offset); 3159 if (result != VK_SUCCESS) 3160 return; 3161 } else if ((compute_pipeline->base.layout 3162 ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_COMPUTE] && 3163 state->dirty.compute_desc_dirty) || 3164 state->dirty.compute_pipeline_binding || push_descriptors_dirty) { 3165 result = pvr_setup_descriptor_mappings( 3166 cmd_buffer, 3167 PVR_STAGE_ALLOCATION_COMPUTE, 3168 &compute_pipeline->state.descriptor, 3169 NULL, 3170 &state->pds_compute_descriptor_data_offset); 3171 if (result != VK_SUCCESS) 3172 return; 3173 } 3174 3175 pvr_compute_update_shared(cmd_buffer, sub_cmd); 3176 3177 pvr_compute_update_kernel(cmd_buffer, sub_cmd, workgroup_size); 3178} 3179 3180void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer, 3181 VkBuffer _buffer, 3182 VkDeviceSize offset) 3183{ 3184 assert(!"Unimplemented"); 3185} 3186 3187static void 3188pvr_update_draw_state(struct pvr_cmd_buffer_state *const state, 3189 const struct pvr_cmd_buffer_draw_state *const draw_state) 3190{ 3191 /* We don't have a state to tell us that base_instance is being used so it 3192 * gets used as a boolean - 0 means we'll use a pds program that skips the 3193 * base instance addition. If the base_instance gets used (and the last 3194 * draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib 3195 * program. 3196 * 3197 * If base_instance changes then we only need to update the data section. 3198 * 3199 * The only draw call state that doesn't really matter is the start vertex 3200 * as that is handled properly in the VDM state in all cases. 3201 */ 3202 if ((state->draw_state.draw_indexed != draw_state->draw_indexed) || 3203 (state->draw_state.draw_indirect != draw_state->draw_indirect) || 3204 (state->draw_state.base_instance == 0 && 3205 draw_state->base_instance != 0)) { 3206 state->dirty.draw_variant = true; 3207 } else if (state->draw_state.base_instance != draw_state->base_instance) { 3208 state->dirty.draw_base_instance = true; 3209 } 3210 3211 state->draw_state = *draw_state; 3212} 3213 3214static uint32_t pvr_calc_shared_regs_count( 3215 const struct pvr_graphics_pipeline *const gfx_pipeline) 3216{ 3217 const struct pvr_pipeline_stage_state *const vertex_state = 3218 &gfx_pipeline->vertex_shader_state.stage_state; 3219 uint32_t shared_regs = vertex_state->const_shared_reg_count + 3220 vertex_state->const_shared_reg_offset; 3221 3222 if (gfx_pipeline->fragment_shader_state.bo) { 3223 const struct pvr_pipeline_stage_state *const fragment_state = 3224 &gfx_pipeline->fragment_shader_state.stage_state; 3225 uint32_t fragment_regs = fragment_state->const_shared_reg_count + 3226 fragment_state->const_shared_reg_offset; 3227 3228 shared_regs = MAX2(shared_regs, fragment_regs); 3229 } 3230 3231 return shared_regs; 3232} 3233 3234static void 3235pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer, 3236 struct pvr_sub_cmd_gfx *const sub_cmd, 3237 const uint32_t pds_vertex_descriptor_data_offset) 3238{ 3239 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 3240 const struct pvr_stage_allocation_descriptor_state 3241 *const vertex_descriptor_state = 3242 &state->gfx_pipeline->vertex_shader_state.descriptor_state; 3243 const struct pvr_pipeline_stage_state *const vertex_stage_state = 3244 &state->gfx_pipeline->vertex_shader_state.stage_state; 3245 struct pvr_csb *const csb = &sub_cmd->control_stream; 3246 3247 if (!vertex_descriptor_state->pds_info.code_size_in_dwords) 3248 return; 3249 3250 pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) { 3251 state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ALL); 3252 3253 state0.usc_common_size = 3254 DIV_ROUND_UP(vertex_stage_state->const_shared_reg_count << 2, 3255 PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE)); 3256 3257 state0.pds_data_size = DIV_ROUND_UP( 3258 vertex_descriptor_state->pds_info.data_size_in_dwords << 2, 3259 PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE)); 3260 } 3261 3262 pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) { 3263 state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset); 3264 state1.sd_type = PVRX(VDMCTRL_SD_TYPE_NONE); 3265 } 3266 3267 pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) { 3268 state2.pds_code_addr = 3269 PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset); 3270 } 3271} 3272 3273static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer) 3274{ 3275 struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state; 3276 const struct pvr_graphics_pipeline *const gfx_pipeline = 3277 cmd_buffer->state.gfx_pipeline; 3278 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state; 3279 const struct pvr_vertex_shader_state *const vertex_state = 3280 &gfx_pipeline->vertex_shader_state; 3281 uint32_t output_selects; 3282 3283 /* TODO: Handle vertex and fragment shader state flags. */ 3284 3285 pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) { 3286 const VkPrimitiveTopology topology = 3287 gfx_pipeline->input_asm_state.topology; 3288 3289 state.rhw_pres = true; 3290 state.vtxsize = DIV_ROUND_UP(vertex_state->vertex_output_size, 4U); 3291 state.psprite_size_pres = (topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST); 3292 } 3293 3294 if (ppp_state->output_selects != output_selects) { 3295 ppp_state->output_selects = output_selects; 3296 emit_state->output_selects = true; 3297 } 3298 3299 if (ppp_state->varying_word[0] != vertex_state->varying[0]) { 3300 ppp_state->varying_word[0] = vertex_state->varying[0]; 3301 emit_state->varying_word0 = true; 3302 } 3303 3304 if (ppp_state->varying_word[1] != vertex_state->varying[1]) { 3305 ppp_state->varying_word[1] = vertex_state->varying[1]; 3306 emit_state->varying_word1 = true; 3307 } 3308} 3309 3310static void 3311pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer, 3312 struct PVRX(TA_STATE_ISPA) *const ispa_out) 3313{ 3314 struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state; 3315 const struct pvr_graphics_pipeline *const gfx_pipeline = 3316 cmd_buffer->state.gfx_pipeline; 3317 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state; 3318 const struct pvr_dynamic_state *const dynamic_state = 3319 &cmd_buffer->state.dynamic.common; 3320 const struct pvr_render_pass_info *const pass_info = 3321 &cmd_buffer->state.render_pass_info; 3322 const uint32_t subpass_idx = pass_info->subpass_idx; 3323 const uint32_t *depth_stencil_attachment_idx = 3324 pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment; 3325 const struct pvr_image_view *const attachment = 3326 (!depth_stencil_attachment_idx) 3327 ? NULL 3328 : pass_info->attachments[*depth_stencil_attachment_idx]; 3329 3330 const VkCullModeFlags cull_mode = gfx_pipeline->raster_state.cull_mode; 3331 const bool raster_discard_enabled = 3332 gfx_pipeline->raster_state.discard_enable; 3333 const bool disable_all = raster_discard_enabled || !attachment; 3334 3335 const VkPrimitiveTopology topology = gfx_pipeline->input_asm_state.topology; 3336 const enum PVRX(TA_OBJTYPE) obj_type = pvr_ta_objtype(topology); 3337 3338 const bool disable_stencil_write = disable_all; 3339 const bool disable_stencil_test = 3340 disable_all || !vk_format_has_stencil(attachment->vk.format); 3341 3342 const bool disable_depth_write = disable_all; 3343 const bool disable_depth_test = disable_all || 3344 !vk_format_has_depth(attachment->vk.format); 3345 3346 uint32_t ispb_stencil_off; 3347 bool is_two_sided = false; 3348 uint32_t isp_control; 3349 3350 uint32_t line_width; 3351 uint32_t common_a; 3352 uint32_t front_a; 3353 uint32_t front_b; 3354 uint32_t back_a; 3355 uint32_t back_b; 3356 3357 /* Convert to 4.4 fixed point format. */ 3358 line_width = util_unsigned_fixed(dynamic_state->line_width, 4); 3359 3360 /* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16]. 3361 * If 0 it stays at 0, otherwise we subtract 1. 3362 */ 3363 line_width = (!!line_width) * (line_width - 1); 3364 3365 line_width = MIN2(line_width, PVRX(TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX)); 3366 3367 /* TODO: Part of the logic in this function is duplicated in another part 3368 * of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier? 3369 */ 3370 3371 pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) { 3372 ispa.pointlinewidth = line_width; 3373 3374 if (disable_depth_test) 3375 ispa.dcmpmode = PVRX(TA_CMPMODE_ALWAYS); 3376 else 3377 ispa.dcmpmode = pvr_ta_cmpmode(gfx_pipeline->depth_compare_op); 3378 3379 /* FIXME: Can we just have this and remove the assignment above? 3380 * The user provides a depthTestEnable at vkCreateGraphicsPipelines() 3381 * should we be using that? 3382 */ 3383 ispa.dcmpmode |= gfx_pipeline->depth_compare_op; 3384 3385 ispa.dwritedisable = disable_depth_test || disable_depth_write; 3386 /* FIXME: Can we just have this and remove the assignment above? */ 3387 ispa.dwritedisable = ispa.dwritedisable || 3388 gfx_pipeline->depth_write_disable; 3389 3390 ispa.passtype = gfx_pipeline->fragment_shader_state.pass_type; 3391 3392 ispa.objtype = obj_type; 3393 3394 /* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and 3395 * objtype are needed by pvr_setup_triangle_merging_flag. 3396 */ 3397 if (ispa_out) 3398 *ispa_out = ispa; 3399 } 3400 3401 /* FIXME: This logic should be redone and improved. Can we also get rid of 3402 * the front and back variants? 3403 */ 3404 3405 pvr_csb_pack (&front_a, TA_STATE_ISPA, ispa) { 3406 ispa.sref = (!disable_stencil_test) * dynamic_state->reference.front; 3407 } 3408 front_a |= common_a; 3409 3410 pvr_csb_pack (&back_a, TA_STATE_ISPA, ispa) { 3411 ispa.sref = (!disable_stencil_test) * dynamic_state->compare_mask.back; 3412 } 3413 back_a |= common_a; 3414 3415 /* TODO: Does this actually represent the ispb control word on stencil off? 3416 * If not, rename the variable. 3417 */ 3418 pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) { 3419 ispb.sop3 = PVRX(TA_ISPB_STENCILOP_KEEP); 3420 ispb.sop2 = PVRX(TA_ISPB_STENCILOP_KEEP); 3421 ispb.sop1 = PVRX(TA_ISPB_STENCILOP_KEEP); 3422 ispb.scmpmode = PVRX(TA_CMPMODE_ALWAYS); 3423 } 3424 3425 if (disable_stencil_test) { 3426 back_b = front_b = ispb_stencil_off; 3427 } else { 3428 pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) { 3429 ispb.swmask = 3430 (!disable_stencil_write) * dynamic_state->write_mask.front; 3431 ispb.scmpmask = dynamic_state->compare_mask.front; 3432 3433 ispb.sop3 = pvr_ta_stencilop(gfx_pipeline->stencil_front.pass_op); 3434 ispb.sop2 = 3435 pvr_ta_stencilop(gfx_pipeline->stencil_front.depth_fail_op); 3436 ispb.sop1 = pvr_ta_stencilop(gfx_pipeline->stencil_front.fail_op); 3437 3438 ispb.scmpmode = pvr_ta_cmpmode(gfx_pipeline->stencil_front.compare_op); 3439 } 3440 3441 pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) { 3442 ispb.swmask = 3443 (!disable_stencil_write) * dynamic_state->write_mask.back; 3444 ispb.scmpmask = dynamic_state->compare_mask.back; 3445 3446 ispb.sop3 = pvr_ta_stencilop(gfx_pipeline->stencil_back.pass_op); 3447 ispb.sop2 = pvr_ta_stencilop(gfx_pipeline->stencil_back.depth_fail_op); 3448 ispb.sop1 = pvr_ta_stencilop(gfx_pipeline->stencil_back.fail_op); 3449 3450 ispb.scmpmode = pvr_ta_cmpmode(gfx_pipeline->stencil_back.compare_op); 3451 } 3452 } 3453 3454 if (front_a != back_a || front_b != back_b) { 3455 if (cull_mode & VK_CULL_MODE_BACK_BIT) { 3456 /* Single face, using front state. */ 3457 } else if (cull_mode & VK_CULL_MODE_FRONT_BIT) { 3458 /* Single face, using back state. */ 3459 3460 front_a = back_a; 3461 front_b = back_b; 3462 } else { 3463 /* Both faces. */ 3464 3465 emit_state->isp_ba = is_two_sided = true; 3466 3467 if (gfx_pipeline->raster_state.front_face == 3468 VK_FRONT_FACE_COUNTER_CLOCKWISE) { 3469 uint32_t tmp = front_a; 3470 3471 front_a = back_a; 3472 back_a = tmp; 3473 3474 tmp = front_b; 3475 front_b = back_b; 3476 back_b = tmp; 3477 } 3478 3479 /* HW defaults to stencil off. */ 3480 if (back_b != ispb_stencil_off) 3481 emit_state->isp_fb = emit_state->isp_bb = true; 3482 } 3483 } 3484 3485 if (!disable_stencil_test && front_b != ispb_stencil_off) 3486 emit_state->isp_fb = true; 3487 3488 pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) { 3489 ispctl.upass = pass_info->userpass_spawn; 3490 3491 /* TODO: is bo ever NULL? Figure out what to do. */ 3492 ispctl.tagwritedisable = raster_discard_enabled || 3493 !gfx_pipeline->fragment_shader_state.bo; 3494 3495 ispctl.two_sided = is_two_sided; 3496 ispctl.bpres = emit_state->isp_fb || emit_state->isp_bb; 3497 3498 ispctl.dbenable = !raster_discard_enabled && 3499 gfx_pipeline->raster_state.depth_bias_enable && 3500 obj_type == PVRX(TA_OBJTYPE_TRIANGLE); 3501 ispctl.scenable = !raster_discard_enabled; 3502 3503 ppp_state->isp.control_struct = ispctl; 3504 } 3505 3506 emit_state->isp = true; 3507 3508 ppp_state->isp.control = isp_control; 3509 ppp_state->isp.front_a = front_a; 3510 ppp_state->isp.front_b = front_b; 3511 ppp_state->isp.back_a = back_a; 3512 ppp_state->isp.back_b = back_b; 3513} 3514 3515static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport, 3516 const VkRect2D *const scissor, 3517 VkRect2D *const rect_out) 3518{ 3519 /* TODO: See if we can remove this struct. */ 3520 struct pvr_rect { 3521 int32_t x0, y0; 3522 int32_t x1, y1; 3523 }; 3524 3525 /* TODO: Worry about overflow? */ 3526 const struct pvr_rect scissor_rect = { 3527 .x0 = scissor->offset.x, 3528 .y0 = scissor->offset.y, 3529 .x1 = scissor->offset.x + scissor->extent.width, 3530 .y1 = scissor->offset.y + scissor->extent.height 3531 }; 3532 struct pvr_rect viewport_rect = { 0 }; 3533 3534 assert(viewport->width >= 0.0f); 3535 assert(scissor_rect.x0 >= 0); 3536 assert(scissor_rect.y0 >= 0); 3537 3538 if (scissor->extent.width == 0 || scissor->extent.height == 0) { 3539 *rect_out = (VkRect2D){ 0 }; 3540 return; 3541 } 3542 3543 viewport_rect.x0 = (int32_t)viewport->x; 3544 viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width; 3545 3546 /* TODO: Is there a mathematical way of doing all this and then clamp at 3547 * the end? 3548 */ 3549 /* We flip the y0 and y1 when height is negative. */ 3550 viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height); 3551 viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height); 3552 3553 if (scissor_rect.x1 <= viewport_rect.x0 || 3554 scissor_rect.y1 <= viewport_rect.y0 || 3555 scissor_rect.x0 >= viewport_rect.x1 || 3556 scissor_rect.y0 >= viewport_rect.y1) { 3557 *rect_out = (VkRect2D){ 0 }; 3558 return; 3559 } 3560 3561 /* Determine the overlapping rectangle. */ 3562 viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0); 3563 viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0); 3564 viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1); 3565 viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1); 3566 3567 /* TODO: Is this conversion safe? Is this logic right? */ 3568 rect_out->offset.x = (uint32_t)viewport_rect.x0; 3569 rect_out->offset.y = (uint32_t)viewport_rect.y0; 3570 rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0); 3571 rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0); 3572} 3573 3574static inline uint32_t 3575pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info) 3576{ 3577 /* TODO: This should come from rogue_ppp.xml. */ 3578 return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16)); 3579} 3580 3581/* FIXME: Remove device param when PVR_HAS_FEATURE() accepts const dev_info */ 3582static void 3583pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer) 3584{ 3585 struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state; 3586 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state; 3587 const struct pvr_dynamic_state *const dynamic_state = 3588 &cmd_buffer->state.dynamic.common; 3589 const struct PVRX(TA_STATE_ISPCTL) *const ispctl = 3590 &ppp_state->isp.control_struct; 3591 struct pvr_device_info *const dev_info = 3592 &cmd_buffer->device->pdevice->dev_info; 3593 3594 if (ispctl->dbenable) 3595 assert(!"Unimplemented"); 3596 3597 if (ispctl->scenable) { 3598 const uint32_t region_clip_align_size = 3599 pvr_get_geom_region_clip_align_size(dev_info); 3600 const VkViewport *const viewport = &dynamic_state->viewport.viewports[0]; 3601 const VkRect2D *const scissor = &dynamic_state->scissor.scissors[0]; 3602 VkRect2D overlap_rect; 3603 uint32_t scissor_words[2]; 3604 uint32_t height; 3605 uint32_t width; 3606 uint32_t x; 3607 uint32_t y; 3608 3609 /* For region clip. */ 3610 uint32_t bottom; 3611 uint32_t right; 3612 uint32_t left; 3613 uint32_t top; 3614 3615 /* We don't support multiple viewport calculations. */ 3616 assert(dynamic_state->viewport.count == 1); 3617 /* We don't support multiple scissor calculations. */ 3618 assert(dynamic_state->scissor.count == 1); 3619 3620 pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect); 3621 3622 x = overlap_rect.offset.x; 3623 y = overlap_rect.offset.y; 3624 width = overlap_rect.extent.width; 3625 height = overlap_rect.extent.height; 3626 3627 pvr_csb_pack (&scissor_words[0], IPF_SCISSOR_WORD_0, word0) { 3628 word0.scw0_xmax = x + width; 3629 word0.scw0_xmin = x; 3630 } 3631 3632 pvr_csb_pack (&scissor_words[1], IPF_SCISSOR_WORD_1, word1) { 3633 word1.scw1_ymax = y + height; 3634 word1.scw1_ymin = y; 3635 } 3636 3637 if (cmd_buffer->scissor_array.size && 3638 cmd_buffer->scissor_words[0] == scissor_words[0] && 3639 cmd_buffer->scissor_words[1] == scissor_words[1]) { 3640 return; 3641 } 3642 3643 cmd_buffer->scissor_words[0] = scissor_words[0]; 3644 cmd_buffer->scissor_words[1] = scissor_words[1]; 3645 3646 /* Calculate region clip. */ 3647 3648 left = x / region_clip_align_size; 3649 top = y / region_clip_align_size; 3650 3651 /* We prevent right=-1 with the multiplication. */ 3652 /* TODO: Is there a better way of doing this? */ 3653 if ((x + width) != 0U) 3654 right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1; 3655 else 3656 right = 0; 3657 3658 if ((y + height) != 0U) 3659 bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1; 3660 else 3661 bottom = 0U; 3662 3663 /* Setup region clip to clip everything outside what was calculated. */ 3664 3665 /* FIXME: Should we mask to prevent writing over other words? */ 3666 pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) { 3667 word0.right = right; 3668 word0.left = left; 3669 word0.mode = PVRX(TA_REGION_CLIP_MODE_OUTSIDE); 3670 } 3671 3672 pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) { 3673 word1.bottom = bottom; 3674 word1.top = top; 3675 } 3676 3677 ppp_state->depthbias_scissor_indices.scissor_index = 3678 util_dynarray_num_elements(&cmd_buffer->scissor_array, 3679 __typeof__(cmd_buffer->scissor_words)); 3680 3681 memcpy(util_dynarray_grow_bytes(&cmd_buffer->scissor_array, 3682 1, 3683 sizeof(cmd_buffer->scissor_words)), 3684 cmd_buffer->scissor_words, 3685 sizeof(cmd_buffer->scissor_words)); 3686 3687 emit_state->isp_dbsc = true; 3688 emit_state->region_clip = true; 3689 } 3690} 3691 3692static void 3693pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer, 3694 struct PVRX(TA_STATE_ISPA) * ispa) 3695{ 3696 struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state; 3697 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state; 3698 uint32_t merge_word; 3699 uint32_t mask; 3700 3701 pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) { 3702 /* Disable for lines or punch-through or for DWD and depth compare 3703 * always. 3704 */ 3705 if (ispa->objtype == PVRX(TA_OBJTYPE_LINE) || 3706 ispa->passtype == PVRX(TA_PASSTYPE_PUNCH_THROUGH) || 3707 (ispa->dwritedisable && ispa->dcmpmode == PVRX(TA_CMPMODE_ALWAYS))) { 3708 size_info.pds_tri_merge_disable = true; 3709 } 3710 } 3711 3712 pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) { 3713 size_info.pds_tri_merge_disable = true; 3714 } 3715 3716 merge_word |= ppp_state->pds.size_info2 & ~mask; 3717 3718 if (merge_word != ppp_state->pds.size_info2) { 3719 ppp_state->pds.size_info2 = merge_word; 3720 emit_state->pds_fragment_stateptr0 = true; 3721 } 3722} 3723 3724static void 3725pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer, 3726 struct pvr_sub_cmd_gfx *const sub_cmd) 3727{ 3728 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 3729 const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state = 3730 &state->gfx_pipeline->fragment_shader_state.descriptor_state; 3731 const struct pvr_pds_upload *pds_coeff_program = 3732 &state->gfx_pipeline->fragment_shader_state.pds_coeff_program; 3733 const struct pvr_pipeline_stage_state *fragment_state = 3734 &state->gfx_pipeline->fragment_shader_state.stage_state; 3735 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; 3736 struct pvr_emit_state *const emit_state = &state->emit_state; 3737 struct pvr_ppp_state *const ppp_state = &state->ppp_state; 3738 3739 const uint32_t pds_uniform_size = 3740 DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords, 3741 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE)); 3742 3743 const uint32_t pds_varying_state_size = 3744 DIV_ROUND_UP(pds_coeff_program->data_size, 3745 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE)); 3746 3747 const uint32_t usc_varying_size = 3748 DIV_ROUND_UP(fragment_state->coefficient_size, 3749 PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE)); 3750 3751 const uint32_t pds_temp_size = 3752 DIV_ROUND_UP(fragment_state->temps_count, 3753 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE)); 3754 3755 const uint32_t usc_shared_size = 3756 DIV_ROUND_UP(fragment_state->const_shared_reg_count, 3757 PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE)); 3758 3759 const uint32_t max_tiles_in_flight = 3760 pvr_calc_fscommon_size_and_tiles_in_flight( 3761 pdevice, 3762 usc_shared_size * 3763 PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE), 3764 1); 3765 uint32_t size_info_mask; 3766 uint32_t size_info2; 3767 3768 if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight) 3769 sub_cmd->max_tiles_in_flight = max_tiles_in_flight; 3770 3771 pvr_csb_pack (&ppp_state->pds.pixel_shader_base, 3772 TA_STATE_PDS_SHADERBASE, 3773 shader_base) { 3774 const struct pvr_pds_upload *const pds_upload = 3775 &state->gfx_pipeline->fragment_shader_state.pds_fragment_program; 3776 3777 shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset); 3778 } 3779 3780 if (descriptor_shader_state->pds_code.pvr_bo) { 3781 pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base, 3782 TA_STATE_PDS_TEXUNICODEBASE, 3783 tex_base) { 3784 tex_base.addr = 3785 PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset); 3786 } 3787 } else { 3788 ppp_state->pds.texture_uniform_code_base = 0U; 3789 } 3790 3791 pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) { 3792 info1.pds_uniformsize = pds_uniform_size; 3793 info1.pds_texturestatesize = 0U; 3794 info1.pds_varyingsize = pds_varying_state_size; 3795 info1.usc_varyingsize = usc_varying_size; 3796 info1.pds_tempsize = pds_temp_size; 3797 } 3798 3799 pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) { 3800 mask.pds_tri_merge_disable = true; 3801 } 3802 3803 ppp_state->pds.size_info2 &= size_info_mask; 3804 3805 pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) { 3806 info2.usc_sharedsize = usc_shared_size; 3807 } 3808 3809 ppp_state->pds.size_info2 |= size_info2; 3810 3811 if (pds_coeff_program->pvr_bo) { 3812 state->emit_state.pds_fragment_stateptr1 = true; 3813 3814 pvr_csb_pack (&ppp_state->pds.varying_base, 3815 TA_STATE_PDS_VARYINGBASE, 3816 base) { 3817 base.addr = PVR_DEV_ADDR(pds_coeff_program->data_offset); 3818 } 3819 } else { 3820 ppp_state->pds.varying_base = 0U; 3821 } 3822 3823 pvr_csb_pack (&ppp_state->pds.uniform_state_data_base, 3824 TA_STATE_PDS_UNIFORMDATABASE, 3825 base) { 3826 base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset); 3827 } 3828 3829 emit_state->pds_fragment_stateptr0 = true; 3830 emit_state->pds_fragment_stateptr3 = true; 3831} 3832 3833static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer) 3834{ 3835 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 3836 struct pvr_emit_state *const emit_state = &state->emit_state; 3837 struct pvr_ppp_state *const ppp_state = &state->ppp_state; 3838 3839 if (ppp_state->viewport_count != state->dynamic.common.viewport.count) { 3840 ppp_state->viewport_count = state->dynamic.common.viewport.count; 3841 emit_state->viewport = true; 3842 } 3843 3844 if (state->gfx_pipeline->raster_state.discard_enable) { 3845 /* We don't want to emit any viewport data as it'll just get thrown 3846 * away. It's after the previous condition because we still want to 3847 * stash the viewport_count as it's our trigger for when 3848 * rasterizer discard gets disabled. 3849 */ 3850 emit_state->viewport = false; 3851 return; 3852 } 3853 3854 for (uint32_t i = 0; i < ppp_state->viewport_count; i++) { 3855 VkViewport *viewport = &state->dynamic.common.viewport.viewports[i]; 3856 uint32_t x_scale = fui(viewport->width * 0.5f); 3857 uint32_t y_scale = fui(viewport->height * 0.5f); 3858 uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth); 3859 uint32_t x_center = fui(viewport->x + viewport->width * 0.5f); 3860 uint32_t y_center = fui(viewport->y + viewport->height * 0.5f); 3861 uint32_t z_center = fui(viewport->minDepth); 3862 3863 if (ppp_state->viewports[i].a0 != x_center || 3864 ppp_state->viewports[i].m0 != x_scale || 3865 ppp_state->viewports[i].a1 != y_center || 3866 ppp_state->viewports[i].m1 != y_scale || 3867 ppp_state->viewports[i].a2 != z_center || 3868 ppp_state->viewports[i].m2 != z_scale) { 3869 ppp_state->viewports[i].a0 = x_center; 3870 ppp_state->viewports[i].m0 = x_scale; 3871 ppp_state->viewports[i].a1 = y_center; 3872 ppp_state->viewports[i].m1 = y_scale; 3873 ppp_state->viewports[i].a2 = z_center; 3874 ppp_state->viewports[i].m2 = z_scale; 3875 3876 emit_state->viewport = true; 3877 } 3878 } 3879} 3880 3881static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer) 3882{ 3883 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 3884 const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline; 3885 struct pvr_emit_state *const emit_state = &state->emit_state; 3886 struct pvr_ppp_state *const ppp_state = &state->ppp_state; 3887 uint32_t ppp_control; 3888 3889 pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) { 3890 const struct pvr_raster_state *raster_state = &gfx_pipeline->raster_state; 3891 VkPrimitiveTopology topology = gfx_pipeline->input_asm_state.topology; 3892 control.drawclippededges = true; 3893 control.wclampen = true; 3894 3895 if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN) 3896 control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_1); 3897 else 3898 control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_0); 3899 3900 if (raster_state->depth_clamp_enable) 3901 control.clip_mode = PVRX(TA_CLIP_MODE_NO_FRONT_OR_REAR); 3902 else 3903 control.clip_mode = PVRX(TA_CLIP_MODE_FRONT_REAR); 3904 3905 /* +--- FrontIsCCW? 3906 * | +--- Cull Front? 3907 * v v 3908 * 0|0 CULLMODE_CULL_CCW, 3909 * 0|1 CULLMODE_CULL_CW, 3910 * 1|0 CULLMODE_CULL_CW, 3911 * 1|1 CULLMODE_CULL_CCW, 3912 */ 3913 switch (raster_state->cull_mode) { 3914 case VK_CULL_MODE_BACK_BIT: 3915 case VK_CULL_MODE_FRONT_BIT: 3916 if ((raster_state->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^ 3917 (raster_state->cull_mode == VK_CULL_MODE_FRONT_BIT)) { 3918 control.cullmode = PVRX(TA_CULLMODE_CULL_CW); 3919 } else { 3920 control.cullmode = PVRX(TA_CULLMODE_CULL_CCW); 3921 } 3922 3923 break; 3924 3925 case VK_CULL_MODE_NONE: 3926 control.cullmode = PVRX(TA_CULLMODE_NO_CULLING); 3927 break; 3928 3929 default: 3930 unreachable("Unsupported cull mode!"); 3931 } 3932 } 3933 3934 if (ppp_control != ppp_state->ppp_control) { 3935 ppp_state->ppp_control = ppp_control; 3936 emit_state->ppp_control = true; 3937 } 3938} 3939 3940/* Largest valid PPP State update in words = 31 3941 * 1 - Header 3942 * 3 - Stream Out Config words 0, 1 and 2 3943 * 1 - PPP Control word 3944 * 3 - Varying Config words 0, 1 and 2 3945 * 1 - Output Select 3946 * 1 - WClamp 3947 * 6 - Viewport Transform words 3948 * 2 - Region Clip words 3949 * 3 - PDS State for fragment phase (PDSSTATEPTR 1-3) 3950 * 4 - PDS State for fragment phase (PDSSTATEPTR0) 3951 * 6 - ISP Control Words 3952 */ 3953#define PVR_MAX_PPP_STATE_DWORDS 31 3954 3955static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer, 3956 struct pvr_sub_cmd_gfx *const sub_cmd) 3957{ 3958 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 3959 struct pvr_emit_state *const emit_state = &state->emit_state; 3960 struct pvr_ppp_state *const ppp_state = &state->ppp_state; 3961 struct pvr_csb *const control_stream = &sub_cmd->control_stream; 3962 uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS]; 3963 uint32_t ppp_state_words_count; 3964 uint32_t ppp_state_header; 3965 bool deferred_secondary; 3966 struct pvr_bo *pvr_bo; 3967 uint32_t *buffer_ptr; 3968 VkResult result; 3969 3970 buffer_ptr = ppp_state_words; 3971 3972 pvr_csb_pack (&ppp_state_header, TA_STATE_HEADER, header) { 3973 header.view_port_count = (ppp_state->viewport_count == 0) 3974 ? 0U 3975 : (ppp_state->viewport_count - 1); 3976 3977 /* Skip over header. */ 3978 buffer_ptr++; 3979 3980 /* Set ISP state. */ 3981 if (emit_state->isp) { 3982 header.pres_ispctl = true; 3983 *buffer_ptr++ = ppp_state->isp.control; 3984 header.pres_ispctl_fa = true; 3985 *buffer_ptr++ = ppp_state->isp.front_a; 3986 3987 if (emit_state->isp_fb) { 3988 header.pres_ispctl_fb = true; 3989 *buffer_ptr++ = ppp_state->isp.front_b; 3990 } 3991 3992 if (emit_state->isp_ba) { 3993 header.pres_ispctl_ba = true; 3994 *buffer_ptr++ = ppp_state->isp.back_a; 3995 } 3996 3997 if (emit_state->isp_bb) { 3998 header.pres_ispctl_bb = true; 3999 *buffer_ptr++ = ppp_state->isp.back_b; 4000 } 4001 } 4002 4003 /* Depth bias / scissor 4004 * If deferred_secondary is true then we do a separate state update 4005 * which gets patched in ExecuteDeferredCommandBuffer. 4006 */ 4007 /* TODO: Update above comment when we port ExecuteDeferredCommandBuffer. 4008 */ 4009 deferred_secondary = 4010 cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && 4011 cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; 4012 4013 if (emit_state->isp_dbsc && !deferred_secondary) { 4014 header.pres_ispctl_dbsc = true; 4015 4016 pvr_csb_pack (buffer_ptr++, TA_STATE_ISPDBSC, ispdbsc) { 4017 ispdbsc.dbindex = 4018 ppp_state->depthbias_scissor_indices.depthbias_index; 4019 ispdbsc.scindex = 4020 ppp_state->depthbias_scissor_indices.scissor_index; 4021 } 4022 } 4023 4024 /* PDS state. */ 4025 if (emit_state->pds_fragment_stateptr0) { 4026 header.pres_pds_state_ptr0 = true; 4027 4028 *buffer_ptr++ = ppp_state->pds.pixel_shader_base; 4029 *buffer_ptr++ = ppp_state->pds.texture_uniform_code_base; 4030 *buffer_ptr++ = ppp_state->pds.size_info1; 4031 *buffer_ptr++ = ppp_state->pds.size_info2; 4032 } 4033 4034 if (emit_state->pds_fragment_stateptr1) { 4035 header.pres_pds_state_ptr1 = true; 4036 *buffer_ptr++ = ppp_state->pds.varying_base; 4037 } 4038 4039 /* We don't use the pds_fragment_stateptr2 (texture state programs) 4040 * control word, but this doesn't mean we need to set it to 0. This is 4041 * because the hardware runs the texture state program only when the 4042 * pds_texture state field of PDS_SIZEINFO1 is non-zero. 4043 */ 4044 4045 if (emit_state->pds_fragment_stateptr3) { 4046 header.pres_pds_state_ptr3 = true; 4047 *buffer_ptr++ = ppp_state->pds.uniform_state_data_base; 4048 } 4049 4050 /* Region clip. */ 4051 if (emit_state->region_clip) { 4052 header.pres_region_clip = true; 4053 *buffer_ptr++ = ppp_state->region_clipping.word0; 4054 *buffer_ptr++ = ppp_state->region_clipping.word1; 4055 } 4056 4057 /* Viewport. */ 4058 if (emit_state->viewport) { 4059 const uint32_t viewports = MAX2(1, ppp_state->viewport_count); 4060 4061 header.pres_viewport = true; 4062 for (uint32_t i = 0; i < viewports; i++) { 4063 *buffer_ptr++ = ppp_state->viewports[i].a0; 4064 *buffer_ptr++ = ppp_state->viewports[i].m0; 4065 *buffer_ptr++ = ppp_state->viewports[i].a1; 4066 *buffer_ptr++ = ppp_state->viewports[i].m1; 4067 *buffer_ptr++ = ppp_state->viewports[i].a2; 4068 *buffer_ptr++ = ppp_state->viewports[i].m2; 4069 } 4070 } 4071 4072 /* W clamp. */ 4073 if (emit_state->wclamp) { 4074 const float wclamp = 0.00001f; 4075 4076 header.pres_wclamp = true; 4077 *buffer_ptr++ = fui(wclamp); 4078 } 4079 4080 /* Output selects. */ 4081 if (emit_state->output_selects) { 4082 header.pres_outselects = true; 4083 *buffer_ptr++ = ppp_state->output_selects; 4084 } 4085 4086 /* Varying words. */ 4087 if (emit_state->varying_word0) { 4088 header.pres_varying_word0 = true; 4089 *buffer_ptr++ = ppp_state->varying_word[0]; 4090 } 4091 4092 if (emit_state->varying_word1) { 4093 header.pres_varying_word1 = true; 4094 *buffer_ptr++ = ppp_state->varying_word[1]; 4095 } 4096 4097 if (emit_state->varying_word2) { 4098 /* We only emit this on the first draw of a render job to prevent us 4099 * from inheriting a non-zero value set elsewhere. 4100 */ 4101 header.pres_varying_word2 = true; 4102 *buffer_ptr++ = 0; 4103 } 4104 4105 /* PPP control. */ 4106 if (emit_state->ppp_control) { 4107 header.pres_ppp_ctrl = true; 4108 *buffer_ptr++ = ppp_state->ppp_control; 4109 } 4110 4111 if (emit_state->stream_out) { 4112 /* We only emit this on the first draw of a render job to prevent us 4113 * from inheriting a non-zero value set elsewhere. 4114 */ 4115 header.pres_stream_out_size = true; 4116 *buffer_ptr++ = 0; 4117 } 4118 } 4119 4120 if (!ppp_state_header) 4121 return VK_SUCCESS; 4122 4123 ppp_state_words_count = buffer_ptr - ppp_state_words; 4124 ppp_state_words[0] = ppp_state_header; 4125 4126 result = pvr_cmd_buffer_alloc_mem(cmd_buffer, 4127 cmd_buffer->device->heaps.general_heap, 4128 ppp_state_words_count * sizeof(uint32_t), 4129 PVR_BO_ALLOC_FLAG_CPU_MAPPED, 4130 &pvr_bo); 4131 if (result != VK_SUCCESS) 4132 return result; 4133 4134 memcpy(pvr_bo->bo->map, 4135 ppp_state_words, 4136 ppp_state_words_count * sizeof(uint32_t)); 4137 4138 /* Write the VDM state update into the VDM control stream. */ 4139 pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) { 4140 state0.word_count = ppp_state_words_count; 4141 state0.addrmsb = pvr_bo->vma->dev_addr; 4142 } 4143 4144 pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) { 4145 state1.addrlsb = pvr_bo->vma->dev_addr; 4146 } 4147 4148 if (emit_state->isp_dbsc && 4149 cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { 4150 pvr_finishme("Unimplemented path!!"); 4151 } 4152 4153 state->emit_state_bits = 0; 4154 4155 return VK_SUCCESS; 4156} 4157 4158static VkResult 4159pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer, 4160 struct pvr_sub_cmd_gfx *const sub_cmd) 4161{ 4162 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 4163 const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline; 4164 const bool dirty_stencil = state->dirty.compare_mask || 4165 state->dirty.write_mask || state->dirty.reference; 4166 VkResult result; 4167 4168 if (!(dirty_stencil || state->dirty.depth_bias || 4169 state->dirty.fragment_descriptors || state->dirty.line_width || 4170 state->dirty.gfx_pipeline_binding || state->dirty.scissor || 4171 state->dirty.userpass_spawn || state->dirty.viewport || 4172 state->emit_state_bits)) { 4173 return VK_SUCCESS; 4174 } 4175 4176 if (state->dirty.gfx_pipeline_binding) { 4177 struct PVRX(TA_STATE_ISPA) ispa; 4178 4179 pvr_setup_output_select(cmd_buffer); 4180 pvr_setup_isp_faces_and_control(cmd_buffer, &ispa); 4181 pvr_setup_triangle_merging_flag(cmd_buffer, &ispa); 4182 } else if (dirty_stencil || state->dirty.line_width || 4183 state->dirty.userpass_spawn) { 4184 pvr_setup_isp_faces_and_control(cmd_buffer, NULL); 4185 } 4186 4187 if (!gfx_pipeline->raster_state.discard_enable && 4188 state->dirty.fragment_descriptors && 4189 gfx_pipeline->fragment_shader_state.bo) { 4190 pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd); 4191 } 4192 4193 pvr_setup_isp_depth_bias_scissor_state(cmd_buffer); 4194 4195 if (state->dirty.viewport) 4196 pvr_setup_viewport(cmd_buffer); 4197 4198 pvr_setup_ppp_control(cmd_buffer); 4199 4200 if (gfx_pipeline->raster_state.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) { 4201 /* FIXME: Port SetNegativeViewport(). */ 4202 } 4203 4204 result = pvr_emit_ppp_state(cmd_buffer, sub_cmd); 4205 if (result != VK_SUCCESS) 4206 return result; 4207 4208 return VK_SUCCESS; 4209} 4210 4211static void 4212pvr_calculate_vertex_cam_size(const struct pvr_device_info *dev_info, 4213 const uint32_t vs_output_size, 4214 const bool raster_enable, 4215 uint32_t *const cam_size_out, 4216 uint32_t *const vs_max_instances_out) 4217{ 4218 /* First work out the size of a vertex in the UVS and multiply by 4 for 4219 * column ordering. 4220 */ 4221 const uint32_t uvs_vertex_vector_size_in_dwords = 4222 (vs_output_size + 1U + raster_enable * 4U) * 4U; 4223 const uint32_t vdm_cam_size = 4224 PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U); 4225 4226 /* This is a proxy for 8XE. */ 4227 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) && 4228 vdm_cam_size < 96U) { 4229 /* Comparisons are based on size including scratch per vertex vector. */ 4230 if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) { 4231 *cam_size_out = MIN2(31U, vdm_cam_size - 1U); 4232 *vs_max_instances_out = 16U; 4233 } else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) { 4234 *cam_size_out = 15U; 4235 *vs_max_instances_out = 16U; 4236 } else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) { 4237 *cam_size_out = 11U; 4238 *vs_max_instances_out = 12U; 4239 } else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) { 4240 *cam_size_out = 7U; 4241 *vs_max_instances_out = 8U; 4242 } else if (PVR_HAS_FEATURE(dev_info, 4243 simple_internal_parameter_format_v2) || 4244 uvs_vertex_vector_size_in_dwords < (64U * 4U)) { 4245 *cam_size_out = 7U; 4246 *vs_max_instances_out = 4U; 4247 } else { 4248 *cam_size_out = 3U; 4249 *vs_max_instances_out = 2U; 4250 } 4251 } else { 4252 /* Comparisons are based on size including scratch per vertex vector. */ 4253 if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) { 4254 /* output size <= 27 + 5 scratch. */ 4255 *cam_size_out = MIN2(95U, vdm_cam_size - 1U); 4256 *vs_max_instances_out = 0U; 4257 } else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) { 4258 /* output size <= 43 + 5 scratch */ 4259 *cam_size_out = 63U; 4260 if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U) 4261 *vs_max_instances_out = 16U; 4262 else 4263 *vs_max_instances_out = 0U; 4264 } else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) { 4265 /* output size <= 59 + 5 scratch. */ 4266 *cam_size_out = 31U; 4267 if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U) 4268 *vs_max_instances_out = 16U; 4269 else 4270 *vs_max_instances_out = 0U; 4271 } else { 4272 *cam_size_out = 15U; 4273 *vs_max_instances_out = 16U; 4274 } 4275 } 4276} 4277 4278static void 4279pvr_emit_dirty_vdm_state(const struct pvr_cmd_buffer *const cmd_buffer, 4280 struct pvr_sub_cmd_gfx *const sub_cmd) 4281{ 4282 /* FIXME: Assume all state is dirty for the moment. */ 4283 struct pvr_device_info *const dev_info = 4284 &cmd_buffer->device->pdevice->dev_info; 4285 ASSERTED const uint32_t max_user_vertex_output_components = 4286 pvr_get_max_user_vertex_output_components(dev_info); 4287 struct PVRX(VDMCTRL_VDM_STATE0) 4288 header = { pvr_cmd_header(VDMCTRL_VDM_STATE0) }; 4289 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 4290 const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline; 4291 struct pvr_csb *const csb = &sub_cmd->control_stream; 4292 uint32_t vs_output_size; 4293 uint32_t max_instances; 4294 uint32_t cam_size; 4295 4296 assert(gfx_pipeline); 4297 4298 /* CAM Calculations and HW state take vertex size aligned to DWORDS. */ 4299 vs_output_size = 4300 DIV_ROUND_UP(gfx_pipeline->vertex_shader_state.vertex_output_size, 4301 PVRX(VDMCTRL_VDM_STATE4_VS_OUTPUT_SIZE_UNIT_SIZE)); 4302 4303 assert(vs_output_size <= max_user_vertex_output_components); 4304 4305 pvr_calculate_vertex_cam_size(dev_info, 4306 vs_output_size, 4307 true, 4308 &cam_size, 4309 &max_instances); 4310 4311 pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) { 4312 state0.cam_size = cam_size; 4313 4314 if (gfx_pipeline->input_asm_state.primitive_restart) { 4315 state0.cut_index_enable = true; 4316 state0.cut_index_present = true; 4317 } 4318 4319 switch (gfx_pipeline->input_asm_state.topology) { 4320 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: 4321 state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_1); 4322 break; 4323 4324 default: 4325 state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_0); 4326 break; 4327 } 4328 4329 /* If we've bound a different vertex buffer, or this draw-call requires 4330 * a different PDS attrib data-section from the last draw call (changed 4331 * base_instance) then we need to specify a new data section. This is 4332 * also the case if we've switched pipeline or attrib program as the 4333 * data-section layout will be different. 4334 */ 4335 state0.vs_data_addr_present = 4336 state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings || 4337 state->dirty.draw_base_instance || state->dirty.draw_variant; 4338 4339 /* Need to specify new PDS Attrib program if we've bound a different 4340 * pipeline or we needed a different PDS Attrib variant for this 4341 * draw-call. 4342 */ 4343 state0.vs_other_present = state->dirty.gfx_pipeline_binding || 4344 state->dirty.draw_variant; 4345 4346 /* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when 4347 * stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because 4348 * Vulkan doesn't support stream output and the vertex position is 4349 * always emitted to the UVB. 4350 */ 4351 state0.uvs_scratch_size_select = 4352 PVRX(VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE); 4353 4354 header = state0; 4355 } 4356 4357 if (header.cut_index_present) { 4358 pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) { 4359 switch (state->index_buffer_binding.type) { 4360 case VK_INDEX_TYPE_UINT32: 4361 /* FIXME: Defines for these? These seem to come from the Vulkan 4362 * spec. for VkPipelineInputAssemblyStateCreateInfo 4363 * primitiveRestartEnable. 4364 */ 4365 state1.cut_index = 0xFFFFFFFF; 4366 break; 4367 4368 case VK_INDEX_TYPE_UINT16: 4369 state1.cut_index = 0xFFFF; 4370 break; 4371 4372 default: 4373 unreachable(!"Invalid index type"); 4374 } 4375 } 4376 } 4377 4378 if (header.vs_data_addr_present) { 4379 pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) { 4380 state2.vs_pds_data_base_addr = 4381 PVR_DEV_ADDR(state->pds_vertex_attrib_offset); 4382 } 4383 } 4384 4385 if (header.vs_other_present) { 4386 const uint32_t usc_unified_store_size_in_bytes = 4387 gfx_pipeline->vertex_shader_state.vertex_input_size << 2; 4388 4389 pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) { 4390 state3.vs_pds_code_base_addr = 4391 PVR_DEV_ADDR(state->pds_shader.code_offset); 4392 } 4393 4394 pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) { 4395 state4.vs_output_size = vs_output_size; 4396 } 4397 4398 pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) { 4399 state5.vs_max_instances = max_instances; 4400 state5.vs_usc_common_size = 0U; 4401 state5.vs_usc_unified_size = DIV_ROUND_UP( 4402 usc_unified_store_size_in_bytes, 4403 PVRX(VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE)); 4404 state5.vs_pds_temp_size = 4405 DIV_ROUND_UP(state->pds_shader.info->temps_required << 2, 4406 PVRX(VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE)); 4407 state5.vs_pds_data_size = 4408 DIV_ROUND_UP(state->pds_shader.info->data_size_in_dwords << 2, 4409 PVRX(VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE)); 4410 } 4411 } 4412} 4413 4414static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer) 4415{ 4416 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 4417 const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline; 4418 const struct pvr_pipeline_stage_state *const fragment_state = 4419 &gfx_pipeline->fragment_shader_state.stage_state; 4420 struct pvr_sub_cmd_gfx *sub_cmd; 4421 bool fstencil_writemask_zero; 4422 bool bstencil_writemask_zero; 4423 bool push_descriptors_dirty; 4424 bool fstencil_keep; 4425 bool bstencil_keep; 4426 VkResult result; 4427 4428 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS); 4429 4430 sub_cmd = &state->current_sub_cmd->gfx; 4431 sub_cmd->empty_cmd = false; 4432 4433 /* Determine pipeline depth/stencil usage. If a pipeline uses depth or 4434 * stencil testing, those attachments are using their loaded values, and 4435 * the loadOps cannot be optimized out. 4436 */ 4437 /* Pipeline uses depth testing. */ 4438 if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED && 4439 gfx_pipeline->depth_compare_op != VK_COMPARE_OP_ALWAYS) { 4440 sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED; 4441 } 4442 4443 /* Pipeline uses stencil testing. */ 4444 if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED && 4445 (gfx_pipeline->stencil_front.compare_op != VK_COMPARE_OP_ALWAYS || 4446 gfx_pipeline->stencil_back.compare_op != VK_COMPARE_OP_ALWAYS)) { 4447 sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED; 4448 } 4449 4450 if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info, 4451 compute_overlap)) { 4452 uint32_t coefficient_size = 4453 DIV_ROUND_UP(fragment_state->coefficient_size, 4454 PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE)); 4455 4456 if (coefficient_size > 4457 PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE)) 4458 sub_cmd->disable_compute_overlap = true; 4459 } 4460 4461 sub_cmd->frag_uses_atomic_ops |= fragment_state->uses_atomic_ops; 4462 sub_cmd->frag_has_side_effects |= fragment_state->has_side_effects; 4463 sub_cmd->frag_uses_texture_rw |= fragment_state->uses_texture_rw; 4464 sub_cmd->vertex_uses_texture_rw |= 4465 gfx_pipeline->vertex_shader_state.stage_state.uses_texture_rw; 4466 4467 fstencil_keep = 4468 (gfx_pipeline->stencil_front.fail_op == VK_STENCIL_OP_KEEP) && 4469 (gfx_pipeline->stencil_front.pass_op == VK_STENCIL_OP_KEEP); 4470 bstencil_keep = (gfx_pipeline->stencil_back.fail_op == VK_STENCIL_OP_KEEP) && 4471 (gfx_pipeline->stencil_back.pass_op == VK_STENCIL_OP_KEEP); 4472 fstencil_writemask_zero = (state->dynamic.common.write_mask.front == 0); 4473 bstencil_writemask_zero = (state->dynamic.common.write_mask.back == 0); 4474 4475 /* Set stencil modified flag if: 4476 * - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP. 4477 * - Neither front nor back-facing stencil has a write_mask of zero. 4478 */ 4479 if (!(fstencil_keep && bstencil_keep) && 4480 !(fstencil_writemask_zero && bstencil_writemask_zero)) { 4481 sub_cmd->modifies_stencil = true; 4482 } 4483 4484 /* Set depth modified flag if depth write is enabled. */ 4485 if (!gfx_pipeline->depth_write_disable) 4486 sub_cmd->modifies_depth = true; 4487 4488 /* If either the data or code changes for pds vertex attribs, regenerate the 4489 * data segment. 4490 */ 4491 if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding || 4492 state->dirty.draw_variant || state->dirty.draw_base_instance) { 4493 enum pvr_pds_vertex_attrib_program_type prog_type; 4494 const struct pvr_pds_attrib_program *program; 4495 4496 if (state->draw_state.draw_indirect) 4497 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT; 4498 else if (state->draw_state.base_instance) 4499 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE; 4500 else 4501 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC; 4502 4503 program = 4504 &gfx_pipeline->vertex_shader_state.pds_attrib_programs[prog_type]; 4505 state->pds_shader.info = &program->info; 4506 state->pds_shader.code_offset = program->program.code_offset; 4507 4508 state->max_shared_regs = 4509 MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline)); 4510 4511 pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline); 4512 } 4513 4514 /* TODO: Check for dirty push constants */ 4515 4516 pvr_validate_push_descriptors(cmd_buffer, &push_descriptors_dirty); 4517 4518 state->dirty.vertex_descriptors = push_descriptors_dirty || 4519 state->dirty.gfx_pipeline_binding; 4520 state->dirty.fragment_descriptors = state->dirty.vertex_descriptors; 4521 4522 if (state->dirty.fragment_descriptors) { 4523 result = pvr_setup_descriptor_mappings( 4524 cmd_buffer, 4525 PVR_STAGE_ALLOCATION_FRAGMENT, 4526 &state->gfx_pipeline->fragment_shader_state.descriptor_state, 4527 NULL, 4528 &state->pds_fragment_descriptor_data_offset); 4529 if (result != VK_SUCCESS) { 4530 mesa_loge("Could not setup fragment descriptor mappings."); 4531 return result; 4532 } 4533 } 4534 4535 if (state->dirty.vertex_descriptors) { 4536 uint32_t pds_vertex_descriptor_data_offset; 4537 4538 result = pvr_setup_descriptor_mappings( 4539 cmd_buffer, 4540 PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY, 4541 &state->gfx_pipeline->vertex_shader_state.descriptor_state, 4542 NULL, 4543 &pds_vertex_descriptor_data_offset); 4544 if (result != VK_SUCCESS) { 4545 mesa_loge("Could not setup vertex descriptor mappings."); 4546 return result; 4547 } 4548 4549 pvr_emit_dirty_pds_state(cmd_buffer, 4550 sub_cmd, 4551 pds_vertex_descriptor_data_offset); 4552 } 4553 4554 pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd); 4555 pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd); 4556 4557 state->dirty.gfx_desc_dirty = false; 4558 state->dirty.blend_constants = false; 4559 state->dirty.compare_mask = false; 4560 state->dirty.depth_bias = false; 4561 state->dirty.draw_base_instance = false; 4562 state->dirty.draw_variant = false; 4563 state->dirty.fragment_descriptors = false; 4564 state->dirty.line_width = false; 4565 state->dirty.gfx_pipeline_binding = false; 4566 state->dirty.reference = false; 4567 state->dirty.scissor = false; 4568 state->dirty.userpass_spawn = false; 4569 state->dirty.vertex_bindings = false; 4570 state->dirty.viewport = false; 4571 state->dirty.write_mask = false; 4572 4573 return VK_SUCCESS; 4574} 4575 4576static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology) 4577{ 4578 switch (topology) { 4579 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: 4580 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST); 4581 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: 4582 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST); 4583 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: 4584 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP); 4585 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: 4586 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST); 4587 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: 4588 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP); 4589 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: 4590 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN); 4591 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: 4592 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ); 4593 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: 4594 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ); 4595 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: 4596 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ); 4597 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: 4598 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ); 4599 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST: 4600 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST); 4601 default: 4602 unreachable("Undefined primitive topology"); 4603 } 4604} 4605 4606static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer, 4607 struct pvr_sub_cmd_gfx *const sub_cmd, 4608 VkPrimitiveTopology topology, 4609 uint32_t first_vertex, 4610 uint32_t vertex_count, 4611 uint32_t first_index, 4612 uint32_t index_count, 4613 uint32_t instance_count) 4614{ 4615 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 4616 struct pvr_csb *const csb = &sub_cmd->control_stream; 4617 struct PVRX(VDMCTRL_INDEX_LIST0) 4618 list_hdr = { pvr_cmd_header(VDMCTRL_INDEX_LIST0) }; 4619 pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID; 4620 unsigned int index_stride = 0; 4621 4622 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) { 4623 const bool vertex_shader_has_side_effects = 4624 cmd_buffer->state.gfx_pipeline->vertex_shader_state.stage_state 4625 .has_side_effects; 4626 4627 list0.primitive_topology = pvr_get_hw_primitive_topology(topology); 4628 4629 /* First instance is not handled in the VDM state, it's implemented as 4630 * an addition in the PDS vertex fetch. 4631 */ 4632 list0.index_count_present = true; 4633 4634 if (instance_count > 1) 4635 list0.index_instance_count_present = true; 4636 4637 if (first_vertex != 0) 4638 list0.index_offset_present = true; 4639 4640 if (state->draw_state.draw_indexed) { 4641 struct pvr_buffer *buffer = state->index_buffer_binding.buffer; 4642 4643 switch (state->index_buffer_binding.type) { 4644 case VK_INDEX_TYPE_UINT32: 4645 list0.index_size = PVRX(VDMCTRL_INDEX_SIZE_B32); 4646 index_stride = 4; 4647 break; 4648 4649 case VK_INDEX_TYPE_UINT16: 4650 list0.index_size = PVRX(VDMCTRL_INDEX_SIZE_B16); 4651 index_stride = 2; 4652 break; 4653 4654 default: 4655 unreachable("Invalid index type"); 4656 } 4657 4658 list0.index_addr_present = true; 4659 index_buffer_addr = PVR_DEV_ADDR_OFFSET( 4660 buffer->dev_addr, 4661 state->index_buffer_binding.offset + first_index * index_stride); 4662 list0.index_base_addrmsb = index_buffer_addr; 4663 } 4664 4665 list0.degen_cull_enable = 4666 PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info, 4667 vdm_degenerate_culling) && 4668 !vertex_shader_has_side_effects; 4669 4670 list_hdr = list0; 4671 } 4672 4673 if (list_hdr.index_addr_present) { 4674 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) { 4675 list1.index_base_addrlsb = index_buffer_addr; 4676 } 4677 } 4678 4679 if (list_hdr.index_count_present) { 4680 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) { 4681 list2.index_count = vertex_count | index_count; 4682 } 4683 } 4684 4685 if (list_hdr.index_instance_count_present) { 4686 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) { 4687 list3.instance_count = instance_count - 1; 4688 } 4689 } 4690 4691 if (list_hdr.index_offset_present) { 4692 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) { 4693 list4.index_offset = first_vertex; 4694 } 4695 } 4696 4697 /* TODO: See if we need list_words[5-9]. */ 4698} 4699 4700void pvr_CmdDraw(VkCommandBuffer commandBuffer, 4701 uint32_t vertexCount, 4702 uint32_t instanceCount, 4703 uint32_t firstVertex, 4704 uint32_t firstInstance) 4705{ 4706 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 4707 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 4708 struct pvr_cmd_buffer_draw_state draw_state; 4709 VkResult result; 4710 4711 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); 4712 4713 draw_state.base_vertex = firstVertex; 4714 draw_state.base_instance = firstInstance; 4715 draw_state.draw_indirect = false; 4716 draw_state.draw_indexed = false; 4717 pvr_update_draw_state(state, &draw_state); 4718 4719 result = pvr_validate_draw_state(cmd_buffer); 4720 if (result != VK_SUCCESS) 4721 return; 4722 4723 /* Write the VDM control stream for the primitive. */ 4724 pvr_emit_vdm_index_list(cmd_buffer, 4725 &state->current_sub_cmd->gfx, 4726 state->gfx_pipeline->input_asm_state.topology, 4727 firstVertex, 4728 vertexCount, 4729 0U, 4730 0U, 4731 instanceCount); 4732} 4733 4734void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer, 4735 uint32_t indexCount, 4736 uint32_t instanceCount, 4737 uint32_t firstIndex, 4738 int32_t vertexOffset, 4739 uint32_t firstInstance) 4740{ 4741 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 4742 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 4743 struct pvr_cmd_buffer_draw_state draw_state; 4744 VkResult result; 4745 4746 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); 4747 4748 draw_state.base_vertex = vertexOffset; 4749 draw_state.base_instance = firstInstance; 4750 draw_state.draw_indirect = false; 4751 draw_state.draw_indexed = true; 4752 pvr_update_draw_state(state, &draw_state); 4753 4754 result = pvr_validate_draw_state(cmd_buffer); 4755 if (result != VK_SUCCESS) 4756 return; 4757 4758 /* Write the VDM control stream for the primitive. */ 4759 pvr_emit_vdm_index_list(cmd_buffer, 4760 &state->current_sub_cmd->gfx, 4761 state->gfx_pipeline->input_asm_state.topology, 4762 vertexOffset, 4763 0, 4764 firstIndex, 4765 indexCount, 4766 instanceCount); 4767} 4768 4769void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, 4770 VkBuffer _buffer, 4771 VkDeviceSize offset, 4772 uint32_t drawCount, 4773 uint32_t stride) 4774{ 4775 assert(!"Unimplemented"); 4776} 4777 4778void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer, 4779 VkBuffer _buffer, 4780 VkDeviceSize offset, 4781 uint32_t drawCount, 4782 uint32_t stride) 4783{ 4784 assert(!"Unimplemented"); 4785} 4786 4787static VkResult 4788pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer) 4789{ 4790 pvr_finishme("Add attachment resolve support!"); 4791 return pvr_cmd_buffer_end_sub_cmd(cmd_buffer); 4792} 4793 4794void pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer, 4795 const VkSubpassEndInfo *pSubpassEndInfo) 4796{ 4797 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 4798 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 4799 struct pvr_image_view **attachments; 4800 VkClearValue *clear_values; 4801 VkResult result; 4802 4803 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); 4804 4805 assert(state->render_pass_info.pass); 4806 assert(state->render_pass_info.framebuffer); 4807 4808 /* TODO: Investigate why pvr_cmd_buffer_end_sub_cmd/EndSubCommand is called 4809 * twice in this path, one here and one from 4810 * pvr_resolve_unemitted_resolve_attachments. 4811 */ 4812 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); 4813 if (result != VK_SUCCESS) 4814 return; 4815 4816 result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer); 4817 if (result != VK_SUCCESS) 4818 return; 4819 4820 /* Save the required fields before clearing render_pass_info struct. */ 4821 attachments = state->render_pass_info.attachments; 4822 clear_values = state->render_pass_info.clear_values; 4823 4824 memset(&state->render_pass_info, 0, sizeof(state->render_pass_info)); 4825 4826 state->render_pass_info.attachments = attachments; 4827 state->render_pass_info.clear_values = clear_values; 4828} 4829 4830void pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer, 4831 uint32_t commandBufferCount, 4832 const VkCommandBuffer *pCommandBuffers) 4833{ 4834 assert(!"Unimplemented"); 4835} 4836 4837void pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer, 4838 const VkSubpassBeginInfo *pSubpassBeginInfo, 4839 const VkSubpassEndInfo *pSubpassEndInfo) 4840{ 4841 assert(!"Unimplemented"); 4842} 4843 4844/* This is just enough to handle vkCmdPipelineBarrier(). 4845 * TODO: Complete? 4846 */ 4847void pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, 4848 const VkDependencyInfo *pDependencyInfo) 4849{ 4850 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 4851 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; 4852 const struct pvr_render_pass *const render_pass = 4853 state->render_pass_info.pass; 4854 VkPipelineStageFlags vk_src_stage_mask = 0U; 4855 VkPipelineStageFlags vk_dst_stage_mask = 0U; 4856 uint32_t required_stage_mask = 0U; 4857 uint32_t src_stage_mask; 4858 uint32_t dst_stage_mask; 4859 bool is_barrier_needed; 4860 4861 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); 4862 4863 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) { 4864 vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask; 4865 vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask; 4866 } 4867 4868 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) { 4869 vk_src_stage_mask |= 4870 pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask; 4871 vk_dst_stage_mask |= 4872 pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask; 4873 } 4874 4875 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) { 4876 vk_src_stage_mask |= 4877 pDependencyInfo->pImageMemoryBarriers[i].srcStageMask; 4878 vk_dst_stage_mask |= 4879 pDependencyInfo->pImageMemoryBarriers[i].dstStageMask; 4880 } 4881 4882 src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask); 4883 dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask); 4884 4885 for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) { 4886 if (!(dst_stage_mask & BITFIELD_BIT(stage))) 4887 continue; 4888 4889 required_stage_mask |= state->barriers_needed[stage]; 4890 } 4891 4892 src_stage_mask &= required_stage_mask; 4893 for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) { 4894 if (!(dst_stage_mask & BITFIELD_BIT(stage))) 4895 continue; 4896 4897 state->barriers_needed[stage] &= ~src_stage_mask; 4898 } 4899 4900 if (src_stage_mask == 0 || dst_stage_mask == 0) { 4901 is_barrier_needed = false; 4902 } else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT && 4903 dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) { 4904 /* This is implicit so no need to barrier. */ 4905 is_barrier_needed = false; 4906 } else if (src_stage_mask == dst_stage_mask && 4907 util_bitcount(src_stage_mask) == 1) { 4908 switch (src_stage_mask) { 4909 case PVR_PIPELINE_STAGE_FRAG_BIT: 4910 pvr_finishme("Handle fragment stage pipeline barrier."); 4911 is_barrier_needed = true; 4912 break; 4913 4914 case PVR_PIPELINE_STAGE_COMPUTE_BIT: { 4915 struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd; 4916 4917 is_barrier_needed = false; 4918 4919 if (!current_sub_cmd || 4920 current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) { 4921 break; 4922 } 4923 4924 /* Multiple dispatches can be merged into a single job. When back to 4925 * back dispatches have a sequential dependency (CDM -> CDM pipeline 4926 * barrier) we need to do the following. 4927 * - Dispatch a kernel which fences all previous memory writes and 4928 * flushes the MADD cache. 4929 * - Issue a CDM fence which ensures all previous tasks emitted by 4930 * the CDM are completed before starting anything new. 4931 */ 4932 4933 /* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait 4934 * for data. 4935 */ 4936 pvr_compute_generate_idfwdf(cmd_buffer, ¤t_sub_cmd->compute); 4937 4938 pvr_compute_generate_fence(cmd_buffer, 4939 ¤t_sub_cmd->compute, 4940 false); 4941 break; 4942 } 4943 4944 default: 4945 is_barrier_needed = false; 4946 break; 4947 }; 4948 } else { 4949 is_barrier_needed = true; 4950 } 4951 4952 if (render_pass) { 4953 pvr_finishme("Insert mid fragment stage barrier if needed."); 4954 } else { 4955 if (is_barrier_needed) 4956 pvr_finishme("Insert barrier if needed."); 4957 } 4958} 4959 4960void pvr_CmdResetEvent2KHR(VkCommandBuffer commandBuffer, 4961 VkEvent _event, 4962 VkPipelineStageFlags2 stageMask) 4963{ 4964 assert(!"Unimplemented"); 4965} 4966 4967void pvr_CmdSetEvent2KHR(VkCommandBuffer commandBuffer, 4968 VkEvent _event, 4969 const VkDependencyInfo *pDependencyInfo) 4970{ 4971 assert(!"Unimplemented"); 4972} 4973 4974void pvr_CmdWaitEvents2KHR(VkCommandBuffer commandBuffer, 4975 uint32_t eventCount, 4976 const VkEvent *pEvents, 4977 const VkDependencyInfo *pDependencyInfos) 4978{ 4979 assert(!"Unimplemented"); 4980} 4981 4982void pvr_CmdWriteTimestamp2KHR(VkCommandBuffer commandBuffer, 4983 VkPipelineStageFlags2 stage, 4984 VkQueryPool queryPool, 4985 uint32_t query) 4986{ 4987 unreachable("Timestamp queries are not supported."); 4988} 4989 4990VkResult pvr_EndCommandBuffer(VkCommandBuffer commandBuffer) 4991{ 4992 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); 4993 struct pvr_cmd_buffer_state *state = &cmd_buffer->state; 4994 VkResult result; 4995 4996 /* From the Vulkan 1.0 spec: 4997 * 4998 * CommandBuffer must be in the recording state. 4999 */ 5000 assert(cmd_buffer->status == PVR_CMD_BUFFER_STATUS_RECORDING); 5001 5002 if (state->status != VK_SUCCESS) 5003 return state->status; 5004 5005 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer); 5006 if (result != VK_SUCCESS) 5007 return result; 5008 5009 cmd_buffer->status = PVR_CMD_BUFFER_STATUS_EXECUTABLE; 5010 5011 return VK_SUCCESS; 5012} 5013