1/* 2 * Copyright © 2022 Imagination Technologies Ltd. 3 * 4 * based in part on v3dv driver which is: 5 * Copyright © 2019 Raspberry Pi 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * of this software and associated documentation files (the "Software"), to deal 9 * in the Software without restriction, including without limitation the rights 10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 * copies of the Software, and to permit persons to whom the Software is 12 * furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the next 15 * paragraph) shall be included in all copies or substantial portions of the 16 * Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 * SOFTWARE. 25 */ 26 27#include <assert.h> 28#include <stdbool.h> 29#include <stdint.h> 30#include <string.h> 31#include <vulkan/vulkan.h> 32 33#include "compiler/shader_enums.h" 34#include "hwdef/rogue_hw_utils.h" 35#include "nir/nir.h" 36#include "pvr_bo.h" 37#include "pvr_csb.h" 38#include "pvr_csb_enum_helpers.h" 39#include "pvr_hardcode.h" 40#include "pvr_pds.h" 41#include "pvr_private.h" 42#include "pvr_shader.h" 43#include "pvr_types.h" 44#include "rogue/rogue.h" 45#include "rogue/rogue_build_data.h" 46#include "util/log.h" 47#include "util/macros.h" 48#include "util/ralloc.h" 49#include "util/u_math.h" 50#include "vk_alloc.h" 51#include "vk_log.h" 52#include "vk_object.h" 53#include "vk_util.h" 54 55/***************************************************************************** 56 PDS functions 57*****************************************************************************/ 58 59/* If allocator == NULL, the internal one will be used. */ 60static VkResult pvr_pds_coeff_program_create_and_upload( 61 struct pvr_device *device, 62 const VkAllocationCallbacks *allocator, 63 const uint32_t *fpu_iterators, 64 uint32_t fpu_iterators_count, 65 const uint32_t *destinations, 66 struct pvr_pds_upload *const pds_upload_out) 67{ 68 struct pvr_pds_coeff_loading_program program = { 69 .num_fpu_iterators = fpu_iterators_count, 70 }; 71 uint32_t staging_buffer_size; 72 uint32_t *staging_buffer; 73 VkResult result; 74 75 assert(fpu_iterators_count < PVR_MAXIMUM_ITERATIONS); 76 77 /* Get the size of the program and then allocate that much memory. */ 78 pvr_pds_coefficient_loading(&program, NULL, PDS_GENERATE_SIZES); 79 80 staging_buffer_size = 81 (program.code_size + program.data_size) * sizeof(*staging_buffer); 82 83 staging_buffer = vk_alloc2(&device->vk.alloc, 84 allocator, 85 staging_buffer_size, 86 8, 87 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 88 if (!staging_buffer) 89 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 90 91 /* FIXME: Should we save pointers when we redesign the pds gen api ? */ 92 typed_memcpy(program.FPU_iterators, 93 fpu_iterators, 94 program.num_fpu_iterators); 95 96 typed_memcpy(program.destination, destinations, program.num_fpu_iterators); 97 98 /* Generate the program into is the staging_buffer. */ 99 pvr_pds_coefficient_loading(&program, 100 staging_buffer, 101 PDS_GENERATE_CODEDATA_SEGMENTS); 102 103 /* FIXME: Figure out the define for alignment of 16. */ 104 result = pvr_gpu_upload_pds(device, 105 &staging_buffer[0], 106 program.data_size, 107 16, 108 &staging_buffer[program.data_size], 109 program.code_size, 110 16, 111 16, 112 pds_upload_out); 113 if (result != VK_SUCCESS) { 114 vk_free2(&device->vk.alloc, allocator, staging_buffer); 115 return result; 116 } 117 118 vk_free2(&device->vk.alloc, allocator, staging_buffer); 119 120 return VK_SUCCESS; 121} 122 123/* FIXME: move this elsewhere since it's also called in pvr_pass.c? */ 124/* If allocator == NULL, the internal one will be used. */ 125VkResult pvr_pds_fragment_program_create_and_upload( 126 struct pvr_device *device, 127 const VkAllocationCallbacks *allocator, 128 const struct pvr_bo *fragment_shader_bo, 129 uint32_t fragment_temp_count, 130 enum rogue_msaa_mode msaa_mode, 131 bool has_phase_rate_change, 132 struct pvr_pds_upload *const pds_upload_out) 133{ 134 const enum PVRX(PDSINST_DOUTU_SAMPLE_RATE) 135 sample_rate = pvr_pdsinst_doutu_sample_rate_from_rogue(msaa_mode); 136 struct pvr_pds_kickusc_program program = { 0 }; 137 uint32_t staging_buffer_size; 138 uint32_t *staging_buffer; 139 VkResult result; 140 141 /* FIXME: Should it be passing in the USC offset rather than address here? 142 */ 143 /* Note this is not strictly required to be done before calculating the 144 * staging_buffer_size in this particular case. It can also be done after 145 * allocating the buffer. The size from pvr_pds_kick_usc() is constant. 146 */ 147 pvr_pds_setup_doutu(&program.usc_task_control, 148 fragment_shader_bo->vma->dev_addr.addr, 149 fragment_temp_count, 150 sample_rate, 151 has_phase_rate_change); 152 153 pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES); 154 155 staging_buffer_size = 156 (program.code_size + program.data_size) * sizeof(*staging_buffer); 157 158 staging_buffer = vk_alloc2(&device->vk.alloc, 159 allocator, 160 staging_buffer_size, 161 8, 162 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 163 if (!staging_buffer) 164 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 165 166 pvr_pds_kick_usc(&program, 167 staging_buffer, 168 0, 169 false, 170 PDS_GENERATE_CODEDATA_SEGMENTS); 171 172 /* FIXME: Figure out the define for alignment of 16. */ 173 result = pvr_gpu_upload_pds(device, 174 &staging_buffer[0], 175 program.data_size, 176 16, 177 &staging_buffer[program.data_size], 178 program.code_size, 179 16, 180 16, 181 pds_upload_out); 182 if (result != VK_SUCCESS) { 183 vk_free2(&device->vk.alloc, allocator, staging_buffer); 184 return result; 185 } 186 187 vk_free2(&device->vk.alloc, allocator, staging_buffer); 188 189 return VK_SUCCESS; 190} 191 192static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes( 193 const struct pvr_device_info *dev_info, 194 bool robust_buffer_access) 195{ 196 /* FIXME: Use more local variable to improve formatting. */ 197 198 /* Maximum memory allocation needed for const map entries in 199 * pvr_pds_generate_vertex_primary_program(). 200 * When robustBufferAccess is disabled, it must be >= 410. 201 * When robustBufferAccess is enabled, it must be >= 570. 202 * 203 * 1. Size of entry for base instance 204 * (pvr_const_map_entry_base_instance) 205 * 206 * 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * ( 207 * if (!robustBufferAccess) 208 * size of vertex attribute entry 209 * (pvr_const_map_entry_vertex_attribute_address) + 210 * else 211 * size of robust vertex attribute entry 212 * (pvr_const_map_entry_robust_vertex_attribute_address) + 213 * size of entry for max attribute index 214 * (pvr_const_map_entry_vertex_attribute_max_index) + 215 * fi 216 * size of Unified Store burst entry 217 * (pvr_const_map_entry_literal32) + 218 * size of entry for vertex stride 219 * (pvr_const_map_entry_literal32) + 220 * size of entries for DDMAD control word 221 * (num_ddmad_literals * pvr_const_map_entry_literal32)) 222 * 223 * 3. Size of entry for DOUTW vertex/instance control word 224 * (pvr_const_map_entry_literal32) 225 * 226 * 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address) 227 */ 228 229 const size_t attribute_size = 230 (!robust_buffer_access) 231 ? sizeof(struct pvr_const_map_entry_vertex_attribute_address) 232 : sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) + 233 sizeof(struct pvr_const_map_entry_vertex_attribute_max_index); 234 235 /* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word 236 * and is increased by one DWORD to contain the data for the DDMADT's 237 * out-of-bounds check. 238 */ 239 const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals = 240 1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt); 241 242 return (sizeof(struct pvr_const_map_entry_base_instance) + 243 PVR_MAX_VERTEX_INPUT_BINDINGS * 244 (attribute_size + 245 (2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) * 246 sizeof(struct pvr_const_map_entry_literal32)) + 247 sizeof(struct pvr_const_map_entry_literal32) + 248 sizeof(struct pvr_const_map_entry_doutu_address)); 249} 250 251/* This is a const pointer to an array of pvr_pds_vertex_dma structs. 252 * The array being pointed to is of PVR_MAX_VERTEX_ATTRIB_DMAS size. 253 */ 254typedef struct pvr_pds_vertex_dma ( 255 *const 256 pvr_pds_attrib_dma_descriptions_array_ptr)[PVR_MAX_VERTEX_ATTRIB_DMAS]; 257 258/* dma_descriptions_out_ptr is a pointer to the array used as output. 259 * The whole array might not be filled so dma_count_out indicates how many 260 * elements were used. 261 */ 262static void pvr_pds_vertex_attrib_init_dma_descriptions( 263 const VkPipelineVertexInputStateCreateInfo *const vertex_input_state, 264 const struct rogue_vs_build_data *vs_data, 265 pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr, 266 uint32_t *const dma_count_out) 267{ 268 struct pvr_pds_vertex_dma *const dma_descriptions = 269 *dma_descriptions_out_ptr; 270 uint32_t dma_count = 0; 271 272 if (!vertex_input_state) { 273 *dma_count_out = 0; 274 return; 275 } 276 277 for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount; 278 i++) { 279 const VkVertexInputAttributeDescription *const attrib_desc = 280 &vertex_input_state->pVertexAttributeDescriptions[i]; 281 const VkVertexInputBindingDescription *binding_desc = NULL; 282 283 /* Finding the matching binding description. */ 284 for (uint32_t j = 0; 285 j < vertex_input_state->vertexBindingDescriptionCount; 286 j++) { 287 const VkVertexInputBindingDescription *const current_binding_desc = 288 &vertex_input_state->pVertexBindingDescriptions[j]; 289 290 if (current_binding_desc->binding == attrib_desc->binding) { 291 binding_desc = current_binding_desc; 292 break; 293 } 294 } 295 296 /* From the Vulkan 1.2.195 spec for 297 * VkPipelineVertexInputStateCreateInfo: 298 * 299 * "For every binding specified by each element of 300 * pVertexAttributeDescriptions, a 301 * VkVertexInputBindingDescription must exist in 302 * pVertexBindingDescriptions with the same value of binding" 303 * 304 * So we don't check if we found the matching binding description 305 * or not. 306 */ 307 308 struct pvr_pds_vertex_dma *const dma_desc = &dma_descriptions[dma_count]; 309 310 size_t location = attrib_desc->location; 311 assert(location < vs_data->inputs.num_input_vars); 312 313 dma_desc->offset = attrib_desc->offset; 314 dma_desc->stride = binding_desc->stride; 315 316 dma_desc->flags = 0; 317 318 if (binding_desc->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) 319 dma_desc->flags |= PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE; 320 321 dma_desc->size_in_dwords = vs_data->inputs.components[location]; 322 /* TODO: This will be different when other types are supported. 323 * Store in vs_data with base and components? 324 */ 325 /* TODO: Use attrib_desc->format. */ 326 dma_desc->component_size_in_bytes = ROGUE_REG_SIZE_BYTES; 327 dma_desc->destination = vs_data->inputs.base[location]; 328 dma_desc->binding_index = attrib_desc->binding; 329 dma_desc->divisor = 1; 330 dma_desc->robustness_buffer_offset = 0; 331 332 ++dma_count; 333 } 334 335 *dma_count_out = dma_count; 336} 337 338static VkResult pvr_pds_vertex_attrib_program_create_and_upload( 339 struct pvr_device *const device, 340 const VkAllocationCallbacks *const allocator, 341 struct pvr_pds_vertex_primary_program_input *const input, 342 struct pvr_pds_attrib_program *const program_out) 343{ 344 const size_t const_entries_size_in_bytes = 345 pvr_pds_get_max_vertex_program_const_map_size_in_bytes( 346 &device->pdevice->dev_info, 347 device->features.robustBufferAccess); 348 struct pvr_pds_upload *const program = &program_out->program; 349 struct pvr_pds_info *const info = &program_out->info; 350 struct pvr_const_map_entry *entries_buffer; 351 ASSERTED uint32_t code_size_in_dwords; 352 size_t staging_buffer_size; 353 uint32_t *staging_buffer; 354 VkResult result; 355 356 memset(info, 0, sizeof(*info)); 357 358 entries_buffer = vk_alloc2(&device->vk.alloc, 359 allocator, 360 const_entries_size_in_bytes, 361 8, 362 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 363 if (!entries_buffer) 364 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 365 366 info->entries = entries_buffer; 367 info->entries_size_in_bytes = const_entries_size_in_bytes; 368 369 pvr_pds_generate_vertex_primary_program(input, 370 NULL, 371 info, 372 device->features.robustBufferAccess, 373 &device->pdevice->dev_info); 374 375 code_size_in_dwords = info->code_size_in_dwords; 376 staging_buffer_size = info->code_size_in_dwords * sizeof(*staging_buffer); 377 378 staging_buffer = vk_alloc2(&device->vk.alloc, 379 allocator, 380 staging_buffer_size, 381 8, 382 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 383 if (!staging_buffer) { 384 vk_free2(&device->vk.alloc, allocator, entries_buffer); 385 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 386 } 387 388 /* This also fills in info->entries. */ 389 pvr_pds_generate_vertex_primary_program(input, 390 staging_buffer, 391 info, 392 device->features.robustBufferAccess, 393 &device->pdevice->dev_info); 394 395 assert(info->code_size_in_dwords <= code_size_in_dwords); 396 397 /* FIXME: Add a vk_realloc2() ? */ 398 entries_buffer = vk_realloc((!allocator) ? &device->vk.alloc : allocator, 399 entries_buffer, 400 info->entries_written_size_in_bytes, 401 8, 402 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 403 if (!entries_buffer) { 404 vk_free2(&device->vk.alloc, allocator, staging_buffer); 405 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 406 } 407 408 info->entries = entries_buffer; 409 info->entries_size_in_bytes = info->entries_written_size_in_bytes; 410 411 /* FIXME: Figure out the define for alignment of 16. */ 412 result = pvr_gpu_upload_pds(device, 413 NULL, 414 0, 415 0, 416 staging_buffer, 417 info->code_size_in_dwords, 418 16, 419 16, 420 program); 421 if (result != VK_SUCCESS) { 422 vk_free2(&device->vk.alloc, allocator, entries_buffer); 423 vk_free2(&device->vk.alloc, allocator, staging_buffer); 424 425 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 426 } 427 428 vk_free2(&device->vk.alloc, allocator, staging_buffer); 429 430 return VK_SUCCESS; 431} 432 433static inline void pvr_pds_vertex_attrib_program_destroy( 434 struct pvr_device *const device, 435 const struct VkAllocationCallbacks *const allocator, 436 struct pvr_pds_attrib_program *const program) 437{ 438 pvr_bo_free(device, program->program.pvr_bo); 439 vk_free2(&device->vk.alloc, allocator, program->info.entries); 440} 441 442/* This is a const pointer to an array of pvr_pds_attrib_program structs. 443 * The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size. 444 */ 445typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr) 446 [PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT]; 447 448/* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex 449 * inputs. This will bake the code segment and create a template of the data 450 * segment for the command buffer to fill in. 451 */ 452/* If allocator == NULL, the internal one will be used. 453 * 454 * programs_out_ptr is a pointer to the array where the outputs will be placed. 455 * */ 456static VkResult pvr_pds_vertex_attrib_programs_create_and_upload( 457 struct pvr_device *device, 458 const VkAllocationCallbacks *const allocator, 459 const VkPipelineVertexInputStateCreateInfo *const vertex_input_state, 460 uint32_t usc_temp_count, 461 const struct rogue_vs_build_data *vs_data, 462 pvr_pds_attrib_programs_array_ptr programs_out_ptr) 463{ 464 struct pvr_pds_vertex_dma dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS]; 465 struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr; 466 struct pvr_pds_vertex_primary_program_input input = { 467 .dma_list = dma_descriptions, 468 }; 469 VkResult result; 470 471 pvr_pds_vertex_attrib_init_dma_descriptions(vertex_input_state, 472 vs_data, 473 &dma_descriptions, 474 &input.dma_count); 475 476 pvr_pds_setup_doutu(&input.usc_task_control, 477 0, 478 usc_temp_count, 479 PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE), 480 false); 481 482 /* TODO: If statements for all the "bRequired"s + ui32ExtraFlags. */ 483 484 /* Note: programs_out_ptr is a pointer to an array so this is fine. See the 485 * typedef. 486 */ 487 for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) { 488 switch (i) { 489 case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC: 490 input.flags = 0; 491 break; 492 493 case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE: 494 input.flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT; 495 break; 496 497 case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT: 498 /* We unset INSTANCE and set INDIRECT. */ 499 input.flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT; 500 break; 501 502 default: 503 unreachable("Invalid vertex attrib program type."); 504 } 505 506 result = 507 pvr_pds_vertex_attrib_program_create_and_upload(device, 508 allocator, 509 &input, 510 &programs_out[i]); 511 if (result != VK_SUCCESS) { 512 for (uint32_t j = 0; j < i; j++) { 513 pvr_pds_vertex_attrib_program_destroy(device, 514 allocator, 515 &programs_out[j]); 516 } 517 518 return result; 519 } 520 } 521 522 return VK_SUCCESS; 523} 524 525static size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes() 526{ 527 /* Maximum memory allocation needed for const map entries in 528 * pvr_pds_generate_descriptor_upload_program(). 529 * It must be >= 688 bytes. This size is calculated as the sum of: 530 * 531 * 1. Max. number of descriptor sets (8) * ( 532 * size of descriptor entry 533 * (pvr_const_map_entry_descriptor_set) + 534 * size of Common Store burst entry 535 * (pvr_const_map_entry_literal32)) 536 * 537 * 2. Max. number of PDS program buffers (24) * ( 538 * size of the largest buffer structure 539 * (pvr_const_map_entry_constant_buffer) + 540 * size of Common Store burst entry 541 * (pvr_const_map_entry_literal32) 542 * 543 * 3. Size of DOUTU entry (pvr_const_map_entry_doutu_address) 544 */ 545 546 /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to 547 * say that it should be 8. 548 * Figure our a define for this or is the comment wrong? 549 */ 550 return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) + 551 sizeof(struct pvr_const_map_entry_literal32)) + 552 PVR_PDS_MAX_BUFFERS * 553 (sizeof(struct pvr_const_map_entry_constant_buffer) + 554 sizeof(struct pvr_const_map_entry_literal32)) + 555 sizeof(struct pvr_const_map_entry_doutu_address)); 556} 557 558/* This is a const pointer to an array of PVR_PDS_MAX_BUFFERS pvr_pds_buffer 559 * structs. 560 */ 561typedef struct pvr_pds_buffer ( 562 *const pvr_pds_descriptor_program_buffer_array_ptr)[PVR_PDS_MAX_BUFFERS]; 563 564/** 565 * \brief Setup buffers for the PDS descriptor program. 566 * 567 * Sets up buffers required by the PDS gen api based on compiler info. 568 * 569 * For compile time static constants that need DMAing it uploads them and 570 * returns the upload in \r static_consts_pvr_bo_out . 571 */ 572static VkResult pvr_pds_descriptor_program_setup_buffers( 573 struct pvr_device *device, 574 bool robust_buffer_access, 575 const struct rogue_compile_time_consts_data *compile_time_consts_data, 576 const struct rogue_ubo_data *ubo_data, 577 pvr_pds_descriptor_program_buffer_array_ptr buffers_out_ptr, 578 uint32_t *const buffer_count_out, 579 struct pvr_bo **const static_consts_pvr_bo_out) 580{ 581 struct pvr_pds_buffer *const buffers = *buffers_out_ptr; 582 uint32_t buffer_count = 0; 583 584 for (size_t i = 0; i < ubo_data->num_ubo_entries; i++) { 585 struct pvr_pds_buffer *current_buffer = &buffers[buffer_count]; 586 587 /* This is fine since buffers_out_ptr is a pointer to an array. */ 588 assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr)); 589 590 current_buffer->type = PVR_BUFFER_TYPE_UBO; 591 current_buffer->size_in_dwords = ubo_data->size[i]; 592 current_buffer->destination = ubo_data->dest[i]; 593 594 current_buffer->buffer_id = buffer_count; 595 current_buffer->desc_set = ubo_data->desc_set[i]; 596 current_buffer->binding = ubo_data->binding[i]; 597 /* TODO: Is this always the case? 598 * E.g. can multiple UBOs have the same base buffer? 599 */ 600 current_buffer->source_offset = 0; 601 602 buffer_count++; 603 } 604 605 if (compile_time_consts_data->static_consts.num > 0) { 606 VkResult result; 607 608 assert(compile_time_consts_data->static_consts.num <= 609 ARRAY_SIZE(compile_time_consts_data->static_consts.value)); 610 611 /* This is fine since buffers_out_ptr is a pointer to an array. */ 612 assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr)); 613 614 /* TODO: Is it possible to have multiple static consts buffer where the 615 * destination is not adjoining? If so we need to handle that. 616 * Currently we're only setting up a single buffer. 617 */ 618 buffers[buffer_count++] = (struct pvr_pds_buffer){ 619 .type = PVR_BUFFER_TYPES_COMPILE_TIME, 620 .size_in_dwords = compile_time_consts_data->static_consts.num, 621 .destination = compile_time_consts_data->static_consts.dest, 622 }; 623 624 result = pvr_gpu_upload(device, 625 device->heaps.general_heap, 626 compile_time_consts_data->static_consts.value, 627 compile_time_consts_data->static_consts.num * 628 ROGUE_REG_SIZE_BYTES, 629 ROGUE_REG_SIZE_BYTES, 630 static_consts_pvr_bo_out); 631 if (result != VK_SUCCESS) 632 return result; 633 } else { 634 *static_consts_pvr_bo_out = NULL; 635 } 636 637 *buffer_count_out = buffer_count; 638 639 return VK_SUCCESS; 640} 641 642static VkResult pvr_pds_descriptor_program_create_and_upload( 643 struct pvr_device *const device, 644 const VkAllocationCallbacks *const allocator, 645 const struct rogue_compile_time_consts_data *const compile_time_consts_data, 646 const struct rogue_ubo_data *const ubo_data, 647 const struct pvr_explicit_constant_usage *const explicit_const_usage, 648 const struct pvr_pipeline_layout *const layout, 649 enum pvr_stage_allocation stage, 650 struct pvr_stage_allocation_descriptor_state *const descriptor_state) 651{ 652 const size_t const_entries_size_in_bytes = 653 pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(); 654 struct pvr_pds_info *const pds_info = &descriptor_state->pds_info; 655 struct pvr_descriptor_program_input program = { 0 }; 656 struct pvr_const_map_entry *entries_buffer; 657 ASSERTED uint32_t code_size_in_dwords; 658 uint32_t staging_buffer_size; 659 uint32_t *staging_buffer; 660 VkResult result; 661 662 assert(stage != PVR_STAGE_ALLOCATION_COUNT); 663 664 *pds_info = (struct pvr_pds_info){ 0 }; 665 666 result = pvr_pds_descriptor_program_setup_buffers( 667 device, 668 device->features.robustBufferAccess, 669 compile_time_consts_data, 670 ubo_data, 671 &program.buffers, 672 &program.buffer_count, 673 &descriptor_state->static_consts); 674 if (result != VK_SUCCESS) 675 return result; 676 677 if (layout->per_stage_reg_info[stage].primary_dynamic_size_in_dwords) 678 assert(!"Unimplemented"); 679 680 for (uint32_t set_num = 0; set_num < layout->set_count; set_num++) { 681 const struct pvr_descriptor_set_layout_mem_layout *const reg_layout = 682 &layout->register_layout_in_dwords_per_stage[stage][set_num]; 683 const uint32_t start_offset = explicit_const_usage->start_offset; 684 685 /* TODO: Use compiler usage info to optimize this? */ 686 687 /* Only dma primaries if they are actually required. */ 688 if (reg_layout->primary_size) { 689 program.descriptor_sets[program.descriptor_set_count++] = 690 (struct pvr_pds_descriptor_set){ 691 .descriptor_set = set_num, 692 .size_in_dwords = reg_layout->primary_size, 693 .destination = reg_layout->primary_offset + start_offset, 694 .primary = true, 695 }; 696 } 697 698 /* Only dma secondaries if they are actually required. */ 699 if (!reg_layout->secondary_size) 700 continue; 701 702 program.descriptor_sets[program.descriptor_set_count++] = 703 (struct pvr_pds_descriptor_set){ 704 .descriptor_set = set_num, 705 .size_in_dwords = reg_layout->secondary_size, 706 .destination = reg_layout->secondary_offset + start_offset, 707 }; 708 } 709 710 entries_buffer = vk_alloc2(&device->vk.alloc, 711 allocator, 712 const_entries_size_in_bytes, 713 8, 714 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 715 if (!entries_buffer) { 716 pvr_bo_free(device, descriptor_state->static_consts); 717 718 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 719 } 720 721 pds_info->entries = entries_buffer; 722 pds_info->entries_size_in_bytes = const_entries_size_in_bytes; 723 724 pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info); 725 726 code_size_in_dwords = pds_info->code_size_in_dwords; 727 staging_buffer_size = 728 pds_info->code_size_in_dwords * sizeof(*staging_buffer); 729 730 if (!staging_buffer_size) { 731 vk_free2(&device->vk.alloc, allocator, entries_buffer); 732 733 *descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 }; 734 735 return VK_SUCCESS; 736 } 737 738 staging_buffer = vk_alloc2(&device->vk.alloc, 739 allocator, 740 staging_buffer_size, 741 8, 742 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 743 if (!staging_buffer) { 744 pvr_bo_free(device, descriptor_state->static_consts); 745 vk_free2(&device->vk.alloc, allocator, entries_buffer); 746 747 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 748 } 749 750 pvr_pds_generate_descriptor_upload_program(&program, 751 staging_buffer, 752 pds_info); 753 754 assert(pds_info->code_size_in_dwords <= code_size_in_dwords); 755 756 /* FIXME: use vk_realloc2() ? */ 757 entries_buffer = vk_realloc((!allocator) ? &device->vk.alloc : allocator, 758 entries_buffer, 759 pds_info->entries_written_size_in_bytes, 760 8, 761 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 762 if (!entries_buffer) { 763 pvr_bo_free(device, descriptor_state->static_consts); 764 vk_free2(&device->vk.alloc, allocator, staging_buffer); 765 766 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 767 } 768 769 pds_info->entries = entries_buffer; 770 pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes; 771 772 /* FIXME: Figure out the define for alignment of 16. */ 773 result = pvr_gpu_upload_pds(device, 774 NULL, 775 0, 776 0, 777 staging_buffer, 778 pds_info->code_size_in_dwords, 779 16, 780 16, 781 &descriptor_state->pds_code); 782 if (result != VK_SUCCESS) { 783 pvr_bo_free(device, descriptor_state->static_consts); 784 vk_free2(&device->vk.alloc, allocator, entries_buffer); 785 vk_free2(&device->vk.alloc, allocator, staging_buffer); 786 787 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 788 } 789 790 vk_free2(&device->vk.alloc, allocator, staging_buffer); 791 792 return VK_SUCCESS; 793} 794 795static void pvr_pds_descriptor_program_destroy( 796 struct pvr_device *const device, 797 const struct VkAllocationCallbacks *const allocator, 798 struct pvr_stage_allocation_descriptor_state *const descriptor_state) 799{ 800 pvr_bo_free(device, descriptor_state->pds_code.pvr_bo); 801 vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries); 802 pvr_bo_free(device, descriptor_state->static_consts); 803} 804 805static void pvr_pds_compute_program_setup( 806 const struct pvr_device_info *dev_info, 807 const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], 808 const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], 809 uint32_t barrier_coefficient, 810 bool add_base_workgroup, 811 uint32_t usc_temps, 812 pvr_dev_addr_t usc_shader_dev_addr, 813 struct pvr_pds_compute_shader_program *const program) 814{ 815 *program = (struct pvr_pds_compute_shader_program){ 816 /* clang-format off */ 817 .local_input_regs = { 818 local_input_regs[0], 819 local_input_regs[1], 820 local_input_regs[2] 821 }, 822 .work_group_input_regs = { 823 work_group_input_regs[0], 824 work_group_input_regs[1], 825 work_group_input_regs[2] 826 }, 827 .global_input_regs = { 828 [0 ... (PVR_WORKGROUP_DIMENSIONS - 1)] = 829 PVR_PDS_COMPUTE_INPUT_REG_UNUSED 830 }, 831 /* clang-format on */ 832 .barrier_coefficient = barrier_coefficient, 833 .flattened_work_groups = true, 834 .clear_pds_barrier = false, 835 .add_base_workgroup = add_base_workgroup, 836 .kick_usc = true, 837 }; 838 839 STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) == 840 PVR_WORKGROUP_DIMENSIONS); 841 STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) == 842 PVR_WORKGROUP_DIMENSIONS); 843 STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) == 844 PVR_WORKGROUP_DIMENSIONS); 845 846 pvr_pds_setup_doutu(&program->usc_task_control, 847 usc_shader_dev_addr.addr, 848 usc_temps, 849 PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE), 850 false); 851 852 pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info); 853} 854 855/* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged. 856 */ 857static VkResult pvr_pds_compute_program_create_and_upload( 858 struct pvr_device *const device, 859 const VkAllocationCallbacks *const allocator, 860 const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], 861 const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], 862 uint32_t barrier_coefficient, 863 uint32_t usc_temps, 864 pvr_dev_addr_t usc_shader_dev_addr, 865 struct pvr_pds_upload *const pds_upload_out, 866 struct pvr_pds_info *const pds_info_out) 867{ 868 struct pvr_device_info *dev_info = &device->pdevice->dev_info; 869 struct pvr_pds_compute_shader_program program; 870 uint32_t staging_buffer_size; 871 uint32_t *staging_buffer; 872 VkResult result; 873 874 pvr_pds_compute_program_setup(dev_info, 875 local_input_regs, 876 work_group_input_regs, 877 barrier_coefficient, 878 false, 879 usc_temps, 880 usc_shader_dev_addr, 881 &program); 882 883 /* FIXME: According to pvr_device_init_compute_pds_program() the code size 884 * is in bytes. Investigate this. 885 */ 886 staging_buffer_size = 887 (program.code_size + program.data_size) * sizeof(*staging_buffer); 888 889 staging_buffer = vk_alloc2(&device->vk.alloc, 890 allocator, 891 staging_buffer_size, 892 8, 893 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 894 if (!staging_buffer) 895 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 896 897 /* FIXME: pvr_pds_compute_shader doesn't implement 898 * PDS_GENERATE_CODEDATA_SEGMENTS. 899 */ 900 pvr_pds_compute_shader(&program, 901 &staging_buffer[0], 902 PDS_GENERATE_CODE_SEGMENT, 903 dev_info); 904 905 pvr_pds_compute_shader(&program, 906 &staging_buffer[program.code_size], 907 PDS_GENERATE_DATA_SEGMENT, 908 dev_info); 909 910 /* FIXME: Figure out the define for alignment of 16. */ 911 result = pvr_gpu_upload_pds(device, 912 &staging_buffer[program.code_size], 913 program.data_size, 914 16, 915 &staging_buffer[0], 916 program.code_size, 917 16, 918 16, 919 pds_upload_out); 920 if (result != VK_SUCCESS) { 921 vk_free2(&device->vk.alloc, allocator, staging_buffer); 922 return result; 923 } 924 925 *pds_info_out = (struct pvr_pds_info){ 926 .temps_required = program.highest_temp, 927 .code_size_in_dwords = program.code_size, 928 .data_size_in_dwords = program.data_size, 929 }; 930 931 vk_free2(&device->vk.alloc, allocator, staging_buffer); 932 933 return VK_SUCCESS; 934}; 935 936static void pvr_pds_compute_program_destroy( 937 struct pvr_device *const device, 938 const struct VkAllocationCallbacks *const allocator, 939 struct pvr_pds_upload *const pds_program, 940 struct pvr_pds_info *const pds_info) 941{ 942 /* We don't allocate an entries buffer so we don't need to free it */ 943 pvr_bo_free(device, pds_program->pvr_bo); 944} 945 946/* This only uploads the code segment. The data segment will need to be patched 947 * with the base workgroup before uploading. 948 */ 949static VkResult pvr_pds_compute_base_workgroup_variant_program_init( 950 struct pvr_device *const device, 951 const VkAllocationCallbacks *const allocator, 952 const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], 953 const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], 954 uint32_t barrier_coefficient, 955 uint32_t usc_temps, 956 pvr_dev_addr_t usc_shader_dev_addr, 957 struct pvr_pds_base_workgroup_program *program_out) 958{ 959 struct pvr_device_info *dev_info = &device->pdevice->dev_info; 960 struct pvr_pds_compute_shader_program program; 961 uint32_t buffer_size; 962 uint32_t *buffer; 963 VkResult result; 964 965 pvr_pds_compute_program_setup(dev_info, 966 local_input_regs, 967 work_group_input_regs, 968 barrier_coefficient, 969 true, 970 usc_temps, 971 usc_shader_dev_addr, 972 &program); 973 974 /* FIXME: According to pvr_device_init_compute_pds_program() the code size 975 * is in bytes. Investigate this. 976 */ 977 buffer_size = MAX2(program.code_size, program.data_size) * sizeof(*buffer); 978 979 buffer = vk_alloc2(&device->vk.alloc, 980 allocator, 981 buffer_size, 982 8, 983 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 984 if (!buffer) 985 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 986 987 pvr_pds_compute_shader(&program, 988 &buffer[0], 989 PDS_GENERATE_CODE_SEGMENT, 990 dev_info); 991 992 /* FIXME: Figure out the define for alignment of 16. */ 993 result = pvr_gpu_upload_pds(device, 994 NULL, 995 0, 996 0, 997 buffer, 998 program.code_size, 999 16, 1000 16, 1001 &program_out->code_upload); 1002 if (result != VK_SUCCESS) { 1003 vk_free2(&device->vk.alloc, allocator, buffer); 1004 return result; 1005 } 1006 1007 pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info); 1008 1009 program_out->data_section = buffer; 1010 1011 /* We'll need to patch the base workgroup in the PDS data section before 1012 * dispatch so we save the offsets at which to patch. We only need to save 1013 * the offset for the first workgroup id since the workgroup ids are stored 1014 * contiguously in the data segment. 1015 */ 1016 program_out->base_workgroup_data_patching_offset = 1017 program.base_workgroup_constant_offset_in_dwords[0]; 1018 1019 program_out->info = (struct pvr_pds_info){ 1020 .temps_required = program.highest_temp, 1021 .code_size_in_dwords = program.code_size, 1022 .data_size_in_dwords = program.data_size, 1023 }; 1024 1025 return VK_SUCCESS; 1026} 1027 1028static void pvr_pds_compute_base_workgroup_variant_program_finish( 1029 struct pvr_device *device, 1030 const VkAllocationCallbacks *const allocator, 1031 struct pvr_pds_base_workgroup_program *const state) 1032{ 1033 pvr_bo_free(device, state->code_upload.pvr_bo); 1034 vk_free2(&device->vk.alloc, allocator, state->data_section); 1035} 1036 1037/****************************************************************************** 1038 Generic pipeline functions 1039 ******************************************************************************/ 1040 1041static void pvr_pipeline_init(struct pvr_device *device, 1042 enum pvr_pipeline_type type, 1043 struct pvr_pipeline *const pipeline) 1044{ 1045 assert(!pipeline->layout); 1046 1047 vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE); 1048 1049 pipeline->type = type; 1050} 1051 1052static void pvr_pipeline_finish(struct pvr_pipeline *pipeline) 1053{ 1054 vk_object_base_finish(&pipeline->base); 1055} 1056 1057/****************************************************************************** 1058 Compute pipeline functions 1059 ******************************************************************************/ 1060 1061/* Compiles and uploads shaders and PDS programs. */ 1062static VkResult pvr_compute_pipeline_compile( 1063 struct pvr_device *const device, 1064 struct pvr_pipeline_cache *pipeline_cache, 1065 const VkComputePipelineCreateInfo *pCreateInfo, 1066 const VkAllocationCallbacks *const allocator, 1067 struct pvr_compute_pipeline *const compute_pipeline) 1068{ 1069 struct rogue_compile_time_consts_data compile_time_consts_data; 1070 uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS]; 1071 struct pvr_explicit_constant_usage explicit_const_usage; 1072 uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS]; 1073 struct rogue_ubo_data ubo_data; 1074 uint32_t barrier_coefficient; 1075 uint32_t usc_temps; 1076 VkResult result; 1077 1078 if (pvr_hard_code_shader_required(&device->pdevice->dev_info)) { 1079 struct pvr_hard_code_compute_build_info build_info; 1080 1081 result = pvr_hard_code_compute_pipeline(device, 1082 &compute_pipeline->state.shader, 1083 &build_info); 1084 if (result != VK_SUCCESS) 1085 return result; 1086 1087 ubo_data = build_info.ubo_data; 1088 compile_time_consts_data = build_info.compile_time_consts_data; 1089 1090 /* We make sure that the compiler's unused reg value is compatible with 1091 * the pds api. 1092 */ 1093 STATIC_ASSERT(ROGUE_REG_UNUSED == PVR_PDS_COMPUTE_INPUT_REG_UNUSED); 1094 1095 barrier_coefficient = build_info.barrier_reg; 1096 1097 /* TODO: Maybe change the pds api to use pointers so we avoid the copy. */ 1098 local_input_regs[0] = build_info.local_invocation_regs[0]; 1099 local_input_regs[1] = build_info.local_invocation_regs[1]; 1100 /* This is not a mistake. We want to assign element 1 to 2. */ 1101 local_input_regs[2] = build_info.local_invocation_regs[1]; 1102 1103 STATIC_ASSERT( 1104 __same_type(work_group_input_regs, build_info.work_group_regs)); 1105 typed_memcpy(work_group_input_regs, 1106 build_info.work_group_regs, 1107 PVR_WORKGROUP_DIMENSIONS); 1108 1109 usc_temps = build_info.usc_temps; 1110 1111 explicit_const_usage = build_info.explicit_conts_usage; 1112 1113 } else { 1114 /* FIXME: Compile and upload the shader. */ 1115 /* FIXME: Initialize the shader state and setup build info. */ 1116 abort(); 1117 }; 1118 1119 result = pvr_pds_descriptor_program_create_and_upload( 1120 device, 1121 allocator, 1122 &compile_time_consts_data, 1123 &ubo_data, 1124 &explicit_const_usage, 1125 compute_pipeline->base.layout, 1126 PVR_STAGE_ALLOCATION_COMPUTE, 1127 &compute_pipeline->state.descriptor); 1128 if (result != VK_SUCCESS) 1129 goto err_free_shader; 1130 1131 result = pvr_pds_compute_program_create_and_upload( 1132 device, 1133 allocator, 1134 local_input_regs, 1135 work_group_input_regs, 1136 barrier_coefficient, 1137 usc_temps, 1138 compute_pipeline->state.shader.bo->vma->dev_addr, 1139 &compute_pipeline->state.primary_program, 1140 &compute_pipeline->state.primary_program_info); 1141 if (result != VK_SUCCESS) 1142 goto err_free_descriptor_program; 1143 1144 /* If the workgroup ID is required, then we require the base workgroup 1145 * variant of the PDS compute program as well. 1146 */ 1147 compute_pipeline->state.flags.base_workgroup = 1148 work_group_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED || 1149 work_group_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED || 1150 work_group_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED; 1151 1152 if (compute_pipeline->state.flags.base_workgroup) { 1153 result = pvr_pds_compute_base_workgroup_variant_program_init( 1154 device, 1155 allocator, 1156 local_input_regs, 1157 work_group_input_regs, 1158 barrier_coefficient, 1159 usc_temps, 1160 compute_pipeline->state.shader.bo->vma->dev_addr, 1161 &compute_pipeline->state.primary_base_workgroup_variant_program); 1162 if (result != VK_SUCCESS) 1163 goto err_destroy_compute_program; 1164 } 1165 1166 return VK_SUCCESS; 1167 1168err_destroy_compute_program: 1169 pvr_pds_compute_program_destroy( 1170 device, 1171 allocator, 1172 &compute_pipeline->state.primary_program, 1173 &compute_pipeline->state.primary_program_info); 1174 1175err_free_descriptor_program: 1176 pvr_bo_free(device, compute_pipeline->state.descriptor.pds_code.pvr_bo); 1177 1178err_free_shader: 1179 pvr_bo_free(device, compute_pipeline->state.shader.bo); 1180 1181 return result; 1182} 1183 1184static VkResult 1185pvr_compute_pipeline_init(struct pvr_device *device, 1186 struct pvr_pipeline_cache *pipeline_cache, 1187 const VkComputePipelineCreateInfo *pCreateInfo, 1188 const VkAllocationCallbacks *allocator, 1189 struct pvr_compute_pipeline *compute_pipeline) 1190{ 1191 VkResult result; 1192 1193 pvr_pipeline_init(device, 1194 PVR_PIPELINE_TYPE_COMPUTE, 1195 &compute_pipeline->base); 1196 1197 compute_pipeline->base.layout = 1198 pvr_pipeline_layout_from_handle(pCreateInfo->layout); 1199 1200 result = pvr_compute_pipeline_compile(device, 1201 pipeline_cache, 1202 pCreateInfo, 1203 allocator, 1204 compute_pipeline); 1205 if (result != VK_SUCCESS) { 1206 pvr_pipeline_finish(&compute_pipeline->base); 1207 return result; 1208 } 1209 1210 return VK_SUCCESS; 1211} 1212 1213static VkResult 1214pvr_compute_pipeline_create(struct pvr_device *device, 1215 struct pvr_pipeline_cache *pipeline_cache, 1216 const VkComputePipelineCreateInfo *pCreateInfo, 1217 const VkAllocationCallbacks *allocator, 1218 VkPipeline *const pipeline_out) 1219{ 1220 struct pvr_compute_pipeline *compute_pipeline; 1221 VkResult result; 1222 1223 compute_pipeline = vk_zalloc2(&device->vk.alloc, 1224 allocator, 1225 sizeof(*compute_pipeline), 1226 8, 1227 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); 1228 if (!compute_pipeline) 1229 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1230 1231 /* Compiles and uploads shaders and PDS programs. */ 1232 result = pvr_compute_pipeline_init(device, 1233 pipeline_cache, 1234 pCreateInfo, 1235 allocator, 1236 compute_pipeline); 1237 if (result != VK_SUCCESS) { 1238 vk_free2(&device->vk.alloc, allocator, compute_pipeline); 1239 return result; 1240 } 1241 1242 *pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base); 1243 1244 return VK_SUCCESS; 1245} 1246 1247static void pvr_compute_pipeline_destroy( 1248 struct pvr_device *const device, 1249 const VkAllocationCallbacks *const allocator, 1250 struct pvr_compute_pipeline *const compute_pipeline) 1251{ 1252 if (compute_pipeline->state.flags.base_workgroup) { 1253 pvr_pds_compute_base_workgroup_variant_program_finish( 1254 device, 1255 allocator, 1256 &compute_pipeline->state.primary_base_workgroup_variant_program); 1257 } 1258 1259 pvr_pds_compute_program_destroy( 1260 device, 1261 allocator, 1262 &compute_pipeline->state.primary_program, 1263 &compute_pipeline->state.primary_program_info); 1264 pvr_pds_descriptor_program_destroy(device, 1265 allocator, 1266 &compute_pipeline->state.descriptor); 1267 pvr_bo_free(device, compute_pipeline->state.shader.bo); 1268 1269 pvr_pipeline_finish(&compute_pipeline->base); 1270 1271 vk_free2(&device->vk.alloc, allocator, compute_pipeline); 1272} 1273 1274VkResult 1275pvr_CreateComputePipelines(VkDevice _device, 1276 VkPipelineCache pipelineCache, 1277 uint32_t createInfoCount, 1278 const VkComputePipelineCreateInfo *pCreateInfos, 1279 const VkAllocationCallbacks *pAllocator, 1280 VkPipeline *pPipelines) 1281{ 1282 PVR_FROM_HANDLE(pvr_pipeline_cache, pipeline_cache, pipelineCache); 1283 PVR_FROM_HANDLE(pvr_device, device, _device); 1284 VkResult result = VK_SUCCESS; 1285 1286 for (uint32_t i = 0; i < createInfoCount; i++) { 1287 const VkResult local_result = 1288 pvr_compute_pipeline_create(device, 1289 pipeline_cache, 1290 &pCreateInfos[i], 1291 pAllocator, 1292 &pPipelines[i]); 1293 if (local_result != VK_SUCCESS) { 1294 result = local_result; 1295 pPipelines[i] = VK_NULL_HANDLE; 1296 } 1297 } 1298 1299 return result; 1300} 1301 1302/****************************************************************************** 1303 Graphics pipeline functions 1304 ******************************************************************************/ 1305 1306static inline uint32_t pvr_dynamic_state_bit_from_vk(VkDynamicState state) 1307{ 1308 switch (state) { 1309 case VK_DYNAMIC_STATE_VIEWPORT: 1310 return PVR_DYNAMIC_STATE_BIT_VIEWPORT; 1311 case VK_DYNAMIC_STATE_SCISSOR: 1312 return PVR_DYNAMIC_STATE_BIT_SCISSOR; 1313 case VK_DYNAMIC_STATE_LINE_WIDTH: 1314 return PVR_DYNAMIC_STATE_BIT_LINE_WIDTH; 1315 case VK_DYNAMIC_STATE_DEPTH_BIAS: 1316 return PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS; 1317 case VK_DYNAMIC_STATE_BLEND_CONSTANTS: 1318 return PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS; 1319 case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK: 1320 return PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK; 1321 case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK: 1322 return PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK; 1323 case VK_DYNAMIC_STATE_STENCIL_REFERENCE: 1324 return PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE; 1325 default: 1326 unreachable("Unsupported state."); 1327 } 1328} 1329 1330static void 1331pvr_graphics_pipeline_destroy(struct pvr_device *const device, 1332 const VkAllocationCallbacks *const allocator, 1333 struct pvr_graphics_pipeline *const gfx_pipeline) 1334{ 1335 const uint32_t num_vertex_attrib_programs = 1336 ARRAY_SIZE(gfx_pipeline->vertex_shader_state.pds_attrib_programs); 1337 1338 pvr_pds_descriptor_program_destroy( 1339 device, 1340 allocator, 1341 &gfx_pipeline->fragment_shader_state.descriptor_state); 1342 1343 pvr_pds_descriptor_program_destroy( 1344 device, 1345 allocator, 1346 &gfx_pipeline->vertex_shader_state.descriptor_state); 1347 1348 for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) { 1349 struct pvr_pds_attrib_program *const attrib_program = 1350 &gfx_pipeline->vertex_shader_state.pds_attrib_programs[i]; 1351 1352 pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program); 1353 } 1354 1355 pvr_bo_free(device, 1356 gfx_pipeline->fragment_shader_state.pds_fragment_program.pvr_bo); 1357 pvr_bo_free(device, 1358 gfx_pipeline->fragment_shader_state.pds_coeff_program.pvr_bo); 1359 1360 pvr_bo_free(device, gfx_pipeline->fragment_shader_state.bo); 1361 pvr_bo_free(device, gfx_pipeline->vertex_shader_state.bo); 1362 1363 pvr_pipeline_finish(&gfx_pipeline->base); 1364 1365 vk_free2(&device->vk.alloc, allocator, gfx_pipeline); 1366} 1367 1368static void 1369pvr_vertex_state_init(struct pvr_graphics_pipeline *gfx_pipeline, 1370 const struct rogue_common_build_data *common_data, 1371 const struct rogue_vs_build_data *vs_data) 1372{ 1373 struct pvr_vertex_shader_state *vertex_state = 1374 &gfx_pipeline->vertex_shader_state; 1375 1376 /* TODO: Hard coding these for now. These should be populated based on the 1377 * information returned by the compiler. 1378 */ 1379 vertex_state->stage_state.const_shared_reg_count = common_data->shareds; 1380 vertex_state->stage_state.const_shared_reg_offset = 0; 1381 vertex_state->stage_state.temps_count = common_data->temps; 1382 vertex_state->stage_state.coefficient_size = common_data->coeffs; 1383 vertex_state->stage_state.uses_atomic_ops = false; 1384 vertex_state->stage_state.uses_texture_rw = false; 1385 vertex_state->stage_state.uses_barrier = false; 1386 vertex_state->stage_state.has_side_effects = false; 1387 vertex_state->stage_state.empty_program = false; 1388 1389 vertex_state->vertex_input_size = vs_data->num_vertex_input_regs; 1390 vertex_state->vertex_output_size = 1391 vs_data->num_vertex_outputs * ROGUE_REG_SIZE_BYTES; 1392 vertex_state->user_clip_planes_mask = 0; 1393 vertex_state->entry_offset = 0; 1394 1395 /* TODO: The number of varyings should be checked against the fragment 1396 * shader inputs and assigned in the place where that happens. 1397 * There will also be an opportunity to cull unused fs inputs/vs outputs. 1398 */ 1399 pvr_csb_pack (&gfx_pipeline->vertex_shader_state.varying[0], 1400 TA_STATE_VARYING0, 1401 varying0) { 1402 varying0.f32_linear = vs_data->num_varyings; 1403 varying0.f32_flat = 0; 1404 varying0.f32_npc = 0; 1405 } 1406 1407 pvr_csb_pack (&gfx_pipeline->vertex_shader_state.varying[1], 1408 TA_STATE_VARYING1, 1409 varying1) { 1410 varying1.f16_linear = 0; 1411 varying1.f16_flat = 0; 1412 varying1.f16_npc = 0; 1413 } 1414} 1415 1416static void 1417pvr_fragment_state_init(struct pvr_graphics_pipeline *gfx_pipeline, 1418 const struct rogue_common_build_data *common_data) 1419{ 1420 struct pvr_fragment_shader_state *fragment_state = 1421 &gfx_pipeline->fragment_shader_state; 1422 1423 /* TODO: Hard coding these for now. These should be populated based on the 1424 * information returned by the compiler. 1425 */ 1426 fragment_state->stage_state.const_shared_reg_count = 0; 1427 fragment_state->stage_state.const_shared_reg_offset = 0; 1428 fragment_state->stage_state.temps_count = common_data->temps; 1429 fragment_state->stage_state.coefficient_size = common_data->coeffs; 1430 fragment_state->stage_state.uses_atomic_ops = false; 1431 fragment_state->stage_state.uses_texture_rw = false; 1432 fragment_state->stage_state.uses_barrier = false; 1433 fragment_state->stage_state.has_side_effects = false; 1434 fragment_state->stage_state.empty_program = false; 1435 1436 fragment_state->pass_type = 0; 1437 fragment_state->entry_offset = 0; 1438} 1439 1440/* Compiles and uploads shaders and PDS programs. */ 1441static VkResult 1442pvr_graphics_pipeline_compile(struct pvr_device *const device, 1443 struct pvr_pipeline_cache *pipeline_cache, 1444 const VkGraphicsPipelineCreateInfo *pCreateInfo, 1445 const VkAllocationCallbacks *const allocator, 1446 struct pvr_graphics_pipeline *const gfx_pipeline) 1447{ 1448 /* FIXME: Remove this hard coding. */ 1449 struct pvr_explicit_constant_usage vert_explicit_const_usage = { 1450 .start_offset = 16, 1451 }; 1452 struct pvr_explicit_constant_usage frag_explicit_const_usage = { 1453 .start_offset = 0, 1454 }; 1455 static uint32_t hard_code_pipeline_n = 0; 1456 1457 const VkPipelineVertexInputStateCreateInfo *const vertex_input_state = 1458 pCreateInfo->pVertexInputState; 1459 const uint32_t cache_line_size = 1460 rogue_get_slc_cache_line_size(&device->pdevice->dev_info); 1461 struct rogue_compiler *compiler = device->pdevice->compiler; 1462 struct rogue_build_ctx *ctx; 1463 VkResult result; 1464 1465 /* Setup shared build context. */ 1466 ctx = rogue_create_build_context(compiler); 1467 if (!ctx) 1468 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1469 1470 /* NIR middle-end translation. */ 1471 for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE; 1472 stage--) { 1473 const VkPipelineShaderStageCreateInfo *create_info; 1474 size_t stage_index = gfx_pipeline->stage_indices[stage]; 1475 1476 if (pvr_hard_code_shader_required(&device->pdevice->dev_info)) { 1477 if (pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) & 1478 BITFIELD_BIT(stage)) { 1479 continue; 1480 } 1481 } 1482 1483 /* Skip unused/inactive stages. */ 1484 if (stage_index == ~0) 1485 continue; 1486 1487 create_info = &pCreateInfo->pStages[stage_index]; 1488 1489 /* SPIR-V to NIR. */ 1490 ctx->nir[stage] = pvr_spirv_to_nir(ctx, stage, create_info); 1491 if (!ctx->nir[stage]) { 1492 ralloc_free(ctx); 1493 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1494 } 1495 } 1496 1497 /* Pre-back-end analysis and optimization, driver data extraction. */ 1498 /* TODO: Analyze and cull unused I/O between stages. */ 1499 /* TODO: Allocate UBOs between stages; 1500 * pipeline->layout->set_{count,layout}. 1501 */ 1502 1503 /* Back-end translation. */ 1504 for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE; 1505 stage--) { 1506 if (pvr_hard_code_shader_required(&device->pdevice->dev_info) && 1507 pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) & 1508 BITFIELD_BIT(stage)) { 1509 const struct pvr_device_info *const dev_info = 1510 &device->pdevice->dev_info; 1511 struct pvr_explicit_constant_usage *explicit_const_usage; 1512 1513 switch (stage) { 1514 case MESA_SHADER_VERTEX: 1515 explicit_const_usage = &vert_explicit_const_usage; 1516 break; 1517 1518 case MESA_SHADER_FRAGMENT: 1519 explicit_const_usage = &frag_explicit_const_usage; 1520 break; 1521 1522 default: 1523 unreachable("Unsupported stage."); 1524 } 1525 1526 pvr_hard_code_graphics_shader(dev_info, 1527 hard_code_pipeline_n, 1528 stage, 1529 &ctx->binary[stage]); 1530 1531 pvr_hard_code_graphics_get_build_info(dev_info, 1532 hard_code_pipeline_n, 1533 stage, 1534 &ctx->common_data[stage], 1535 &ctx->stage_data, 1536 explicit_const_usage); 1537 1538 continue; 1539 } 1540 1541 if (!ctx->nir[stage]) 1542 continue; 1543 1544 ctx->rogue[stage] = pvr_nir_to_rogue(ctx, ctx->nir[stage]); 1545 if (!ctx->rogue[stage]) { 1546 ralloc_free(ctx); 1547 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1548 } 1549 1550 ctx->binary[stage] = pvr_rogue_to_binary(ctx, ctx->rogue[stage]); 1551 if (!ctx->binary[stage]) { 1552 ralloc_free(ctx); 1553 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1554 } 1555 } 1556 1557 if (pvr_hard_code_shader_required(&device->pdevice->dev_info) && 1558 pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) & 1559 BITFIELD_BIT(MESA_SHADER_VERTEX)) { 1560 pvr_hard_code_graphics_vertex_state(&device->pdevice->dev_info, 1561 hard_code_pipeline_n, 1562 &gfx_pipeline->vertex_shader_state); 1563 } else { 1564 pvr_vertex_state_init(gfx_pipeline, 1565 &ctx->common_data[MESA_SHADER_VERTEX], 1566 &ctx->stage_data.vs); 1567 } 1568 1569 result = pvr_gpu_upload_usc(device, 1570 ctx->binary[MESA_SHADER_VERTEX]->data, 1571 ctx->binary[MESA_SHADER_VERTEX]->size, 1572 cache_line_size, 1573 &gfx_pipeline->vertex_shader_state.bo); 1574 if (result != VK_SUCCESS) 1575 goto err_free_build_context; 1576 1577 if (pvr_hard_code_shader_required(&device->pdevice->dev_info) && 1578 pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) & 1579 BITFIELD_BIT(MESA_SHADER_FRAGMENT)) { 1580 pvr_hard_code_graphics_fragment_state( 1581 &device->pdevice->dev_info, 1582 hard_code_pipeline_n, 1583 &gfx_pipeline->fragment_shader_state); 1584 } else { 1585 pvr_fragment_state_init(gfx_pipeline, 1586 &ctx->common_data[MESA_SHADER_FRAGMENT]); 1587 } 1588 1589 result = pvr_gpu_upload_usc(device, 1590 ctx->binary[MESA_SHADER_FRAGMENT]->data, 1591 ctx->binary[MESA_SHADER_FRAGMENT]->size, 1592 cache_line_size, 1593 &gfx_pipeline->fragment_shader_state.bo); 1594 if (result != VK_SUCCESS) 1595 goto err_free_vertex_bo; 1596 1597 /* TODO: powervr has an optimization where it attempts to recompile shaders. 1598 * See PipelineCompileNoISPFeedbackFragmentStage. Unimplemented since in our 1599 * case the optimization doesn't happen. 1600 */ 1601 1602 /* TODO: The programs we use are hard coded for now, but these should be 1603 * selected dynamically. 1604 */ 1605 1606 result = pvr_pds_coeff_program_create_and_upload( 1607 device, 1608 allocator, 1609 ctx->stage_data.fs.iterator_args.fpu_iterators, 1610 ctx->stage_data.fs.iterator_args.num_fpu_iterators, 1611 ctx->stage_data.fs.iterator_args.destination, 1612 &gfx_pipeline->fragment_shader_state.pds_coeff_program); 1613 if (result != VK_SUCCESS) 1614 goto err_free_fragment_bo; 1615 1616 result = pvr_pds_fragment_program_create_and_upload( 1617 device, 1618 allocator, 1619 gfx_pipeline->fragment_shader_state.bo, 1620 ctx->common_data[MESA_SHADER_FRAGMENT].temps, 1621 ctx->stage_data.fs.msaa_mode, 1622 ctx->stage_data.fs.phas, 1623 &gfx_pipeline->fragment_shader_state.pds_fragment_program); 1624 if (result != VK_SUCCESS) 1625 goto err_free_coeff_program; 1626 1627 result = pvr_pds_vertex_attrib_programs_create_and_upload( 1628 device, 1629 allocator, 1630 vertex_input_state, 1631 ctx->common_data[MESA_SHADER_VERTEX].temps, 1632 &ctx->stage_data.vs, 1633 &gfx_pipeline->vertex_shader_state.pds_attrib_programs); 1634 if (result != VK_SUCCESS) 1635 goto err_free_frag_program; 1636 1637 result = pvr_pds_descriptor_program_create_and_upload( 1638 device, 1639 allocator, 1640 &ctx->common_data[MESA_SHADER_VERTEX].compile_time_consts_data, 1641 &ctx->common_data[MESA_SHADER_VERTEX].ubo_data, 1642 &vert_explicit_const_usage, 1643 gfx_pipeline->base.layout, 1644 PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY, 1645 &gfx_pipeline->vertex_shader_state.descriptor_state); 1646 if (result != VK_SUCCESS) 1647 goto err_free_vertex_attrib_program; 1648 1649 /* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a 1650 * scratch buffer for both vertex and fragment stage. 1651 * Figure out the best place to do this. 1652 */ 1653 /* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */ 1654 /* TODO: Implement spilling with the above. */ 1655 1656 /* TODO: Call pvr_pds_program_program_create_and_upload in a loop. */ 1657 /* FIXME: For now we pass in the same explicit_const_usage since it contains 1658 * all invalid entries. Fix this by hooking it up to the compiler. 1659 */ 1660 result = pvr_pds_descriptor_program_create_and_upload( 1661 device, 1662 allocator, 1663 &ctx->common_data[MESA_SHADER_FRAGMENT].compile_time_consts_data, 1664 &ctx->common_data[MESA_SHADER_FRAGMENT].ubo_data, 1665 &frag_explicit_const_usage, 1666 gfx_pipeline->base.layout, 1667 PVR_STAGE_ALLOCATION_FRAGMENT, 1668 &gfx_pipeline->fragment_shader_state.descriptor_state); 1669 if (result != VK_SUCCESS) 1670 goto err_free_vertex_descriptor_program; 1671 1672 ralloc_free(ctx); 1673 1674 hard_code_pipeline_n++; 1675 1676 return VK_SUCCESS; 1677 1678err_free_vertex_descriptor_program: 1679 pvr_pds_descriptor_program_destroy( 1680 device, 1681 allocator, 1682 &gfx_pipeline->vertex_shader_state.descriptor_state); 1683err_free_vertex_attrib_program: 1684 for (uint32_t i = 0; 1685 i < ARRAY_SIZE(gfx_pipeline->vertex_shader_state.pds_attrib_programs); 1686 i++) { 1687 struct pvr_pds_attrib_program *const attrib_program = 1688 &gfx_pipeline->vertex_shader_state.pds_attrib_programs[i]; 1689 1690 pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program); 1691 } 1692err_free_frag_program: 1693 pvr_bo_free(device, 1694 gfx_pipeline->fragment_shader_state.pds_fragment_program.pvr_bo); 1695err_free_coeff_program: 1696 pvr_bo_free(device, 1697 gfx_pipeline->fragment_shader_state.pds_coeff_program.pvr_bo); 1698err_free_fragment_bo: 1699 pvr_bo_free(device, gfx_pipeline->fragment_shader_state.bo); 1700err_free_vertex_bo: 1701 pvr_bo_free(device, gfx_pipeline->vertex_shader_state.bo); 1702err_free_build_context: 1703 ralloc_free(ctx); 1704 return result; 1705} 1706 1707static void pvr_graphics_pipeline_init_depth_and_stencil_state( 1708 struct pvr_graphics_pipeline *gfx_pipeline, 1709 const VkPipelineDepthStencilStateCreateInfo *depth_stencil_state) 1710{ 1711 const VkStencilOpState *front; 1712 const VkStencilOpState *back; 1713 1714 if (!depth_stencil_state) 1715 return; 1716 1717 front = &depth_stencil_state->front; 1718 back = &depth_stencil_state->back; 1719 1720 if (depth_stencil_state->depthTestEnable) { 1721 gfx_pipeline->depth_compare_op = depth_stencil_state->depthCompareOp; 1722 gfx_pipeline->depth_write_disable = 1723 !depth_stencil_state->depthWriteEnable; 1724 } else { 1725 gfx_pipeline->depth_compare_op = VK_COMPARE_OP_ALWAYS; 1726 gfx_pipeline->depth_write_disable = true; 1727 } 1728 1729 if (depth_stencil_state->stencilTestEnable) { 1730 gfx_pipeline->stencil_front.compare_op = front->compareOp; 1731 gfx_pipeline->stencil_front.fail_op = front->failOp; 1732 gfx_pipeline->stencil_front.depth_fail_op = front->depthFailOp; 1733 gfx_pipeline->stencil_front.pass_op = front->passOp; 1734 1735 gfx_pipeline->stencil_back.compare_op = back->compareOp; 1736 gfx_pipeline->stencil_back.fail_op = back->failOp; 1737 gfx_pipeline->stencil_back.depth_fail_op = back->depthFailOp; 1738 gfx_pipeline->stencil_back.pass_op = back->passOp; 1739 } else { 1740 gfx_pipeline->stencil_front.compare_op = VK_COMPARE_OP_ALWAYS; 1741 gfx_pipeline->stencil_front.fail_op = VK_STENCIL_OP_KEEP; 1742 gfx_pipeline->stencil_front.depth_fail_op = VK_STENCIL_OP_KEEP; 1743 gfx_pipeline->stencil_front.pass_op = VK_STENCIL_OP_KEEP; 1744 1745 gfx_pipeline->stencil_back = gfx_pipeline->stencil_front; 1746 } 1747} 1748 1749static void pvr_graphics_pipeline_init_dynamic_state( 1750 struct pvr_graphics_pipeline *gfx_pipeline, 1751 const VkPipelineDynamicStateCreateInfo *dynamic_state, 1752 const VkPipelineViewportStateCreateInfo *viewport_state, 1753 const VkPipelineDepthStencilStateCreateInfo *depth_stencil_state, 1754 const VkPipelineColorBlendStateCreateInfo *color_blend_state, 1755 const VkPipelineRasterizationStateCreateInfo *rasterization_state) 1756{ 1757 struct pvr_dynamic_state *const internal_dynamic_state = 1758 &gfx_pipeline->dynamic_state; 1759 uint32_t dynamic_states = 0; 1760 1761 if (dynamic_state) { 1762 for (uint32_t i = 0; i < dynamic_state->dynamicStateCount; i++) { 1763 dynamic_states |= 1764 pvr_dynamic_state_bit_from_vk(dynamic_state->pDynamicStates[i]); 1765 } 1766 } 1767 1768 /* TODO: Verify this. 1769 * We don't zero out the pipeline's state if they are dynamic since they 1770 * should be set later on in the command buffer. 1771 */ 1772 1773 /* TODO: Handle rasterizerDiscardEnable. */ 1774 1775 if (rasterization_state) { 1776 if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_LINE_WIDTH)) 1777 internal_dynamic_state->line_width = rasterization_state->lineWidth; 1778 1779 /* TODO: Do we need the depthBiasEnable check? */ 1780 if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS)) { 1781 internal_dynamic_state->depth_bias.constant_factor = 1782 rasterization_state->depthBiasConstantFactor; 1783 internal_dynamic_state->depth_bias.clamp = 1784 rasterization_state->depthBiasClamp; 1785 internal_dynamic_state->depth_bias.slope_factor = 1786 rasterization_state->depthBiasSlopeFactor; 1787 } 1788 } 1789 1790 /* TODO: handle viewport state flags. */ 1791 1792 /* TODO: handle static viewport state. */ 1793 /* We assume the viewport state to by dynamic for now. */ 1794 1795 /* TODO: handle static scissor state. */ 1796 /* We assume the scissor state to by dynamic for now. */ 1797 1798 if (depth_stencil_state) { 1799 const VkStencilOpState *const front = &depth_stencil_state->front; 1800 const VkStencilOpState *const back = &depth_stencil_state->back; 1801 1802 /* VkPhysicalDeviceFeatures->depthBounds is false. */ 1803 assert(depth_stencil_state->depthBoundsTestEnable == VK_FALSE); 1804 1805 if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK)) { 1806 internal_dynamic_state->compare_mask.front = front->compareMask; 1807 internal_dynamic_state->compare_mask.back = back->compareMask; 1808 } 1809 1810 if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK)) { 1811 internal_dynamic_state->write_mask.front = front->writeMask; 1812 internal_dynamic_state->write_mask.back = back->writeMask; 1813 } 1814 1815 if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE)) { 1816 internal_dynamic_state->reference.front = front->reference; 1817 internal_dynamic_state->reference.back = back->reference; 1818 } 1819 } 1820 1821 if (color_blend_state && 1822 !(dynamic_states & PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS)) { 1823 STATIC_ASSERT(__same_type(internal_dynamic_state->blend_constants, 1824 color_blend_state->blendConstants)); 1825 1826 typed_memcpy(internal_dynamic_state->blend_constants, 1827 color_blend_state->blendConstants, 1828 ARRAY_SIZE(internal_dynamic_state->blend_constants)); 1829 } 1830 1831 /* TODO: handle STATIC_STATE_DEPTH_BOUNDS ? */ 1832 1833 internal_dynamic_state->mask = dynamic_states; 1834} 1835 1836static VkResult 1837pvr_graphics_pipeline_init(struct pvr_device *device, 1838 struct pvr_pipeline_cache *pipeline_cache, 1839 const VkGraphicsPipelineCreateInfo *pCreateInfo, 1840 const VkAllocationCallbacks *allocator, 1841 struct pvr_graphics_pipeline *gfx_pipeline) 1842{ 1843 /* If rasterization is not enabled, various CreateInfo structs must be 1844 * ignored. 1845 */ 1846 const bool raster_discard_enabled = 1847 pCreateInfo->pRasterizationState->rasterizerDiscardEnable; 1848 const VkPipelineViewportStateCreateInfo *vs_info = 1849 !raster_discard_enabled ? pCreateInfo->pViewportState : NULL; 1850 const VkPipelineDepthStencilStateCreateInfo *dss_info = 1851 !raster_discard_enabled ? pCreateInfo->pDepthStencilState : NULL; 1852 const VkPipelineRasterizationStateCreateInfo *rs_info = 1853 !raster_discard_enabled ? pCreateInfo->pRasterizationState : NULL; 1854 const VkPipelineColorBlendStateCreateInfo *cbs_info = 1855 !raster_discard_enabled ? pCreateInfo->pColorBlendState : NULL; 1856 const VkPipelineMultisampleStateCreateInfo *ms_info = 1857 !raster_discard_enabled ? pCreateInfo->pMultisampleState : NULL; 1858 VkResult result; 1859 1860 pvr_pipeline_init(device, PVR_PIPELINE_TYPE_GRAPHICS, &gfx_pipeline->base); 1861 1862 pvr_finishme("ignoring pCreateInfo flags."); 1863 pvr_finishme("ignoring pipeline cache."); 1864 1865 gfx_pipeline->raster_state.discard_enable = raster_discard_enabled; 1866 gfx_pipeline->raster_state.cull_mode = 1867 pCreateInfo->pRasterizationState->cullMode; 1868 gfx_pipeline->raster_state.front_face = 1869 pCreateInfo->pRasterizationState->frontFace; 1870 gfx_pipeline->raster_state.depth_bias_enable = 1871 pCreateInfo->pRasterizationState->depthBiasEnable; 1872 gfx_pipeline->raster_state.depth_clamp_enable = 1873 pCreateInfo->pRasterizationState->depthClampEnable; 1874 1875 /* FIXME: Handle depthClampEnable. */ 1876 1877 pvr_graphics_pipeline_init_depth_and_stencil_state(gfx_pipeline, dss_info); 1878 pvr_graphics_pipeline_init_dynamic_state(gfx_pipeline, 1879 pCreateInfo->pDynamicState, 1880 vs_info, 1881 dss_info, 1882 cbs_info, 1883 rs_info); 1884 1885 if (pCreateInfo->pInputAssemblyState) { 1886 gfx_pipeline->input_asm_state.topology = 1887 pCreateInfo->pInputAssemblyState->topology; 1888 gfx_pipeline->input_asm_state.primitive_restart = 1889 pCreateInfo->pInputAssemblyState->primitiveRestartEnable; 1890 } 1891 1892 memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices)); 1893 1894 for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) { 1895 VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage; 1896 gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage); 1897 /* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo: 1898 * 1899 * "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS, 1900 * or VK_SHADER_STAGE_ALL." 1901 * 1902 * So we don't handle that. 1903 * 1904 * We also don't handle VK_SHADER_STAGE_TESSELLATION_* and 1905 * VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and 1906 * 'geometryShader' are set to false in the VkPhysicalDeviceFeatures 1907 * structure returned by the driver. 1908 */ 1909 switch (pCreateInfo->pStages[i].stage) { 1910 case VK_SHADER_STAGE_VERTEX_BIT: 1911 case VK_SHADER_STAGE_FRAGMENT_BIT: 1912 gfx_pipeline->stage_indices[gl_stage] = i; 1913 break; 1914 default: 1915 unreachable("Unsupported stage."); 1916 } 1917 } 1918 1919 gfx_pipeline->base.layout = 1920 pvr_pipeline_layout_from_handle(pCreateInfo->layout); 1921 1922 if (ms_info) { 1923 gfx_pipeline->rasterization_samples = ms_info->rasterizationSamples; 1924 gfx_pipeline->sample_mask = 1925 (ms_info->pSampleMask) ? ms_info->pSampleMask[0] : 0xFFFFFFFF; 1926 } else { 1927 gfx_pipeline->rasterization_samples = VK_SAMPLE_COUNT_1_BIT; 1928 gfx_pipeline->sample_mask = 0xFFFFFFFF; 1929 } 1930 1931 /* Compiles and uploads shaders and PDS programs. */ 1932 result = pvr_graphics_pipeline_compile(device, 1933 pipeline_cache, 1934 pCreateInfo, 1935 allocator, 1936 gfx_pipeline); 1937 if (result != VK_SUCCESS) { 1938 pvr_pipeline_finish(&gfx_pipeline->base); 1939 return result; 1940 } 1941 1942 return VK_SUCCESS; 1943} 1944 1945/* If allocator == NULL, the internal one will be used. */ 1946static VkResult 1947pvr_graphics_pipeline_create(struct pvr_device *device, 1948 struct pvr_pipeline_cache *pipeline_cache, 1949 const VkGraphicsPipelineCreateInfo *pCreateInfo, 1950 const VkAllocationCallbacks *allocator, 1951 VkPipeline *const pipeline_out) 1952{ 1953 struct pvr_graphics_pipeline *gfx_pipeline; 1954 VkResult result; 1955 1956 gfx_pipeline = vk_zalloc2(&device->vk.alloc, 1957 allocator, 1958 sizeof(*gfx_pipeline), 1959 8, 1960 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); 1961 if (!gfx_pipeline) 1962 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1963 1964 /* Compiles and uploads shaders and PDS programs too. */ 1965 result = pvr_graphics_pipeline_init(device, 1966 pipeline_cache, 1967 pCreateInfo, 1968 allocator, 1969 gfx_pipeline); 1970 if (result != VK_SUCCESS) { 1971 vk_free2(&device->vk.alloc, allocator, gfx_pipeline); 1972 return result; 1973 } 1974 1975 *pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base); 1976 1977 return VK_SUCCESS; 1978} 1979 1980VkResult 1981pvr_CreateGraphicsPipelines(VkDevice _device, 1982 VkPipelineCache pipelineCache, 1983 uint32_t createInfoCount, 1984 const VkGraphicsPipelineCreateInfo *pCreateInfos, 1985 const VkAllocationCallbacks *pAllocator, 1986 VkPipeline *pPipelines) 1987{ 1988 PVR_FROM_HANDLE(pvr_pipeline_cache, pipeline_cache, pipelineCache); 1989 PVR_FROM_HANDLE(pvr_device, device, _device); 1990 VkResult result = VK_SUCCESS; 1991 1992 for (uint32_t i = 0; i < createInfoCount; i++) { 1993 const VkResult local_result = 1994 pvr_graphics_pipeline_create(device, 1995 pipeline_cache, 1996 &pCreateInfos[i], 1997 pAllocator, 1998 &pPipelines[i]); 1999 if (local_result != VK_SUCCESS) { 2000 result = local_result; 2001 pPipelines[i] = VK_NULL_HANDLE; 2002 } 2003 } 2004 2005 return result; 2006} 2007 2008/***************************************************************************** 2009 Other functions 2010*****************************************************************************/ 2011 2012void pvr_DestroyPipeline(VkDevice _device, 2013 VkPipeline _pipeline, 2014 const VkAllocationCallbacks *pAllocator) 2015{ 2016 PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline); 2017 PVR_FROM_HANDLE(pvr_device, device, _device); 2018 2019 if (!pipeline) 2020 return; 2021 2022 switch (pipeline->type) { 2023 case PVR_PIPELINE_TYPE_GRAPHICS: { 2024 struct pvr_graphics_pipeline *const gfx_pipeline = 2025 to_pvr_graphics_pipeline(pipeline); 2026 2027 pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline); 2028 break; 2029 } 2030 2031 case PVR_PIPELINE_TYPE_COMPUTE: { 2032 struct pvr_compute_pipeline *const compute_pipeline = 2033 to_pvr_compute_pipeline(pipeline); 2034 2035 pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline); 2036 break; 2037 } 2038 2039 default: 2040 unreachable("Unknown pipeline type."); 2041 } 2042} 2043