1/* 2 * Copyright © 2016 Red Hat. 3 * Copyright © 2016 Bas Nieuwenhuizen 4 * SPDX-License-Identifier: MIT 5 * 6 * based in part on anv driver which is: 7 * Copyright © 2015 Intel Corporation 8 */ 9 10#include "tu_pipeline.h" 11 12#include "common/freedreno_guardband.h" 13 14#include "ir3/ir3_nir.h" 15#include "main/menums.h" 16#include "nir/nir.h" 17#include "nir/nir_builder.h" 18#include "spirv/nir_spirv.h" 19#include "util/debug.h" 20#include "util/mesa-sha1.h" 21#include "vk_pipeline.h" 22#include "vk_render_pass.h" 23#include "vk_util.h" 24 25#include "tu_cmd_buffer.h" 26#include "tu_cs.h" 27#include "tu_device.h" 28#include "tu_formats.h" 29#include "tu_lrz.h" 30#include "tu_pass.h" 31 32/* Emit IB that preloads the descriptors that the shader uses */ 33 34static void 35emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st, 36 enum a6xx_state_block sb, unsigned base, unsigned offset, 37 unsigned count) 38{ 39 /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not 40 * clear if emitting more packets will even help anything. Presumably the 41 * descriptor cache is relatively small, and these packets stop doing 42 * anything when there are too many descriptors. 43 */ 44 tu_cs_emit_pkt7(cs, opcode, 3); 45 tu_cs_emit(cs, 46 CP_LOAD_STATE6_0_STATE_TYPE(st) | 47 CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) | 48 CP_LOAD_STATE6_0_STATE_BLOCK(sb) | 49 CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1))); 50 tu_cs_emit_qw(cs, offset | (base << 28)); 51} 52 53static unsigned 54tu6_load_state_size(struct tu_pipeline *pipeline, 55 struct tu_pipeline_layout *layout, bool compute) 56{ 57 const unsigned load_state_size = 4; 58 unsigned size = 0; 59 for (unsigned i = 0; i < layout->num_sets; i++) { 60 if (!(pipeline->active_desc_sets & (1u << i))) 61 continue; 62 63 struct tu_descriptor_set_layout *set_layout = layout->set[i].layout; 64 for (unsigned j = 0; j < set_layout->binding_count; j++) { 65 struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j]; 66 unsigned count = 0; 67 /* Note: some users, like amber for example, pass in 68 * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so 69 * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly. 70 */ 71 VkShaderStageFlags stages = compute ? 72 binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT : 73 binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS; 74 unsigned stage_count = util_bitcount(stages); 75 76 if (!binding->array_size) 77 continue; 78 79 switch (binding->type) { 80 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: 81 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: 82 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: 83 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: 84 /* IBO-backed resources only need one packet for all graphics stages */ 85 if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) 86 count += 1; 87 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) 88 count += 1; 89 break; 90 case VK_DESCRIPTOR_TYPE_SAMPLER: 91 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: 92 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: 93 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: 94 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: 95 /* Textures and UBO's needs a packet for each stage */ 96 count = stage_count; 97 break; 98 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: 99 /* Because of how we pack combined images and samplers, we 100 * currently can't use one packet for the whole array. 101 */ 102 count = stage_count * binding->array_size * 2; 103 break; 104 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: 105 case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE: 106 break; 107 default: 108 unreachable("bad descriptor type"); 109 } 110 size += count * load_state_size; 111 } 112 } 113 return size; 114} 115 116static void 117tu6_emit_load_state(struct tu_pipeline *pipeline, 118 struct tu_pipeline_layout *layout, bool compute) 119{ 120 unsigned size = tu6_load_state_size(pipeline, layout, compute); 121 if (size == 0) 122 return; 123 124 struct tu_cs cs; 125 tu_cs_begin_sub_stream(&pipeline->cs, size, &cs); 126 127 for (unsigned i = 0; i < layout->num_sets; i++) { 128 /* From 13.2.7. Descriptor Set Binding: 129 * 130 * A compatible descriptor set must be bound for all set numbers that 131 * any shaders in a pipeline access, at the time that a draw or 132 * dispatch command is recorded to execute using that pipeline. 133 * However, if none of the shaders in a pipeline statically use any 134 * bindings with a particular set number, then no descriptor set need 135 * be bound for that set number, even if the pipeline layout includes 136 * a non-trivial descriptor set layout for that set number. 137 * 138 * This means that descriptor sets unused by the pipeline may have a 139 * garbage or 0 BINDLESS_BASE register, which will cause context faults 140 * when prefetching descriptors from these sets. Skip prefetching for 141 * descriptors from them to avoid this. This is also an optimization, 142 * since these prefetches would be useless. 143 */ 144 if (!(pipeline->active_desc_sets & (1u << i))) 145 continue; 146 147 struct tu_descriptor_set_layout *set_layout = layout->set[i].layout; 148 for (unsigned j = 0; j < set_layout->binding_count; j++) { 149 struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j]; 150 unsigned base = i; 151 unsigned offset = binding->offset / 4; 152 /* Note: some users, like amber for example, pass in 153 * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so 154 * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly. 155 */ 156 VkShaderStageFlags stages = compute ? 157 binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT : 158 binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS; 159 unsigned count = binding->array_size; 160 161 /* If this is a variable-count descriptor, then the array_size is an 162 * upper bound on the size, but we don't know how many descriptors 163 * will actually be used. Therefore we can't pre-load them here. 164 */ 165 if (j == set_layout->binding_count - 1 && 166 set_layout->has_variable_descriptors) 167 continue; 168 169 if (count == 0 || stages == 0) 170 continue; 171 switch (binding->type) { 172 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: 173 base = MAX_SETS; 174 offset = (layout->set[i].dynamic_offset_start + 175 binding->dynamic_offset_offset) / 4; 176 FALLTHROUGH; 177 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: 178 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: 179 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: { 180 unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4); 181 /* IBO-backed resources only need one packet for all graphics stages */ 182 if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) { 183 emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO, 184 base, offset, count * mul); 185 } 186 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) { 187 emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER, 188 base, offset, count * mul); 189 } 190 break; 191 } 192 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: 193 case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE: 194 /* nothing - input attachment doesn't use bindless */ 195 break; 196 case VK_DESCRIPTOR_TYPE_SAMPLER: 197 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: 198 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: { 199 tu_foreach_stage(stage, stages) { 200 emit_load_state(&cs, tu6_stage2opcode(stage), 201 binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ? 202 ST6_SHADER : ST6_CONSTANTS, 203 tu6_stage2texsb(stage), base, offset, count); 204 } 205 break; 206 } 207 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: 208 base = MAX_SETS; 209 offset = (layout->set[i].dynamic_offset_start + 210 binding->dynamic_offset_offset) / 4; 211 FALLTHROUGH; 212 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: { 213 tu_foreach_stage(stage, stages) { 214 emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO, 215 tu6_stage2shadersb(stage), base, offset, count); 216 } 217 break; 218 } 219 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: { 220 tu_foreach_stage(stage, stages) { 221 /* TODO: We could emit less CP_LOAD_STATE6 if we used 222 * struct-of-arrays instead of array-of-structs. 223 */ 224 for (unsigned i = 0; i < count; i++) { 225 unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS; 226 unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS; 227 emit_load_state(&cs, tu6_stage2opcode(stage), 228 ST6_CONSTANTS, tu6_stage2texsb(stage), 229 base, tex_offset, 1); 230 emit_load_state(&cs, tu6_stage2opcode(stage), 231 ST6_SHADER, tu6_stage2texsb(stage), 232 base, sam_offset, 1); 233 } 234 } 235 break; 236 } 237 default: 238 unreachable("bad descriptor type"); 239 } 240 } 241 } 242 243 pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs); 244} 245 246struct tu_pipeline_builder 247{ 248 struct tu_device *device; 249 void *mem_ctx; 250 struct vk_pipeline_cache *cache; 251 struct tu_pipeline_layout *layout; 252 const VkAllocationCallbacks *alloc; 253 const VkGraphicsPipelineCreateInfo *create_info; 254 255 struct tu_compiled_shaders *shaders; 256 struct ir3_shader_variant *binning_variant; 257 uint64_t shader_iova[MESA_SHADER_FRAGMENT + 1]; 258 uint64_t binning_vs_iova; 259 260 uint32_t additional_cs_reserve_size; 261 262 struct tu_pvtmem_config pvtmem; 263 264 bool rasterizer_discard; 265 /* these states are affectd by rasterizer_discard */ 266 bool emit_msaa_state; 267 bool depth_clip_disable; 268 VkSampleCountFlagBits samples; 269 bool use_color_attachments; 270 bool use_dual_src_blend; 271 bool alpha_to_coverage; 272 uint32_t color_attachment_count; 273 VkFormat color_attachment_formats[MAX_RTS]; 274 VkFormat depth_attachment_format; 275 uint32_t render_components; 276 uint32_t multiview_mask; 277 278 bool subpass_raster_order_attachment_access; 279 bool subpass_feedback_loop_color; 280 bool subpass_feedback_loop_ds; 281}; 282 283static bool 284tu_logic_op_reads_dst(VkLogicOp op) 285{ 286 switch (op) { 287 case VK_LOGIC_OP_CLEAR: 288 case VK_LOGIC_OP_COPY: 289 case VK_LOGIC_OP_COPY_INVERTED: 290 case VK_LOGIC_OP_SET: 291 return false; 292 default: 293 return true; 294 } 295} 296 297static VkBlendFactor 298tu_blend_factor_no_dst_alpha(VkBlendFactor factor) 299{ 300 /* treat dst alpha as 1.0 and avoid reading it */ 301 switch (factor) { 302 case VK_BLEND_FACTOR_DST_ALPHA: 303 return VK_BLEND_FACTOR_ONE; 304 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA: 305 return VK_BLEND_FACTOR_ZERO; 306 default: 307 return factor; 308 } 309} 310 311static bool tu_blend_factor_is_dual_src(VkBlendFactor factor) 312{ 313 switch (factor) { 314 case VK_BLEND_FACTOR_SRC1_COLOR: 315 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR: 316 case VK_BLEND_FACTOR_SRC1_ALPHA: 317 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA: 318 return true; 319 default: 320 return false; 321 } 322} 323 324static bool 325tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo *info) 326{ 327 if (!info) 328 return false; 329 330 for (unsigned i = 0; i < info->attachmentCount; i++) { 331 const VkPipelineColorBlendAttachmentState *blend = &info->pAttachments[i]; 332 if (tu_blend_factor_is_dual_src(blend->srcColorBlendFactor) || 333 tu_blend_factor_is_dual_src(blend->dstColorBlendFactor) || 334 tu_blend_factor_is_dual_src(blend->srcAlphaBlendFactor) || 335 tu_blend_factor_is_dual_src(blend->dstAlphaBlendFactor)) 336 return true; 337 } 338 339 return false; 340} 341 342static const struct xs_config { 343 uint16_t reg_sp_xs_ctrl; 344 uint16_t reg_sp_xs_config; 345 uint16_t reg_sp_xs_instrlen; 346 uint16_t reg_hlsq_xs_ctrl; 347 uint16_t reg_sp_xs_first_exec_offset; 348 uint16_t reg_sp_xs_pvt_mem_hw_stack_offset; 349} xs_config[] = { 350 [MESA_SHADER_VERTEX] = { 351 REG_A6XX_SP_VS_CTRL_REG0, 352 REG_A6XX_SP_VS_CONFIG, 353 REG_A6XX_SP_VS_INSTRLEN, 354 REG_A6XX_HLSQ_VS_CNTL, 355 REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET, 356 REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET, 357 }, 358 [MESA_SHADER_TESS_CTRL] = { 359 REG_A6XX_SP_HS_CTRL_REG0, 360 REG_A6XX_SP_HS_CONFIG, 361 REG_A6XX_SP_HS_INSTRLEN, 362 REG_A6XX_HLSQ_HS_CNTL, 363 REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET, 364 REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET, 365 }, 366 [MESA_SHADER_TESS_EVAL] = { 367 REG_A6XX_SP_DS_CTRL_REG0, 368 REG_A6XX_SP_DS_CONFIG, 369 REG_A6XX_SP_DS_INSTRLEN, 370 REG_A6XX_HLSQ_DS_CNTL, 371 REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET, 372 REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET, 373 }, 374 [MESA_SHADER_GEOMETRY] = { 375 REG_A6XX_SP_GS_CTRL_REG0, 376 REG_A6XX_SP_GS_CONFIG, 377 REG_A6XX_SP_GS_INSTRLEN, 378 REG_A6XX_HLSQ_GS_CNTL, 379 REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET, 380 REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET, 381 }, 382 [MESA_SHADER_FRAGMENT] = { 383 REG_A6XX_SP_FS_CTRL_REG0, 384 REG_A6XX_SP_FS_CONFIG, 385 REG_A6XX_SP_FS_INSTRLEN, 386 REG_A6XX_HLSQ_FS_CNTL, 387 REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET, 388 REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET, 389 }, 390 [MESA_SHADER_COMPUTE] = { 391 REG_A6XX_SP_CS_CTRL_REG0, 392 REG_A6XX_SP_CS_CONFIG, 393 REG_A6XX_SP_CS_INSTRLEN, 394 REG_A6XX_HLSQ_CS_CNTL, 395 REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET, 396 REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET, 397 }, 398}; 399 400static uint32_t 401tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs) 402{ 403 const struct ir3_const_state *const_state = ir3_const_state(xs); 404 uint32_t base = const_state->offsets.immediate; 405 int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4); 406 407 /* truncate size to avoid writing constants that shader 408 * does not use: 409 */ 410 size = MIN2(size + base, xs->constlen) - base; 411 412 return MAX2(size, 0) * 4; 413} 414 415/* We allocate fixed-length substreams for shader state, however some 416 * parts of the state may have unbound length. Their additional space 417 * requirements should be calculated here. 418 */ 419static uint32_t 420tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs) 421{ 422 const struct ir3_const_state *const_state = ir3_const_state(xs); 423 424 uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs); 425 426 /* Variable number of UBO upload ranges. */ 427 size += 4 * const_state->ubo_state.num_enabled; 428 429 /* Variable number of dwords for the primitive map */ 430 size += xs->input_size; 431 432 size += xs->constant_data_size / 4; 433 434 return size; 435} 436 437void 438tu6_emit_xs_config(struct tu_cs *cs, 439 gl_shader_stage stage, /* xs->type, but xs may be NULL */ 440 const struct ir3_shader_variant *xs) 441{ 442 const struct xs_config *cfg = &xs_config[stage]; 443 444 if (!xs) { 445 /* shader stage disabled */ 446 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1); 447 tu_cs_emit(cs, 0); 448 449 tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1); 450 tu_cs_emit(cs, 0); 451 return; 452 } 453 454 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1); 455 tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED | 456 COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) | 457 COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) | 458 COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) | 459 COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) | 460 A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) | 461 A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp)); 462 463 tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1); 464 tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) | 465 A6XX_HLSQ_VS_CNTL_ENABLED); 466} 467 468void 469tu6_emit_xs(struct tu_cs *cs, 470 gl_shader_stage stage, /* xs->type, but xs may be NULL */ 471 const struct ir3_shader_variant *xs, 472 const struct tu_pvtmem_config *pvtmem, 473 uint64_t binary_iova) 474{ 475 const struct xs_config *cfg = &xs_config[stage]; 476 477 if (!xs) { 478 /* shader stage disabled */ 479 return; 480 } 481 482 enum a6xx_threadsize thrsz = 483 xs->info.double_threadsize ? THREAD128 : THREAD64; 484 switch (stage) { 485 case MESA_SHADER_VERTEX: 486 tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0( 487 .fullregfootprint = xs->info.max_reg + 1, 488 .halfregfootprint = xs->info.max_half_reg + 1, 489 .branchstack = ir3_shader_branchstack_hw(xs), 490 .mergedregs = xs->mergedregs, 491 )); 492 break; 493 case MESA_SHADER_TESS_CTRL: 494 tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0( 495 .fullregfootprint = xs->info.max_reg + 1, 496 .halfregfootprint = xs->info.max_half_reg + 1, 497 .branchstack = ir3_shader_branchstack_hw(xs), 498 )); 499 break; 500 case MESA_SHADER_TESS_EVAL: 501 tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0( 502 .fullregfootprint = xs->info.max_reg + 1, 503 .halfregfootprint = xs->info.max_half_reg + 1, 504 .branchstack = ir3_shader_branchstack_hw(xs), 505 )); 506 break; 507 case MESA_SHADER_GEOMETRY: 508 tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0( 509 .fullregfootprint = xs->info.max_reg + 1, 510 .halfregfootprint = xs->info.max_half_reg + 1, 511 .branchstack = ir3_shader_branchstack_hw(xs), 512 )); 513 break; 514 case MESA_SHADER_FRAGMENT: 515 tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0( 516 .fullregfootprint = xs->info.max_reg + 1, 517 .halfregfootprint = xs->info.max_half_reg + 1, 518 .branchstack = ir3_shader_branchstack_hw(xs), 519 .mergedregs = xs->mergedregs, 520 .threadsize = thrsz, 521 .pixlodenable = xs->need_pixlod, 522 .diff_fine = xs->need_fine_derivatives, 523 .varying = xs->total_in != 0, 524 /* unknown bit, seems unnecessary */ 525 .unk24 = true, 526 )); 527 break; 528 case MESA_SHADER_COMPUTE: 529 tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0( 530 .fullregfootprint = xs->info.max_reg + 1, 531 .halfregfootprint = xs->info.max_half_reg + 1, 532 .branchstack = ir3_shader_branchstack_hw(xs), 533 .mergedregs = xs->mergedregs, 534 .threadsize = thrsz, 535 )); 536 break; 537 default: 538 unreachable("bad shader stage"); 539 } 540 541 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1); 542 tu_cs_emit(cs, xs->instrlen); 543 544 /* emit program binary & private memory layout 545 * binary_iova should be aligned to 1 instrlen unit (128 bytes) 546 */ 547 548 assert((binary_iova & 0x7f) == 0); 549 assert((pvtmem->iova & 0x1f) == 0); 550 551 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7); 552 tu_cs_emit(cs, 0); 553 tu_cs_emit_qw(cs, binary_iova); 554 tu_cs_emit(cs, 555 A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size)); 556 tu_cs_emit_qw(cs, pvtmem->iova); 557 tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) | 558 COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT)); 559 560 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1); 561 tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size)); 562 563 uint32_t shader_preload_size = 564 MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size); 565 566 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); 567 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | 568 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | 569 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 570 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 571 CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size)); 572 tu_cs_emit_qw(cs, binary_iova); 573 574 /* emit immediates */ 575 576 const struct ir3_const_state *const_state = ir3_const_state(xs); 577 uint32_t base = const_state->offsets.immediate; 578 unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs); 579 580 if (immediate_size > 0) { 581 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size); 582 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | 583 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 584 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 585 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 586 CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4)); 587 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 588 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 589 590 tu_cs_emit_array(cs, const_state->immediates, immediate_size); 591 } 592 593 if (const_state->constant_data_ubo != -1) { 594 uint64_t iova = binary_iova + xs->info.constant_data_offset; 595 596 /* Upload UBO state for the constant data. */ 597 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5); 598 tu_cs_emit(cs, 599 CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) | 600 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)| 601 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 602 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 603 CP_LOAD_STATE6_0_NUM_UNIT(1)); 604 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 605 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 606 int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16); 607 tu_cs_emit_qw(cs, 608 iova | 609 (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32); 610 611 /* Upload the constant data to the const file if needed. */ 612 const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state; 613 614 for (int i = 0; i < ubo_state->num_enabled; i++) { 615 if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo || 616 ubo_state->range[i].ubo.bindless) { 617 continue; 618 } 619 620 uint32_t start = ubo_state->range[i].start; 621 uint32_t end = ubo_state->range[i].end; 622 uint32_t size = MIN2(end - start, 623 (16 * xs->constlen) - ubo_state->range[i].offset); 624 625 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); 626 tu_cs_emit(cs, 627 CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) | 628 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 629 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 630 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 631 CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); 632 tu_cs_emit_qw(cs, iova + start); 633 } 634 } 635 636 /* emit FS driver param */ 637 if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) { 638 uint32_t base = const_state->offsets.driver_param; 639 int32_t size = DIV_ROUND_UP(const_state->num_driver_params, 4); 640 size = MAX2(MIN2(size + base, xs->constlen) - base, 0); 641 642 if (size > 0) { 643 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4); 644 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | 645 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 646 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 647 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | 648 CP_LOAD_STATE6_0_NUM_UNIT(size)); 649 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 650 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 651 652 assert(size == 1); 653 tu_cs_emit(cs, xs->info.double_threadsize ? 128 : 64); 654 tu_cs_emit(cs, 0); 655 tu_cs_emit(cs, 0); 656 tu_cs_emit(cs, 0); 657 } 658 } 659} 660 661static void 662tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable) 663{ 664 /* Enable/disable shared constants */ 665 tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable)); 666 tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true, 667 .isammode = ISAMMODE_GL, 668 .shared_consts_enable = enable)); 669} 670 671static void 672tu6_emit_cs_config(struct tu_cs *cs, 673 const struct ir3_shader_variant *v, 674 const struct tu_pvtmem_config *pvtmem, 675 uint64_t binary_iova) 676{ 677 bool shared_consts_enable = ir3_const_state(v)->shared_consts_enable; 678 tu6_emit_shared_consts_enable(cs, shared_consts_enable); 679 680 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( 681 .cs_state = true, 682 .cs_ibo = true, 683 .cs_shared_const = shared_consts_enable)); 684 685 tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v); 686 tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova); 687 688 uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1); 689 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); 690 tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) | 691 A6XX_SP_CS_UNKNOWN_A9B1_UNK6); 692 693 if (cs->device->physical_device->info->a6xx.has_lpac) { 694 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1); 695 tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) | 696 A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6); 697 } 698 699 uint32_t local_invocation_id = 700 ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); 701 uint32_t work_group_id = 702 ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID); 703 704 enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64; 705 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2); 706 tu_cs_emit(cs, 707 A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | 708 A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | 709 A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | 710 A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); 711 tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | 712 A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz)); 713 714 if (cs->device->physical_device->info->a6xx.has_lpac) { 715 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2); 716 tu_cs_emit(cs, 717 A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) | 718 A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | 719 A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | 720 A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); 721 tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | 722 A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz)); 723 } 724} 725 726static void 727tu6_emit_vs_system_values(struct tu_cs *cs, 728 const struct ir3_shader_variant *vs, 729 const struct ir3_shader_variant *hs, 730 const struct ir3_shader_variant *ds, 731 const struct ir3_shader_variant *gs, 732 bool primid_passthru) 733{ 734 const uint32_t vertexid_regid = 735 ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); 736 const uint32_t instanceid_regid = 737 ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); 738 const uint32_t tess_coord_x_regid = hs ? 739 ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD) : 740 regid(63, 0); 741 const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ? 742 tess_coord_x_regid + 1 : 743 regid(63, 0); 744 const uint32_t hs_rel_patch_regid = hs ? 745 ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3) : 746 regid(63, 0); 747 const uint32_t ds_rel_patch_regid = hs ? 748 ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3) : 749 regid(63, 0); 750 const uint32_t hs_invocation_regid = hs ? 751 ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3) : 752 regid(63, 0); 753 const uint32_t gs_primitiveid_regid = gs ? 754 ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) : 755 regid(63, 0); 756 const uint32_t vs_primitiveid_regid = hs ? 757 ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID) : 758 gs_primitiveid_regid; 759 const uint32_t ds_primitiveid_regid = ds ? 760 ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID) : 761 regid(63, 0); 762 const uint32_t gsheader_regid = gs ? 763 ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3) : 764 regid(63, 0); 765 766 /* Note: we currently don't support multiview with tess or GS. If we did, 767 * and the HW actually works, then we'd have to somehow share this across 768 * stages. Note that the blob doesn't support this either. 769 */ 770 const uint32_t viewid_regid = 771 ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX); 772 773 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 6); 774 tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) | 775 A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) | 776 A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) | 777 A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid)); 778 tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) | 779 A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid)); 780 tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) | 781 A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) | 782 A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) | 783 A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid)); 784 tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */ 785 tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) | 786 0xfc00); /* VFD_CONTROL_5 */ 787 tu_cs_emit(cs, COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */ 788} 789 790static void 791tu6_setup_streamout(struct tu_cs *cs, 792 const struct ir3_shader_variant *v, 793 struct ir3_shader_linkage *l) 794{ 795 const struct ir3_stream_output_info *info = &v->stream_output; 796 /* Note: 64 here comes from the HW layout of the program RAM. The program 797 * for stream N is at DWORD 64 * N. 798 */ 799#define A6XX_SO_PROG_DWORDS 64 800 uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {}; 801 BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0}; 802 803 /* TODO: streamout state should be in a non-GMEM draw state */ 804 805 /* no streamout: */ 806 if (info->num_outputs == 0) { 807 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4); 808 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); 809 tu_cs_emit(cs, 0); 810 tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL); 811 tu_cs_emit(cs, 0); 812 return; 813 } 814 815 for (unsigned i = 0; i < info->num_outputs; i++) { 816 const struct ir3_stream_output *out = &info->output[i]; 817 unsigned k = out->register_index; 818 unsigned idx; 819 820 /* Skip it, if it's an output that was never assigned a register. */ 821 if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG) 822 continue; 823 824 /* linkage map sorted by order frag shader wants things, so 825 * a bit less ideal here.. 826 */ 827 for (idx = 0; idx < l->cnt; idx++) 828 if (l->var[idx].slot == v->outputs[k].slot) 829 break; 830 831 assert(idx < l->cnt); 832 833 for (unsigned j = 0; j < out->num_components; j++) { 834 unsigned c = j + out->start_component; 835 unsigned loc = l->var[idx].loc + c; 836 unsigned off = j + out->dst_offset; /* in dwords */ 837 838 assert(loc < A6XX_SO_PROG_DWORDS * 2); 839 unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2; 840 if (loc & 1) { 841 prog[dword] |= A6XX_VPC_SO_PROG_B_EN | 842 A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) | 843 A6XX_VPC_SO_PROG_B_OFF(off * 4); 844 } else { 845 prog[dword] |= A6XX_VPC_SO_PROG_A_EN | 846 A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) | 847 A6XX_VPC_SO_PROG_A_OFF(off * 4); 848 } 849 BITSET_SET(valid_dwords, dword); 850 } 851 } 852 853 unsigned prog_count = 0; 854 unsigned start, end; 855 BITSET_FOREACH_RANGE(start, end, valid_dwords, 856 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) { 857 prog_count += end - start + 1; 858 } 859 860 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count); 861 tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL); 862 tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) | 863 COND(info->stride[0] > 0, 864 A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) | 865 COND(info->stride[1] > 0, 866 A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) | 867 COND(info->stride[2] > 0, 868 A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) | 869 COND(info->stride[3] > 0, 870 A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3]))); 871 for (uint32_t i = 0; i < 4; i++) { 872 tu_cs_emit(cs, REG_A6XX_VPC_SO_BUFFER_STRIDE(i)); 873 tu_cs_emit(cs, info->stride[i]); 874 } 875 bool first = true; 876 BITSET_FOREACH_RANGE(start, end, valid_dwords, 877 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) { 878 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); 879 tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) | 880 A6XX_VPC_SO_CNTL_ADDR(start)); 881 for (unsigned i = start; i < end; i++) { 882 tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG); 883 tu_cs_emit(cs, prog[i]); 884 } 885 first = false; 886 } 887} 888 889static void 890tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base, 891 enum a6xx_state_block block, uint32_t offset, 892 uint32_t size, const uint32_t *dwords) { 893 assert(size % 4 == 0); 894 895 tu_cs_emit_pkt7(cs, opcode, 3 + size); 896 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | 897 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 898 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | 899 CP_LOAD_STATE6_0_STATE_BLOCK(block) | 900 CP_LOAD_STATE6_0_NUM_UNIT(size / 4)); 901 902 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 903 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 904 dwords = (uint32_t *)&((uint8_t *)dwords)[offset]; 905 906 tu_cs_emit_array(cs, dwords, size); 907} 908 909static void 910tu6_emit_link_map(struct tu_cs *cs, 911 const struct ir3_shader_variant *producer, 912 const struct ir3_shader_variant *consumer, 913 enum a6xx_state_block sb) 914{ 915 const struct ir3_const_state *const_state = ir3_const_state(consumer); 916 uint32_t base = const_state->offsets.primitive_map; 917 int size = DIV_ROUND_UP(consumer->input_size, 4); 918 919 size = (MIN2(size + base, consumer->constlen) - base) * 4; 920 if (size <= 0) 921 return; 922 923 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size, 924 producer->output_loc); 925} 926 927static uint16_t 928primitive_to_tess(enum shader_prim primitive) { 929 switch (primitive) { 930 case SHADER_PRIM_POINTS: 931 return TESS_POINTS; 932 case SHADER_PRIM_LINE_STRIP: 933 return TESS_LINES; 934 case SHADER_PRIM_TRIANGLE_STRIP: 935 return TESS_CW_TRIS; 936 default: 937 unreachable(""); 938 } 939} 940 941void 942tu6_emit_vpc(struct tu_cs *cs, 943 const struct ir3_shader_variant *vs, 944 const struct ir3_shader_variant *hs, 945 const struct ir3_shader_variant *ds, 946 const struct ir3_shader_variant *gs, 947 const struct ir3_shader_variant *fs, 948 uint32_t patch_control_points) 949{ 950 /* note: doesn't compile as static because of the array regs.. */ 951 const struct reg_config { 952 uint16_t reg_sp_xs_out_reg; 953 uint16_t reg_sp_xs_vpc_dst_reg; 954 uint16_t reg_vpc_xs_pack; 955 uint16_t reg_vpc_xs_clip_cntl; 956 uint16_t reg_gras_xs_cl_cntl; 957 uint16_t reg_pc_xs_out_cntl; 958 uint16_t reg_sp_xs_primitive_cntl; 959 uint16_t reg_vpc_xs_layer_cntl; 960 uint16_t reg_gras_xs_layer_cntl; 961 } reg_config[] = { 962 [MESA_SHADER_VERTEX] = { 963 REG_A6XX_SP_VS_OUT_REG(0), 964 REG_A6XX_SP_VS_VPC_DST_REG(0), 965 REG_A6XX_VPC_VS_PACK, 966 REG_A6XX_VPC_VS_CLIP_CNTL, 967 REG_A6XX_GRAS_VS_CL_CNTL, 968 REG_A6XX_PC_VS_OUT_CNTL, 969 REG_A6XX_SP_VS_PRIMITIVE_CNTL, 970 REG_A6XX_VPC_VS_LAYER_CNTL, 971 REG_A6XX_GRAS_VS_LAYER_CNTL 972 }, 973 [MESA_SHADER_TESS_CTRL] = { 974 0, 975 0, 976 0, 977 0, 978 0, 979 REG_A6XX_PC_HS_OUT_CNTL, 980 0, 981 0, 982 0 983 }, 984 [MESA_SHADER_TESS_EVAL] = { 985 REG_A6XX_SP_DS_OUT_REG(0), 986 REG_A6XX_SP_DS_VPC_DST_REG(0), 987 REG_A6XX_VPC_DS_PACK, 988 REG_A6XX_VPC_DS_CLIP_CNTL, 989 REG_A6XX_GRAS_DS_CL_CNTL, 990 REG_A6XX_PC_DS_OUT_CNTL, 991 REG_A6XX_SP_DS_PRIMITIVE_CNTL, 992 REG_A6XX_VPC_DS_LAYER_CNTL, 993 REG_A6XX_GRAS_DS_LAYER_CNTL 994 }, 995 [MESA_SHADER_GEOMETRY] = { 996 REG_A6XX_SP_GS_OUT_REG(0), 997 REG_A6XX_SP_GS_VPC_DST_REG(0), 998 REG_A6XX_VPC_GS_PACK, 999 REG_A6XX_VPC_GS_CLIP_CNTL, 1000 REG_A6XX_GRAS_GS_CL_CNTL, 1001 REG_A6XX_PC_GS_OUT_CNTL, 1002 REG_A6XX_SP_GS_PRIMITIVE_CNTL, 1003 REG_A6XX_VPC_GS_LAYER_CNTL, 1004 REG_A6XX_GRAS_GS_LAYER_CNTL 1005 }, 1006 }; 1007 1008 const struct ir3_shader_variant *last_shader; 1009 if (gs) { 1010 last_shader = gs; 1011 } else if (hs) { 1012 last_shader = ds; 1013 } else { 1014 last_shader = vs; 1015 } 1016 1017 const struct reg_config *cfg = ®_config[last_shader->type]; 1018 1019 struct ir3_shader_linkage linkage = { 1020 .primid_loc = 0xff, 1021 .clip0_loc = 0xff, 1022 .clip1_loc = 0xff, 1023 }; 1024 if (fs) 1025 ir3_link_shaders(&linkage, last_shader, fs, true); 1026 1027 if (last_shader->stream_output.num_outputs) 1028 ir3_link_stream_out(&linkage, last_shader); 1029 1030 /* We do this after linking shaders in order to know whether PrimID 1031 * passthrough needs to be enabled. 1032 */ 1033 bool primid_passthru = linkage.primid_loc != 0xff; 1034 tu6_emit_vs_system_values(cs, vs, hs, ds, gs, primid_passthru); 1035 1036 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4); 1037 tu_cs_emit(cs, ~linkage.varmask[0]); 1038 tu_cs_emit(cs, ~linkage.varmask[1]); 1039 tu_cs_emit(cs, ~linkage.varmask[2]); 1040 tu_cs_emit(cs, ~linkage.varmask[3]); 1041 1042 /* a6xx finds position/pointsize at the end */ 1043 const uint32_t pointsize_regid = 1044 ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ); 1045 const uint32_t layer_regid = 1046 ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER); 1047 const uint32_t view_regid = 1048 ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT); 1049 const uint32_t clip0_regid = 1050 ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0); 1051 const uint32_t clip1_regid = 1052 ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1); 1053 uint32_t flags_regid = gs ? 1054 ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0; 1055 1056 uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff; 1057 1058 if (layer_regid != regid(63, 0)) { 1059 layer_loc = linkage.max_loc; 1060 ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc); 1061 } 1062 1063 if (view_regid != regid(63, 0)) { 1064 view_loc = linkage.max_loc; 1065 ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc); 1066 } 1067 1068 unsigned extra_pos = 0; 1069 1070 for (unsigned i = 0; i < last_shader->outputs_count; i++) { 1071 if (last_shader->outputs[i].slot != VARYING_SLOT_POS) 1072 continue; 1073 1074 if (position_loc == 0xff) 1075 position_loc = linkage.max_loc; 1076 1077 ir3_link_add(&linkage, last_shader->outputs[i].slot, 1078 last_shader->outputs[i].regid, 1079 0xf, position_loc + 4 * last_shader->outputs[i].view); 1080 extra_pos = MAX2(extra_pos, last_shader->outputs[i].view); 1081 } 1082 1083 if (pointsize_regid != regid(63, 0)) { 1084 pointsize_loc = linkage.max_loc; 1085 ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc); 1086 } 1087 1088 uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask; 1089 1090 /* Handle the case where clip/cull distances aren't read by the FS */ 1091 uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc; 1092 if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) { 1093 clip0_loc = linkage.max_loc; 1094 ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid, 1095 clip_cull_mask & 0xf, linkage.max_loc); 1096 } 1097 if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) { 1098 clip1_loc = linkage.max_loc; 1099 ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid, 1100 clip_cull_mask >> 4, linkage.max_loc); 1101 } 1102 1103 tu6_setup_streamout(cs, last_shader, &linkage); 1104 1105 /* The GPU hangs on some models when there are no outputs (xs_pack::CNT), 1106 * at least when a DS is the last stage, so add a dummy output to keep it 1107 * happy if there aren't any. We do this late in order to avoid emitting 1108 * any unused code and make sure that optimizations don't remove it. 1109 */ 1110 if (linkage.cnt == 0) 1111 ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc); 1112 1113 /* map outputs of the last shader to VPC */ 1114 assert(linkage.cnt <= 32); 1115 const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2); 1116 const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4); 1117 uint32_t sp_out[16] = {0}; 1118 uint32_t sp_vpc_dst[8] = {0}; 1119 for (uint32_t i = 0; i < linkage.cnt; i++) { 1120 ((uint16_t *) sp_out)[i] = 1121 A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) | 1122 A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask); 1123 ((uint8_t *) sp_vpc_dst)[i] = 1124 A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc); 1125 } 1126 1127 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count); 1128 tu_cs_emit_array(cs, sp_out, sp_out_count); 1129 1130 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count); 1131 tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count); 1132 1133 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1); 1134 tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) | 1135 A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) | 1136 A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) | 1137 A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos)); 1138 1139 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1); 1140 tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | 1141 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | 1142 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); 1143 1144 tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1); 1145 tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) | 1146 A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask)); 1147 1148 const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs }; 1149 1150 for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) { 1151 const struct ir3_shader_variant *shader = geom_shaders[i]; 1152 if (!shader) 1153 continue; 1154 1155 bool primid = shader->type != MESA_SHADER_VERTEX && 1156 VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID)); 1157 1158 tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1); 1159 if (shader == last_shader) { 1160 tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) | 1161 CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) | 1162 CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) | 1163 CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) | 1164 COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) | 1165 A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); 1166 } else { 1167 tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID)); 1168 } 1169 } 1170 1171 /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */ 1172 if (gs) 1173 assert(flags_regid != INVALID_REG); 1174 1175 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1); 1176 tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) | 1177 A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid)); 1178 1179 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1); 1180 tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) | 1181 A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc)); 1182 1183 tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1); 1184 tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) | 1185 CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW)); 1186 1187 tu_cs_emit_regs(cs, A6XX_PC_PRIMID_PASSTHRU(primid_passthru)); 1188 1189 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1); 1190 tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs ? fs->total_in : 0) | 1191 COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) | 1192 A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) | 1193 A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc)); 1194 1195 if (hs) { 1196 tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1); 1197 tu_cs_emit(cs, hs->tess.tcs_vertices_out); 1198 1199 /* Total attribute slots in HS incoming patch. */ 1200 tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1); 1201 tu_cs_emit(cs, patch_control_points * vs->output_size / 4); 1202 1203 const uint32_t wavesize = 64; 1204 const uint32_t max_wave_input_size = 64; 1205 1206 /* note: if HS is really just the VS extended, then this 1207 * should be by MAX2(patch_control_points, hs->tess.tcs_vertices_out) 1208 * however that doesn't match the blob, and fails some dEQP tests. 1209 */ 1210 uint32_t prims_per_wave = wavesize / hs->tess.tcs_vertices_out; 1211 uint32_t max_prims_per_wave = 1212 max_wave_input_size * wavesize / (vs->output_size * patch_control_points); 1213 prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); 1214 1215 uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave; 1216 uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); 1217 1218 tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); 1219 tu_cs_emit(cs, wave_input_size); 1220 1221 /* In SPIR-V generated from GLSL, the tessellation primitive params are 1222 * are specified in the tess eval shader, but in SPIR-V generated from 1223 * HLSL, they are specified in the tess control shader. */ 1224 const struct ir3_shader_variant *tess = 1225 ds->tess.spacing == TESS_SPACING_UNSPECIFIED ? hs : ds; 1226 tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_CNTL, 1); 1227 uint32_t output; 1228 if (tess->tess.point_mode) 1229 output = TESS_POINTS; 1230 else if (tess->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES) 1231 output = TESS_LINES; 1232 else if (tess->tess.ccw) 1233 output = TESS_CCW_TRIS; 1234 else 1235 output = TESS_CW_TRIS; 1236 1237 enum a6xx_tess_spacing spacing; 1238 switch (tess->tess.spacing) { 1239 case TESS_SPACING_EQUAL: 1240 spacing = TESS_EQUAL; 1241 break; 1242 case TESS_SPACING_FRACTIONAL_ODD: 1243 spacing = TESS_FRACTIONAL_ODD; 1244 break; 1245 case TESS_SPACING_FRACTIONAL_EVEN: 1246 spacing = TESS_FRACTIONAL_EVEN; 1247 break; 1248 case TESS_SPACING_UNSPECIFIED: 1249 default: 1250 unreachable("invalid tess spacing"); 1251 } 1252 tu_cs_emit(cs, A6XX_PC_TESS_CNTL_SPACING(spacing) | 1253 A6XX_PC_TESS_CNTL_OUTPUT(output)); 1254 1255 tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER); 1256 tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER); 1257 } 1258 1259 1260 if (gs) { 1261 uint32_t vertices_out, invocations, output, vec4_size; 1262 uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size; 1263 1264 if (hs) { 1265 tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER); 1266 } else { 1267 tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER); 1268 } 1269 vertices_out = gs->gs.vertices_out - 1; 1270 output = primitive_to_tess(gs->gs.output_primitive); 1271 invocations = gs->gs.invocations - 1; 1272 /* Size of per-primitive alloction in ldlw memory in vec4s. */ 1273 vec4_size = gs->gs.vertices_in * 1274 DIV_ROUND_UP(prev_stage_output_size, 4); 1275 1276 tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1); 1277 tu_cs_emit(cs, 1278 A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) | 1279 A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | 1280 A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations)); 1281 1282 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1); 1283 tu_cs_emit(cs, 0xff); 1284 1285 tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); 1286 tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); 1287 1288 uint32_t prim_size = prev_stage_output_size; 1289 if (prim_size > 64) 1290 prim_size = 64; 1291 else if (prim_size == 64) 1292 prim_size = 63; 1293 tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1); 1294 tu_cs_emit(cs, prim_size); 1295 } 1296} 1297 1298static int 1299tu6_vpc_varying_mode(const struct ir3_shader_variant *fs, 1300 uint32_t index, 1301 uint8_t *interp_mode, 1302 uint8_t *ps_repl_mode) 1303{ 1304 enum 1305 { 1306 INTERP_SMOOTH = 0, 1307 INTERP_FLAT = 1, 1308 INTERP_ZERO = 2, 1309 INTERP_ONE = 3, 1310 }; 1311 enum 1312 { 1313 PS_REPL_NONE = 0, 1314 PS_REPL_S = 1, 1315 PS_REPL_T = 2, 1316 PS_REPL_ONE_MINUS_T = 3, 1317 }; 1318 1319 const uint32_t compmask = fs->inputs[index].compmask; 1320 1321 /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and 1322 * fourth component occupy three consecutive varying slots 1323 */ 1324 int shift = 0; 1325 *interp_mode = 0; 1326 *ps_repl_mode = 0; 1327 if (fs->inputs[index].slot == VARYING_SLOT_PNTC) { 1328 if (compmask & 0x1) { 1329 *ps_repl_mode |= PS_REPL_S << shift; 1330 shift += 2; 1331 } 1332 if (compmask & 0x2) { 1333 *ps_repl_mode |= PS_REPL_T << shift; 1334 shift += 2; 1335 } 1336 if (compmask & 0x4) { 1337 *interp_mode |= INTERP_ZERO << shift; 1338 shift += 2; 1339 } 1340 if (compmask & 0x8) { 1341 *interp_mode |= INTERP_ONE << 6; 1342 shift += 2; 1343 } 1344 } else if (fs->inputs[index].flat) { 1345 for (int i = 0; i < 4; i++) { 1346 if (compmask & (1 << i)) { 1347 *interp_mode |= INTERP_FLAT << shift; 1348 shift += 2; 1349 } 1350 } 1351 } 1352 1353 return shift; 1354} 1355 1356static void 1357tu6_emit_vpc_varying_modes(struct tu_cs *cs, 1358 const struct ir3_shader_variant *fs) 1359{ 1360 uint32_t interp_modes[8] = { 0 }; 1361 uint32_t ps_repl_modes[8] = { 0 }; 1362 1363 if (fs) { 1364 for (int i = -1; 1365 (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) { 1366 1367 /* get the mode for input i */ 1368 uint8_t interp_mode; 1369 uint8_t ps_repl_mode; 1370 const int bits = 1371 tu6_vpc_varying_mode(fs, i, &interp_mode, &ps_repl_mode); 1372 1373 /* OR the mode into the array */ 1374 const uint32_t inloc = fs->inputs[i].inloc * 2; 1375 uint32_t n = inloc / 32; 1376 uint32_t shift = inloc % 32; 1377 interp_modes[n] |= interp_mode << shift; 1378 ps_repl_modes[n] |= ps_repl_mode << shift; 1379 if (shift + bits > 32) { 1380 n++; 1381 shift = 32 - shift; 1382 1383 interp_modes[n] |= interp_mode >> shift; 1384 ps_repl_modes[n] |= ps_repl_mode >> shift; 1385 } 1386 } 1387 } 1388 1389 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); 1390 tu_cs_emit_array(cs, interp_modes, 8); 1391 1392 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); 1393 tu_cs_emit_array(cs, ps_repl_modes, 8); 1394} 1395 1396void 1397tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) 1398{ 1399 uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; 1400 uint32_t ij_regid[IJ_COUNT]; 1401 uint32_t smask_in_regid; 1402 1403 bool sample_shading = fs->per_samp | fs->key.sample_shading; 1404 bool enable_varyings = fs->total_in > 0; 1405 1406 samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); 1407 smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); 1408 face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); 1409 coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); 1410 zwcoord_regid = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0); 1411 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) 1412 ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); 1413 1414 if (fs->num_sampler_prefetch > 0) { 1415 assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL])); 1416 /* also, it seems like ij_pix is *required* to be r0.x */ 1417 assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0)); 1418 } 1419 1420 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); 1421 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | 1422 A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) | 1423 0x7000); // XXX); 1424 for (int i = 0; i < fs->num_sampler_prefetch; i++) { 1425 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; 1426 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) | 1427 A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) | 1428 A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) | 1429 A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) | 1430 A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) | 1431 COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) | 1432 A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd)); 1433 } 1434 1435 if (fs->num_sampler_prefetch > 0) { 1436 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch); 1437 for (int i = 0; i < fs->num_sampler_prefetch; i++) { 1438 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; 1439 tu_cs_emit(cs, 1440 A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) | 1441 A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id)); 1442 } 1443 } 1444 1445 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5); 1446 tu_cs_emit(cs, 0x7); 1447 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | 1448 A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | 1449 A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) | 1450 A6XX_HLSQ_CONTROL_2_REG_CENTERRHW(ij_regid[IJ_PERSP_CENTER_RHW])); 1451 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | 1452 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | 1453 A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) | 1454 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID])); 1455 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | 1456 A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | 1457 A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) | 1458 A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE])); 1459 tu_cs_emit(cs, 0xfcfc); 1460 1461 enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64; 1462 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1); 1463 tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz) | 1464 COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS)); 1465 1466 bool need_size = fs->frag_face || fs->fragcoord_compmask != 0; 1467 bool need_size_persamp = false; 1468 if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) { 1469 if (sample_shading) 1470 need_size_persamp = true; 1471 else 1472 need_size = true; 1473 } 1474 1475 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1); 1476 tu_cs_emit(cs, 1477 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) | 1478 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) | 1479 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) | 1480 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | 1481 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) | 1482 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | 1483 COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | 1484 COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | 1485 COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask))); 1486 1487 tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2); 1488 tu_cs_emit(cs, 1489 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) | 1490 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) | 1491 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) | 1492 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | 1493 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) | 1494 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | 1495 COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | 1496 COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) | 1497 COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | 1498 COND(fs->fragcoord_compmask != 0, 1499 A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask))); 1500 tu_cs_emit(cs, 1501 A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE( 1502 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) | 1503 CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | 1504 CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | 1505 CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) | 1506 COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); 1507 1508 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1); 1509 tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE)); 1510 1511 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1); 1512 tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) | 1513 A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE( 1514 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER)); 1515 1516 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1); 1517 tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE)); 1518} 1519 1520static void 1521tu6_emit_fs_outputs(struct tu_cs *cs, 1522 const struct ir3_shader_variant *fs, 1523 uint32_t mrt_count, bool dual_src_blend, 1524 uint32_t render_components, 1525 bool no_earlyz, 1526 struct tu_pipeline *pipeline) 1527{ 1528 uint32_t smask_regid, posz_regid, stencilref_regid; 1529 1530 posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); 1531 smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK); 1532 stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL); 1533 1534 uint32_t fragdata_regid[8]; 1535 if (fs->color0_mrt) { 1536 fragdata_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_COLOR); 1537 for (uint32_t i = 1; i < ARRAY_SIZE(fragdata_regid); i++) 1538 fragdata_regid[i] = fragdata_regid[0]; 1539 } else { 1540 for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) 1541 fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i); 1542 } 1543 1544 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); 1545 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | 1546 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | 1547 A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) | 1548 COND(dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); 1549 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count)); 1550 1551 uint32_t fs_render_components = 0; 1552 1553 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 8); 1554 for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) { 1555 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) | 1556 (COND(fragdata_regid[i] & HALF_REG_ID, 1557 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION))); 1558 1559 if (VALIDREG(fragdata_regid[i])) { 1560 fs_render_components |= 0xf << (i * 4); 1561 } 1562 } 1563 1564 /* dual source blending has an extra fs output in the 2nd slot */ 1565 if (dual_src_blend) { 1566 fs_render_components |= 0xf << 4; 1567 } 1568 1569 /* There is no point in having component enabled which is not written 1570 * by the shader. Per VK spec it is an UB, however a few apps depend on 1571 * attachment not being changed if FS doesn't have corresponding output. 1572 */ 1573 fs_render_components &= render_components; 1574 1575 tu_cs_emit_regs(cs, 1576 A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components)); 1577 1578 tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2); 1579 tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) | 1580 COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) | 1581 COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) | 1582 COND(dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); 1583 tu_cs_emit(cs, A6XX_RB_FS_OUTPUT_CNTL1_MRT(mrt_count)); 1584 1585 tu_cs_emit_regs(cs, 1586 A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components)); 1587 1588 if (pipeline) { 1589 pipeline->lrz.fs_has_kill = fs->has_kill; 1590 pipeline->lrz.early_fragment_tests = fs->fs.early_fragment_tests; 1591 1592 if (!fs->fs.early_fragment_tests && 1593 (fs->no_earlyz || fs->has_kill || fs->writes_pos || fs->writes_stencilref || no_earlyz || fs->writes_smask)) { 1594 pipeline->lrz.force_late_z = true; 1595 } 1596 } 1597} 1598 1599static void 1600tu6_emit_geom_tess_consts(struct tu_cs *cs, 1601 const struct ir3_shader_variant *vs, 1602 const struct ir3_shader_variant *hs, 1603 const struct ir3_shader_variant *ds, 1604 const struct ir3_shader_variant *gs, 1605 uint32_t cps_per_patch) 1606{ 1607 struct tu_device *dev = cs->device; 1608 1609 uint32_t num_vertices = 1610 hs ? cps_per_patch : gs->gs.vertices_in; 1611 1612 uint32_t vs_params[4] = { 1613 vs->output_size * num_vertices * 4, /* vs primitive stride */ 1614 vs->output_size * 4, /* vs vertex stride */ 1615 0, 1616 0, 1617 }; 1618 uint32_t vs_base = ir3_const_state(vs)->offsets.primitive_param; 1619 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0, 1620 ARRAY_SIZE(vs_params), vs_params); 1621 1622 if (hs) { 1623 assert(ds->type != MESA_SHADER_NONE); 1624 1625 /* Create the shared tess factor BO the first time tess is used on the device. */ 1626 mtx_lock(&dev->mutex); 1627 if (!dev->tess_bo) 1628 tu_bo_init_new(dev, &dev->tess_bo, TU_TESS_BO_SIZE, TU_BO_ALLOC_NO_FLAGS); 1629 mtx_unlock(&dev->mutex); 1630 1631 uint64_t tess_factor_iova = dev->tess_bo->iova; 1632 uint64_t tess_param_iova = tess_factor_iova + TU_TESS_FACTOR_SIZE; 1633 1634 uint32_t hs_params[8] = { 1635 vs->output_size * num_vertices * 4, /* hs primitive stride */ 1636 vs->output_size * 4, /* hs vertex stride */ 1637 hs->output_size, 1638 cps_per_patch, 1639 tess_param_iova, 1640 tess_param_iova >> 32, 1641 tess_factor_iova, 1642 tess_factor_iova >> 32, 1643 }; 1644 1645 uint32_t hs_base = hs->const_state->offsets.primitive_param; 1646 uint32_t hs_param_dwords = MIN2((hs->constlen - hs_base) * 4, ARRAY_SIZE(hs_params)); 1647 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0, 1648 hs_param_dwords, hs_params); 1649 if (gs) 1650 num_vertices = gs->gs.vertices_in; 1651 1652 uint32_t ds_params[8] = { 1653 ds->output_size * num_vertices * 4, /* ds primitive stride */ 1654 ds->output_size * 4, /* ds vertex stride */ 1655 hs->output_size, /* hs vertex stride (dwords) */ 1656 hs->tess.tcs_vertices_out, 1657 tess_param_iova, 1658 tess_param_iova >> 32, 1659 tess_factor_iova, 1660 tess_factor_iova >> 32, 1661 }; 1662 1663 uint32_t ds_base = ds->const_state->offsets.primitive_param; 1664 uint32_t ds_param_dwords = MIN2((ds->constlen - ds_base) * 4, ARRAY_SIZE(ds_params)); 1665 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0, 1666 ds_param_dwords, ds_params); 1667 } 1668 1669 if (gs) { 1670 const struct ir3_shader_variant *prev = ds ? ds : vs; 1671 uint32_t gs_params[4] = { 1672 prev->output_size * num_vertices * 4, /* gs primitive stride */ 1673 prev->output_size * 4, /* gs vertex stride */ 1674 0, 1675 0, 1676 }; 1677 uint32_t gs_base = gs->const_state->offsets.primitive_param; 1678 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0, 1679 ARRAY_SIZE(gs_params), gs_params); 1680 } 1681} 1682 1683static void 1684tu6_emit_program_config(struct tu_cs *cs, 1685 struct tu_pipeline_builder *builder) 1686{ 1687 gl_shader_stage stage = MESA_SHADER_VERTEX; 1688 1689 STATIC_ASSERT(MESA_SHADER_VERTEX == 0); 1690 1691 bool shared_consts_enable = tu6_shared_constants_enable(builder->layout, 1692 builder->device->compiler); 1693 tu6_emit_shared_consts_enable(cs, shared_consts_enable); 1694 1695 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( 1696 .vs_state = true, 1697 .hs_state = true, 1698 .ds_state = true, 1699 .gs_state = true, 1700 .fs_state = true, 1701 .gfx_ibo = true, 1702 .gfx_shared_const = shared_consts_enable)); 1703 for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) { 1704 tu6_emit_xs_config(cs, stage, builder->shaders->variants[stage]); 1705 } 1706} 1707 1708static void 1709tu6_emit_program(struct tu_cs *cs, 1710 struct tu_pipeline_builder *builder, 1711 bool binning_pass, 1712 struct tu_pipeline *pipeline) 1713{ 1714 const struct ir3_shader_variant *vs = builder->shaders->variants[MESA_SHADER_VERTEX]; 1715 const struct ir3_shader_variant *bs = builder->binning_variant; 1716 const struct ir3_shader_variant *hs = builder->shaders->variants[MESA_SHADER_TESS_CTRL]; 1717 const struct ir3_shader_variant *ds = builder->shaders->variants[MESA_SHADER_TESS_EVAL]; 1718 const struct ir3_shader_variant *gs = builder->shaders->variants[MESA_SHADER_GEOMETRY]; 1719 const struct ir3_shader_variant *fs = builder->shaders->variants[MESA_SHADER_FRAGMENT]; 1720 gl_shader_stage stage = MESA_SHADER_VERTEX; 1721 uint32_t cps_per_patch = builder->create_info->pTessellationState ? 1722 builder->create_info->pTessellationState->patchControlPoints : 0; 1723 bool multi_pos_output = builder->shaders->multi_pos_output; 1724 1725 /* Don't use the binning pass variant when GS is present because we don't 1726 * support compiling correct binning pass variants with GS. 1727 */ 1728 if (binning_pass && !gs) { 1729 vs = bs; 1730 tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova); 1731 stage++; 1732 } 1733 1734 for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) { 1735 const struct ir3_shader_variant *xs = builder->shaders->variants[stage]; 1736 1737 if (stage == MESA_SHADER_FRAGMENT && binning_pass) 1738 fs = xs = NULL; 1739 1740 tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]); 1741 } 1742 1743 uint32_t multiview_views = util_logbase2(builder->multiview_mask) + 1; 1744 uint32_t multiview_cntl = builder->multiview_mask ? 1745 A6XX_PC_MULTIVIEW_CNTL_ENABLE | 1746 A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) | 1747 COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS) 1748 : 0; 1749 1750 /* Copy what the blob does here. This will emit an extra 0x3f 1751 * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what 1752 * this is working around yet. 1753 */ 1754 if (builder->device->physical_device->info->a6xx.has_cp_reg_write) { 1755 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3); 1756 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE)); 1757 tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL); 1758 } else { 1759 tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1); 1760 } 1761 tu_cs_emit(cs, multiview_cntl); 1762 1763 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1); 1764 tu_cs_emit(cs, multiview_cntl); 1765 1766 if (multiview_cntl && 1767 builder->device->physical_device->info->a6xx.supports_multiview_mask) { 1768 tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1); 1769 tu_cs_emit(cs, builder->multiview_mask); 1770 } 1771 1772 tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); 1773 tu_cs_emit(cs, 0); 1774 1775 tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch); 1776 tu6_emit_vpc_varying_modes(cs, fs); 1777 1778 bool no_earlyz = builder->depth_attachment_format == VK_FORMAT_S8_UINT; 1779 uint32_t mrt_count = builder->color_attachment_count; 1780 uint32_t render_components = builder->render_components; 1781 1782 if (builder->alpha_to_coverage) { 1783 /* alpha to coverage can behave like a discard */ 1784 no_earlyz = true; 1785 /* alpha value comes from first mrt */ 1786 render_components |= 0xf; 1787 if (!mrt_count) { 1788 mrt_count = 1; 1789 /* Disable memory write for dummy mrt because it doesn't get set otherwise */ 1790 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = 0)); 1791 } 1792 } 1793 1794 if (fs) { 1795 tu6_emit_fs_inputs(cs, fs); 1796 tu6_emit_fs_outputs(cs, fs, mrt_count, 1797 builder->use_dual_src_blend, 1798 render_components, 1799 no_earlyz, 1800 pipeline); 1801 } else { 1802 /* TODO: check if these can be skipped if fs is disabled */ 1803 struct ir3_shader_variant dummy_variant = {}; 1804 tu6_emit_fs_inputs(cs, &dummy_variant); 1805 tu6_emit_fs_outputs(cs, &dummy_variant, mrt_count, 1806 builder->use_dual_src_blend, 1807 render_components, 1808 no_earlyz, 1809 NULL); 1810 } 1811 1812 if (gs || hs) { 1813 tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs, cps_per_patch); 1814 } 1815} 1816 1817#define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 5 + 4) 1818 1819static void 1820tu6_emit_vertex_input(struct tu_pipeline *pipeline, 1821 struct tu_draw_state *vi_state, 1822 const struct ir3_shader_variant *vs, 1823 const VkPipelineVertexInputStateCreateInfo *info) 1824{ 1825 uint32_t binding_instanced = 0; /* bitmask of instanced bindings */ 1826 uint32_t step_rate[MAX_VBS]; 1827 1828 struct tu_cs cs; 1829 tu_cs_begin_sub_stream(&pipeline->cs, 1830 TU6_EMIT_VERTEX_INPUT_MAX_DWORDS, &cs); 1831 1832 for (uint32_t i = 0; i < info->vertexBindingDescriptionCount; i++) { 1833 const VkVertexInputBindingDescription *binding = 1834 &info->pVertexBindingDescriptions[i]; 1835 1836 if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) { 1837 tu_cs_emit_regs(&cs, 1838 A6XX_VFD_FETCH_STRIDE(binding->binding, binding->stride)); 1839 } 1840 1841 if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) 1842 binding_instanced |= 1 << binding->binding; 1843 1844 step_rate[binding->binding] = 1; 1845 } 1846 1847 const VkPipelineVertexInputDivisorStateCreateInfoEXT *div_state = 1848 vk_find_struct_const(info->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT); 1849 if (div_state) { 1850 for (uint32_t i = 0; i < div_state->vertexBindingDivisorCount; i++) { 1851 const VkVertexInputBindingDivisorDescriptionEXT *desc = 1852 &div_state->pVertexBindingDivisors[i]; 1853 step_rate[desc->binding] = desc->divisor; 1854 } 1855 } 1856 1857 int32_t input_for_attr[MAX_VERTEX_ATTRIBS]; 1858 uint32_t used_attrs_count = 0; 1859 1860 for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) { 1861 input_for_attr[attr_idx] = -1; 1862 for (uint32_t input_idx = 0; input_idx < vs->inputs_count; input_idx++) { 1863 if ((vs->inputs[input_idx].slot - VERT_ATTRIB_GENERIC0) == 1864 info->pVertexAttributeDescriptions[attr_idx].location) { 1865 input_for_attr[attr_idx] = input_idx; 1866 used_attrs_count++; 1867 break; 1868 } 1869 } 1870 } 1871 1872 if (used_attrs_count) 1873 tu_cs_emit_pkt4(&cs, REG_A6XX_VFD_DECODE_INSTR(0), used_attrs_count * 2); 1874 1875 for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) { 1876 const VkVertexInputAttributeDescription *attr = 1877 &info->pVertexAttributeDescriptions[attr_idx]; 1878 1879 if (input_for_attr[attr_idx] == -1) 1880 continue; 1881 1882 const struct tu_native_format format = tu6_format_vtx(attr->format); 1883 tu_cs_emit(&cs, A6XX_VFD_DECODE_INSTR(0, 1884 .idx = attr->binding, 1885 .offset = attr->offset, 1886 .instanced = binding_instanced & (1 << attr->binding), 1887 .format = format.fmt, 1888 .swap = format.swap, 1889 .unk30 = 1, 1890 ._float = !vk_format_is_int(attr->format)).value); 1891 tu_cs_emit(&cs, A6XX_VFD_DECODE_STEP_RATE(0, step_rate[attr->binding]).value); 1892 } 1893 1894 if (used_attrs_count) 1895 tu_cs_emit_pkt4(&cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), used_attrs_count); 1896 1897 for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) { 1898 int32_t input_idx = input_for_attr[attr_idx]; 1899 if (input_idx == -1) 1900 continue; 1901 1902 tu_cs_emit(&cs, A6XX_VFD_DEST_CNTL_INSTR(0, 1903 .writemask = vs->inputs[input_idx].compmask, 1904 .regid = vs->inputs[input_idx].regid).value); 1905 } 1906 1907 tu_cs_emit_regs(&cs, 1908 A6XX_VFD_CONTROL_0( 1909 .fetch_cnt = used_attrs_count, /* decode_cnt for binning pass ? */ 1910 .decode_cnt = used_attrs_count)); 1911 1912 *vi_state = tu_cs_end_draw_state(&pipeline->cs, &cs); 1913} 1914 1915void 1916tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewports, uint32_t num_viewport, 1917 bool z_negative_one_to_one) 1918{ 1919 VkExtent2D guardband = {511, 511}; 1920 1921 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), num_viewport * 6); 1922 for (uint32_t i = 0; i < num_viewport; i++) { 1923 const VkViewport *viewport = &viewports[i]; 1924 float offsets[3]; 1925 float scales[3]; 1926 scales[0] = viewport->width / 2.0f; 1927 scales[1] = viewport->height / 2.0f; 1928 if (z_negative_one_to_one) { 1929 scales[2] = 0.5 * (viewport->maxDepth - viewport->minDepth); 1930 } else { 1931 scales[2] = viewport->maxDepth - viewport->minDepth; 1932 } 1933 1934 offsets[0] = viewport->x + scales[0]; 1935 offsets[1] = viewport->y + scales[1]; 1936 if (z_negative_one_to_one) { 1937 offsets[2] = 0.5 * (viewport->minDepth + viewport->maxDepth); 1938 } else { 1939 offsets[2] = viewport->minDepth; 1940 } 1941 1942 for (uint32_t j = 0; j < 3; j++) { 1943 tu_cs_emit(cs, fui(offsets[j])); 1944 tu_cs_emit(cs, fui(scales[j])); 1945 } 1946 1947 guardband.width = 1948 MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false)); 1949 guardband.height = 1950 MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false)); 1951 } 1952 1953 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), num_viewport * 2); 1954 for (uint32_t i = 0; i < num_viewport; i++) { 1955 const VkViewport *viewport = &viewports[i]; 1956 VkOffset2D min; 1957 VkOffset2D max; 1958 min.x = (int32_t) viewport->x; 1959 max.x = (int32_t) ceilf(viewport->x + viewport->width); 1960 if (viewport->height >= 0.0f) { 1961 min.y = (int32_t) viewport->y; 1962 max.y = (int32_t) ceilf(viewport->y + viewport->height); 1963 } else { 1964 min.y = (int32_t)(viewport->y + viewport->height); 1965 max.y = (int32_t) ceilf(viewport->y); 1966 } 1967 /* the spec allows viewport->height to be 0.0f */ 1968 if (min.y == max.y) 1969 max.y++; 1970 /* allow viewport->width = 0.0f for un-initialized viewports: */ 1971 if (min.x == max.x) 1972 max.x++; 1973 1974 min.x = MAX2(min.x, 0); 1975 min.y = MAX2(min.y, 0); 1976 max.x = MAX2(max.x, 1); 1977 max.y = MAX2(max.y, 1); 1978 1979 assert(min.x < max.x); 1980 assert(min.y < max.y); 1981 1982 tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) | 1983 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y)); 1984 tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_X(max.x - 1) | 1985 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_Y(max.y - 1)); 1986 } 1987 1988 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), num_viewport * 2); 1989 for (uint32_t i = 0; i < num_viewport; i++) { 1990 const VkViewport *viewport = &viewports[i]; 1991 tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth))); 1992 tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth))); 1993 } 1994 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1); 1995 tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) | 1996 A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height)); 1997 1998 /* TODO: what to do about this and multi viewport ? */ 1999 float z_clamp_min = num_viewport ? MIN2(viewports[0].minDepth, viewports[0].maxDepth) : 0; 2000 float z_clamp_max = num_viewport ? MAX2(viewports[0].minDepth, viewports[0].maxDepth) : 0; 2001 2002 tu_cs_emit_regs(cs, 2003 A6XX_RB_Z_CLAMP_MIN(z_clamp_min), 2004 A6XX_RB_Z_CLAMP_MAX(z_clamp_max)); 2005} 2006 2007void 2008tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissors, uint32_t scissor_count) 2009{ 2010 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), scissor_count * 2); 2011 2012 for (uint32_t i = 0; i < scissor_count; i++) { 2013 const VkRect2D *scissor = &scissors[i]; 2014 2015 uint32_t min_x = scissor->offset.x; 2016 uint32_t min_y = scissor->offset.y; 2017 uint32_t max_x = min_x + scissor->extent.width - 1; 2018 uint32_t max_y = min_y + scissor->extent.height - 1; 2019 2020 if (!scissor->extent.width || !scissor->extent.height) { 2021 min_x = min_y = 1; 2022 max_x = max_y = 0; 2023 } else { 2024 /* avoid overflow */ 2025 uint32_t scissor_max = BITFIELD_MASK(15); 2026 min_x = MIN2(scissor_max, min_x); 2027 min_y = MIN2(scissor_max, min_y); 2028 max_x = MIN2(scissor_max, max_x); 2029 max_y = MIN2(scissor_max, max_y); 2030 } 2031 2032 tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) | 2033 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y)); 2034 tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) | 2035 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y)); 2036 } 2037} 2038 2039void 2040tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc) 2041{ 2042 if (!samp_loc) { 2043 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1); 2044 tu_cs_emit(cs, 0); 2045 2046 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1); 2047 tu_cs_emit(cs, 0); 2048 2049 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1); 2050 tu_cs_emit(cs, 0); 2051 return; 2052 } 2053 2054 assert(samp_loc->sampleLocationsPerPixel == samp_loc->sampleLocationsCount); 2055 assert(samp_loc->sampleLocationGridSize.width == 1); 2056 assert(samp_loc->sampleLocationGridSize.height == 1); 2057 2058 uint32_t sample_config = 2059 A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE; 2060 uint32_t sample_locations = 0; 2061 for (uint32_t i = 0; i < samp_loc->sampleLocationsCount; i++) { 2062 sample_locations |= 2063 (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(samp_loc->pSampleLocations[i].x) | 2064 A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(samp_loc->pSampleLocations[i].y)) << i*8; 2065 } 2066 2067 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 2); 2068 tu_cs_emit(cs, sample_config); 2069 tu_cs_emit(cs, sample_locations); 2070 2071 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 2); 2072 tu_cs_emit(cs, sample_config); 2073 tu_cs_emit(cs, sample_locations); 2074 2075 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 2); 2076 tu_cs_emit(cs, sample_config); 2077 tu_cs_emit(cs, sample_locations); 2078} 2079 2080static uint32_t 2081tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info, 2082 enum a5xx_line_mode line_mode, 2083 bool multiview) 2084{ 2085 uint32_t gras_su_cntl = 0; 2086 2087 if (rast_info->cullMode & VK_CULL_MODE_FRONT_BIT) 2088 gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT; 2089 if (rast_info->cullMode & VK_CULL_MODE_BACK_BIT) 2090 gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK; 2091 2092 if (rast_info->frontFace == VK_FRONT_FACE_CLOCKWISE) 2093 gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW; 2094 2095 gras_su_cntl |= 2096 A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(rast_info->lineWidth / 2.0f); 2097 2098 if (rast_info->depthBiasEnable) 2099 gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET; 2100 2101 gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINE_MODE(line_mode); 2102 2103 if (multiview) { 2104 gras_su_cntl |= 2105 A6XX_GRAS_SU_CNTL_UNK17 | 2106 A6XX_GRAS_SU_CNTL_MULTIVIEW_ENABLE; 2107 } 2108 2109 return gras_su_cntl; 2110} 2111 2112void 2113tu6_emit_depth_bias(struct tu_cs *cs, 2114 float constant_factor, 2115 float clamp, 2116 float slope_factor) 2117{ 2118 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3); 2119 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor).value); 2120 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor).value); 2121 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp).value); 2122} 2123 2124static uint32_t 2125tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState *att, 2126 bool has_alpha) 2127{ 2128 const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->colorBlendOp); 2129 const enum adreno_rb_blend_factor src_color_factor = tu6_blend_factor( 2130 has_alpha ? att->srcColorBlendFactor 2131 : tu_blend_factor_no_dst_alpha(att->srcColorBlendFactor)); 2132 const enum adreno_rb_blend_factor dst_color_factor = tu6_blend_factor( 2133 has_alpha ? att->dstColorBlendFactor 2134 : tu_blend_factor_no_dst_alpha(att->dstColorBlendFactor)); 2135 const enum a3xx_rb_blend_opcode alpha_op = tu6_blend_op(att->alphaBlendOp); 2136 const enum adreno_rb_blend_factor src_alpha_factor = 2137 tu6_blend_factor(att->srcAlphaBlendFactor); 2138 const enum adreno_rb_blend_factor dst_alpha_factor = 2139 tu6_blend_factor(att->dstAlphaBlendFactor); 2140 2141 return A6XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(src_color_factor) | 2142 A6XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(color_op) | 2143 A6XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(dst_color_factor) | 2144 A6XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(src_alpha_factor) | 2145 A6XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(alpha_op) | 2146 A6XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(dst_alpha_factor); 2147} 2148 2149static uint32_t 2150tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att, 2151 uint32_t rb_mrt_control_rop, 2152 bool has_alpha) 2153{ 2154 uint32_t rb_mrt_control = 2155 A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE(att->colorWriteMask); 2156 2157 rb_mrt_control |= rb_mrt_control_rop; 2158 2159 if (att->blendEnable) { 2160 rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND; 2161 2162 if (has_alpha) 2163 rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND2; 2164 } 2165 2166 return rb_mrt_control; 2167} 2168 2169uint32_t 2170tu6_rb_mrt_control_rop(VkLogicOp op, bool *rop_reads_dst) 2171{ 2172 *rop_reads_dst = tu_logic_op_reads_dst(op); 2173 return A6XX_RB_MRT_CONTROL_ROP_ENABLE | 2174 A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(op)); 2175} 2176 2177static void 2178tu6_emit_rb_mrt_controls(struct tu_pipeline *pipeline, 2179 const VkPipelineColorBlendStateCreateInfo *blend_info, 2180 const VkFormat attachment_formats[MAX_RTS], 2181 bool *rop_reads_dst, 2182 uint32_t *color_bandwidth_per_sample) 2183{ 2184 const VkPipelineColorWriteCreateInfoEXT *color_info = 2185 vk_find_struct_const(blend_info->pNext, 2186 PIPELINE_COLOR_WRITE_CREATE_INFO_EXT); 2187 2188 /* The static state is ignored if it's dynamic. In that case assume 2189 * everything is enabled and then the appropriate registers will be zero'd 2190 * dynamically. 2191 */ 2192 if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE)) 2193 color_info = NULL; 2194 2195 *rop_reads_dst = false; 2196 *color_bandwidth_per_sample = 0; 2197 2198 uint32_t rb_mrt_control_rop = 0; 2199 if (blend_info->logicOpEnable) { 2200 pipeline->logic_op_enabled = true; 2201 rb_mrt_control_rop = tu6_rb_mrt_control_rop(blend_info->logicOp, 2202 rop_reads_dst); 2203 } 2204 2205 uint32_t total_bpp = 0; 2206 pipeline->num_rts = blend_info->attachmentCount; 2207 for (uint32_t i = 0; i < blend_info->attachmentCount; i++) { 2208 const VkPipelineColorBlendAttachmentState *att = 2209 &blend_info->pAttachments[i]; 2210 const VkFormat format = attachment_formats[i]; 2211 2212 uint32_t rb_mrt_control = 0; 2213 uint32_t rb_mrt_blend_control = 0; 2214 if (format != VK_FORMAT_UNDEFINED && 2215 (!color_info || color_info->pColorWriteEnables[i])) { 2216 const bool has_alpha = vk_format_has_alpha(format); 2217 2218 rb_mrt_control = 2219 tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha); 2220 rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha); 2221 2222 /* calculate bpp based on format and write mask */ 2223 uint32_t write_bpp = 0; 2224 if (att->colorWriteMask == 0xf) { 2225 write_bpp = vk_format_get_blocksizebits(format); 2226 } else { 2227 const enum pipe_format pipe_format = vk_format_to_pipe_format(format); 2228 for (uint32_t i = 0; i < 4; i++) { 2229 if (att->colorWriteMask & (1 << i)) { 2230 write_bpp += util_format_get_component_bits(pipe_format, 2231 UTIL_FORMAT_COLORSPACE_RGB, i); 2232 } 2233 } 2234 } 2235 total_bpp += write_bpp; 2236 2237 pipeline->color_write_enable |= BIT(i); 2238 if (att->blendEnable) 2239 pipeline->blend_enable |= BIT(i); 2240 2241 if (att->blendEnable || *rop_reads_dst) { 2242 total_bpp += write_bpp; 2243 } 2244 } 2245 2246 pipeline->rb_mrt_control[i] = rb_mrt_control & pipeline->rb_mrt_control_mask; 2247 pipeline->rb_mrt_blend_control[i] = rb_mrt_blend_control; 2248 } 2249 2250 *color_bandwidth_per_sample = total_bpp / 8; 2251} 2252 2253static void 2254tu6_emit_blend_control(struct tu_pipeline *pipeline, 2255 uint32_t blend_enable_mask, 2256 bool dual_src_blend, 2257 const VkPipelineMultisampleStateCreateInfo *msaa_info) 2258{ 2259 const uint32_t sample_mask = 2260 msaa_info->pSampleMask ? (*msaa_info->pSampleMask & 0xffff) 2261 : ((1 << msaa_info->rasterizationSamples) - 1); 2262 2263 2264 pipeline->sp_blend_cntl = 2265 A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask, 2266 .dual_color_in_enable = dual_src_blend, 2267 .alpha_to_coverage = msaa_info->alphaToCoverageEnable, 2268 .unk8 = true).value & pipeline->sp_blend_cntl_mask; 2269 2270 /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */ 2271 pipeline->rb_blend_cntl = 2272 A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask, 2273 .independent_blend = true, 2274 .sample_mask = sample_mask, 2275 .dual_color_in_enable = dual_src_blend, 2276 .alpha_to_coverage = msaa_info->alphaToCoverageEnable, 2277 .alpha_to_one = msaa_info->alphaToOneEnable).value & 2278 pipeline->rb_blend_cntl_mask; 2279} 2280 2281static void 2282tu6_emit_blend(struct tu_cs *cs, 2283 struct tu_pipeline *pipeline) 2284{ 2285 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.dword = pipeline->sp_blend_cntl)); 2286 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.dword = pipeline->rb_blend_cntl)); 2287 2288 for (unsigned i = 0; i < pipeline->num_rts; i++) { 2289 tu_cs_emit_regs(cs, 2290 A6XX_RB_MRT_CONTROL(i, .dword = pipeline->rb_mrt_control[i]), 2291 A6XX_RB_MRT_BLEND_CONTROL(i, .dword = pipeline->rb_mrt_blend_control[i])); 2292 } 2293} 2294 2295static uint32_t 2296calc_pvtmem_size(struct tu_device *dev, struct tu_pvtmem_config *config, 2297 uint32_t pvtmem_bytes) 2298{ 2299 uint32_t per_fiber_size = ALIGN(pvtmem_bytes, 512); 2300 uint32_t per_sp_size = 2301 ALIGN(per_fiber_size * dev->physical_device->info->a6xx.fibers_per_sp, 1 << 12); 2302 2303 if (config) { 2304 config->per_fiber_size = per_fiber_size; 2305 config->per_sp_size = per_sp_size; 2306 } 2307 2308 return dev->physical_device->info->num_sp_cores * per_sp_size; 2309} 2310 2311static VkResult 2312tu_setup_pvtmem(struct tu_device *dev, 2313 struct tu_pipeline *pipeline, 2314 struct tu_pvtmem_config *config, 2315 uint32_t pvtmem_bytes, bool per_wave) 2316{ 2317 if (!pvtmem_bytes) { 2318 memset(config, 0, sizeof(*config)); 2319 return VK_SUCCESS; 2320 } 2321 2322 uint32_t total_size = calc_pvtmem_size(dev, config, pvtmem_bytes); 2323 config->per_wave = per_wave; 2324 2325 VkResult result = 2326 tu_bo_init_new(dev, &pipeline->pvtmem_bo, total_size, 2327 TU_BO_ALLOC_NO_FLAGS); 2328 if (result != VK_SUCCESS) 2329 return result; 2330 2331 config->iova = pipeline->pvtmem_bo->iova; 2332 2333 return result; 2334} 2335 2336 2337static VkResult 2338tu_pipeline_allocate_cs(struct tu_device *dev, 2339 struct tu_pipeline *pipeline, 2340 struct tu_pipeline_layout *layout, 2341 struct tu_pipeline_builder *builder, 2342 struct ir3_shader_variant *compute) 2343{ 2344 uint32_t size = 1024 + tu6_load_state_size(pipeline, layout, compute); 2345 2346 /* graphics case: */ 2347 if (builder) { 2348 size += 2 * TU6_EMIT_VERTEX_INPUT_MAX_DWORDS; 2349 2350 for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) { 2351 if (builder->shaders->variants[i]) { 2352 size += builder->shaders->variants[i]->info.size / 4; 2353 } 2354 } 2355 2356 size += builder->binning_variant->info.size / 4; 2357 2358 builder->additional_cs_reserve_size = 0; 2359 for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) { 2360 struct ir3_shader_variant *variant = builder->shaders->variants[i]; 2361 if (variant) { 2362 builder->additional_cs_reserve_size += 2363 tu_xs_get_additional_cs_size_dwords(variant); 2364 2365 if (variant->binning) { 2366 builder->additional_cs_reserve_size += 2367 tu_xs_get_additional_cs_size_dwords(variant->binning); 2368 } 2369 } 2370 } 2371 2372 /* The additional size is used twice, once per tu6_emit_program() call. */ 2373 size += builder->additional_cs_reserve_size * 2; 2374 } else { 2375 size += compute->info.size / 4; 2376 2377 size += tu_xs_get_additional_cs_size_dwords(compute); 2378 } 2379 2380 /* Allocate the space for the pipeline out of the device's RO suballocator. 2381 * 2382 * Sub-allocating BOs saves memory and also kernel overhead in refcounting of 2383 * BOs at exec time. 2384 * 2385 * The pipeline cache would seem like a natural place to stick the 2386 * suballocator, except that it is not guaranteed to outlive the pipelines 2387 * created from it, so you can't store any long-lived state there, and you 2388 * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because 2389 * pipeline destroy isn't synchronized by the cache. 2390 */ 2391 pthread_mutex_lock(&dev->pipeline_mutex); 2392 VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc, 2393 size * 4, 128); 2394 pthread_mutex_unlock(&dev->pipeline_mutex); 2395 if (result != VK_SUCCESS) 2396 return result; 2397 2398 tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo); 2399 2400 return VK_SUCCESS; 2401} 2402 2403static void 2404tu_pipeline_shader_key_init(struct ir3_shader_key *key, 2405 const struct tu_pipeline *pipeline, 2406 const VkGraphicsPipelineCreateInfo *pipeline_info) 2407{ 2408 for (uint32_t i = 0; i < pipeline_info->stageCount; i++) { 2409 if (pipeline_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) { 2410 key->has_gs = true; 2411 break; 2412 } 2413 } 2414 2415 if (pipeline_info->pRasterizationState->rasterizerDiscardEnable && 2416 !(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD))) 2417 return; 2418 2419 const VkPipelineMultisampleStateCreateInfo *msaa_info = pipeline_info->pMultisampleState; 2420 const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations = 2421 vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT); 2422 if (msaa_info->rasterizationSamples > 1 || 2423 /* also set msaa key when sample location is not the default 2424 * since this affects varying interpolation */ 2425 (sample_locations && sample_locations->sampleLocationsEnable)) { 2426 key->msaa = true; 2427 } 2428 2429 /* The 1.3.215 spec says: 2430 * 2431 * Sample shading can be used to specify a minimum number of unique 2432 * samples to process for each fragment. If sample shading is enabled, 2433 * an implementation must provide a minimum of 2434 * 2435 * max(ceil(minSampleShadingFactor * totalSamples), 1) 2436 * 2437 * unique associated data for each fragment, where 2438 * minSampleShadingFactor is the minimum fraction of sample shading. 2439 * 2440 * The definition is pretty much the same as OpenGL's GL_SAMPLE_SHADING. 2441 * They both require unique associated data. 2442 * 2443 * There are discussions to change the definition, such that 2444 * sampleShadingEnable does not imply unique associated data. Before the 2445 * discussions are settled and before apps (i.e., ANGLE) are fixed to 2446 * follow the new and incompatible definition, we should stick to the 2447 * current definition. 2448 * 2449 * Note that ir3_shader_key::sample_shading is not actually used by ir3, 2450 * just checked in tu6_emit_fs_inputs. We will also copy the value to 2451 * tu_shader_key::force_sample_interp in a bit. 2452 */ 2453 if (msaa_info->sampleShadingEnable && 2454 (msaa_info->minSampleShading * msaa_info->rasterizationSamples) > 1.0f) 2455 key->sample_shading = true; 2456 2457 /* We set this after we compile to NIR because we need the prim mode */ 2458 key->tessellation = IR3_TESS_NONE; 2459} 2460 2461static uint32_t 2462tu6_get_tessmode(struct tu_shader* shader) 2463{ 2464 enum tess_primitive_mode primitive_mode = shader->ir3_shader->nir->info.tess._primitive_mode; 2465 switch (primitive_mode) { 2466 case TESS_PRIMITIVE_ISOLINES: 2467 return IR3_TESS_ISOLINES; 2468 case TESS_PRIMITIVE_TRIANGLES: 2469 return IR3_TESS_TRIANGLES; 2470 case TESS_PRIMITIVE_QUADS: 2471 return IR3_TESS_QUADS; 2472 case TESS_PRIMITIVE_UNSPECIFIED: 2473 return IR3_TESS_NONE; 2474 default: 2475 unreachable("bad tessmode"); 2476 } 2477} 2478 2479static uint64_t 2480tu_upload_variant(struct tu_pipeline *pipeline, 2481 const struct ir3_shader_variant *variant) 2482{ 2483 struct tu_cs_memory memory; 2484 2485 if (!variant) 2486 return 0; 2487 2488 /* this expects to get enough alignment because shaders are allocated first 2489 * and total size is always aligned correctly 2490 * note: an assert in tu6_emit_xs_config validates the alignment 2491 */ 2492 tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory); 2493 2494 memcpy(memory.map, variant->bin, variant->info.size); 2495 return memory.iova; 2496} 2497 2498static void 2499tu_append_executable(struct tu_pipeline *pipeline, struct ir3_shader_variant *variant, 2500 char *nir_from_spirv) 2501{ 2502 struct tu_pipeline_executable exe = { 2503 .stage = variant->type, 2504 .nir_from_spirv = nir_from_spirv, 2505 .nir_final = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.nir), 2506 .disasm = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.disasm), 2507 .stats = variant->info, 2508 .is_binning = variant->binning_pass, 2509 }; 2510 2511 util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe); 2512} 2513 2514static void 2515tu_link_shaders(struct tu_pipeline_builder *builder, 2516 nir_shader **shaders, unsigned shaders_count) 2517{ 2518 nir_shader *consumer = NULL; 2519 for (gl_shader_stage stage = shaders_count - 1; 2520 stage >= MESA_SHADER_VERTEX; stage--) { 2521 if (!shaders[stage]) 2522 continue; 2523 2524 nir_shader *producer = shaders[stage]; 2525 if (!consumer) { 2526 consumer = producer; 2527 continue; 2528 } 2529 2530 if (nir_link_opt_varyings(producer, consumer)) { 2531 NIR_PASS_V(consumer, nir_opt_constant_folding); 2532 NIR_PASS_V(consumer, nir_opt_algebraic); 2533 NIR_PASS_V(consumer, nir_opt_dce); 2534 } 2535 2536 NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL); 2537 NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL); 2538 2539 bool progress = nir_remove_unused_varyings(producer, consumer); 2540 2541 nir_compact_varyings(producer, consumer, true); 2542 if (progress) { 2543 if (nir_lower_global_vars_to_local(producer)) { 2544 /* Remove dead writes, which can remove input loads */ 2545 NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL); 2546 NIR_PASS_V(producer, nir_opt_dce); 2547 } 2548 nir_lower_global_vars_to_local(consumer); 2549 } 2550 2551 consumer = producer; 2552 } 2553} 2554 2555static void 2556tu_shader_key_init(struct tu_shader_key *key, 2557 const VkPipelineShaderStageCreateInfo *stage_info, 2558 struct tu_device *dev) 2559{ 2560 enum ir3_wavesize_option api_wavesize, real_wavesize; 2561 2562 if (stage_info) { 2563 if (stage_info->flags & 2564 VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT) { 2565 api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE; 2566 } else { 2567 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *size_info = 2568 vk_find_struct_const(stage_info->pNext, 2569 PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO); 2570 2571 if (size_info) { 2572 if (size_info->requiredSubgroupSize == dev->compiler->threadsize_base) { 2573 api_wavesize = IR3_SINGLE_ONLY; 2574 } else { 2575 assert(size_info->requiredSubgroupSize == dev->compiler->threadsize_base * 2); 2576 api_wavesize = IR3_DOUBLE_ONLY; 2577 } 2578 } else { 2579 /* Match the exposed subgroupSize. */ 2580 api_wavesize = IR3_DOUBLE_ONLY; 2581 } 2582 2583 if (stage_info->flags & 2584 VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT) 2585 real_wavesize = api_wavesize; 2586 else if (api_wavesize == IR3_SINGLE_ONLY) 2587 real_wavesize = IR3_SINGLE_ONLY; 2588 else 2589 real_wavesize = IR3_SINGLE_OR_DOUBLE; 2590 } 2591 } else { 2592 api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE; 2593 } 2594 2595 key->api_wavesize = api_wavesize; 2596 key->real_wavesize = real_wavesize; 2597} 2598 2599static void 2600tu_hash_stage(struct mesa_sha1 *ctx, 2601 const VkPipelineShaderStageCreateInfo *stage, 2602 const struct tu_shader_key *key) 2603{ 2604 unsigned char stage_hash[SHA1_DIGEST_LENGTH]; 2605 2606 vk_pipeline_hash_shader_stage(stage, stage_hash); 2607 _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash)); 2608 _mesa_sha1_update(ctx, key, sizeof(*key)); 2609} 2610 2611/* Hash flags which can affect ir3 shader compilation which aren't known until 2612 * logical device creation. 2613 */ 2614static void 2615tu_hash_compiler(struct mesa_sha1 *ctx, const struct ir3_compiler *compiler) 2616{ 2617 _mesa_sha1_update(ctx, &compiler->robust_buffer_access2, 2618 sizeof(compiler->robust_buffer_access2)); 2619 _mesa_sha1_update(ctx, &ir3_shader_debug, sizeof(ir3_shader_debug)); 2620} 2621 2622static void 2623tu_hash_shaders(unsigned char *hash, 2624 const VkPipelineShaderStageCreateInfo **stages, 2625 const struct tu_pipeline_layout *layout, 2626 const struct tu_shader_key *keys, 2627 const struct ir3_shader_key *ir3_key, 2628 const struct ir3_compiler *compiler) 2629{ 2630 struct mesa_sha1 ctx; 2631 2632 _mesa_sha1_init(&ctx); 2633 2634 if (layout) 2635 _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1)); 2636 2637 _mesa_sha1_update(&ctx, ir3_key, sizeof(ir3_key)); 2638 2639 for (int i = 0; i < MESA_SHADER_STAGES; ++i) { 2640 if (stages[i]) { 2641 tu_hash_stage(&ctx, stages[i], &keys[i]); 2642 } 2643 } 2644 tu_hash_compiler(&ctx, compiler); 2645 _mesa_sha1_final(&ctx, hash); 2646} 2647 2648static void 2649tu_hash_compute(unsigned char *hash, 2650 const VkPipelineShaderStageCreateInfo *stage, 2651 const struct tu_pipeline_layout *layout, 2652 const struct tu_shader_key *key, 2653 const struct ir3_compiler *compiler) 2654{ 2655 struct mesa_sha1 ctx; 2656 2657 _mesa_sha1_init(&ctx); 2658 2659 if (layout) 2660 _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1)); 2661 2662 tu_hash_stage(&ctx, stage, key); 2663 2664 tu_hash_compiler(&ctx, compiler); 2665 _mesa_sha1_final(&ctx, hash); 2666} 2667 2668static bool 2669tu_shaders_serialize(struct vk_pipeline_cache_object *object, 2670 struct blob *blob); 2671 2672static struct vk_pipeline_cache_object * 2673tu_shaders_deserialize(struct vk_device *device, 2674 const void *key_data, size_t key_size, 2675 struct blob_reader *blob); 2676 2677static void 2678tu_shaders_destroy(struct vk_pipeline_cache_object *object) 2679{ 2680 struct tu_compiled_shaders *shaders = 2681 container_of(object, struct tu_compiled_shaders, base); 2682 2683 for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) 2684 ralloc_free(shaders->variants[i]); 2685 2686 vk_pipeline_cache_object_finish(&shaders->base); 2687 vk_free(&object->device->alloc, shaders); 2688} 2689 2690const struct vk_pipeline_cache_object_ops tu_shaders_ops = { 2691 .serialize = tu_shaders_serialize, 2692 .deserialize = tu_shaders_deserialize, 2693 .destroy = tu_shaders_destroy, 2694}; 2695 2696static struct tu_compiled_shaders * 2697tu_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size) 2698{ 2699 VK_MULTIALLOC(ma); 2700 VK_MULTIALLOC_DECL(&ma, struct tu_compiled_shaders, shaders, 1); 2701 VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, key_size); 2702 2703 if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc, 2704 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE)) 2705 return NULL; 2706 2707 memcpy(obj_key_data, key_data, key_size); 2708 vk_pipeline_cache_object_init(&dev->vk, &shaders->base, 2709 &tu_shaders_ops, obj_key_data, key_size); 2710 2711 return shaders; 2712} 2713 2714static bool 2715tu_shaders_serialize(struct vk_pipeline_cache_object *object, 2716 struct blob *blob) 2717{ 2718 struct tu_compiled_shaders *shaders = 2719 container_of(object, struct tu_compiled_shaders, base); 2720 2721 blob_write_bytes(blob, shaders->push_consts, sizeof(shaders->push_consts)); 2722 blob_write_uint8(blob, shaders->active_desc_sets); 2723 blob_write_uint8(blob, shaders->multi_pos_output); 2724 2725 for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) { 2726 if (shaders->variants[i]) { 2727 blob_write_uint8(blob, 1); 2728 ir3_store_variant(blob, shaders->variants[i]); 2729 } else { 2730 blob_write_uint8(blob, 0); 2731 } 2732 } 2733 2734 return true; 2735} 2736 2737static struct vk_pipeline_cache_object * 2738tu_shaders_deserialize(struct vk_device *_device, 2739 const void *key_data, size_t key_size, 2740 struct blob_reader *blob) 2741{ 2742 struct tu_device *dev = container_of(_device, struct tu_device, vk); 2743 struct tu_compiled_shaders *shaders = 2744 tu_shaders_init(dev, key_data, key_size); 2745 2746 if (!shaders) 2747 return NULL; 2748 2749 blob_copy_bytes(blob, shaders->push_consts, sizeof(shaders->push_consts)); 2750 shaders->active_desc_sets = blob_read_uint8(blob); 2751 shaders->multi_pos_output = blob_read_uint8(blob); 2752 2753 for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) { 2754 bool has_shader = blob_read_uint8(blob); 2755 if (has_shader) { 2756 shaders->variants[i] = ir3_retrieve_variant(blob, dev->compiler, NULL); 2757 } 2758 } 2759 2760 return &shaders->base; 2761} 2762 2763static struct tu_compiled_shaders * 2764tu_pipeline_cache_lookup(struct vk_pipeline_cache *cache, 2765 const void *key_data, size_t key_size, 2766 bool *application_cache_hit) 2767{ 2768 struct vk_pipeline_cache_object *object = 2769 vk_pipeline_cache_lookup_object(cache, key_data, key_size, 2770 &tu_shaders_ops, application_cache_hit); 2771 if (object) 2772 return container_of(object, struct tu_compiled_shaders, base); 2773 else 2774 return NULL; 2775} 2776 2777static struct tu_compiled_shaders * 2778tu_pipeline_cache_insert(struct vk_pipeline_cache *cache, 2779 struct tu_compiled_shaders *shaders) 2780{ 2781 struct vk_pipeline_cache_object *object = 2782 vk_pipeline_cache_add_object(cache, &shaders->base); 2783 return container_of(object, struct tu_compiled_shaders, base); 2784} 2785 2786static VkResult 2787tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder, 2788 struct tu_pipeline *pipeline) 2789{ 2790 VkResult result = VK_SUCCESS; 2791 const struct ir3_compiler *compiler = builder->device->compiler; 2792 const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = { 2793 NULL 2794 }; 2795 VkPipelineCreationFeedback pipeline_feedback = { 2796 .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT, 2797 }; 2798 VkPipelineCreationFeedback stage_feedbacks[MESA_SHADER_STAGES] = { 0 }; 2799 2800 int64_t pipeline_start = os_time_get_nano(); 2801 2802 const VkPipelineCreationFeedbackCreateInfo *creation_feedback = 2803 vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO); 2804 2805 for (uint32_t i = 0; i < builder->create_info->stageCount; i++) { 2806 gl_shader_stage stage = 2807 vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage); 2808 stage_infos[stage] = &builder->create_info->pStages[i]; 2809 } 2810 2811 if (tu6_shared_constants_enable(builder->layout, builder->device->compiler)) { 2812 pipeline->shared_consts = (struct tu_push_constant_range) { 2813 .lo = 0, 2814 .dwords = builder->layout->push_constant_size / 4, 2815 }; 2816 } 2817 2818 struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { }; 2819 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 2820 stage < ARRAY_SIZE(keys); stage++) { 2821 tu_shader_key_init(&keys[stage], stage_infos[stage], builder->device); 2822 } 2823 2824 struct ir3_shader_key ir3_key = {}; 2825 tu_pipeline_shader_key_init(&ir3_key, pipeline, builder->create_info); 2826 2827 keys[MESA_SHADER_VERTEX].multiview_mask = builder->multiview_mask; 2828 keys[MESA_SHADER_FRAGMENT].multiview_mask = builder->multiview_mask; 2829 keys[MESA_SHADER_FRAGMENT].force_sample_interp = ir3_key.sample_shading; 2830 2831 unsigned char pipeline_sha1[20]; 2832 tu_hash_shaders(pipeline_sha1, stage_infos, builder->layout, keys, &ir3_key, compiler); 2833 2834 const bool executable_info = builder->create_info->flags & 2835 VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR; 2836 2837 char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL }; 2838 2839 struct tu_compiled_shaders *compiled_shaders; 2840 2841 if (!executable_info) { 2842 bool application_cache_hit = false; 2843 2844 compiled_shaders = 2845 tu_pipeline_cache_lookup(builder->cache, &pipeline_sha1, 2846 sizeof(pipeline_sha1), 2847 &application_cache_hit); 2848 2849 if (application_cache_hit && builder->cache != builder->device->mem_cache) { 2850 pipeline_feedback.flags |= 2851 VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; 2852 } 2853 2854 if (compiled_shaders) 2855 goto done; 2856 } 2857 2858 if (builder->create_info->flags & 2859 VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) { 2860 return VK_PIPELINE_COMPILE_REQUIRED; 2861 } 2862 2863 nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL }; 2864 2865 struct tu_shader *shaders[ARRAY_SIZE(nir)] = { NULL }; 2866 2867 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 2868 stage < ARRAY_SIZE(nir); stage++) { 2869 const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage]; 2870 if (!stage_info) 2871 continue; 2872 2873 int64_t stage_start = os_time_get_nano(); 2874 2875 nir[stage] = tu_spirv_to_nir(builder->device, builder->mem_ctx, stage_info, stage); 2876 if (!nir[stage]) { 2877 result = VK_ERROR_OUT_OF_HOST_MEMORY; 2878 goto fail; 2879 } 2880 2881 stage_feedbacks[stage].flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT; 2882 stage_feedbacks[stage].duration += os_time_get_nano() - stage_start; 2883 } 2884 2885 if (!nir[MESA_SHADER_FRAGMENT]) { 2886 const nir_shader_compiler_options *nir_options = 2887 ir3_get_compiler_options(builder->device->compiler); 2888 nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, 2889 nir_options, 2890 "noop_fs"); 2891 nir[MESA_SHADER_FRAGMENT] = fs_b.shader; 2892 } 2893 2894 if (executable_info) { 2895 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 2896 stage < ARRAY_SIZE(nir); stage++) { 2897 if (!nir[stage]) 2898 continue; 2899 2900 nir_initial_disasm[stage] = 2901 nir_shader_as_str(nir[stage], pipeline->executables_mem_ctx); 2902 } 2903 } 2904 2905 tu_link_shaders(builder, nir, ARRAY_SIZE(nir)); 2906 2907 uint32_t desc_sets = 0; 2908 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 2909 stage < ARRAY_SIZE(nir); stage++) { 2910 if (!nir[stage]) 2911 continue; 2912 2913 int64_t stage_start = os_time_get_nano(); 2914 2915 struct tu_shader *shader = 2916 tu_shader_create(builder->device, nir[stage], &keys[stage], 2917 builder->layout, builder->alloc); 2918 if (!shader) { 2919 result = VK_ERROR_OUT_OF_HOST_MEMORY; 2920 goto fail; 2921 } 2922 2923 /* In SPIR-V generated from GLSL, the primitive mode is specified in the 2924 * tessellation evaluation shader, but in SPIR-V generated from HLSL, 2925 * the mode is specified in the tessellation control shader. */ 2926 if ((stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_TESS_CTRL) && 2927 ir3_key.tessellation == IR3_TESS_NONE) { 2928 ir3_key.tessellation = tu6_get_tessmode(shader); 2929 } 2930 2931 if (stage > MESA_SHADER_TESS_CTRL) { 2932 if (stage == MESA_SHADER_FRAGMENT) { 2933 ir3_key.tcs_store_primid = ir3_key.tcs_store_primid || 2934 (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID)); 2935 } else { 2936 ir3_key.tcs_store_primid = ir3_key.tcs_store_primid || 2937 BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID); 2938 } 2939 } 2940 2941 /* Keep track of the status of each shader's active descriptor sets, 2942 * which is set in tu_lower_io. */ 2943 desc_sets |= shader->active_desc_sets; 2944 2945 shaders[stage] = shader; 2946 2947 stage_feedbacks[stage].duration += os_time_get_nano() - stage_start; 2948 } 2949 2950 struct tu_shader *last_shader = shaders[MESA_SHADER_GEOMETRY]; 2951 if (!last_shader) 2952 last_shader = shaders[MESA_SHADER_TESS_EVAL]; 2953 if (!last_shader) 2954 last_shader = shaders[MESA_SHADER_VERTEX]; 2955 2956 uint64_t outputs_written = last_shader->ir3_shader->nir->info.outputs_written; 2957 2958 ir3_key.layer_zero = !(outputs_written & VARYING_BIT_LAYER); 2959 ir3_key.view_zero = !(outputs_written & VARYING_BIT_VIEWPORT); 2960 2961 compiled_shaders = 2962 tu_shaders_init(builder->device, &pipeline_sha1, sizeof(pipeline_sha1)); 2963 2964 if (!compiled_shaders) { 2965 result = VK_ERROR_OUT_OF_HOST_MEMORY; 2966 goto fail; 2967 } 2968 2969 compiled_shaders->active_desc_sets = desc_sets; 2970 compiled_shaders->multi_pos_output = 2971 shaders[MESA_SHADER_VERTEX]->multi_pos_output; 2972 2973 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 2974 stage < ARRAY_SIZE(shaders); stage++) { 2975 if (!shaders[stage]) 2976 continue; 2977 2978 int64_t stage_start = os_time_get_nano(); 2979 2980 compiled_shaders->variants[stage] = 2981 ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key, 2982 executable_info); 2983 if (!compiled_shaders->variants[stage]) 2984 return VK_ERROR_OUT_OF_HOST_MEMORY; 2985 2986 compiled_shaders->push_consts[stage] = shaders[stage]->push_consts; 2987 2988 stage_feedbacks[stage].duration += os_time_get_nano() - stage_start; 2989 } 2990 2991 uint32_t safe_constlens = ir3_trim_constlen(compiled_shaders->variants, compiler); 2992 2993 ir3_key.safe_constlen = true; 2994 2995 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 2996 stage < ARRAY_SIZE(shaders); stage++) { 2997 if (!shaders[stage]) 2998 continue; 2999 3000 if (safe_constlens & (1 << stage)) { 3001 int64_t stage_start = os_time_get_nano(); 3002 3003 ralloc_free(compiled_shaders->variants[stage]); 3004 compiled_shaders->variants[stage] = 3005 ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key, 3006 executable_info); 3007 if (!compiled_shaders->variants[stage]) { 3008 result = VK_ERROR_OUT_OF_HOST_MEMORY; 3009 goto fail; 3010 } 3011 3012 stage_feedbacks[stage].duration += os_time_get_nano() - stage_start; 3013 } 3014 } 3015 3016 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 3017 stage < ARRAY_SIZE(nir); stage++) { 3018 if (shaders[stage]) { 3019 tu_shader_destroy(builder->device, shaders[stage], builder->alloc); 3020 } 3021 } 3022 3023 compiled_shaders = 3024 tu_pipeline_cache_insert(builder->cache, compiled_shaders); 3025 3026done: 3027 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 3028 stage < ARRAY_SIZE(nir); stage++) { 3029 if (compiled_shaders->variants[stage]) { 3030 tu_append_executable(pipeline, compiled_shaders->variants[stage], 3031 nir_initial_disasm[stage]); 3032 } 3033 } 3034 3035 struct ir3_shader_variant *vs = 3036 compiled_shaders->variants[MESA_SHADER_VERTEX]; 3037 3038 struct ir3_shader_variant *variant; 3039 if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) { 3040 tu_append_executable(pipeline, vs->binning, NULL); 3041 variant = vs->binning; 3042 } else { 3043 variant = vs; 3044 } 3045 3046 builder->binning_variant = variant; 3047 3048 builder->shaders = compiled_shaders; 3049 3050 pipeline->active_desc_sets = compiled_shaders->active_desc_sets; 3051 if (compiled_shaders->variants[MESA_SHADER_TESS_CTRL]) { 3052 pipeline->tess.patch_type = 3053 compiled_shaders->variants[MESA_SHADER_TESS_CTRL]->key.tessellation; 3054 } 3055 3056 pipeline_feedback.duration = os_time_get_nano() - pipeline_start; 3057 if (creation_feedback) { 3058 *creation_feedback->pPipelineCreationFeedback = pipeline_feedback; 3059 3060 assert(builder->create_info->stageCount == 3061 creation_feedback->pipelineStageCreationFeedbackCount); 3062 for (uint32_t i = 0; i < builder->create_info->stageCount; i++) { 3063 gl_shader_stage s = 3064 vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage); 3065 creation_feedback->pPipelineStageCreationFeedbacks[i] = stage_feedbacks[s]; 3066 } 3067 } 3068 3069 return VK_SUCCESS; 3070 3071fail: 3072 for (gl_shader_stage stage = MESA_SHADER_VERTEX; 3073 stage < ARRAY_SIZE(nir); stage++) { 3074 if (shaders[stage]) { 3075 tu_shader_destroy(builder->device, shaders[stage], builder->alloc); 3076 } 3077 } 3078 3079 if (compiled_shaders) 3080 vk_pipeline_cache_object_unref(&compiled_shaders->base); 3081 3082 return result; 3083} 3084 3085static void 3086tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder *builder, 3087 struct tu_pipeline *pipeline) 3088{ 3089 const VkPipelineDynamicStateCreateInfo *dynamic_info = 3090 builder->create_info->pDynamicState; 3091 3092 pipeline->gras_su_cntl_mask = ~0u; 3093 pipeline->rb_depth_cntl_mask = ~0u; 3094 pipeline->rb_stencil_cntl_mask = ~0u; 3095 pipeline->pc_raster_cntl_mask = ~0u; 3096 pipeline->vpc_unknown_9107_mask = ~0u; 3097 pipeline->sp_blend_cntl_mask = ~0u; 3098 pipeline->rb_blend_cntl_mask = ~0u; 3099 pipeline->rb_mrt_control_mask = ~0u; 3100 3101 if (!dynamic_info) 3102 return; 3103 3104 for (uint32_t i = 0; i < dynamic_info->dynamicStateCount; i++) { 3105 VkDynamicState state = dynamic_info->pDynamicStates[i]; 3106 switch (state) { 3107 case VK_DYNAMIC_STATE_VIEWPORT ... VK_DYNAMIC_STATE_STENCIL_REFERENCE: 3108 if (state == VK_DYNAMIC_STATE_LINE_WIDTH) 3109 pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK; 3110 pipeline->dynamic_state_mask |= BIT(state); 3111 break; 3112 case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT: 3113 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS); 3114 break; 3115 case VK_DYNAMIC_STATE_CULL_MODE: 3116 pipeline->gras_su_cntl_mask &= 3117 ~(A6XX_GRAS_SU_CNTL_CULL_BACK | A6XX_GRAS_SU_CNTL_CULL_FRONT); 3118 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL); 3119 break; 3120 case VK_DYNAMIC_STATE_FRONT_FACE: 3121 pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_FRONT_CW; 3122 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL); 3123 break; 3124 case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY: 3125 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY); 3126 break; 3127 case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE: 3128 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VB_STRIDE); 3129 break; 3130 case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT: 3131 pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT); 3132 break; 3133 case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT: 3134 pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR); 3135 break; 3136 case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE: 3137 pipeline->rb_depth_cntl_mask &= 3138 ~(A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE); 3139 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); 3140 break; 3141 case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE: 3142 pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; 3143 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); 3144 break; 3145 case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP: 3146 pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK; 3147 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); 3148 break; 3149 case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE: 3150 pipeline->rb_depth_cntl_mask &= 3151 ~(A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE); 3152 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); 3153 break; 3154 case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE: 3155 pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | 3156 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | 3157 A6XX_RB_STENCIL_CONTROL_STENCIL_READ); 3158 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL); 3159 break; 3160 case VK_DYNAMIC_STATE_STENCIL_OP: 3161 pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_FUNC__MASK | 3162 A6XX_RB_STENCIL_CONTROL_FAIL__MASK | 3163 A6XX_RB_STENCIL_CONTROL_ZPASS__MASK | 3164 A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK | 3165 A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK | 3166 A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK | 3167 A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK | 3168 A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK); 3169 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL); 3170 break; 3171 case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE: 3172 pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET; 3173 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL); 3174 break; 3175 case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE: 3176 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE); 3177 break; 3178 case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE: 3179 pipeline->pc_raster_cntl_mask &= ~A6XX_PC_RASTER_CNTL_DISCARD; 3180 pipeline->vpc_unknown_9107_mask &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD; 3181 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD); 3182 break; 3183 case VK_DYNAMIC_STATE_LOGIC_OP_EXT: 3184 pipeline->sp_blend_cntl_mask &= ~A6XX_SP_BLEND_CNTL_ENABLE_BLEND__MASK; 3185 pipeline->rb_blend_cntl_mask &= ~A6XX_RB_BLEND_CNTL_ENABLE_BLEND__MASK; 3186 pipeline->rb_mrt_control_mask &= ~A6XX_RB_MRT_CONTROL_ROP_CODE__MASK; 3187 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_BLEND); 3188 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_LOGIC_OP); 3189 break; 3190 case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT: 3191 pipeline->sp_blend_cntl_mask &= ~A6XX_SP_BLEND_CNTL_ENABLE_BLEND__MASK; 3192 pipeline->rb_blend_cntl_mask &= ~A6XX_RB_BLEND_CNTL_ENABLE_BLEND__MASK; 3193 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_BLEND); 3194 3195 /* Dynamic color write enable doesn't directly change any of the 3196 * registers, but it causes us to make some of the registers 0, so we 3197 * set this dynamic state instead of making the register dynamic. 3198 */ 3199 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE); 3200 break; 3201 default: 3202 assert(!"unsupported dynamic state"); 3203 break; 3204 } 3205 } 3206} 3207 3208static void 3209tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link, 3210 struct tu_push_constant_range *push_consts, 3211 struct ir3_shader_variant *v) 3212{ 3213 link->const_state = *ir3_const_state(v); 3214 link->constlen = v->constlen; 3215 link->push_consts = *push_consts; 3216} 3217 3218static void 3219tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, 3220 struct tu_pipeline *pipeline) 3221{ 3222 struct tu_cs prog_cs; 3223 3224 /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything 3225 * else that could depend on that state (like push constants) 3226 * 3227 * Note also that this always uses the full VS even in binning pass. The 3228 * binning pass variant has the same const layout as the full VS, and 3229 * the constlen for the VS will be the same or greater than the constlen 3230 * for the binning pass variant. It is required that the constlen state 3231 * matches between binning and draw passes, as some parts of the push 3232 * consts are emitted in state groups that are shared between the binning 3233 * and draw passes. 3234 */ 3235 tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs); 3236 tu6_emit_program_config(&prog_cs, builder); 3237 pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); 3238 3239 tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs); 3240 tu6_emit_program(&prog_cs, builder, false, pipeline); 3241 pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); 3242 3243 tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs); 3244 tu6_emit_program(&prog_cs, builder, true, pipeline); 3245 pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); 3246 3247 VkShaderStageFlags stages = 0; 3248 for (unsigned i = 0; i < builder->create_info->stageCount; i++) { 3249 stages |= builder->create_info->pStages[i].stage; 3250 } 3251 pipeline->active_stages = stages; 3252 3253 for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) { 3254 if (!builder->shaders->variants[i]) 3255 continue; 3256 3257 tu_pipeline_set_linkage(&pipeline->program.link[i], 3258 &builder->shaders->push_consts[i], 3259 builder->shaders->variants[i]); 3260 } 3261} 3262 3263static void 3264tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder, 3265 struct tu_pipeline *pipeline) 3266{ 3267 const VkPipelineVertexInputStateCreateInfo *vi_info = 3268 builder->create_info->pVertexInputState; 3269 const struct ir3_shader_variant *vs = builder->shaders->variants[MESA_SHADER_VERTEX]; 3270 const struct ir3_shader_variant *bs = builder->binning_variant; 3271 3272 /* Bindings may contain holes */ 3273 for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) { 3274 pipeline->num_vbs = 3275 MAX2(pipeline->num_vbs, vi_info->pVertexBindingDescriptions[i].binding + 1); 3276 } 3277 3278 tu6_emit_vertex_input(pipeline, &pipeline->vi.state, vs, vi_info); 3279 if (bs) 3280 tu6_emit_vertex_input(pipeline, &pipeline->vi.binning_state, bs, vi_info); 3281} 3282 3283static void 3284tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder *builder, 3285 struct tu_pipeline *pipeline) 3286{ 3287 const VkPipelineInputAssemblyStateCreateInfo *ia_info = 3288 builder->create_info->pInputAssemblyState; 3289 3290 pipeline->ia.primtype = tu6_primtype(ia_info->topology); 3291 pipeline->ia.primitive_restart = ia_info->primitiveRestartEnable; 3292} 3293 3294static bool 3295tu_pipeline_static_state(struct tu_pipeline *pipeline, struct tu_cs *cs, 3296 uint32_t id, uint32_t size) 3297{ 3298 assert(id < ARRAY_SIZE(pipeline->dynamic_state)); 3299 3300 if (pipeline->dynamic_state_mask & BIT(id)) 3301 return false; 3302 3303 pipeline->dynamic_state[id] = tu_cs_draw_state(&pipeline->cs, cs, size); 3304 return true; 3305} 3306 3307static void 3308tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder, 3309 struct tu_pipeline *pipeline) 3310{ 3311 if (!(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) || 3312 !(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)) 3313 return; 3314 3315 const VkPipelineTessellationStateCreateInfo *tess_info = 3316 builder->create_info->pTessellationState; 3317 3318 assert(pipeline->ia.primtype == DI_PT_PATCHES0); 3319 assert(tess_info->patchControlPoints <= 32); 3320 pipeline->ia.primtype += tess_info->patchControlPoints; 3321 const VkPipelineTessellationDomainOriginStateCreateInfo *domain_info = 3322 vk_find_struct_const(tess_info->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO); 3323 pipeline->tess.upper_left_domain_origin = !domain_info || 3324 domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT; 3325 const struct ir3_shader_variant *hs = builder->shaders->variants[MESA_SHADER_TESS_CTRL]; 3326 pipeline->tess.param_stride = hs->output_size * 4; 3327} 3328 3329static void 3330tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder, 3331 struct tu_pipeline *pipeline) 3332{ 3333 /* The spec says: 3334 * 3335 * pViewportState is a pointer to an instance of the 3336 * VkPipelineViewportStateCreateInfo structure, and is ignored if the 3337 * pipeline has rasterization disabled." 3338 * 3339 * We leave the relevant registers stale in that case. 3340 */ 3341 if (builder->rasterizer_discard) 3342 return; 3343 3344 const VkPipelineViewportStateCreateInfo *vp_info = 3345 builder->create_info->pViewportState; 3346 const VkPipelineViewportDepthClipControlCreateInfoEXT *depth_clip_info = 3347 vk_find_struct_const(vp_info->pNext, PIPELINE_VIEWPORT_DEPTH_CLIP_CONTROL_CREATE_INFO_EXT); 3348 pipeline->z_negative_one_to_one = depth_clip_info ? depth_clip_info->negativeOneToOne : false; 3349 3350 struct tu_cs cs; 3351 3352 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount)) 3353 tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount, pipeline->z_negative_one_to_one); 3354 3355 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount)) 3356 tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount); 3357} 3358 3359static void 3360tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder, 3361 struct tu_pipeline *pipeline) 3362{ 3363 const VkPipelineRasterizationStateCreateInfo *rast_info = 3364 builder->create_info->pRasterizationState; 3365 3366 enum a6xx_polygon_mode mode = tu6_polygon_mode(rast_info->polygonMode); 3367 3368 builder->depth_clip_disable = rast_info->depthClampEnable; 3369 3370 const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state = 3371 vk_find_struct_const(rast_info, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT); 3372 if (depth_clip_state) 3373 builder->depth_clip_disable = !depth_clip_state->depthClipEnable; 3374 3375 pipeline->line_mode = RECTANGULAR; 3376 3377 if (tu6_primtype_line(pipeline->ia.primtype) || 3378 (tu6_primtype_patches(pipeline->ia.primtype) && 3379 pipeline->tess.patch_type == IR3_TESS_ISOLINES)) { 3380 const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_state = 3381 vk_find_struct_const(rast_info->pNext, 3382 PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT); 3383 3384 if (rast_line_state && rast_line_state->lineRasterizationMode == 3385 VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) { 3386 pipeline->line_mode = BRESENHAM; 3387 } 3388 } 3389 3390 struct tu_cs cs; 3391 uint32_t cs_size = 9 + 3392 (builder->device->physical_device->info->a6xx.has_shading_rate ? 8 : 0) + 3393 (builder->emit_msaa_state ? 11 : 0); 3394 pipeline->rast_state = tu_cs_draw_state(&pipeline->cs, &cs, cs_size); 3395 3396 tu_cs_emit_regs(&cs, 3397 A6XX_GRAS_CL_CNTL( 3398 .znear_clip_disable = builder->depth_clip_disable, 3399 .zfar_clip_disable = builder->depth_clip_disable, 3400 /* TODO should this be depth_clip_disable instead? */ 3401 .unk5 = rast_info->depthClampEnable, 3402 .zero_gb_scale_z = pipeline->z_negative_one_to_one ? 0 : 1, 3403 .vp_clip_code_ignore = 1)); 3404 3405 tu_cs_emit_regs(&cs, 3406 A6XX_VPC_POLYGON_MODE(mode)); 3407 3408 tu_cs_emit_regs(&cs, 3409 A6XX_PC_POLYGON_MODE(mode)); 3410 3411 /* move to hw ctx init? */ 3412 tu_cs_emit_regs(&cs, 3413 A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f), 3414 A6XX_GRAS_SU_POINT_SIZE(1.0f)); 3415 3416 if (builder->device->physical_device->info->a6xx.has_shading_rate) { 3417 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A00()); 3418 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A10()); 3419 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A20()); 3420 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A30()); 3421 } 3422 3423 /* If samples count couldn't be devised from the subpass, we should emit it here. 3424 * It happens when subpass doesn't use any color/depth attachment. 3425 */ 3426 if (builder->emit_msaa_state) 3427 tu6_emit_msaa(&cs, builder->samples, pipeline->line_mode); 3428 3429 const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info = 3430 vk_find_struct_const(rast_info->pNext, 3431 PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT); 3432 unsigned stream = stream_info ? stream_info->rasterizationStream : 0; 3433 3434 pipeline->pc_raster_cntl = A6XX_PC_RASTER_CNTL_STREAM(stream); 3435 pipeline->vpc_unknown_9107 = 0; 3436 if (rast_info->rasterizerDiscardEnable) { 3437 pipeline->pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD; 3438 pipeline->vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD; 3439 } 3440 3441 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4)) { 3442 tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = pipeline->pc_raster_cntl)); 3443 tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = pipeline->vpc_unknown_9107)); 3444 } 3445 3446 pipeline->gras_su_cntl = 3447 tu6_gras_su_cntl(rast_info, pipeline->line_mode, builder->multiview_mask != 0); 3448 3449 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2)) 3450 tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = pipeline->gras_su_cntl)); 3451 3452 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BIAS, 4)) { 3453 tu6_emit_depth_bias(&cs, rast_info->depthBiasConstantFactor, 3454 rast_info->depthBiasClamp, 3455 rast_info->depthBiasSlopeFactor); 3456 } 3457 3458 const struct VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_state = 3459 vk_find_struct_const(rast_info->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT); 3460 pipeline->provoking_vertex_last = provoking_vtx_state && 3461 provoking_vtx_state->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT; 3462} 3463 3464static void 3465tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder, 3466 struct tu_pipeline *pipeline) 3467{ 3468 /* The spec says: 3469 * 3470 * pDepthStencilState is a pointer to an instance of the 3471 * VkPipelineDepthStencilStateCreateInfo structure, and is ignored if 3472 * the pipeline has rasterization disabled or if the subpass of the 3473 * render pass the pipeline is created against does not use a 3474 * depth/stencil attachment. 3475 */ 3476 const VkPipelineDepthStencilStateCreateInfo *ds_info = 3477 builder->create_info->pDepthStencilState; 3478 const enum pipe_format pipe_format = 3479 vk_format_to_pipe_format(builder->depth_attachment_format); 3480 uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0; 3481 struct tu_cs cs; 3482 3483 if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED && 3484 builder->depth_attachment_format != VK_FORMAT_S8_UINT) { 3485 if (ds_info->depthTestEnable) { 3486 rb_depth_cntl |= 3487 A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | 3488 A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) | 3489 A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; /* TODO: don't set for ALWAYS/NEVER */ 3490 3491 if (builder->depth_clip_disable) 3492 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_CLIP_DISABLE; 3493 3494 if (ds_info->depthWriteEnable) 3495 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; 3496 } 3497 3498 if (ds_info->depthBoundsTestEnable) 3499 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; 3500 3501 if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable) 3502 tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl); 3503 3504 pipeline->depth_cpp_per_sample = util_format_get_component_bits( 3505 pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 0) / 8; 3506 } else { 3507 /* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set 3508 * to 0 when this pipeline is used, as enabling depth test when there 3509 * is no depth attachment is a problem (at least for the S8_UINT case) 3510 */ 3511 if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL)) 3512 pipeline->rb_depth_cntl_disable = true; 3513 } 3514 3515 if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) { 3516 const VkStencilOpState *front = &ds_info->front; 3517 const VkStencilOpState *back = &ds_info->back; 3518 3519 rb_stencil_cntl |= 3520 A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) | 3521 A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) | 3522 A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) | 3523 A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) | 3524 A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) | 3525 A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) | 3526 A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) | 3527 A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp)); 3528 3529 if (ds_info->stencilTestEnable) { 3530 rb_stencil_cntl |= 3531 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | 3532 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | 3533 A6XX_RB_STENCIL_CONTROL_STENCIL_READ; 3534 } 3535 3536 pipeline->stencil_cpp_per_sample = util_format_get_component_bits( 3537 pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 1) / 8; 3538 } 3539 3540 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) { 3541 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_CNTL, 1); 3542 tu_cs_emit(&cs, rb_depth_cntl); 3543 } 3544 pipeline->rb_depth_cntl = rb_depth_cntl; 3545 3546 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2)) { 3547 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_STENCIL_CONTROL, 1); 3548 tu_cs_emit(&cs, rb_stencil_cntl); 3549 } 3550 pipeline->rb_stencil_cntl = rb_stencil_cntl; 3551 3552 /* the remaining draw states arent used if there is no d/s, leave them empty */ 3553 if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED) 3554 return; 3555 3556 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3)) { 3557 tu_cs_emit_regs(&cs, 3558 A6XX_RB_Z_BOUNDS_MIN(ds_info->minDepthBounds), 3559 A6XX_RB_Z_BOUNDS_MAX(ds_info->maxDepthBounds)); 3560 } 3561 3562 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2)) { 3563 tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.mask = ds_info->front.compareMask & 0xff, 3564 .bfmask = ds_info->back.compareMask & 0xff)); 3565 } 3566 3567 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2)) { 3568 update_stencil_mask(&pipeline->stencil_wrmask, VK_STENCIL_FACE_FRONT_BIT, ds_info->front.writeMask); 3569 update_stencil_mask(&pipeline->stencil_wrmask, VK_STENCIL_FACE_BACK_BIT, ds_info->back.writeMask); 3570 tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = pipeline->stencil_wrmask)); 3571 } 3572 3573 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2)) { 3574 tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.ref = ds_info->front.reference & 0xff, 3575 .bfref = ds_info->back.reference & 0xff)); 3576 } 3577 3578 if (builder->shaders->variants[MESA_SHADER_FRAGMENT]) { 3579 const struct ir3_shader_variant *fs = builder->shaders->variants[MESA_SHADER_FRAGMENT]; 3580 if (fs->has_kill || builder->alpha_to_coverage) { 3581 pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE; 3582 } 3583 if (fs->no_earlyz || fs->writes_pos) { 3584 pipeline->lrz.force_disable_mask = TU_LRZ_FORCE_DISABLE_LRZ; 3585 } 3586 } 3587} 3588 3589static void 3590tu_pipeline_builder_parse_multisample_and_color_blend( 3591 struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline) 3592{ 3593 /* The spec says: 3594 * 3595 * pMultisampleState is a pointer to an instance of the 3596 * VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline 3597 * has rasterization disabled. 3598 * 3599 * Also, 3600 * 3601 * pColorBlendState is a pointer to an instance of the 3602 * VkPipelineColorBlendStateCreateInfo structure, and is ignored if the 3603 * pipeline has rasterization disabled or if the subpass of the render 3604 * pass the pipeline is created against does not use any color 3605 * attachments. 3606 * 3607 * We leave the relevant registers stale when rasterization is disabled. 3608 */ 3609 if (builder->rasterizer_discard) 3610 return; 3611 3612 static const VkPipelineColorBlendStateCreateInfo dummy_blend_info; 3613 const VkPipelineMultisampleStateCreateInfo *msaa_info = 3614 builder->create_info->pMultisampleState; 3615 const VkPipelineColorBlendStateCreateInfo *blend_info = 3616 builder->use_color_attachments ? builder->create_info->pColorBlendState 3617 : &dummy_blend_info; 3618 3619 struct tu_cs cs; 3620 tu6_emit_rb_mrt_controls(pipeline, blend_info, 3621 builder->color_attachment_formats, 3622 &pipeline->rop_reads_dst, 3623 &pipeline->color_bandwidth_per_sample); 3624 3625 uint32_t blend_enable_mask = 3626 pipeline->rop_reads_dst ? pipeline->color_write_enable : pipeline->blend_enable; 3627 tu6_emit_blend_control(pipeline, blend_enable_mask, 3628 builder->use_dual_src_blend, msaa_info); 3629 3630 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_BLEND, 3631 blend_info->attachmentCount * 3 + 4)) { 3632 tu6_emit_blend(&cs, pipeline); 3633 assert(cs.cur == cs.end); /* validate draw state size */ 3634 } 3635 3636 /* Disable LRZ writes when blend or logic op that reads the destination is 3637 * enabled, since the resulting pixel value from the blend-draw depends on 3638 * an earlier draw, which LRZ in the draw pass could early-reject if the 3639 * previous blend-enabled draw wrote LRZ. 3640 * 3641 * TODO: We need to disable LRZ writes only for the binning pass. 3642 * Therefore, we need to emit it in a separate draw state. We keep 3643 * it disabled for sysmem path as well for the moment. 3644 */ 3645 if (blend_enable_mask) 3646 pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE; 3647 3648 for (int i = 0; i < blend_info->attachmentCount; i++) { 3649 VkPipelineColorBlendAttachmentState blendAttachment = blend_info->pAttachments[i]; 3650 /* From the PoV of LRZ, having masked color channels is 3651 * the same as having blend enabled, in that the draw will 3652 * care about the fragments from an earlier draw. 3653 */ 3654 VkFormat format = builder->color_attachment_formats[i]; 3655 unsigned mask = MASK(vk_format_get_nr_components(format)); 3656 if (format != VK_FORMAT_UNDEFINED && 3657 ((blendAttachment.colorWriteMask & mask) != mask || 3658 !(pipeline->color_write_enable & BIT(i)))) { 3659 pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE; 3660 } 3661 } 3662 3663 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5)) { 3664 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4); 3665 tu_cs_emit_array(&cs, (const uint32_t *) blend_info->blendConstants, 4); 3666 } 3667 3668 const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations = 3669 vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT); 3670 const VkSampleLocationsInfoEXT *samp_loc = NULL; 3671 3672 if (sample_locations && sample_locations->sampleLocationsEnable) 3673 samp_loc = &sample_locations->sampleLocationsInfo; 3674 3675 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, 3676 samp_loc ? 9 : 6)) { 3677 tu6_emit_sample_locations(&cs, samp_loc); 3678 } 3679} 3680 3681static void 3682tu_pipeline_builder_parse_rasterization_order( 3683 struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline) 3684{ 3685 if (builder->rasterizer_discard) 3686 return; 3687 3688 pipeline->subpass_feedback_loop_ds = builder->subpass_feedback_loop_ds; 3689 3690 const VkPipelineColorBlendStateCreateInfo *blend_info = 3691 builder->create_info->pColorBlendState; 3692 3693 const VkPipelineDepthStencilStateCreateInfo *ds_info = 3694 builder->create_info->pDepthStencilState; 3695 3696 if (builder->use_color_attachments) { 3697 pipeline->raster_order_attachment_access = 3698 blend_info->flags & 3699 VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_ARM; 3700 } 3701 3702 if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) { 3703 pipeline->raster_order_attachment_access |= 3704 ds_info->flags & 3705 (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_ARM | 3706 VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_ARM); 3707 } 3708 3709 if (unlikely(builder->device->physical_device->instance->debug_flags & TU_DEBUG_RAST_ORDER)) 3710 pipeline->raster_order_attachment_access = true; 3711 3712 /* VK_EXT_blend_operation_advanced would also require ordered access 3713 * when implemented in the future. 3714 */ 3715 3716 uint32_t sysmem_prim_mode = NO_FLUSH; 3717 uint32_t gmem_prim_mode = NO_FLUSH; 3718 3719 if (pipeline->raster_order_attachment_access) { 3720 /* VK_ARM_rasterization_order_attachment_access: 3721 * 3722 * This extension allow access to framebuffer attachments when used as 3723 * both input and color attachments from one fragment to the next, 3724 * in rasterization order, without explicit synchronization. 3725 */ 3726 sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE; 3727 gmem_prim_mode = FLUSH_PER_OVERLAP; 3728 } else { 3729 /* If there is a feedback loop, then the shader can read the previous value 3730 * of a pixel being written out. It can also write some components and then 3731 * read different components without a barrier in between. This is a 3732 * problem in sysmem mode with UBWC, because the main buffer and flags 3733 * buffer can get out-of-sync if only one is flushed. We fix this by 3734 * setting the SINGLE_PRIM_MODE field to the same value that the blob does 3735 * for advanced_blend in sysmem mode if a feedback loop is detected. 3736 */ 3737 if (builder->subpass_feedback_loop_color || 3738 builder->subpass_feedback_loop_ds) { 3739 sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE; 3740 } 3741 } 3742 3743 struct tu_cs cs; 3744 3745 pipeline->prim_order_state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2); 3746 tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL, 3747 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) | 3748 A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode)); 3749 3750 pipeline->prim_order_state_sysmem = tu_cs_draw_state(&pipeline->cs, &cs, 2); 3751 tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL, 3752 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) | 3753 A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(sysmem_prim_mode)); 3754} 3755 3756static void 3757tu_pipeline_finish(struct tu_pipeline *pipeline, 3758 struct tu_device *dev, 3759 const VkAllocationCallbacks *alloc) 3760{ 3761 tu_cs_finish(&pipeline->cs); 3762 pthread_mutex_lock(&dev->pipeline_mutex); 3763 tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo); 3764 pthread_mutex_unlock(&dev->pipeline_mutex); 3765 3766 if (pipeline->pvtmem_bo) 3767 tu_bo_finish(dev, pipeline->pvtmem_bo); 3768 3769 ralloc_free(pipeline->executables_mem_ctx); 3770} 3771 3772static VkResult 3773tu_pipeline_builder_build(struct tu_pipeline_builder *builder, 3774 struct tu_pipeline **pipeline) 3775{ 3776 VkResult result; 3777 3778 *pipeline = vk_object_zalloc(&builder->device->vk, builder->alloc, 3779 sizeof(**pipeline), VK_OBJECT_TYPE_PIPELINE); 3780 if (!*pipeline) 3781 return VK_ERROR_OUT_OF_HOST_MEMORY; 3782 3783 (*pipeline)->executables_mem_ctx = ralloc_context(NULL); 3784 util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx); 3785 3786 /* compile and upload shaders */ 3787 result = tu_pipeline_builder_compile_shaders(builder, *pipeline); 3788 if (result != VK_SUCCESS) { 3789 vk_object_free(&builder->device->vk, builder->alloc, *pipeline); 3790 return result; 3791 } 3792 3793 result = tu_pipeline_allocate_cs(builder->device, *pipeline, 3794 builder->layout, builder, NULL); 3795 if (result != VK_SUCCESS) { 3796 vk_object_free(&builder->device->vk, builder->alloc, *pipeline); 3797 return result; 3798 } 3799 3800 for (uint32_t i = 0; i < ARRAY_SIZE(builder->shader_iova); i++) 3801 builder->shader_iova[i] = 3802 tu_upload_variant(*pipeline, builder->shaders->variants[i]); 3803 3804 builder->binning_vs_iova = 3805 tu_upload_variant(*pipeline, builder->binning_variant); 3806 3807 /* Setup private memory. Note that because we're sharing the same private 3808 * memory for all stages, all stages must use the same config, or else 3809 * fibers from one stage might overwrite fibers in another. 3810 */ 3811 3812 uint32_t pvtmem_size = 0; 3813 bool per_wave = true; 3814 for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) { 3815 if (builder->shaders->variants[i]) { 3816 pvtmem_size = MAX2(pvtmem_size, builder->shaders->variants[i]->pvtmem_size); 3817 if (!builder->shaders->variants[i]->pvtmem_per_wave) 3818 per_wave = false; 3819 } 3820 } 3821 3822 if (builder->binning_variant) { 3823 pvtmem_size = MAX2(pvtmem_size, builder->binning_variant->pvtmem_size); 3824 if (!builder->binning_variant->pvtmem_per_wave) 3825 per_wave = false; 3826 } 3827 3828 result = tu_setup_pvtmem(builder->device, *pipeline, &builder->pvtmem, 3829 pvtmem_size, per_wave); 3830 if (result != VK_SUCCESS) { 3831 vk_object_free(&builder->device->vk, builder->alloc, *pipeline); 3832 return result; 3833 } 3834 3835 tu_pipeline_builder_parse_dynamic(builder, *pipeline); 3836 tu_pipeline_builder_parse_shader_stages(builder, *pipeline); 3837 tu_pipeline_builder_parse_vertex_input(builder, *pipeline); 3838 tu_pipeline_builder_parse_input_assembly(builder, *pipeline); 3839 tu_pipeline_builder_parse_tessellation(builder, *pipeline); 3840 tu_pipeline_builder_parse_viewport(builder, *pipeline); 3841 tu_pipeline_builder_parse_rasterization(builder, *pipeline); 3842 tu_pipeline_builder_parse_depth_stencil(builder, *pipeline); 3843 tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline); 3844 tu_pipeline_builder_parse_rasterization_order(builder, *pipeline); 3845 tu6_emit_load_state(*pipeline, builder->layout, false); 3846 3847 return VK_SUCCESS; 3848} 3849 3850static void 3851tu_pipeline_builder_finish(struct tu_pipeline_builder *builder) 3852{ 3853 if (builder->shaders) 3854 vk_pipeline_cache_object_unref(&builder->shaders->base); 3855 ralloc_free(builder->mem_ctx); 3856} 3857 3858static void 3859tu_pipeline_builder_init_graphics( 3860 struct tu_pipeline_builder *builder, 3861 struct tu_device *dev, 3862 struct vk_pipeline_cache *cache, 3863 const VkGraphicsPipelineCreateInfo *create_info, 3864 const VkAllocationCallbacks *alloc) 3865{ 3866 TU_FROM_HANDLE(tu_pipeline_layout, layout, create_info->layout); 3867 3868 *builder = (struct tu_pipeline_builder) { 3869 .device = dev, 3870 .mem_ctx = ralloc_context(NULL), 3871 .cache = cache, 3872 .create_info = create_info, 3873 .alloc = alloc, 3874 .layout = layout, 3875 }; 3876 3877 bool rasterizer_discard_dynamic = false; 3878 if (create_info->pDynamicState) { 3879 for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) { 3880 if (create_info->pDynamicState->pDynamicStates[i] == 3881 VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE) { 3882 rasterizer_discard_dynamic = true; 3883 break; 3884 } 3885 } 3886 } 3887 3888 builder->rasterizer_discard = 3889 builder->create_info->pRasterizationState->rasterizerDiscardEnable && 3890 !rasterizer_discard_dynamic; 3891 3892 const VkPipelineRenderingCreateInfo *rendering_info = 3893 vk_find_struct_const(create_info->pNext, PIPELINE_RENDERING_CREATE_INFO); 3894 3895 if (unlikely(dev->instance->debug_flags & TU_DEBUG_DYNAMIC) && !rendering_info) 3896 rendering_info = vk_get_pipeline_rendering_create_info(create_info); 3897 3898 if (rendering_info) { 3899 builder->subpass_raster_order_attachment_access = false; 3900 builder->subpass_feedback_loop_ds = false; 3901 builder->subpass_feedback_loop_color = false; 3902 3903 builder->multiview_mask = rendering_info->viewMask; 3904 3905 /* We don't know with dynamic rendering whether the pipeline will be 3906 * used in a render pass with none of attachments enabled, so we have to 3907 * dynamically emit MSAA state. 3908 * 3909 * TODO: Move MSAA state to a separate draw state and emit it 3910 * dynamically only when the sample count is different from the 3911 * subpass's sample count. 3912 */ 3913 builder->emit_msaa_state = !builder->rasterizer_discard; 3914 3915 const VkRenderingSelfDependencyInfoMESA *self_dependency = 3916 vk_find_struct_const(rendering_info->pNext, RENDERING_SELF_DEPENDENCY_INFO_MESA); 3917 3918 if (self_dependency) { 3919 builder->subpass_feedback_loop_ds = 3920 self_dependency->depthSelfDependency || 3921 self_dependency->stencilSelfDependency; 3922 builder->subpass_feedback_loop_color = 3923 self_dependency->colorSelfDependencies; 3924 } 3925 3926 if (!builder->rasterizer_discard) { 3927 builder->depth_attachment_format = 3928 rendering_info->depthAttachmentFormat == VK_FORMAT_UNDEFINED ? 3929 rendering_info->stencilAttachmentFormat : 3930 rendering_info->depthAttachmentFormat; 3931 3932 builder->color_attachment_count = 3933 rendering_info->colorAttachmentCount; 3934 3935 for (unsigned i = 0; i < rendering_info->colorAttachmentCount; i++) { 3936 builder->color_attachment_formats[i] = 3937 rendering_info->pColorAttachmentFormats[i]; 3938 if (builder->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) { 3939 builder->use_color_attachments = true; 3940 builder->render_components |= 0xf << (i * 4); 3941 } 3942 } 3943 } 3944 } else { 3945 const struct tu_render_pass *pass = 3946 tu_render_pass_from_handle(create_info->renderPass); 3947 const struct tu_subpass *subpass = 3948 &pass->subpasses[create_info->subpass]; 3949 3950 builder->subpass_raster_order_attachment_access = 3951 subpass->raster_order_attachment_access; 3952 builder->subpass_feedback_loop_color = subpass->feedback_loop_color; 3953 builder->subpass_feedback_loop_ds = subpass->feedback_loop_ds; 3954 3955 builder->multiview_mask = subpass->multiview_mask; 3956 3957 /* variableMultisampleRate support */ 3958 builder->emit_msaa_state = (subpass->samples == 0) && !builder->rasterizer_discard; 3959 3960 if (!builder->rasterizer_discard) { 3961 const uint32_t a = subpass->depth_stencil_attachment.attachment; 3962 builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ? 3963 pass->attachments[a].format : VK_FORMAT_UNDEFINED; 3964 3965 assert(subpass->color_count == 0 || 3966 !create_info->pColorBlendState || 3967 subpass->color_count == create_info->pColorBlendState->attachmentCount); 3968 builder->color_attachment_count = subpass->color_count; 3969 for (uint32_t i = 0; i < subpass->color_count; i++) { 3970 const uint32_t a = subpass->color_attachments[i].attachment; 3971 if (a == VK_ATTACHMENT_UNUSED) 3972 continue; 3973 3974 builder->color_attachment_formats[i] = pass->attachments[a].format; 3975 builder->use_color_attachments = true; 3976 builder->render_components |= 0xf << (i * 4); 3977 } 3978 } 3979 } 3980 3981 3982 if (builder->rasterizer_discard) { 3983 builder->samples = VK_SAMPLE_COUNT_1_BIT; 3984 } else { 3985 builder->samples = create_info->pMultisampleState->rasterizationSamples; 3986 builder->alpha_to_coverage = create_info->pMultisampleState->alphaToCoverageEnable; 3987 3988 if (tu_blend_state_is_dual_src(create_info->pColorBlendState)) { 3989 builder->color_attachment_count++; 3990 builder->use_dual_src_blend = true; 3991 /* dual source blending has an extra fs output in the 2nd slot */ 3992 if (builder->color_attachment_formats[0] != VK_FORMAT_UNDEFINED) 3993 builder->render_components |= 0xf << 4; 3994 } 3995 } 3996} 3997 3998static VkResult 3999tu_graphics_pipeline_create(VkDevice device, 4000 VkPipelineCache pipelineCache, 4001 const VkGraphicsPipelineCreateInfo *pCreateInfo, 4002 const VkAllocationCallbacks *pAllocator, 4003 VkPipeline *pPipeline) 4004{ 4005 TU_FROM_HANDLE(tu_device, dev, device); 4006 TU_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache); 4007 4008 cache = cache ? cache : dev->mem_cache; 4009 4010 struct tu_pipeline_builder builder; 4011 tu_pipeline_builder_init_graphics(&builder, dev, cache, 4012 pCreateInfo, pAllocator); 4013 4014 struct tu_pipeline *pipeline = NULL; 4015 VkResult result = tu_pipeline_builder_build(&builder, &pipeline); 4016 tu_pipeline_builder_finish(&builder); 4017 4018 if (result == VK_SUCCESS) 4019 *pPipeline = tu_pipeline_to_handle(pipeline); 4020 else 4021 *pPipeline = VK_NULL_HANDLE; 4022 4023 return result; 4024} 4025 4026VKAPI_ATTR VkResult VKAPI_CALL 4027tu_CreateGraphicsPipelines(VkDevice device, 4028 VkPipelineCache pipelineCache, 4029 uint32_t count, 4030 const VkGraphicsPipelineCreateInfo *pCreateInfos, 4031 const VkAllocationCallbacks *pAllocator, 4032 VkPipeline *pPipelines) 4033{ 4034 VkResult final_result = VK_SUCCESS; 4035 uint32_t i = 0; 4036 4037 for (; i < count; i++) { 4038 VkResult result = tu_graphics_pipeline_create(device, pipelineCache, 4039 &pCreateInfos[i], pAllocator, 4040 &pPipelines[i]); 4041 4042 if (result != VK_SUCCESS) { 4043 final_result = result; 4044 pPipelines[i] = VK_NULL_HANDLE; 4045 4046 if (pCreateInfos[i].flags & 4047 VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT) 4048 break; 4049 } 4050 } 4051 4052 for (; i < count; i++) 4053 pPipelines[i] = VK_NULL_HANDLE; 4054 4055 return final_result; 4056} 4057 4058static VkResult 4059tu_compute_pipeline_create(VkDevice device, 4060 VkPipelineCache pipelineCache, 4061 const VkComputePipelineCreateInfo *pCreateInfo, 4062 const VkAllocationCallbacks *pAllocator, 4063 VkPipeline *pPipeline) 4064{ 4065 TU_FROM_HANDLE(tu_device, dev, device); 4066 TU_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache); 4067 TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout); 4068 const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage; 4069 VkResult result; 4070 4071 cache = cache ? cache : dev->mem_cache; 4072 4073 struct tu_pipeline *pipeline; 4074 4075 *pPipeline = VK_NULL_HANDLE; 4076 4077 VkPipelineCreationFeedback pipeline_feedback = { 4078 .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT, 4079 }; 4080 4081 const VkPipelineCreationFeedbackCreateInfo *creation_feedback = 4082 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO); 4083 4084 int64_t pipeline_start = os_time_get_nano(); 4085 4086 pipeline = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pipeline), 4087 VK_OBJECT_TYPE_PIPELINE); 4088 if (!pipeline) 4089 return VK_ERROR_OUT_OF_HOST_MEMORY; 4090 4091 pipeline->executables_mem_ctx = ralloc_context(NULL); 4092 util_dynarray_init(&pipeline->executables, pipeline->executables_mem_ctx); 4093 4094 struct tu_shader_key key = { }; 4095 tu_shader_key_init(&key, stage_info, dev); 4096 4097 void *pipeline_mem_ctx = ralloc_context(NULL); 4098 4099 unsigned char pipeline_sha1[20]; 4100 tu_hash_compute(pipeline_sha1, stage_info, layout, &key, dev->compiler); 4101 4102 struct tu_compiled_shaders *compiled = NULL; 4103 4104 const bool executable_info = pCreateInfo->flags & 4105 VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR; 4106 4107 bool application_cache_hit = false; 4108 4109 if (!executable_info) { 4110 compiled = 4111 tu_pipeline_cache_lookup(cache, pipeline_sha1, sizeof(pipeline_sha1), 4112 &application_cache_hit); 4113 } 4114 4115 if (application_cache_hit && cache != dev->mem_cache) { 4116 pipeline_feedback.flags |= 4117 VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; 4118 } 4119 4120 if (tu6_shared_constants_enable(layout, dev->compiler)) { 4121 pipeline->shared_consts = (struct tu_push_constant_range) { 4122 .lo = 0, 4123 .dwords = layout->push_constant_size / 4, 4124 }; 4125 } 4126 4127 char *nir_initial_disasm = NULL; 4128 4129 if (!compiled) { 4130 if (pCreateInfo->flags & 4131 VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) { 4132 result = VK_PIPELINE_COMPILE_REQUIRED; 4133 goto fail; 4134 } 4135 4136 struct ir3_shader_key ir3_key = {}; 4137 4138 nir_shader *nir = tu_spirv_to_nir(dev, pipeline_mem_ctx, stage_info, 4139 MESA_SHADER_COMPUTE); 4140 4141 nir_initial_disasm = executable_info ? 4142 nir_shader_as_str(nir, pipeline->executables_mem_ctx) : NULL; 4143 4144 struct tu_shader *shader = 4145 tu_shader_create(dev, nir, &key, layout, pAllocator); 4146 if (!shader) { 4147 result = VK_ERROR_OUT_OF_HOST_MEMORY; 4148 goto fail; 4149 } 4150 4151 compiled = tu_shaders_init(dev, &pipeline_sha1, sizeof(pipeline_sha1)); 4152 if (!compiled) { 4153 tu_shader_destroy(dev, shader, pAllocator); 4154 result = VK_ERROR_OUT_OF_HOST_MEMORY; 4155 goto fail; 4156 } 4157 4158 compiled->active_desc_sets = shader->active_desc_sets; 4159 compiled->push_consts[MESA_SHADER_COMPUTE] = shader->push_consts; 4160 4161 struct ir3_shader_variant *v = 4162 ir3_shader_create_variant(shader->ir3_shader, &ir3_key, executable_info); 4163 4164 tu_shader_destroy(dev, shader, pAllocator); 4165 4166 if (!v) { 4167 result = VK_ERROR_OUT_OF_HOST_MEMORY; 4168 goto fail; 4169 } 4170 4171 compiled->variants[MESA_SHADER_COMPUTE] = v; 4172 4173 compiled = tu_pipeline_cache_insert(cache, compiled); 4174 } 4175 4176 pipeline_feedback.duration = os_time_get_nano() - pipeline_start; 4177 4178 if (creation_feedback) { 4179 *creation_feedback->pPipelineCreationFeedback = pipeline_feedback; 4180 assert(creation_feedback->pipelineStageCreationFeedbackCount == 1); 4181 creation_feedback->pPipelineStageCreationFeedbacks[0] = pipeline_feedback; 4182 } 4183 4184 pipeline->active_desc_sets = compiled->active_desc_sets; 4185 4186 struct ir3_shader_variant *v = compiled->variants[MESA_SHADER_COMPUTE]; 4187 4188 tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE], 4189 &compiled->push_consts[MESA_SHADER_COMPUTE], v); 4190 4191 result = tu_pipeline_allocate_cs(dev, pipeline, layout, NULL, v); 4192 if (result != VK_SUCCESS) 4193 goto fail; 4194 4195 uint64_t shader_iova = tu_upload_variant(pipeline, v); 4196 4197 struct tu_pvtmem_config pvtmem; 4198 tu_setup_pvtmem(dev, pipeline, &pvtmem, v->pvtmem_size, v->pvtmem_per_wave); 4199 4200 for (int i = 0; i < 3; i++) 4201 pipeline->compute.local_size[i] = v->local_size[i]; 4202 4203 pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64; 4204 4205 struct tu_cs prog_cs; 4206 uint32_t additional_reserve_size = tu_xs_get_additional_cs_size_dwords(v); 4207 tu_cs_begin_sub_stream(&pipeline->cs, 64 + additional_reserve_size, &prog_cs); 4208 tu6_emit_cs_config(&prog_cs, v, &pvtmem, shader_iova); 4209 pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); 4210 4211 tu6_emit_load_state(pipeline, layout, true); 4212 4213 tu_append_executable(pipeline, v, nir_initial_disasm); 4214 4215 vk_pipeline_cache_object_unref(&compiled->base); 4216 ralloc_free(pipeline_mem_ctx); 4217 4218 *pPipeline = tu_pipeline_to_handle(pipeline); 4219 4220 return VK_SUCCESS; 4221 4222fail: 4223 if (compiled) 4224 vk_pipeline_cache_object_unref(&compiled->base); 4225 4226 ralloc_free(pipeline_mem_ctx); 4227 4228 vk_object_free(&dev->vk, pAllocator, pipeline); 4229 4230 return result; 4231} 4232 4233VKAPI_ATTR VkResult VKAPI_CALL 4234tu_CreateComputePipelines(VkDevice device, 4235 VkPipelineCache pipelineCache, 4236 uint32_t count, 4237 const VkComputePipelineCreateInfo *pCreateInfos, 4238 const VkAllocationCallbacks *pAllocator, 4239 VkPipeline *pPipelines) 4240{ 4241 VkResult final_result = VK_SUCCESS; 4242 uint32_t i = 0; 4243 4244 for (; i < count; i++) { 4245 VkResult result = tu_compute_pipeline_create(device, pipelineCache, 4246 &pCreateInfos[i], 4247 pAllocator, &pPipelines[i]); 4248 if (result != VK_SUCCESS) { 4249 final_result = result; 4250 pPipelines[i] = VK_NULL_HANDLE; 4251 4252 if (pCreateInfos[i].flags & 4253 VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT) 4254 break; 4255 } 4256 } 4257 4258 for (; i < count; i++) 4259 pPipelines[i] = VK_NULL_HANDLE; 4260 4261 return final_result; 4262} 4263 4264VKAPI_ATTR void VKAPI_CALL 4265tu_DestroyPipeline(VkDevice _device, 4266 VkPipeline _pipeline, 4267 const VkAllocationCallbacks *pAllocator) 4268{ 4269 TU_FROM_HANDLE(tu_device, dev, _device); 4270 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline); 4271 4272 if (!_pipeline) 4273 return; 4274 4275 tu_pipeline_finish(pipeline, dev, pAllocator); 4276 vk_object_free(&dev->vk, pAllocator, pipeline); 4277} 4278 4279#define WRITE_STR(field, ...) ({ \ 4280 memset(field, 0, sizeof(field)); \ 4281 UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \ 4282 assert(_i > 0 && _i < sizeof(field)); \ 4283}) 4284 4285static const struct tu_pipeline_executable * 4286tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index) 4287{ 4288 assert(index < util_dynarray_num_elements(&pipeline->executables, 4289 struct tu_pipeline_executable)); 4290 return util_dynarray_element( 4291 &pipeline->executables, struct tu_pipeline_executable, index); 4292} 4293 4294VKAPI_ATTR VkResult VKAPI_CALL 4295tu_GetPipelineExecutablePropertiesKHR( 4296 VkDevice _device, 4297 const VkPipelineInfoKHR* pPipelineInfo, 4298 uint32_t* pExecutableCount, 4299 VkPipelineExecutablePropertiesKHR* pProperties) 4300{ 4301 TU_FROM_HANDLE(tu_device, dev, _device); 4302 TU_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline); 4303 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, 4304 pProperties, pExecutableCount); 4305 4306 util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) { 4307 vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) { 4308 gl_shader_stage stage = exe->stage; 4309 props->stages = mesa_to_vk_shader_stage(stage); 4310 4311 if (!exe->is_binning) 4312 WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage)); 4313 else 4314 WRITE_STR(props->name, "Binning VS"); 4315 4316 WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage)); 4317 4318 props->subgroupSize = 4319 dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1); 4320 } 4321 } 4322 4323 return vk_outarray_status(&out); 4324} 4325 4326VKAPI_ATTR VkResult VKAPI_CALL 4327tu_GetPipelineExecutableStatisticsKHR( 4328 VkDevice _device, 4329 const VkPipelineExecutableInfoKHR* pExecutableInfo, 4330 uint32_t* pStatisticCount, 4331 VkPipelineExecutableStatisticKHR* pStatistics) 4332{ 4333 TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline); 4334 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, 4335 pStatistics, pStatisticCount); 4336 4337 const struct tu_pipeline_executable *exe = 4338 tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex); 4339 4340 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4341 WRITE_STR(stat->name, "Max Waves Per Core"); 4342 WRITE_STR(stat->description, 4343 "Maximum number of simultaneous waves per core."); 4344 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4345 stat->value.u64 = exe->stats.max_waves; 4346 } 4347 4348 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4349 WRITE_STR(stat->name, "Instruction Count"); 4350 WRITE_STR(stat->description, 4351 "Total number of IR3 instructions in the final generated " 4352 "shader executable."); 4353 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4354 stat->value.u64 = exe->stats.instrs_count; 4355 } 4356 4357 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4358 WRITE_STR(stat->name, "Code size"); 4359 WRITE_STR(stat->description, 4360 "Total number of dwords in the final generated " 4361 "shader executable."); 4362 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4363 stat->value.u64 = exe->stats.sizedwords; 4364 } 4365 4366 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4367 WRITE_STR(stat->name, "NOPs Count"); 4368 WRITE_STR(stat->description, 4369 "Number of NOP instructions in the final generated " 4370 "shader executable."); 4371 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4372 stat->value.u64 = exe->stats.nops_count; 4373 } 4374 4375 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4376 WRITE_STR(stat->name, "MOV Count"); 4377 WRITE_STR(stat->description, 4378 "Number of MOV instructions in the final generated " 4379 "shader executable."); 4380 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4381 stat->value.u64 = exe->stats.mov_count; 4382 } 4383 4384 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4385 WRITE_STR(stat->name, "COV Count"); 4386 WRITE_STR(stat->description, 4387 "Number of COV instructions in the final generated " 4388 "shader executable."); 4389 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4390 stat->value.u64 = exe->stats.cov_count; 4391 } 4392 4393 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4394 WRITE_STR(stat->name, "Registers used"); 4395 WRITE_STR(stat->description, 4396 "Number of registers used in the final generated " 4397 "shader executable."); 4398 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4399 stat->value.u64 = exe->stats.max_reg + 1; 4400 } 4401 4402 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4403 WRITE_STR(stat->name, "Half-registers used"); 4404 WRITE_STR(stat->description, 4405 "Number of half-registers used in the final generated " 4406 "shader executable."); 4407 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4408 stat->value.u64 = exe->stats.max_half_reg + 1; 4409 } 4410 4411 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4412 WRITE_STR(stat->name, "Instructions with SS sync bit"); 4413 WRITE_STR(stat->description, 4414 "SS bit is set for instructions which depend on a result " 4415 "of \"long\" instructions to prevent RAW hazard."); 4416 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4417 stat->value.u64 = exe->stats.ss; 4418 } 4419 4420 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4421 WRITE_STR(stat->name, "Instructions with SY sync bit"); 4422 WRITE_STR(stat->description, 4423 "SY bit is set for instructions which depend on a result " 4424 "of loads from global memory to prevent RAW hazard."); 4425 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4426 stat->value.u64 = exe->stats.sy; 4427 } 4428 4429 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4430 WRITE_STR(stat->name, "Estimated cycles stalled on SS"); 4431 WRITE_STR(stat->description, 4432 "A better metric to estimate the impact of SS syncs."); 4433 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4434 stat->value.u64 = exe->stats.sstall; 4435 } 4436 4437 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4438 WRITE_STR(stat->name, "Estimated cycles stalled on SY"); 4439 WRITE_STR(stat->description, 4440 "A better metric to estimate the impact of SY syncs."); 4441 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4442 stat->value.u64 = exe->stats.systall; 4443 } 4444 4445 for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) { 4446 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4447 WRITE_STR(stat->name, "cat%d instructions", i); 4448 WRITE_STR(stat->description, 4449 "Number of cat%d instructions.", i); 4450 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4451 stat->value.u64 = exe->stats.instrs_per_cat[i]; 4452 } 4453 } 4454 4455 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4456 WRITE_STR(stat->name, "STP Count"); 4457 WRITE_STR(stat->description, 4458 "Number of STore Private instructions in the final generated " 4459 "shader executable."); 4460 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4461 stat->value.u64 = exe->stats.stp_count; 4462 } 4463 4464 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { 4465 WRITE_STR(stat->name, "LDP Count"); 4466 WRITE_STR(stat->description, 4467 "Number of LoaD Private instructions in the final generated " 4468 "shader executable."); 4469 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; 4470 stat->value.u64 = exe->stats.ldp_count; 4471 } 4472 4473 return vk_outarray_status(&out); 4474} 4475 4476static bool 4477write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir, 4478 const char *data) 4479{ 4480 ir->isText = VK_TRUE; 4481 4482 size_t data_len = strlen(data) + 1; 4483 4484 if (ir->pData == NULL) { 4485 ir->dataSize = data_len; 4486 return true; 4487 } 4488 4489 strncpy(ir->pData, data, ir->dataSize); 4490 if (ir->dataSize < data_len) 4491 return false; 4492 4493 ir->dataSize = data_len; 4494 return true; 4495} 4496 4497VKAPI_ATTR VkResult VKAPI_CALL 4498tu_GetPipelineExecutableInternalRepresentationsKHR( 4499 VkDevice _device, 4500 const VkPipelineExecutableInfoKHR* pExecutableInfo, 4501 uint32_t* pInternalRepresentationCount, 4502 VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations) 4503{ 4504 TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline); 4505 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out, 4506 pInternalRepresentations, pInternalRepresentationCount); 4507 bool incomplete_text = false; 4508 4509 const struct tu_pipeline_executable *exe = 4510 tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex); 4511 4512 if (exe->nir_from_spirv) { 4513 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { 4514 WRITE_STR(ir->name, "NIR from SPIRV"); 4515 WRITE_STR(ir->description, 4516 "Initial NIR before any optimizations"); 4517 4518 if (!write_ir_text(ir, exe->nir_from_spirv)) 4519 incomplete_text = true; 4520 } 4521 } 4522 4523 if (exe->nir_final) { 4524 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { 4525 WRITE_STR(ir->name, "Final NIR"); 4526 WRITE_STR(ir->description, 4527 "Final NIR before going into the back-end compiler"); 4528 4529 if (!write_ir_text(ir, exe->nir_final)) 4530 incomplete_text = true; 4531 } 4532 } 4533 4534 if (exe->disasm) { 4535 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { 4536 WRITE_STR(ir->name, "IR3 Assembly"); 4537 WRITE_STR(ir->description, 4538 "Final IR3 assembly for the generated shader binary"); 4539 4540 if (!write_ir_text(ir, exe->disasm)) 4541 incomplete_text = true; 4542 } 4543 } 4544 4545 return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out); 4546} 4547