1/* 2 * Copyright © 2016 Red Hat. 3 * Copyright © 2016 Bas Nieuwenhuizen 4 * 5 * based in part on anv driver which is: 6 * Copyright © 2015 Intel Corporation 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the "Software"), 10 * to deal in the Software without restriction, including without limitation 11 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 * and/or sell copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the next 16 * paragraph) shall be included in all copies or substantial portions of the 17 * Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 25 * IN THE SOFTWARE. 26 */ 27 28#ifndef RADV_SHADER_H 29#define RADV_SHADER_H 30 31#include "ac_binary.h" 32#include "ac_shader_util.h" 33 34#include "amd_family.h" 35#include "radv_constants.h" 36 37#include "nir/nir.h" 38#include "vulkan/runtime/vk_object.h" 39#include "vulkan/runtime/vk_shader_module.h" 40#include "vulkan/vulkan.h" 41 42#include "aco_shader_info.h" 43 44#define RADV_VERT_ATTRIB_MAX MAX2(VERT_ATTRIB_MAX, VERT_ATTRIB_GENERIC0 + MAX_VERTEX_ATTRIBS) 45 46struct radv_physical_device; 47struct radv_device; 48struct radv_pipeline; 49struct radv_pipeline_cache; 50struct radv_pipeline_key; 51struct radv_shader_args; 52struct radv_vs_input_state; 53struct radv_shader_args; 54 55enum radv_vs_input_alpha_adjust { 56 ALPHA_ADJUST_NONE = 0, 57 ALPHA_ADJUST_SNORM = 1, 58 ALPHA_ADJUST_SSCALED = 2, 59 ALPHA_ADJUST_SINT = 3, 60}; 61 62struct radv_pipeline_key { 63 uint32_t has_multiview_view_index : 1; 64 uint32_t optimisations_disabled : 1; 65 uint32_t invariant_geom : 1; 66 uint32_t use_ngg : 1; 67 uint32_t adjust_frag_coord_z : 1; 68 uint32_t disable_aniso_single_level : 1; 69 uint32_t disable_sinking_load_input_fs : 1; 70 uint32_t image_2d_view_of_3d : 1; 71 uint32_t primitives_generated_query : 1; 72 73 struct { 74 uint32_t instance_rate_inputs; 75 uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS]; 76 uint8_t vertex_attribute_formats[MAX_VERTEX_ATTRIBS]; 77 uint32_t vertex_attribute_bindings[MAX_VERTEX_ATTRIBS]; 78 uint32_t vertex_attribute_offsets[MAX_VERTEX_ATTRIBS]; 79 uint32_t vertex_attribute_strides[MAX_VERTEX_ATTRIBS]; 80 uint8_t vertex_binding_align[MAX_VBS]; 81 enum radv_vs_input_alpha_adjust vertex_alpha_adjust[MAX_VERTEX_ATTRIBS]; 82 uint32_t vertex_post_shuffle; 83 uint32_t provoking_vtx_last : 1; 84 uint32_t dynamic_input_state : 1; 85 uint8_t topology; 86 } vs; 87 88 struct { 89 unsigned tess_input_vertices; 90 } tcs; 91 92 struct { 93 uint32_t col_format; 94 uint32_t is_int8; 95 uint32_t is_int10; 96 uint32_t cb_target_mask; 97 uint8_t log2_ps_iter_samples; 98 uint8_t num_samples; 99 bool mrt0_is_dual_src; 100 101 bool lower_discard_to_demote; 102 uint8_t enable_mrt_output_nan_fixup; 103 bool force_vrs_enabled; 104 105 /* Used to export alpha through MRTZ for alpha-to-coverage (GFX11+). */ 106 bool alpha_to_coverage_via_mrtz; 107 108 bool has_epilog; 109 } ps; 110 111 struct { 112 /* Non-zero if a required subgroup size is specified via 113 * VK_EXT_subgroup_size_control. 114 */ 115 uint8_t compute_subgroup_size; 116 bool require_full_subgroups; 117 } cs; 118}; 119 120struct radv_nir_compiler_options { 121 struct radv_pipeline_key key; 122 bool robust_buffer_access; 123 bool dump_shader; 124 bool dump_preoptir; 125 bool record_ir; 126 bool record_stats; 127 bool check_ir; 128 bool has_ls_vgpr_init_bug; 129 uint8_t enable_mrt_output_nan_fixup; 130 bool wgp_mode; 131 enum radeon_family family; 132 enum amd_gfx_level gfx_level; 133 uint32_t address32_hi; 134 bool has_3d_cube_border_color_mipmap; 135 136 struct { 137 void (*func)(void *private_data, enum aco_compiler_debug_level level, const char *message); 138 void *private_data; 139 } debug; 140}; 141 142enum radv_ud_index { 143 AC_UD_SCRATCH_RING_OFFSETS = 0, 144 AC_UD_PUSH_CONSTANTS = 1, 145 AC_UD_INLINE_PUSH_CONSTANTS = 2, 146 AC_UD_INDIRECT_DESCRIPTOR_SETS = 3, 147 AC_UD_VIEW_INDEX = 4, 148 AC_UD_STREAMOUT_BUFFERS = 5, 149 AC_UD_NGG_QUERY_STATE = 6, 150 AC_UD_NGG_CULLING_SETTINGS = 7, 151 AC_UD_NGG_VIEWPORT = 8, 152 AC_UD_FORCE_VRS_RATES = 9, 153 AC_UD_TASK_RING_ENTRY = 10, 154 AC_UD_SHADER_START = 11, 155 AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START, 156 AC_UD_VS_BASE_VERTEX_START_INSTANCE, 157 AC_UD_VS_PROLOG_INPUTS, 158 AC_UD_VS_MAX_UD, 159 AC_UD_PS_EPILOG_PC, 160 AC_UD_PS_MAX_UD, 161 AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START, 162 AC_UD_CS_SBT_DESCRIPTORS, 163 AC_UD_CS_RAY_LAUNCH_SIZE_ADDR, 164 AC_UD_CS_TASK_RING_OFFSETS, 165 AC_UD_CS_TASK_DRAW_ID, 166 AC_UD_CS_TASK_IB, 167 AC_UD_CS_MAX_UD, 168 AC_UD_GS_MAX_UD, 169 AC_UD_TCS_MAX_UD, 170 AC_UD_TES_MAX_UD, 171 AC_UD_MAX_UD = AC_UD_TCS_MAX_UD, 172}; 173 174struct radv_stream_output { 175 uint8_t location; 176 uint8_t buffer; 177 uint16_t offset; 178 uint8_t component_mask; 179 uint8_t stream; 180}; 181 182struct radv_streamout_info { 183 uint16_t num_outputs; 184 struct radv_stream_output outputs[MAX_SO_OUTPUTS]; 185 uint16_t strides[MAX_SO_BUFFERS]; 186 uint32_t enabled_stream_buffers_mask; 187}; 188 189struct radv_userdata_info { 190 int8_t sgpr_idx; 191 uint8_t num_sgprs; 192}; 193 194struct radv_userdata_locations { 195 struct radv_userdata_info descriptor_sets[MAX_SETS]; 196 struct radv_userdata_info shader_data[AC_UD_MAX_UD]; 197 uint32_t descriptor_sets_enabled; 198}; 199 200struct radv_vs_output_info { 201 uint8_t vs_output_param_offset[VARYING_SLOT_MAX]; 202 uint8_t clip_dist_mask; 203 uint8_t cull_dist_mask; 204 uint8_t param_exports; 205 uint8_t prim_param_exports; 206 bool writes_pointsize; 207 bool writes_layer; 208 bool writes_layer_per_primitive; 209 bool writes_viewport_index; 210 bool writes_viewport_index_per_primitive; 211 bool writes_primitive_shading_rate; 212 bool writes_primitive_shading_rate_per_primitive; 213 bool export_prim_id; 214 bool export_clip_dists; 215 unsigned pos_exports; 216}; 217 218struct radv_es_output_info { 219 uint32_t esgs_itemsize; 220}; 221 222struct gfx9_gs_info { 223 uint32_t vgt_gs_onchip_cntl; 224 uint32_t vgt_gs_max_prims_per_subgroup; 225 uint32_t vgt_esgs_ring_itemsize; 226 uint32_t lds_size; 227}; 228 229struct gfx10_ngg_info { 230 uint16_t ngg_emit_size; /* in dwords */ 231 uint32_t hw_max_esverts; 232 uint32_t max_gsprims; 233 uint32_t max_out_verts; 234 uint32_t prim_amp_factor; 235 uint32_t vgt_esgs_ring_itemsize; 236 uint32_t esgs_ring_size; 237 bool max_vert_out_per_gs_instance; 238 bool enable_vertex_grouping; 239}; 240 241struct radv_shader_info { 242 uint64_t inline_push_constant_mask; 243 bool can_inline_all_push_constants; 244 bool loads_push_constants; 245 bool loads_dynamic_offsets; 246 uint32_t desc_set_used_mask; 247 bool uses_view_index; 248 bool uses_invocation_id; 249 bool uses_prim_id; 250 uint8_t wave_size; 251 uint8_t ballot_bit_size; 252 struct radv_userdata_locations user_sgprs_locs; 253 bool is_ngg; 254 bool is_ngg_passthrough; 255 bool has_ngg_culling; 256 bool has_ngg_early_prim_export; 257 uint32_t num_lds_blocks_when_not_culling; 258 uint32_t num_tess_patches; 259 unsigned workgroup_size; 260 bool force_vrs_per_vertex; 261 struct { 262 uint8_t input_usage_mask[RADV_VERT_ATTRIB_MAX]; 263 uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; 264 bool needs_draw_id; 265 bool needs_instance_id; 266 struct radv_vs_output_info outinfo; 267 struct radv_es_output_info es_info; 268 bool as_es; 269 bool as_ls; 270 bool tcs_in_out_eq; 271 uint64_t tcs_temp_only_input_mask; 272 uint8_t num_linked_outputs; 273 bool needs_base_instance; 274 bool use_per_attribute_vb_descs; 275 uint32_t vb_desc_usage_mask; 276 bool has_prolog; 277 bool dynamic_inputs; 278 } vs; 279 struct { 280 uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; 281 uint8_t num_stream_output_components[4]; 282 uint8_t output_streams[VARYING_SLOT_VAR31 + 1]; 283 uint8_t max_stream; 284 unsigned gsvs_vertex_size; 285 unsigned max_gsvs_emit_size; 286 unsigned vertices_in; 287 unsigned vertices_out; 288 unsigned output_prim; 289 unsigned invocations; 290 unsigned es_type; /* GFX9: VS or TES */ 291 uint8_t num_linked_inputs; 292 } gs; 293 struct { 294 uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; 295 struct radv_vs_output_info outinfo; 296 struct radv_es_output_info es_info; 297 bool as_es; 298 enum tess_primitive_mode _primitive_mode; 299 enum gl_tess_spacing spacing; 300 bool ccw; 301 bool point_mode; 302 uint8_t num_linked_inputs; 303 uint8_t num_linked_patch_inputs; 304 uint8_t num_linked_outputs; 305 } tes; 306 struct { 307 bool uses_sample_shading; 308 bool needs_sample_positions; 309 bool writes_memory; 310 bool writes_z; 311 bool writes_stencil; 312 bool writes_sample_mask; 313 bool has_pcoord; 314 bool prim_id_input; 315 bool layer_input; 316 bool viewport_index_input; 317 uint8_t num_input_clips_culls; 318 uint32_t input_mask; 319 uint32_t input_per_primitive_mask; 320 uint32_t flat_shaded_mask; 321 uint32_t explicit_shaded_mask; 322 uint32_t float16_shaded_mask; 323 uint32_t num_interp; 324 uint32_t num_prim_interp; 325 bool can_discard; 326 bool early_fragment_test; 327 bool post_depth_coverage; 328 bool reads_sample_mask_in; 329 bool reads_front_face; 330 bool reads_sample_id; 331 bool reads_frag_shading_rate; 332 bool reads_barycentric_model; 333 bool reads_persp_sample; 334 bool reads_persp_center; 335 bool reads_persp_centroid; 336 bool reads_linear_sample; 337 bool reads_linear_center; 338 bool reads_linear_centroid; 339 uint8_t reads_frag_coord_mask; 340 uint8_t reads_sample_pos_mask; 341 uint8_t depth_layout; 342 bool allow_flat_shading; 343 bool has_epilog; 344 unsigned spi_ps_input; 345 } ps; 346 struct { 347 bool uses_grid_size; 348 bool uses_block_id[3]; 349 bool uses_thread_id[3]; 350 bool uses_local_invocation_idx; 351 unsigned block_size[3]; 352 353 uint8_t subgroup_size; 354 355 bool uses_sbt; 356 bool uses_ray_launch_size; 357 bool uses_task_rings; 358 } cs; 359 struct { 360 uint64_t tes_inputs_read; 361 uint64_t tes_patch_inputs_read; 362 unsigned tcs_vertices_out; 363 uint32_t num_lds_blocks; 364 uint8_t num_linked_inputs; 365 uint8_t num_linked_outputs; 366 uint8_t num_linked_patch_outputs; 367 bool tes_reads_tess_factors : 1; 368 } tcs; 369 struct { 370 struct radv_vs_output_info outinfo; 371 enum shader_prim output_prim; 372 bool needs_ms_scratch_ring; 373 } ms; 374 375 struct radv_streamout_info so; 376 377 struct gfx9_gs_info gs_ring_info; 378 struct gfx10_ngg_info ngg_info; 379}; 380 381struct radv_vs_input_state { 382 uint32_t attribute_mask; 383 384 uint32_t instance_rate_inputs; 385 uint32_t nontrivial_divisors; 386 uint32_t zero_divisors; 387 uint32_t post_shuffle; 388 /* Having two separate fields instead of a single uint64_t makes it easier to remove attributes 389 * using bitwise arithmetic. 390 */ 391 uint32_t alpha_adjust_lo; 392 uint32_t alpha_adjust_hi; 393 394 uint8_t bindings[MAX_VERTEX_ATTRIBS]; 395 uint32_t divisors[MAX_VERTEX_ATTRIBS]; 396 uint32_t offsets[MAX_VERTEX_ATTRIBS]; 397 uint8_t formats[MAX_VERTEX_ATTRIBS]; 398 uint8_t format_align_req_minus_1[MAX_VERTEX_ATTRIBS]; 399 uint8_t format_sizes[MAX_VERTEX_ATTRIBS]; 400 401 bool bindings_match_attrib; 402}; 403 404struct radv_vs_prolog_key { 405 const struct radv_vs_input_state *state; 406 unsigned num_attributes; 407 uint32_t misaligned_mask; 408 bool as_ls; 409 bool is_ngg; 410 bool wave32; 411 gl_shader_stage next_stage; 412}; 413 414struct radv_ps_epilog_key { 415 uint32_t spi_shader_col_format; 416 417 /* Bitmasks, each bit represents one of the 8 MRTs. */ 418 uint8_t color_is_int8; 419 uint8_t color_is_int10; 420 uint8_t enable_mrt_output_nan_fixup; 421 422 bool wave32; 423}; 424 425enum radv_shader_binary_type { RADV_BINARY_TYPE_LEGACY, RADV_BINARY_TYPE_RTLD }; 426 427struct radv_shader_binary { 428 enum radv_shader_binary_type type; 429 gl_shader_stage stage; 430 bool is_gs_copy_shader; 431 432 struct ac_shader_config config; 433 struct radv_shader_info info; 434 435 /* Self-referential size so we avoid consistency issues. */ 436 uint32_t total_size; 437}; 438 439struct radv_shader_binary_legacy { 440 struct radv_shader_binary base; 441 unsigned code_size; 442 unsigned exec_size; 443 unsigned ir_size; 444 unsigned disasm_size; 445 unsigned stats_size; 446 447 /* data has size of stats_size + code_size + ir_size + disasm_size + 2, 448 * where the +2 is for 0 of the ir strings. */ 449 uint8_t data[0]; 450}; 451 452struct radv_shader_binary_rtld { 453 struct radv_shader_binary base; 454 unsigned elf_size; 455 unsigned llvm_ir_size; 456 uint8_t data[0]; 457}; 458 459struct radv_shader_part_binary { 460 uint8_t num_sgprs; 461 uint8_t num_vgprs; 462 uint8_t num_preserved_sgprs; 463 unsigned code_size; 464 unsigned disasm_size; 465 uint8_t data[0]; 466}; 467 468struct radv_shader_arena { 469 struct list_head list; 470 struct list_head entries; 471 struct radeon_winsys_bo *bo; 472 char *ptr; 473}; 474 475union radv_shader_arena_block { 476 struct list_head pool; 477 struct { 478 /* List of blocks in the arena, sorted by address. */ 479 struct list_head list; 480 /* For holes, a list_head for the free-list. For allocations, freelist.prev=NULL and 481 * freelist.next is a pointer associated with the allocation. 482 */ 483 struct list_head freelist; 484 struct radv_shader_arena *arena; 485 uint32_t offset; 486 uint32_t size; 487 }; 488}; 489 490struct radv_shader { 491 uint32_t ref_count; 492 493 uint64_t va; 494 495 struct ac_shader_config config; 496 uint8_t *code_ptr; 497 uint32_t code_size; 498 uint32_t exec_size; 499 struct radv_shader_info info; 500 501 /* debug only */ 502 char *spirv; 503 uint32_t spirv_size; 504 char *nir_string; 505 char *disasm_string; 506 char *ir_string; 507 uint32_t *statistics; 508}; 509 510struct radv_trap_handler_shader { 511 struct radeon_winsys_bo *bo; 512 union radv_shader_arena_block *alloc; 513}; 514 515struct radv_shader_part { 516 struct radeon_winsys_bo *bo; 517 union radv_shader_arena_block *alloc; 518 uint32_t rsrc1; 519 uint8_t num_preserved_sgprs; 520 bool nontrivial_divisors; 521 522 /* debug only */ 523 char *disasm_string; 524}; 525 526struct radv_pipeline_layout; 527 528void radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively, bool allow_copies); 529void radv_optimize_nir_algebraic(nir_shader *shader, bool opt_offsets); 530bool radv_nir_lower_ycbcr_textures(nir_shader *shader, const struct radv_pipeline_layout *layout); 531 532bool radv_nir_lower_ray_queries(nir_shader *shader, struct radv_device *device); 533 534void radv_nir_apply_pipeline_layout(nir_shader *shader, struct radv_device *device, 535 const struct radv_pipeline_layout *layout, 536 const struct radv_shader_info *info, 537 const struct radv_shader_args *args); 538 539struct radv_pipeline_stage; 540 541nir_shader *radv_shader_spirv_to_nir(struct radv_device *device, 542 const struct radv_pipeline_stage *stage, 543 const struct radv_pipeline_key *key); 544 545void radv_nir_lower_abi(nir_shader *shader, enum amd_gfx_level gfx_level, 546 const struct radv_shader_info *info, const struct radv_shader_args *args, 547 const struct radv_pipeline_key *pl_key, bool use_llvm); 548 549void radv_init_shader_arenas(struct radv_device *device); 550void radv_destroy_shader_arenas(struct radv_device *device); 551 552struct radv_pipeline_shader_stack_size; 553 554VkResult radv_create_shaders(struct radv_pipeline *pipeline, 555 struct radv_pipeline_layout *pipeline_layout, 556 struct radv_device *device, struct radv_pipeline_cache *cache, 557 const struct radv_pipeline_key *key, 558 const VkPipelineShaderStageCreateInfo *pStages, 559 uint32_t stageCount, 560 const VkPipelineCreateFlags flags, const uint8_t *custom_hash, 561 const VkPipelineCreationFeedbackCreateInfo *creation_feedback, 562 struct radv_pipeline_shader_stack_size **stack_sizes, 563 uint32_t *num_stack_sizes, 564 gl_shader_stage *last_vgt_api_stage); 565 566struct radv_shader_args; 567 568struct radv_shader *radv_shader_create(struct radv_device *device, 569 const struct radv_shader_binary *binary, 570 bool keep_shader_info, bool from_cache, 571 const struct radv_shader_args *args); 572struct radv_shader *radv_shader_nir_to_asm( 573 struct radv_device *device, struct radv_pipeline_stage *stage, struct nir_shader *const *shaders, 574 int shader_count, const struct radv_pipeline_key *key, bool keep_shader_info, bool keep_statistic_info, 575 struct radv_shader_binary **binary_out); 576 577bool radv_shader_binary_upload(struct radv_device *device, const struct radv_shader_binary *binary, 578 struct radv_shader *shader, void *dest_ptr); 579 580union radv_shader_arena_block *radv_alloc_shader_memory(struct radv_device *device, uint32_t size, 581 void *ptr); 582void radv_free_shader_memory(struct radv_device *device, union radv_shader_arena_block *alloc); 583 584struct radv_shader * 585radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *nir, 586 struct radv_shader_info *info, const struct radv_shader_args *args, 587 struct radv_shader_binary **binary_out, 588 bool keep_shader_info, bool keep_statistic_info, 589 bool disable_optimizations); 590 591struct radv_trap_handler_shader *radv_create_trap_handler_shader(struct radv_device *device); 592uint64_t radv_trap_handler_shader_get_va(const struct radv_trap_handler_shader *trap); 593void radv_trap_handler_shader_destroy(struct radv_device *device, 594 struct radv_trap_handler_shader *trap); 595 596struct radv_shader_part *radv_create_vs_prolog(struct radv_device *device, 597 const struct radv_vs_prolog_key *key); 598 599struct radv_shader_part *radv_create_ps_epilog(struct radv_device *device, 600 const struct radv_ps_epilog_key *key); 601 602void radv_shader_destroy(struct radv_device *device, struct radv_shader *shader); 603 604void radv_shader_part_destroy(struct radv_device *device, struct radv_shader_part *shader_part); 605 606uint64_t radv_shader_get_va(const struct radv_shader *shader); 607struct radv_shader *radv_find_shader(struct radv_device *device, uint64_t pc); 608 609unsigned radv_get_max_waves(const struct radv_device *device, struct radv_shader *shader, 610 gl_shader_stage stage); 611 612const char *radv_get_shader_name(const struct radv_shader_info *info, gl_shader_stage stage); 613 614unsigned radv_compute_spi_ps_input(const struct radv_pipeline_key *pipeline_key, 615 const struct radv_shader_info *info); 616 617bool radv_can_dump_shader(struct radv_device *device, nir_shader *nir, bool meta_shader); 618 619bool radv_can_dump_shader_stats(struct radv_device *device, nir_shader *nir); 620 621VkResult radv_dump_shader_stats(struct radv_device *device, struct radv_pipeline *pipeline, 622 gl_shader_stage stage, FILE *output); 623 624static inline unsigned 625calculate_tess_lds_size(enum amd_gfx_level gfx_level, unsigned tcs_num_input_vertices, 626 unsigned tcs_num_output_vertices, unsigned tcs_num_inputs, 627 unsigned tcs_num_patches, unsigned tcs_num_outputs, 628 unsigned tcs_num_patch_outputs) 629{ 630 unsigned input_vertex_size = tcs_num_inputs * 16; 631 unsigned output_vertex_size = tcs_num_outputs * 16; 632 633 unsigned input_patch_size = tcs_num_input_vertices * input_vertex_size; 634 635 unsigned pervertex_output_patch_size = tcs_num_output_vertices * output_vertex_size; 636 unsigned output_patch_size = pervertex_output_patch_size + tcs_num_patch_outputs * 16; 637 638 unsigned output_patch0_offset = input_patch_size * tcs_num_patches; 639 640 unsigned lds_size = output_patch0_offset + output_patch_size * tcs_num_patches; 641 642 if (gfx_level >= GFX7) { 643 assert(lds_size <= 65536); 644 lds_size = align(lds_size, 512) / 512; 645 } else { 646 assert(lds_size <= 32768); 647 lds_size = align(lds_size, 256) / 256; 648 } 649 650 return lds_size; 651} 652 653static inline unsigned 654get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, 655 unsigned tcs_num_inputs, unsigned tcs_num_outputs, 656 unsigned tcs_num_patch_outputs, unsigned tess_offchip_block_dw_size, 657 enum amd_gfx_level gfx_level, enum radeon_family family) 658{ 659 uint32_t input_vertex_size = tcs_num_inputs * 16; 660 uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size; 661 uint32_t output_vertex_size = tcs_num_outputs * 16; 662 uint32_t pervertex_output_patch_size = tcs_num_output_vertices * output_vertex_size; 663 uint32_t output_patch_size = pervertex_output_patch_size + tcs_num_patch_outputs * 16; 664 665 /* Ensure that we only need one wave per SIMD so we don't need to check 666 * resource usage. Also ensures that the number of tcs in and out 667 * vertices per threadgroup are at most 256. 668 */ 669 unsigned num_patches = 64 / MAX2(tcs_num_input_vertices, tcs_num_output_vertices) * 4; 670 /* Make sure that the data fits in LDS. This assumes the shaders only 671 * use LDS for the inputs and outputs. 672 */ 673 unsigned hardware_lds_size = 32768; 674 675 /* Looks like STONEY hangs if we use more than 32 KiB LDS in a single 676 * threadgroup, even though there is more than 32 KiB LDS. 677 * 678 * Test: dEQP-VK.tessellation.shader_input_output.barrier 679 */ 680 if (gfx_level >= GFX7 && family != CHIP_STONEY) 681 hardware_lds_size = 65536; 682 683 if (input_patch_size + output_patch_size) 684 num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size)); 685 /* Make sure the output data fits in the offchip buffer */ 686 if (output_patch_size) 687 num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / output_patch_size); 688 /* Not necessary for correctness, but improves performance. The 689 * specific value is taken from the proprietary driver. 690 */ 691 num_patches = MIN2(num_patches, 40); 692 693 /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */ 694 if (gfx_level == GFX6) { 695 unsigned one_wave = 64 / MAX2(tcs_num_input_vertices, tcs_num_output_vertices); 696 num_patches = MIN2(num_patches, one_wave); 697 } 698 return num_patches; 699} 700 701void radv_lower_io(struct radv_device *device, nir_shader *nir, bool is_mesh_shading); 702 703bool radv_lower_io_to_mem(struct radv_device *device, struct radv_pipeline_stage *stage, 704 const struct radv_pipeline_key *pl_key); 705 706void radv_lower_ngg(struct radv_device *device, struct radv_pipeline_stage *ngg_stage, 707 const struct radv_pipeline_key *pl_key); 708 709bool radv_consider_culling(const struct radv_physical_device *pdevice, struct nir_shader *nir, 710 uint64_t ps_inputs_read, unsigned num_vertices_per_primitive, 711 const struct radv_shader_info *info); 712 713void radv_get_nir_options(struct radv_physical_device *device); 714 715bool radv_force_primitive_shading_rate(nir_shader *nir, struct radv_device *device); 716 717bool radv_lower_fs_intrinsics(nir_shader *nir, const struct radv_pipeline_stage *fs_stage, 718 const struct radv_pipeline_key *key); 719 720#endif 721