1/* 2 * Copyright © 2010 - 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#ifndef BRW_COMPILER_H 25#define BRW_COMPILER_H 26 27#include <stdio.h> 28#include "c11/threads.h" 29#include "dev/intel_device_info.h" 30#include "util/macros.h" 31#include "util/ralloc.h" 32#include "util/u_math.h" 33#include "brw_isa_info.h" 34 35#ifdef __cplusplus 36extern "C" { 37#endif 38 39struct ra_regs; 40struct nir_shader; 41struct brw_program; 42struct shader_info; 43 44struct nir_shader_compiler_options; 45typedef struct nir_shader nir_shader; 46 47struct brw_compiler { 48 const struct intel_device_info *devinfo; 49 50 /* This lock must be taken if the compiler is to be modified in any way, 51 * including adding something to the ralloc child list. 52 */ 53 mtx_t mutex; 54 55 struct brw_isa_info isa; 56 57 struct { 58 struct ra_regs *regs; 59 60 /** 61 * Array of the ra classes for the unaligned contiguous register 62 * block sizes used. 63 */ 64 struct ra_class **classes; 65 } vec4_reg_set; 66 67 struct { 68 struct ra_regs *regs; 69 70 /** 71 * Array of the ra classes for the unaligned contiguous register 72 * block sizes used, indexed by register size. 73 */ 74 struct ra_class *classes[16]; 75 76 /** 77 * ra class for the aligned barycentrics we use for PLN, which doesn't 78 * appear in *classes. 79 */ 80 struct ra_class *aligned_bary_class; 81 } fs_reg_sets[3]; 82 83 void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4); 84 void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4); 85 86 bool scalar_stage[MESA_ALL_SHADER_STAGES]; 87 bool use_tcs_8_patch; 88 struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES]; 89 90 /** 91 * Apply workarounds for SIN and COS output range problems. 92 * This can negatively impact performance. 93 */ 94 bool precise_trig; 95 96 /** 97 * Is 3DSTATE_CONSTANT_*'s Constant Buffer 0 relative to Dynamic State 98 * Base Address? (If not, it's a normal GPU address.) 99 */ 100 bool constant_buffer_0_is_relative; 101 102 /** 103 * Whether or not the driver supports NIR shader constants. This controls 104 * whether nir_opt_large_constants will be run. 105 */ 106 bool supports_shader_constants; 107 108 /** 109 * Whether or not the driver wants variable group size to be lowered by the 110 * back-end compiler. 111 */ 112 bool lower_variable_group_size; 113 114 /** 115 * Whether indirect UBO loads should use the sampler or go through the 116 * data/constant cache. For the sampler, UBO surface states have to be set 117 * up with VK_FORMAT_R32G32B32A32_FLOAT whereas if it's going through the 118 * constant or data cache, UBOs must use VK_FORMAT_RAW. 119 */ 120 bool indirect_ubos_use_sampler; 121 122 struct nir_shader *clc_shader; 123}; 124 125#define brw_shader_debug_log(compiler, data, fmt, ... ) do { \ 126 static unsigned id = 0; \ 127 compiler->shader_debug_log(data, &id, fmt, ##__VA_ARGS__); \ 128} while (0) 129 130#define brw_shader_perf_log(compiler, data, fmt, ... ) do { \ 131 static unsigned id = 0; \ 132 compiler->shader_perf_log(data, &id, fmt, ##__VA_ARGS__); \ 133} while (0) 134 135/** 136 * We use a constant subgroup size of 32. It really only needs to be a 137 * maximum and, since we do SIMD32 for compute shaders in some cases, it 138 * needs to be at least 32. SIMD8 and SIMD16 shaders will still claim a 139 * subgroup size of 32 but will act as if 16 or 24 of those channels are 140 * disabled. 141 */ 142#define BRW_SUBGROUP_SIZE 32 143 144static inline bool 145brw_shader_stage_is_bindless(gl_shader_stage stage) 146{ 147 return stage >= MESA_SHADER_RAYGEN && 148 stage <= MESA_SHADER_CALLABLE; 149} 150 151static inline bool 152brw_shader_stage_requires_bindless_resources(gl_shader_stage stage) 153{ 154 return brw_shader_stage_is_bindless(stage) || gl_shader_stage_is_mesh(stage); 155} 156 157/** 158 * Program key structures. 159 * 160 * When drawing, we look for the currently bound shaders in the program 161 * cache. This is essentially a hash table lookup, and these are the keys. 162 * 163 * Sometimes OpenGL features specified as state need to be simulated via 164 * shader code, due to a mismatch between the API and the hardware. This 165 * is often referred to as "non-orthagonal state" or "NOS". We store NOS 166 * in the program key so it's considered when searching for a program. If 167 * we haven't seen a particular combination before, we have to recompile a 168 * new specialized version. 169 * 170 * Shader compilation should not look up state in gl_context directly, but 171 * instead use the copy in the program key. This guarantees recompiles will 172 * happen correctly. 173 * 174 * @{ 175 */ 176 177enum PACKED gfx6_gather_sampler_wa { 178 WA_SIGN = 1, /* whether we need to sign extend */ 179 WA_8BIT = 2, /* if we have an 8bit format needing wa */ 180 WA_16BIT = 4, /* if we have a 16bit format needing wa */ 181}; 182 183#define BRW_MAX_SAMPLERS 32 184 185/* Provide explicit padding for each member, to ensure that the compiler 186 * initializes every bit in the shader cache keys. The keys will be compared 187 * with memcmp. 188 */ 189PRAGMA_DIAGNOSTIC_PUSH 190PRAGMA_DIAGNOSTIC_ERROR(-Wpadded) 191 192/** 193 * Sampler information needed by VS, WM, and GS program cache keys. 194 */ 195struct brw_sampler_prog_key_data { 196 /** 197 * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles. 198 */ 199 uint16_t swizzles[BRW_MAX_SAMPLERS]; 200 201 uint32_t gl_clamp_mask[3]; 202 203 /** 204 * For RG32F, gather4's channel select is broken. 205 */ 206 uint32_t gather_channel_quirk_mask; 207 208 /** 209 * Whether this sampler uses the compressed multisample surface layout. 210 */ 211 uint32_t compressed_multisample_layout_mask; 212 213 /** 214 * Whether this sampler is using 16x multisampling. If so fetching from 215 * this sampler will be handled with a different instruction, ld2dms_w 216 * instead of ld2dms. 217 */ 218 uint32_t msaa_16; 219 220 /** 221 * For Sandybridge, which shader w/a we need for gather quirks. 222 */ 223 enum gfx6_gather_sampler_wa gfx6_gather_wa[BRW_MAX_SAMPLERS]; 224 225 /** 226 * Texture units that have a YUV image bound. 227 */ 228 uint32_t y_u_v_image_mask; 229 uint32_t y_uv_image_mask; 230 uint32_t yx_xuxv_image_mask; 231 uint32_t xy_uxvx_image_mask; 232 uint32_t ayuv_image_mask; 233 uint32_t xyuv_image_mask; 234 uint32_t bt709_mask; 235 uint32_t bt2020_mask; 236 237 /* Scale factor for each texture. */ 238 float scale_factors[BRW_MAX_SAMPLERS]; 239}; 240 241struct brw_base_prog_key { 242 unsigned program_string_id; 243 244 bool robust_buffer_access; 245 246 /** 247 * Apply workarounds for SIN and COS input range problems. 248 * This limits input range for SIN and COS to [-2p : 2p] to 249 * avoid precision issues. 250 */ 251 bool limit_trig_input_range; 252 unsigned padding:16; 253 254 struct brw_sampler_prog_key_data tex; 255}; 256 257/** 258 * The VF can't natively handle certain types of attributes, such as GL_FIXED 259 * or most 10_10_10_2 types. These flags enable various VS workarounds to 260 * "fix" attributes at the beginning of shaders. 261 */ 262#define BRW_ATTRIB_WA_COMPONENT_MASK 7 /* mask for GL_FIXED scale channel count */ 263#define BRW_ATTRIB_WA_NORMALIZE 8 /* normalize in shader */ 264#define BRW_ATTRIB_WA_BGRA 16 /* swap r/b channels in shader */ 265#define BRW_ATTRIB_WA_SIGN 32 /* interpret as signed in shader */ 266#define BRW_ATTRIB_WA_SCALE 64 /* interpret as scaled in shader */ 267 268/** 269 * OpenGL attribute slots fall in [0, VERT_ATTRIB_MAX - 1] with the range 270 * [VERT_ATTRIB_GENERIC0, VERT_ATTRIB_MAX - 1] reserved for up to 16 user 271 * input vertex attributes. In Vulkan, we expose up to 28 user vertex input 272 * attributes that are mapped to slots also starting at VERT_ATTRIB_GENERIC0. 273 */ 274#define MAX_GL_VERT_ATTRIB VERT_ATTRIB_MAX 275#define MAX_VK_VERT_ATTRIB (VERT_ATTRIB_GENERIC0 + 28) 276 277/** 278 * Max number of binding table entries used for stream output. 279 * 280 * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the 281 * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64. 282 * 283 * On Gfx6, the size of transform feedback data is limited not by the number 284 * of components but by the number of binding table entries we set aside. We 285 * use one binding table entry for a float, one entry for a vector, and one 286 * entry per matrix column. Since the only way we can communicate our 287 * transform feedback capabilities to the client is via 288 * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the 289 * worst case, in which all the varyings are floats, so we use up one binding 290 * table entry per component. Therefore we need to set aside at least 64 291 * binding table entries for use by transform feedback. 292 * 293 * Note: since we don't currently pack varyings, it is currently impossible 294 * for the client to actually use up all of these binding table entries--if 295 * all of their varyings were floats, they would run out of varying slots and 296 * fail to link. But that's a bug, so it seems prudent to go ahead and 297 * allocate the number of binding table entries we will need once the bug is 298 * fixed. 299 */ 300#define BRW_MAX_SOL_BINDINGS 64 301 302/** The program key for Vertex Shaders. */ 303struct brw_vs_prog_key { 304 struct brw_base_prog_key base; 305 306 /** 307 * Per-attribute workaround flags 308 * 309 * For each attribute, a combination of BRW_ATTRIB_WA_*. 310 * 311 * For OpenGL, where we expose a maximum of 16 user input attributes 312 * we only need up to VERT_ATTRIB_MAX slots, however, in Vulkan 313 * slots preceding VERT_ATTRIB_GENERIC0 are unused and we can 314 * expose up to 28 user input vertex attributes that are mapped to slots 315 * starting at VERT_ATTRIB_GENERIC0, so this array needs to be large 316 * enough to hold this many slots. 317 */ 318 uint8_t gl_attrib_wa_flags[MAX2(MAX_GL_VERT_ATTRIB, MAX_VK_VERT_ATTRIB)]; 319 320 /** 321 * For pre-Gfx6 hardware, a bitfield indicating which texture coordinates 322 * are going to be replaced with point coordinates (as a consequence of a 323 * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because 324 * our SF thread requires exact matching between VS outputs and FS inputs, 325 * these texture coordinates will need to be unconditionally included in 326 * the VUE, even if they aren't written by the vertex shader. 327 */ 328 uint8_t point_coord_replace; 329 unsigned clamp_pointsize:1; 330 331 bool copy_edgeflag:1; 332 333 bool clamp_vertex_color:1; 334 335 /** 336 * How many user clipping planes are being uploaded to the vertex shader as 337 * push constants. 338 * 339 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to 340 * clip distances. 341 */ 342 unsigned nr_userclip_plane_consts:4; 343 344 uint32_t padding: 25; 345}; 346 347/** The program key for Tessellation Control Shaders. */ 348struct brw_tcs_prog_key 349{ 350 struct brw_base_prog_key base; 351 352 /** A bitfield of per-vertex outputs written. */ 353 uint64_t outputs_written; 354 355 enum tess_primitive_mode _tes_primitive_mode; 356 357 unsigned input_vertices; 358 359 /** A bitfield of per-patch outputs written. */ 360 uint32_t patch_outputs_written; 361 362 bool quads_workaround; 363 uint32_t padding:24; 364}; 365 366/** The program key for Tessellation Evaluation Shaders. */ 367struct brw_tes_prog_key 368{ 369 struct brw_base_prog_key base; 370 371 /** A bitfield of per-vertex inputs read. */ 372 uint64_t inputs_read; 373 374 /** A bitfield of per-patch inputs read. */ 375 uint32_t patch_inputs_read; 376 377 /** 378 * How many user clipping planes are being uploaded to the tessellation 379 * evaluation shader as push constants. 380 * 381 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to 382 * clip distances. 383 */ 384 unsigned nr_userclip_plane_consts:4; 385 unsigned clamp_pointsize:1; 386 uint32_t padding:27; 387}; 388 389/** The program key for Geometry Shaders. */ 390struct brw_gs_prog_key 391{ 392 struct brw_base_prog_key base; 393 394 /** 395 * How many user clipping planes are being uploaded to the geometry shader 396 * as push constants. 397 * 398 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to 399 * clip distances. 400 */ 401 unsigned nr_userclip_plane_consts:4; 402 unsigned clamp_pointsize:1; 403 unsigned padding:27; 404}; 405 406struct brw_task_prog_key 407{ 408 struct brw_base_prog_key base; 409}; 410 411struct brw_mesh_prog_key 412{ 413 struct brw_base_prog_key base; 414}; 415 416enum brw_sf_primitive { 417 BRW_SF_PRIM_POINTS = 0, 418 BRW_SF_PRIM_LINES = 1, 419 BRW_SF_PRIM_TRIANGLES = 2, 420 BRW_SF_PRIM_UNFILLED_TRIS = 3, 421}; 422 423struct brw_sf_prog_key { 424 uint64_t attrs; 425 bool contains_flat_varying; 426 unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ 427 uint8_t point_sprite_coord_replace; 428 enum brw_sf_primitive primitive:2; 429 bool do_twoside_color:1; 430 bool frontface_ccw:1; 431 bool do_point_sprite:1; 432 bool do_point_coord:1; 433 bool sprite_origin_lower_left:1; 434 bool userclip_active:1; 435 unsigned padding: 32; 436}; 437 438enum brw_clip_mode { 439 BRW_CLIP_MODE_NORMAL = 0, 440 BRW_CLIP_MODE_CLIP_ALL = 1, 441 BRW_CLIP_MODE_CLIP_NON_REJECTED = 2, 442 BRW_CLIP_MODE_REJECT_ALL = 3, 443 BRW_CLIP_MODE_ACCEPT_ALL = 4, 444 BRW_CLIP_MODE_KERNEL_CLIP = 5, 445}; 446 447enum brw_clip_fill_mode { 448 BRW_CLIP_FILL_MODE_LINE = 0, 449 BRW_CLIP_FILL_MODE_POINT = 1, 450 BRW_CLIP_FILL_MODE_FILL = 2, 451 BRW_CLIP_FILL_MODE_CULL = 3, 452}; 453 454/* Note that if unfilled primitives are being emitted, we have to fix 455 * up polygon offset and flatshading at this point: 456 */ 457struct brw_clip_prog_key { 458 uint64_t attrs; 459 float offset_factor; 460 float offset_units; 461 float offset_clamp; 462 bool contains_flat_varying; 463 bool contains_noperspective_varying; 464 unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ 465 unsigned primitive:4; 466 unsigned nr_userclip:4; 467 bool pv_first:1; 468 bool do_unfilled:1; 469 enum brw_clip_fill_mode fill_cw:2; /* includes cull information */ 470 enum brw_clip_fill_mode fill_ccw:2; /* includes cull information */ 471 bool offset_cw:1; 472 bool offset_ccw:1; 473 bool copy_bfc_cw:1; 474 bool copy_bfc_ccw:1; 475 enum brw_clip_mode clip_mode:3; 476 uint64_t padding:51; 477}; 478 479/* A big lookup table is used to figure out which and how many 480 * additional regs will inserted before the main payload in the WM 481 * program execution. These mainly relate to depth and stencil 482 * processing and the early-depth-test optimization. 483 */ 484enum brw_wm_iz_bits { 485 BRW_WM_IZ_PS_KILL_ALPHATEST_BIT = 0x1, 486 BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT = 0x2, 487 BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT = 0x4, 488 BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT = 0x8, 489 BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT = 0x10, 490 BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT = 0x20, 491 BRW_WM_IZ_BIT_MAX = 0x40 492}; 493 494enum brw_wm_aa_enable { 495 BRW_WM_AA_NEVER, 496 BRW_WM_AA_SOMETIMES, 497 BRW_WM_AA_ALWAYS 498}; 499 500/** The program key for Fragment/Pixel Shaders. */ 501struct brw_wm_prog_key { 502 struct brw_base_prog_key base; 503 504 uint64_t input_slots_valid; 505 float alpha_test_ref; 506 uint8_t color_outputs_valid; 507 508 /* Some collection of BRW_WM_IZ_* */ 509 uint8_t iz_lookup; 510 bool stats_wm:1; 511 bool flat_shade:1; 512 unsigned nr_color_regions:5; 513 bool emit_alpha_test:1; 514 enum compare_func alpha_test_func:3; /* < For Gfx4/5 MRT alpha test */ 515 bool alpha_test_replicate_alpha:1; 516 bool alpha_to_coverage:1; 517 bool clamp_fragment_color:1; 518 bool persample_interp:1; 519 bool multisample_fbo:1; 520 enum brw_wm_aa_enable line_aa:2; 521 bool force_dual_color_blend:1; 522 bool coherent_fb_fetch:1; 523 bool ignore_sample_mask_out:1; 524 bool coarse_pixel:1; 525 526 uint64_t padding:58; 527}; 528 529struct brw_cs_prog_key { 530 struct brw_base_prog_key base; 531}; 532 533struct brw_bs_prog_key { 534 struct brw_base_prog_key base; 535}; 536 537struct brw_ff_gs_prog_key { 538 uint64_t attrs; 539 540 /** 541 * Map from the index of a transform feedback binding table entry to the 542 * gl_varying_slot that should be streamed out through that binding table 543 * entry. 544 */ 545 unsigned char transform_feedback_bindings[BRW_MAX_SOL_BINDINGS]; 546 547 /** 548 * Map from the index of a transform feedback binding table entry to the 549 * swizzles that should be used when streaming out data through that 550 * binding table entry. 551 */ 552 unsigned char transform_feedback_swizzles[BRW_MAX_SOL_BINDINGS]; 553 554 /** 555 * Hardware primitive type being drawn, e.g. _3DPRIM_TRILIST. 556 */ 557 unsigned primitive:8; 558 559 unsigned pv_first:1; 560 unsigned need_gs_prog:1; 561 562 /** 563 * Number of varyings that are output to transform feedback. 564 */ 565 unsigned num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ 566 uint64_t padding:47; 567}; 568 569/* brw_any_prog_key is any of the keys that map to an API stage */ 570union brw_any_prog_key { 571 struct brw_base_prog_key base; 572 struct brw_vs_prog_key vs; 573 struct brw_tcs_prog_key tcs; 574 struct brw_tes_prog_key tes; 575 struct brw_gs_prog_key gs; 576 struct brw_wm_prog_key wm; 577 struct brw_cs_prog_key cs; 578 struct brw_bs_prog_key bs; 579 struct brw_task_prog_key task; 580 struct brw_mesh_prog_key mesh; 581}; 582 583PRAGMA_DIAGNOSTIC_POP 584 585/* 586 * Image metadata structure as laid out in the shader parameter 587 * buffer. Entries have to be 16B-aligned for the vec4 back-end to be 588 * able to use them. That's okay because the padding and any unused 589 * entries [most of them except when we're doing untyped surface 590 * access] will be removed by the uniform packing pass. 591 */ 592#define BRW_IMAGE_PARAM_OFFSET_OFFSET 0 593#define BRW_IMAGE_PARAM_SIZE_OFFSET 4 594#define BRW_IMAGE_PARAM_STRIDE_OFFSET 8 595#define BRW_IMAGE_PARAM_TILING_OFFSET 12 596#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET 16 597#define BRW_IMAGE_PARAM_SIZE 20 598 599struct brw_image_param { 600 /** Offset applied to the X and Y surface coordinates. */ 601 uint32_t offset[2]; 602 603 /** Surface X, Y and Z dimensions. */ 604 uint32_t size[3]; 605 606 /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in 607 * pixels, vertical slice stride in pixels. 608 */ 609 uint32_t stride[4]; 610 611 /** Log2 of the tiling modulus in the X, Y and Z dimension. */ 612 uint32_t tiling[3]; 613 614 /** 615 * Right shift to apply for bit 6 address swizzling. Two different 616 * swizzles can be specified and will be applied one after the other. The 617 * resulting address will be: 618 * 619 * addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^ 620 * (addr >> swizzling[1]))) 621 * 622 * Use \c 0xff if any of the swizzles is not required. 623 */ 624 uint32_t swizzling[2]; 625}; 626 627/** Max number of render targets in a shader */ 628#define BRW_MAX_DRAW_BUFFERS 8 629 630/** 631 * Binding table index for the first gfx6 SOL binding. 632 */ 633#define BRW_GFX6_SOL_BINDING_START 0 634 635struct brw_ubo_range 636{ 637 uint16_t block; 638 uint8_t start; 639 uint8_t length; 640}; 641 642/* We reserve the first 2^16 values for builtins */ 643#define BRW_PARAM_IS_BUILTIN(param) (((param) & 0xffff0000) == 0) 644 645enum brw_param_builtin { 646 BRW_PARAM_BUILTIN_ZERO, 647 648 BRW_PARAM_BUILTIN_CLIP_PLANE_0_X, 649 BRW_PARAM_BUILTIN_CLIP_PLANE_0_Y, 650 BRW_PARAM_BUILTIN_CLIP_PLANE_0_Z, 651 BRW_PARAM_BUILTIN_CLIP_PLANE_0_W, 652 BRW_PARAM_BUILTIN_CLIP_PLANE_1_X, 653 BRW_PARAM_BUILTIN_CLIP_PLANE_1_Y, 654 BRW_PARAM_BUILTIN_CLIP_PLANE_1_Z, 655 BRW_PARAM_BUILTIN_CLIP_PLANE_1_W, 656 BRW_PARAM_BUILTIN_CLIP_PLANE_2_X, 657 BRW_PARAM_BUILTIN_CLIP_PLANE_2_Y, 658 BRW_PARAM_BUILTIN_CLIP_PLANE_2_Z, 659 BRW_PARAM_BUILTIN_CLIP_PLANE_2_W, 660 BRW_PARAM_BUILTIN_CLIP_PLANE_3_X, 661 BRW_PARAM_BUILTIN_CLIP_PLANE_3_Y, 662 BRW_PARAM_BUILTIN_CLIP_PLANE_3_Z, 663 BRW_PARAM_BUILTIN_CLIP_PLANE_3_W, 664 BRW_PARAM_BUILTIN_CLIP_PLANE_4_X, 665 BRW_PARAM_BUILTIN_CLIP_PLANE_4_Y, 666 BRW_PARAM_BUILTIN_CLIP_PLANE_4_Z, 667 BRW_PARAM_BUILTIN_CLIP_PLANE_4_W, 668 BRW_PARAM_BUILTIN_CLIP_PLANE_5_X, 669 BRW_PARAM_BUILTIN_CLIP_PLANE_5_Y, 670 BRW_PARAM_BUILTIN_CLIP_PLANE_5_Z, 671 BRW_PARAM_BUILTIN_CLIP_PLANE_5_W, 672 BRW_PARAM_BUILTIN_CLIP_PLANE_6_X, 673 BRW_PARAM_BUILTIN_CLIP_PLANE_6_Y, 674 BRW_PARAM_BUILTIN_CLIP_PLANE_6_Z, 675 BRW_PARAM_BUILTIN_CLIP_PLANE_6_W, 676 BRW_PARAM_BUILTIN_CLIP_PLANE_7_X, 677 BRW_PARAM_BUILTIN_CLIP_PLANE_7_Y, 678 BRW_PARAM_BUILTIN_CLIP_PLANE_7_Z, 679 BRW_PARAM_BUILTIN_CLIP_PLANE_7_W, 680 681 BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X, 682 BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y, 683 BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Z, 684 BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W, 685 BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X, 686 BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y, 687 688 BRW_PARAM_BUILTIN_PATCH_VERTICES_IN, 689 690 BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X, 691 BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y, 692 BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z, 693 BRW_PARAM_BUILTIN_SUBGROUP_ID, 694 BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X, 695 BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Y, 696 BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z, 697 BRW_PARAM_BUILTIN_WORK_DIM, 698}; 699 700#define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \ 701 (BRW_PARAM_BUILTIN_CLIP_PLANE_0_X + ((idx) << 2) + (comp)) 702 703#define BRW_PARAM_BUILTIN_IS_CLIP_PLANE(param) \ 704 ((param) >= BRW_PARAM_BUILTIN_CLIP_PLANE_0_X && \ 705 (param) <= BRW_PARAM_BUILTIN_CLIP_PLANE_7_W) 706 707#define BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(param) \ 708 (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) >> 2) 709 710#define BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(param) \ 711 (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) & 0x3) 712 713enum brw_shader_reloc_id { 714 BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW, 715 BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH, 716 BRW_SHADER_RELOC_SHADER_START_OFFSET, 717 BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW, 718 BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH, 719}; 720 721enum brw_shader_reloc_type { 722 /** An arbitrary 32-bit value */ 723 BRW_SHADER_RELOC_TYPE_U32, 724 /** A MOV instruction with an immediate source */ 725 BRW_SHADER_RELOC_TYPE_MOV_IMM, 726}; 727 728/** Represents a code relocation 729 * 730 * Relocatable constants are immediates in the code which we want to be able 731 * to replace post-compile with the actual value. 732 */ 733struct brw_shader_reloc { 734 /** The 32-bit ID of the relocatable constant */ 735 uint32_t id; 736 737 /** Type of this relocation */ 738 enum brw_shader_reloc_type type; 739 740 /** The offset in the shader to the relocated value 741 * 742 * For MOV_IMM relocs, this is an offset to the MOV instruction. This 743 * allows us to do some sanity checking while we update the value. 744 */ 745 uint32_t offset; 746 747 /** Value to be added to the relocated value before it is written */ 748 uint32_t delta; 749}; 750 751/** A value to write to a relocation */ 752struct brw_shader_reloc_value { 753 /** The 32-bit ID of the relocatable constant */ 754 uint32_t id; 755 756 /** The value with which to replace the relocated immediate */ 757 uint32_t value; 758}; 759 760struct brw_stage_prog_data { 761 struct brw_ubo_range ubo_ranges[4]; 762 763 unsigned nr_params; /**< number of float params/constants */ 764 765 gl_shader_stage stage; 766 767 /* zero_push_reg is a bitfield which indicates what push registers (if any) 768 * should be zeroed by SW at the start of the shader. The corresponding 769 * push_reg_mask_param specifies the param index (in 32-bit units) where 770 * the actual runtime 64-bit mask will be pushed. The shader will zero 771 * push reg i if 772 * 773 * reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i) 774 * 775 * If this field is set, brw_compiler::compact_params must be false. 776 */ 777 uint64_t zero_push_reg; 778 unsigned push_reg_mask_param; 779 780 unsigned curb_read_length; 781 unsigned total_scratch; 782 unsigned total_shared; 783 784 unsigned program_size; 785 786 unsigned const_data_size; 787 unsigned const_data_offset; 788 789 unsigned num_relocs; 790 const struct brw_shader_reloc *relocs; 791 792 /** Does this program pull from any UBO or other constant buffers? */ 793 bool has_ubo_pull; 794 795 /** How many ray queries objects in this shader. */ 796 unsigned ray_queries; 797 798 /** 799 * Register where the thread expects to find input data from the URB 800 * (typically uniforms, followed by vertex or fragment attributes). 801 */ 802 unsigned dispatch_grf_start_reg; 803 804 bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */ 805 806 /* 32-bit identifiers for all push/pull parameters. These can be anything 807 * the driver wishes them to be; the core of the back-end compiler simply 808 * re-arranges them. The one restriction is that the bottom 2^16 values 809 * are reserved for builtins defined in the brw_param_builtin enum defined 810 * above. 811 */ 812 uint32_t *param; 813 814 /* Whether shader uses atomic operations. */ 815 bool uses_atomic_load_store; 816}; 817 818static inline uint32_t * 819brw_stage_prog_data_add_params(struct brw_stage_prog_data *prog_data, 820 unsigned nr_new_params) 821{ 822 unsigned old_nr_params = prog_data->nr_params; 823 prog_data->nr_params += nr_new_params; 824 prog_data->param = reralloc(ralloc_parent(prog_data->param), 825 prog_data->param, uint32_t, 826 prog_data->nr_params); 827 return prog_data->param + old_nr_params; 828} 829 830enum brw_barycentric_mode { 831 BRW_BARYCENTRIC_PERSPECTIVE_PIXEL = 0, 832 BRW_BARYCENTRIC_PERSPECTIVE_CENTROID = 1, 833 BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE = 2, 834 BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL = 3, 835 BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4, 836 BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5, 837 BRW_BARYCENTRIC_MODE_COUNT = 6 838}; 839#define BRW_BARYCENTRIC_NONPERSPECTIVE_BITS \ 840 ((1 << BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \ 841 (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \ 842 (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)) 843 844enum brw_pixel_shader_computed_depth_mode { 845 BRW_PSCDEPTH_OFF = 0, /* PS does not compute depth */ 846 BRW_PSCDEPTH_ON = 1, /* PS computes depth; no guarantee about value */ 847 BRW_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */ 848 BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */ 849}; 850 851/* Data about a particular attempt to compile a program. Note that 852 * there can be many of these, each in a different GL state 853 * corresponding to a different brw_wm_prog_key struct, with different 854 * compiled programs. 855 */ 856struct brw_wm_prog_data { 857 struct brw_stage_prog_data base; 858 859 unsigned num_per_primitive_inputs; 860 unsigned num_varying_inputs; 861 862 uint8_t reg_blocks_8; 863 uint8_t reg_blocks_16; 864 uint8_t reg_blocks_32; 865 866 uint8_t dispatch_grf_start_reg_16; 867 uint8_t dispatch_grf_start_reg_32; 868 uint32_t prog_offset_16; 869 uint32_t prog_offset_32; 870 871 struct { 872 /** @{ 873 * surface indices the WM-specific surfaces 874 */ 875 uint32_t render_target_read_start; 876 /** @} */ 877 } binding_table; 878 879 uint8_t color_outputs_written; 880 uint8_t computed_depth_mode; 881 bool computed_stencil; 882 883 bool early_fragment_tests; 884 bool post_depth_coverage; 885 bool inner_coverage; 886 bool dispatch_8; 887 bool dispatch_16; 888 bool dispatch_32; 889 bool dual_src_blend; 890 bool persample_dispatch; 891 bool uses_pos_offset; 892 bool uses_omask; 893 bool uses_kill; 894 bool uses_src_depth; 895 bool uses_src_w; 896 bool uses_depth_w_coefficients; 897 bool uses_sample_mask; 898 bool uses_vmask; 899 bool has_render_target_reads; 900 bool has_side_effects; 901 bool pulls_bary; 902 903 bool contains_flat_varying; 904 bool contains_noperspective_varying; 905 906 /** 907 * Shader is ran at the coarse pixel shading dispatch rate (3DSTATE_CPS). 908 */ 909 bool per_coarse_pixel_dispatch; 910 911 /** 912 * Mask of which interpolation modes are required by the fragment shader. 913 * Those interpolations are delivered as part of the thread payload. Used 914 * in hardware setup on gfx6+. 915 */ 916 uint32_t barycentric_interp_modes; 917 918 /** 919 * Whether nonperspective interpolation modes are used by the 920 * barycentric_interp_modes or fragment shader through interpolator messages. 921 */ 922 bool uses_nonperspective_interp_modes; 923 924 /** 925 * Mask of which FS inputs are marked flat by the shader source. This is 926 * needed for setting up 3DSTATE_SF/SBE. 927 */ 928 uint32_t flat_inputs; 929 930 /** 931 * The FS inputs 932 */ 933 uint64_t inputs; 934 935 /* Mapping of VUE slots to interpolation modes. 936 * Used by the Gfx4-5 clip/sf/wm stages. 937 */ 938 unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ 939 940 /** 941 * Map from gl_varying_slot to the position within the FS setup data 942 * payload where the varying's attribute vertex deltas should be delivered. 943 * For varying slots that are not used by the FS, the value is -1. 944 */ 945 int urb_setup[VARYING_SLOT_MAX]; 946 947 /** 948 * Cache structure into the urb_setup array above that contains the 949 * attribute numbers of active varyings out of urb_setup. 950 * The actual count is stored in urb_setup_attribs_count. 951 */ 952 uint8_t urb_setup_attribs[VARYING_SLOT_MAX]; 953 uint8_t urb_setup_attribs_count; 954}; 955 956/** Returns the SIMD width corresponding to a given KSP index 957 * 958 * The "Variable Pixel Dispatch" table in the PRM (which can be found, for 959 * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to 960 * kernel start pointer (KSP) indices that is based on what dispatch widths 961 * are enabled. This function provides, effectively, the reverse mapping. 962 * 963 * If the given KSP is valid with respect to the SIMD8/16/32 enables, a SIMD 964 * width of 8, 16, or 32 is returned. If the KSP is invalid, 0 is returned. 965 */ 966static inline unsigned 967brw_fs_simd_width_for_ksp(unsigned ksp_idx, bool simd8_enabled, 968 bool simd16_enabled, bool simd32_enabled) 969{ 970 /* This function strictly ignores contiguous dispatch */ 971 switch (ksp_idx) { 972 case 0: 973 return simd8_enabled ? 8 : 974 (simd16_enabled && !simd32_enabled) ? 16 : 975 (simd32_enabled && !simd16_enabled) ? 32 : 0; 976 case 1: 977 return (simd32_enabled && (simd16_enabled || simd8_enabled)) ? 32 : 0; 978 case 2: 979 return (simd16_enabled && (simd32_enabled || simd8_enabled)) ? 16 : 0; 980 default: 981 unreachable("Invalid KSP index"); 982 } 983} 984 985#define brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx) \ 986 brw_fs_simd_width_for_ksp((ksp_idx), (wm_state)._8PixelDispatchEnable, \ 987 (wm_state)._16PixelDispatchEnable, \ 988 (wm_state)._32PixelDispatchEnable) 989 990#define brw_wm_state_has_ksp(wm_state, ksp_idx) \ 991 (brw_wm_state_simd_width_for_ksp((wm_state), (ksp_idx)) != 0) 992 993static inline uint32_t 994_brw_wm_prog_data_prog_offset(const struct brw_wm_prog_data *prog_data, 995 unsigned simd_width) 996{ 997 switch (simd_width) { 998 case 8: return 0; 999 case 16: return prog_data->prog_offset_16; 1000 case 32: return prog_data->prog_offset_32; 1001 default: return 0; 1002 } 1003} 1004 1005#define brw_wm_prog_data_prog_offset(prog_data, wm_state, ksp_idx) \ 1006 _brw_wm_prog_data_prog_offset(prog_data, \ 1007 brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)) 1008 1009static inline uint8_t 1010_brw_wm_prog_data_dispatch_grf_start_reg(const struct brw_wm_prog_data *prog_data, 1011 unsigned simd_width) 1012{ 1013 switch (simd_width) { 1014 case 8: return prog_data->base.dispatch_grf_start_reg; 1015 case 16: return prog_data->dispatch_grf_start_reg_16; 1016 case 32: return prog_data->dispatch_grf_start_reg_32; 1017 default: return 0; 1018 } 1019} 1020 1021#define brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm_state, ksp_idx) \ 1022 _brw_wm_prog_data_dispatch_grf_start_reg(prog_data, \ 1023 brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)) 1024 1025static inline uint8_t 1026_brw_wm_prog_data_reg_blocks(const struct brw_wm_prog_data *prog_data, 1027 unsigned simd_width) 1028{ 1029 switch (simd_width) { 1030 case 8: return prog_data->reg_blocks_8; 1031 case 16: return prog_data->reg_blocks_16; 1032 case 32: return prog_data->reg_blocks_32; 1033 default: return 0; 1034 } 1035} 1036 1037#define brw_wm_prog_data_reg_blocks(prog_data, wm_state, ksp_idx) \ 1038 _brw_wm_prog_data_reg_blocks(prog_data, \ 1039 brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)) 1040 1041struct brw_push_const_block { 1042 unsigned dwords; /* Dword count, not reg aligned */ 1043 unsigned regs; 1044 unsigned size; /* Bytes, register aligned */ 1045}; 1046 1047struct brw_cs_prog_data { 1048 struct brw_stage_prog_data base; 1049 1050 unsigned local_size[3]; 1051 1052 /* Program offsets for the 8/16/32 SIMD variants. Multiple variants are 1053 * kept when using variable group size, and the right one can only be 1054 * decided at dispatch time. 1055 */ 1056 unsigned prog_offset[3]; 1057 1058 /* Bitmask indicating which program offsets are valid. */ 1059 unsigned prog_mask; 1060 1061 /* Bitmask indicating which programs have spilled. */ 1062 unsigned prog_spilled; 1063 1064 bool uses_barrier; 1065 bool uses_num_work_groups; 1066 bool uses_inline_data; 1067 bool uses_btd_stack_ids; 1068 1069 struct { 1070 struct brw_push_const_block cross_thread; 1071 struct brw_push_const_block per_thread; 1072 } push; 1073 1074 struct { 1075 /** @{ 1076 * surface indices the CS-specific surfaces 1077 */ 1078 uint32_t work_groups_start; 1079 /** @} */ 1080 } binding_table; 1081}; 1082 1083static inline uint32_t 1084brw_cs_prog_data_prog_offset(const struct brw_cs_prog_data *prog_data, 1085 unsigned dispatch_width) 1086{ 1087 assert(dispatch_width == 8 || 1088 dispatch_width == 16 || 1089 dispatch_width == 32); 1090 const unsigned index = dispatch_width / 16; 1091 assert(prog_data->prog_mask & (1 << index)); 1092 return prog_data->prog_offset[index]; 1093} 1094 1095struct brw_bs_prog_data { 1096 struct brw_stage_prog_data base; 1097 1098 /** SIMD size of the root shader */ 1099 uint8_t simd_size; 1100 1101 /** Maximum stack size of all shaders */ 1102 uint32_t max_stack_size; 1103 1104 /** Offset into the shader where the resume SBT is located */ 1105 uint32_t resume_sbt_offset; 1106}; 1107 1108struct brw_ff_gs_prog_data { 1109 unsigned urb_read_length; 1110 unsigned total_grf; 1111 1112 /** 1113 * Gfx6 transform feedback: Amount by which the streaming vertex buffer 1114 * indices should be incremented each time the GS is invoked. 1115 */ 1116 unsigned svbi_postincrement_value; 1117}; 1118 1119/** 1120 * Enum representing the i965-specific vertex results that don't correspond 1121 * exactly to any element of gl_varying_slot. The values of this enum are 1122 * assigned such that they don't conflict with gl_varying_slot. 1123 */ 1124typedef enum 1125{ 1126 BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX, 1127 BRW_VARYING_SLOT_PAD, 1128 /** 1129 * Technically this is not a varying but just a placeholder that 1130 * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord 1131 * builtin variable to be compiled correctly. see compile_sf_prog() for 1132 * more info. 1133 */ 1134 BRW_VARYING_SLOT_PNTC, 1135 BRW_VARYING_SLOT_COUNT 1136} brw_varying_slot; 1137 1138/** 1139 * We always program SF to start reading at an offset of 1 (2 varying slots) 1140 * from the start of the vertex URB entry. This causes it to skip: 1141 * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gfx4-5 1142 * - VARYING_SLOT_PSIZ and VARYING_SLOT_POS on gfx6+ 1143 */ 1144#define BRW_SF_URB_ENTRY_READ_OFFSET 1 1145 1146/** 1147 * Bitmask indicating which fragment shader inputs represent varyings (and 1148 * hence have to be delivered to the fragment shader by the SF/SBE stage). 1149 */ 1150#define BRW_FS_VARYING_INPUT_MASK \ 1151 (BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \ 1152 ~VARYING_BIT_POS & ~VARYING_BIT_FACE) 1153 1154/** 1155 * Data structure recording the relationship between the gl_varying_slot enum 1156 * and "slots" within the vertex URB entry (VUE). A "slot" is defined as a 1157 * single octaword within the VUE (128 bits). 1158 * 1159 * Note that each BRW register contains 256 bits (2 octawords), so when 1160 * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two 1161 * consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as 1162 * in a vertex shader), each register corresponds to a single VUE slot, since 1163 * it contains data for two separate vertices. 1164 */ 1165struct brw_vue_map { 1166 /** 1167 * Bitfield representing all varying slots that are (a) stored in this VUE 1168 * map, and (b) actually written by the shader. Does not include any of 1169 * the additional varying slots defined in brw_varying_slot. 1170 */ 1171 uint64_t slots_valid; 1172 1173 /** 1174 * Is this VUE map for a separate shader pipeline? 1175 * 1176 * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched 1177 * without the linker having a chance to dead code eliminate unused varyings. 1178 * 1179 * This means that we have to use a fixed slot layout, based on the output's 1180 * location field, rather than assigning slots in a compact contiguous block. 1181 */ 1182 bool separate; 1183 1184 /** 1185 * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are 1186 * not stored in a slot (because they are not written, or because 1187 * additional processing is applied before storing them in the VUE), the 1188 * value is -1. 1189 */ 1190 signed char varying_to_slot[VARYING_SLOT_TESS_MAX]; 1191 1192 /** 1193 * Map from VUE slot to gl_varying_slot value. For slots that do not 1194 * directly correspond to a gl_varying_slot, the value comes from 1195 * brw_varying_slot. 1196 * 1197 * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD. 1198 */ 1199 signed char slot_to_varying[VARYING_SLOT_TESS_MAX]; 1200 1201 /** 1202 * Total number of VUE slots in use 1203 */ 1204 int num_slots; 1205 1206 /** 1207 * Number of per-patch VUE slots. Only valid for tessellation control 1208 * shader outputs and tessellation evaluation shader inputs. 1209 */ 1210 int num_per_patch_slots; 1211 1212 /** 1213 * Number of per-vertex VUE slots. Only valid for tessellation control 1214 * shader outputs and tessellation evaluation shader inputs. 1215 */ 1216 int num_per_vertex_slots; 1217}; 1218 1219void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map, 1220 gl_shader_stage stage); 1221 1222/** 1223 * Convert a VUE slot number into a byte offset within the VUE. 1224 */ 1225static inline unsigned brw_vue_slot_to_offset(unsigned slot) 1226{ 1227 return 16*slot; 1228} 1229 1230/** 1231 * Convert a vertex output (brw_varying_slot) into a byte offset within the 1232 * VUE. 1233 */ 1234static inline unsigned 1235brw_varying_to_offset(const struct brw_vue_map *vue_map, unsigned varying) 1236{ 1237 return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); 1238} 1239 1240void brw_compute_vue_map(const struct intel_device_info *devinfo, 1241 struct brw_vue_map *vue_map, 1242 uint64_t slots_valid, 1243 bool separate_shader, 1244 uint32_t pos_slots); 1245 1246void brw_compute_tess_vue_map(struct brw_vue_map *const vue_map, 1247 uint64_t slots_valid, 1248 uint32_t is_patch); 1249 1250/* brw_interpolation_map.c */ 1251void brw_setup_vue_interpolation(const struct brw_vue_map *vue_map, 1252 struct nir_shader *nir, 1253 struct brw_wm_prog_data *prog_data); 1254 1255enum shader_dispatch_mode { 1256 DISPATCH_MODE_4X1_SINGLE = 0, 1257 DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, 1258 DISPATCH_MODE_4X2_DUAL_OBJECT = 2, 1259 DISPATCH_MODE_SIMD8 = 3, 1260 1261 DISPATCH_MODE_TCS_SINGLE_PATCH = 0, 1262 DISPATCH_MODE_TCS_8_PATCH = 2, 1263}; 1264 1265/** 1266 * @defgroup Tessellator parameter enumerations. 1267 * 1268 * These correspond to the hardware values in 3DSTATE_TE, and are provided 1269 * as part of the tessellation evaluation shader. 1270 * 1271 * @{ 1272 */ 1273enum brw_tess_partitioning { 1274 BRW_TESS_PARTITIONING_INTEGER = 0, 1275 BRW_TESS_PARTITIONING_ODD_FRACTIONAL = 1, 1276 BRW_TESS_PARTITIONING_EVEN_FRACTIONAL = 2, 1277}; 1278 1279enum brw_tess_output_topology { 1280 BRW_TESS_OUTPUT_TOPOLOGY_POINT = 0, 1281 BRW_TESS_OUTPUT_TOPOLOGY_LINE = 1, 1282 BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW = 2, 1283 BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3, 1284}; 1285 1286enum brw_tess_domain { 1287 BRW_TESS_DOMAIN_QUAD = 0, 1288 BRW_TESS_DOMAIN_TRI = 1, 1289 BRW_TESS_DOMAIN_ISOLINE = 2, 1290}; 1291/** @} */ 1292 1293struct brw_vue_prog_data { 1294 struct brw_stage_prog_data base; 1295 struct brw_vue_map vue_map; 1296 1297 /** Should the hardware deliver input VUE handles for URB pull loads? */ 1298 bool include_vue_handles; 1299 1300 unsigned urb_read_length; 1301 unsigned total_grf; 1302 1303 uint32_t clip_distance_mask; 1304 uint32_t cull_distance_mask; 1305 1306 /* Used for calculating urb partitions. In the VS, this is the size of the 1307 * URB entry used for both input and output to the thread. In the GS, this 1308 * is the size of the URB entry used for output. 1309 */ 1310 unsigned urb_entry_size; 1311 1312 enum shader_dispatch_mode dispatch_mode; 1313}; 1314 1315struct brw_vs_prog_data { 1316 struct brw_vue_prog_data base; 1317 1318 uint64_t inputs_read; 1319 uint64_t double_inputs_read; 1320 1321 unsigned nr_attribute_slots; 1322 1323 bool uses_vertexid; 1324 bool uses_instanceid; 1325 bool uses_is_indexed_draw; 1326 bool uses_firstvertex; 1327 bool uses_baseinstance; 1328 bool uses_drawid; 1329}; 1330 1331struct brw_tcs_prog_data 1332{ 1333 struct brw_vue_prog_data base; 1334 1335 /** Should the non-SINGLE_PATCH payload provide primitive ID? */ 1336 bool include_primitive_id; 1337 1338 /** Number vertices in output patch */ 1339 int instances; 1340 1341 /** Track patch count threshold */ 1342 int patch_count_threshold; 1343}; 1344 1345 1346struct brw_tes_prog_data 1347{ 1348 struct brw_vue_prog_data base; 1349 1350 enum brw_tess_partitioning partitioning; 1351 enum brw_tess_output_topology output_topology; 1352 enum brw_tess_domain domain; 1353 bool include_primitive_id; 1354}; 1355 1356struct brw_gs_prog_data 1357{ 1358 struct brw_vue_prog_data base; 1359 1360 unsigned vertices_in; 1361 1362 /** 1363 * Size of an output vertex, measured in HWORDS (32 bytes). 1364 */ 1365 unsigned output_vertex_size_hwords; 1366 1367 unsigned output_topology; 1368 1369 /** 1370 * Size of the control data (cut bits or StreamID bits), in hwords (32 1371 * bytes). 0 if there is no control data. 1372 */ 1373 unsigned control_data_header_size_hwords; 1374 1375 /** 1376 * Format of the control data (either GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID 1377 * if the control data is StreamID bits, or 1378 * GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits). 1379 * Ignored if control_data_header_size is 0. 1380 */ 1381 unsigned control_data_format; 1382 1383 bool include_primitive_id; 1384 1385 /** 1386 * The number of vertices emitted, if constant - otherwise -1. 1387 */ 1388 int static_vertex_count; 1389 1390 int invocations; 1391 1392 /** 1393 * Gfx6: Provoking vertex convention for odd-numbered triangles 1394 * in tristrips. 1395 */ 1396 unsigned pv_first:1; 1397 1398 /** 1399 * Gfx6: Number of varyings that are output to transform feedback. 1400 */ 1401 unsigned num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ 1402 1403 /** 1404 * Gfx6: Map from the index of a transform feedback binding table entry to the 1405 * gl_varying_slot that should be streamed out through that binding table 1406 * entry. 1407 */ 1408 unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */]; 1409 1410 /** 1411 * Gfx6: Map from the index of a transform feedback binding table entry to the 1412 * swizzles that should be used when streaming out data through that 1413 * binding table entry. 1414 */ 1415 unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */]; 1416}; 1417 1418struct brw_sf_prog_data { 1419 uint32_t urb_read_length; 1420 uint32_t total_grf; 1421 1422 /* Each vertex may have up to 12 attributes, 4 components each, 1423 * except WPOS which requires only 2. (11*4 + 2) == 44 ==> 11 1424 * rows. 1425 * 1426 * Actually we use 4 for each, so call it 12 rows. 1427 */ 1428 unsigned urb_entry_size; 1429}; 1430 1431struct brw_clip_prog_data { 1432 uint32_t curb_read_length; /* user planes? */ 1433 uint32_t clip_mode; 1434 uint32_t urb_read_length; 1435 uint32_t total_grf; 1436}; 1437 1438struct brw_tue_map { 1439 uint32_t size_dw; 1440 1441 uint32_t per_task_data_start_dw; 1442}; 1443 1444struct brw_mue_map { 1445 int32_t start_dw[VARYING_SLOT_MAX]; 1446 1447 uint32_t size_dw; 1448 1449 uint32_t max_primitives; 1450 uint32_t per_primitive_start_dw; 1451 uint32_t per_primitive_header_size_dw; 1452 uint32_t per_primitive_data_size_dw; 1453 uint32_t per_primitive_pitch_dw; 1454 1455 uint32_t max_vertices; 1456 uint32_t per_vertex_start_dw; 1457 uint32_t per_vertex_header_size_dw; 1458 uint32_t per_vertex_data_size_dw; 1459 uint32_t per_vertex_pitch_dw; 1460}; 1461 1462struct brw_task_prog_data { 1463 struct brw_cs_prog_data base; 1464 struct brw_tue_map map; 1465 bool uses_drawid; 1466}; 1467 1468enum brw_mesh_index_format { 1469 BRW_INDEX_FORMAT_U32, 1470}; 1471 1472struct brw_mesh_prog_data { 1473 struct brw_cs_prog_data base; 1474 struct brw_mue_map map; 1475 1476 uint32_t clip_distance_mask; 1477 uint32_t cull_distance_mask; 1478 uint16_t primitive_type; 1479 1480 enum brw_mesh_index_format index_format; 1481 1482 bool uses_drawid; 1483}; 1484 1485/* brw_any_prog_data is prog_data for any stage that maps to an API stage */ 1486union brw_any_prog_data { 1487 struct brw_stage_prog_data base; 1488 struct brw_vue_prog_data vue; 1489 struct brw_vs_prog_data vs; 1490 struct brw_tcs_prog_data tcs; 1491 struct brw_tes_prog_data tes; 1492 struct brw_gs_prog_data gs; 1493 struct brw_wm_prog_data wm; 1494 struct brw_cs_prog_data cs; 1495 struct brw_bs_prog_data bs; 1496 struct brw_task_prog_data task; 1497 struct brw_mesh_prog_data mesh; 1498}; 1499 1500#define DEFINE_PROG_DATA_DOWNCAST(STAGE, CHECK) \ 1501static inline struct brw_##STAGE##_prog_data * \ 1502brw_##STAGE##_prog_data(struct brw_stage_prog_data *prog_data) \ 1503{ \ 1504 if (prog_data) \ 1505 assert(CHECK); \ 1506 return (struct brw_##STAGE##_prog_data *) prog_data; \ 1507} \ 1508static inline const struct brw_##STAGE##_prog_data * \ 1509brw_##STAGE##_prog_data_const(const struct brw_stage_prog_data *prog_data) \ 1510{ \ 1511 if (prog_data) \ 1512 assert(CHECK); \ 1513 return (const struct brw_##STAGE##_prog_data *) prog_data; \ 1514} 1515 1516DEFINE_PROG_DATA_DOWNCAST(vs, prog_data->stage == MESA_SHADER_VERTEX) 1517DEFINE_PROG_DATA_DOWNCAST(tcs, prog_data->stage == MESA_SHADER_TESS_CTRL) 1518DEFINE_PROG_DATA_DOWNCAST(tes, prog_data->stage == MESA_SHADER_TESS_EVAL) 1519DEFINE_PROG_DATA_DOWNCAST(gs, prog_data->stage == MESA_SHADER_GEOMETRY) 1520DEFINE_PROG_DATA_DOWNCAST(wm, prog_data->stage == MESA_SHADER_FRAGMENT) 1521DEFINE_PROG_DATA_DOWNCAST(cs, gl_shader_stage_uses_workgroup(prog_data->stage)) 1522DEFINE_PROG_DATA_DOWNCAST(bs, brw_shader_stage_is_bindless(prog_data->stage)) 1523 1524DEFINE_PROG_DATA_DOWNCAST(vue, prog_data->stage == MESA_SHADER_VERTEX || 1525 prog_data->stage == MESA_SHADER_TESS_CTRL || 1526 prog_data->stage == MESA_SHADER_TESS_EVAL || 1527 prog_data->stage == MESA_SHADER_GEOMETRY) 1528 1529DEFINE_PROG_DATA_DOWNCAST(task, prog_data->stage == MESA_SHADER_TASK) 1530DEFINE_PROG_DATA_DOWNCAST(mesh, prog_data->stage == MESA_SHADER_MESH) 1531 1532/* These are not really brw_stage_prog_data. */ 1533DEFINE_PROG_DATA_DOWNCAST(ff_gs, true) 1534DEFINE_PROG_DATA_DOWNCAST(clip, true) 1535DEFINE_PROG_DATA_DOWNCAST(sf, true) 1536#undef DEFINE_PROG_DATA_DOWNCAST 1537 1538struct brw_compile_stats { 1539 uint32_t dispatch_width; /**< 0 for vec4 */ 1540 uint32_t instructions; 1541 uint32_t sends; 1542 uint32_t loops; 1543 uint32_t cycles; 1544 uint32_t spills; 1545 uint32_t fills; 1546}; 1547 1548/** @} */ 1549 1550struct brw_compiler * 1551brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo); 1552 1553/** 1554 * Returns a compiler configuration for use with disk shader cache 1555 * 1556 * This value only needs to change for settings that can cause different 1557 * program generation between two runs on the same hardware. 1558 * 1559 * For example, it doesn't need to be different for gen 8 and gen 9 hardware, 1560 * but it does need to be different if INTEL_DEBUG=nocompact is or isn't used. 1561 */ 1562uint64_t 1563brw_get_compiler_config_value(const struct brw_compiler *compiler); 1564 1565unsigned 1566brw_prog_data_size(gl_shader_stage stage); 1567 1568unsigned 1569brw_prog_key_size(gl_shader_stage stage); 1570 1571void 1572brw_prog_key_set_id(union brw_any_prog_key *key, gl_shader_stage, unsigned id); 1573 1574/** 1575 * Parameters for compiling a vertex shader. 1576 * 1577 * Some of these will be modified during the shader compilation. 1578 */ 1579struct brw_compile_vs_params { 1580 nir_shader *nir; 1581 1582 const struct brw_vs_prog_key *key; 1583 struct brw_vs_prog_data *prog_data; 1584 1585 bool edgeflag_is_last; /* true for gallium */ 1586 1587 struct brw_compile_stats *stats; 1588 1589 void *log_data; 1590 1591 char *error_str; 1592 1593 /* If unset, DEBUG_VS is used. */ 1594 uint64_t debug_flag; 1595}; 1596 1597/** 1598 * Compile a vertex shader. 1599 * 1600 * Returns the final assembly and updates the parameters structure. 1601 */ 1602const unsigned * 1603brw_compile_vs(const struct brw_compiler *compiler, 1604 void *mem_ctx, 1605 struct brw_compile_vs_params *params); 1606 1607/** 1608 * Parameters for compiling a tessellation control shader. 1609 * 1610 * Some of these will be modified during the shader compilation. 1611 */ 1612struct brw_compile_tcs_params { 1613 nir_shader *nir; 1614 1615 const struct brw_tcs_prog_key *key; 1616 struct brw_tcs_prog_data *prog_data; 1617 1618 struct brw_compile_stats *stats; 1619 1620 void *log_data; 1621 1622 char *error_str; 1623}; 1624 1625/** 1626 * Compile a tessellation control shader. 1627 * 1628 * Returns the final assembly and updates the parameters structure. 1629 */ 1630const unsigned * 1631brw_compile_tcs(const struct brw_compiler *compiler, 1632 void *mem_ctx, 1633 struct brw_compile_tcs_params *params); 1634 1635/** 1636 * Parameters for compiling a tessellation evaluation shader. 1637 * 1638 * Some of these will be modified during the shader compilation. 1639 */ 1640struct brw_compile_tes_params { 1641 nir_shader *nir; 1642 1643 const struct brw_tes_prog_key *key; 1644 struct brw_tes_prog_data *prog_data; 1645 const struct brw_vue_map *input_vue_map; 1646 1647 struct brw_compile_stats *stats; 1648 1649 void *log_data; 1650 1651 char *error_str; 1652}; 1653 1654/** 1655 * Compile a tessellation evaluation shader. 1656 * 1657 * Returns the final assembly and updates the parameters structure. 1658 */ 1659const unsigned * 1660brw_compile_tes(const struct brw_compiler *compiler, 1661 void *mem_ctx, 1662 struct brw_compile_tes_params *params); 1663 1664/** 1665 * Parameters for compiling a geometry shader. 1666 * 1667 * Some of these will be modified during the shader compilation. 1668 */ 1669struct brw_compile_gs_params { 1670 nir_shader *nir; 1671 1672 const struct brw_gs_prog_key *key; 1673 struct brw_gs_prog_data *prog_data; 1674 1675 struct brw_compile_stats *stats; 1676 1677 void *log_data; 1678 1679 char *error_str; 1680}; 1681 1682/** 1683 * Compile a geometry shader. 1684 * 1685 * Returns the final assembly and updates the parameters structure. 1686 */ 1687const unsigned * 1688brw_compile_gs(const struct brw_compiler *compiler, 1689 void *mem_ctx, 1690 struct brw_compile_gs_params *params); 1691 1692/** 1693 * Compile a strips and fans shader. 1694 * 1695 * This is a fixed-function shader determined entirely by the shader key and 1696 * a VUE map. 1697 * 1698 * Returns the final assembly and the program's size. 1699 */ 1700const unsigned * 1701brw_compile_sf(const struct brw_compiler *compiler, 1702 void *mem_ctx, 1703 const struct brw_sf_prog_key *key, 1704 struct brw_sf_prog_data *prog_data, 1705 struct brw_vue_map *vue_map, 1706 unsigned *final_assembly_size); 1707 1708/** 1709 * Compile a clipper shader. 1710 * 1711 * This is a fixed-function shader determined entirely by the shader key and 1712 * a VUE map. 1713 * 1714 * Returns the final assembly and the program's size. 1715 */ 1716const unsigned * 1717brw_compile_clip(const struct brw_compiler *compiler, 1718 void *mem_ctx, 1719 const struct brw_clip_prog_key *key, 1720 struct brw_clip_prog_data *prog_data, 1721 struct brw_vue_map *vue_map, 1722 unsigned *final_assembly_size); 1723 1724struct brw_compile_task_params { 1725 struct nir_shader *nir; 1726 1727 const struct brw_task_prog_key *key; 1728 struct brw_task_prog_data *prog_data; 1729 1730 struct brw_compile_stats *stats; 1731 1732 char *error_str; 1733 void *log_data; 1734}; 1735 1736const unsigned * 1737brw_compile_task(const struct brw_compiler *compiler, 1738 void *mem_ctx, 1739 struct brw_compile_task_params *params); 1740 1741struct brw_compile_mesh_params { 1742 struct nir_shader *nir; 1743 1744 const struct brw_mesh_prog_key *key; 1745 struct brw_mesh_prog_data *prog_data; 1746 const struct brw_tue_map *tue_map; 1747 1748 struct brw_compile_stats *stats; 1749 1750 char *error_str; 1751 void *log_data; 1752}; 1753 1754const unsigned * 1755brw_compile_mesh(const struct brw_compiler *compiler, 1756 void *mem_ctx, 1757 struct brw_compile_mesh_params *params); 1758 1759/** 1760 * Parameters for compiling a fragment shader. 1761 * 1762 * Some of these will be modified during the shader compilation. 1763 */ 1764struct brw_compile_fs_params { 1765 nir_shader *nir; 1766 1767 const struct brw_wm_prog_key *key; 1768 struct brw_wm_prog_data *prog_data; 1769 1770 const struct brw_vue_map *vue_map; 1771 const struct brw_mue_map *mue_map; 1772 1773 bool allow_spilling; 1774 bool use_rep_send; 1775 1776 struct brw_compile_stats *stats; 1777 1778 void *log_data; 1779 1780 char *error_str; 1781 1782 /* If unset, DEBUG_WM is used. */ 1783 uint64_t debug_flag; 1784}; 1785 1786/** 1787 * Compile a fragment shader. 1788 * 1789 * Returns the final assembly and updates the parameters structure. 1790 */ 1791const unsigned * 1792brw_compile_fs(const struct brw_compiler *compiler, 1793 void *mem_ctx, 1794 struct brw_compile_fs_params *params); 1795 1796/** 1797 * Parameters for compiling a compute shader. 1798 * 1799 * Some of these will be modified during the shader compilation. 1800 */ 1801struct brw_compile_cs_params { 1802 nir_shader *nir; 1803 1804 const struct brw_cs_prog_key *key; 1805 struct brw_cs_prog_data *prog_data; 1806 1807 struct brw_compile_stats *stats; 1808 1809 void *log_data; 1810 1811 char *error_str; 1812 1813 /* If unset, DEBUG_CS is used. */ 1814 uint64_t debug_flag; 1815}; 1816 1817/** 1818 * Compile a compute shader. 1819 * 1820 * Returns the final assembly and updates the parameters structure. 1821 */ 1822const unsigned * 1823brw_compile_cs(const struct brw_compiler *compiler, 1824 void *mem_ctx, 1825 struct brw_compile_cs_params *params); 1826 1827/** 1828 * Parameters for compiling a Bindless shader. 1829 * 1830 * Some of these will be modified during the shader compilation. 1831 */ 1832struct brw_compile_bs_params { 1833 nir_shader *nir; 1834 1835 const struct brw_bs_prog_key *key; 1836 struct brw_bs_prog_data *prog_data; 1837 1838 unsigned num_resume_shaders; 1839 struct nir_shader **resume_shaders; 1840 1841 struct brw_compile_stats *stats; 1842 1843 void *log_data; 1844 1845 char *error_str; 1846}; 1847 1848/** 1849 * Compile a Bindless shader. 1850 * 1851 * Returns the final assembly and updates the parameters structure. 1852 */ 1853const unsigned * 1854brw_compile_bs(const struct brw_compiler *compiler, 1855 void *mem_ctx, 1856 struct brw_compile_bs_params *params); 1857 1858/** 1859 * Compile a fixed function geometry shader. 1860 * 1861 * Returns the final assembly and the program's size. 1862 */ 1863const unsigned * 1864brw_compile_ff_gs_prog(struct brw_compiler *compiler, 1865 void *mem_ctx, 1866 const struct brw_ff_gs_prog_key *key, 1867 struct brw_ff_gs_prog_data *prog_data, 1868 struct brw_vue_map *vue_map, 1869 unsigned *final_assembly_size); 1870 1871void brw_debug_key_recompile(const struct brw_compiler *c, void *log, 1872 gl_shader_stage stage, 1873 const struct brw_base_prog_key *old_key, 1874 const struct brw_base_prog_key *key); 1875 1876/* Shared Local Memory Size is specified as powers of two, 1877 * and also have a Gen-dependent minimum value if not zero. 1878 */ 1879static inline uint32_t 1880intel_calculate_slm_size(unsigned gen, uint32_t bytes) 1881{ 1882 assert(bytes <= 64 * 1024); 1883 if (bytes > 0) 1884 return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096); 1885 else 1886 return 0; 1887} 1888 1889static inline uint32_t 1890encode_slm_size(unsigned gen, uint32_t bytes) 1891{ 1892 uint32_t slm_size = 0; 1893 1894 /* Shared Local Memory is specified as powers of two, and encoded in 1895 * INTERFACE_DESCRIPTOR_DATA with the following representations: 1896 * 1897 * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | 1898 * ------------------------------------------------------------------- 1899 * Gfx7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | 1900 * ------------------------------------------------------------------- 1901 * Gfx9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 1902 */ 1903 1904 if (bytes > 0) { 1905 slm_size = intel_calculate_slm_size(gen, bytes); 1906 assert(util_is_power_of_two_nonzero(slm_size)); 1907 1908 if (gen >= 9) { 1909 /* Turn an exponent of 10 (1024 kB) into 1. */ 1910 assert(slm_size >= 1024); 1911 slm_size = ffs(slm_size) - 10; 1912 } else { 1913 assert(slm_size >= 4096); 1914 /* Convert to the pre-Gfx9 representation. */ 1915 slm_size = slm_size / 4096; 1916 } 1917 } 1918 1919 return slm_size; 1920} 1921 1922unsigned 1923brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data, 1924 unsigned threads); 1925 1926void 1927brw_write_shader_relocs(const struct brw_isa_info *isa, 1928 void *program, 1929 const struct brw_stage_prog_data *prog_data, 1930 struct brw_shader_reloc_value *values, 1931 unsigned num_values); 1932 1933struct brw_cs_dispatch_info { 1934 uint32_t group_size; 1935 uint32_t simd_size; 1936 uint32_t threads; 1937 1938 /* RightExecutionMask field used in GPGPU_WALKER. */ 1939 uint32_t right_mask; 1940}; 1941 1942/** 1943 * Get the dispatch information for a shader to be used with GPGPU_WALKER and 1944 * similar instructions. 1945 * 1946 * If override_local_size is not NULL, it must to point to a 3-element that 1947 * will override the value from prog_data->local_size. This is used by 1948 * ARB_compute_variable_group_size, where the size is set only at dispatch 1949 * time (so prog_data is outdated). 1950 */ 1951struct brw_cs_dispatch_info 1952brw_cs_get_dispatch_info(const struct intel_device_info *devinfo, 1953 const struct brw_cs_prog_data *prog_data, 1954 const unsigned *override_local_size); 1955 1956/** 1957 * Return true if the given shader stage is dispatched contiguously by the 1958 * relevant fixed function starting from channel 0 of the SIMD thread, which 1959 * implies that the dispatch mask of a thread can be assumed to have the form 1960 * '2^n - 1' for some n. 1961 */ 1962static inline bool 1963brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo, 1964 gl_shader_stage stage, 1965 const struct brw_stage_prog_data *prog_data) 1966{ 1967 /* The code below makes assumptions about the hardware's thread dispatch 1968 * behavior that could be proven wrong in future generations -- Make sure 1969 * to do a full test run with brw_fs_test_dispatch_packing() hooked up to 1970 * the NIR front-end before changing this assertion. 1971 */ 1972 assert(devinfo->ver <= 12); 1973 1974 switch (stage) { 1975 case MESA_SHADER_FRAGMENT: { 1976 /* The PSD discards subspans coming in with no lit samples, which in the 1977 * per-pixel shading case implies that each subspan will either be fully 1978 * lit (due to the VMask being used to allow derivative computations), 1979 * or not dispatched at all. In per-sample dispatch mode individual 1980 * samples from the same subspan have a fixed relative location within 1981 * the SIMD thread, so dispatch of unlit samples cannot be avoided in 1982 * general and we should return false. 1983 */ 1984 const struct brw_wm_prog_data *wm_prog_data = 1985 (const struct brw_wm_prog_data *)prog_data; 1986 return devinfo->verx10 < 125 && 1987 !wm_prog_data->persample_dispatch && 1988 wm_prog_data->uses_vmask; 1989 } 1990 case MESA_SHADER_COMPUTE: 1991 /* Compute shaders will be spawned with either a fully enabled dispatch 1992 * mask or with whatever bottom/right execution mask was given to the 1993 * GPGPU walker command to be used along the workgroup edges -- In both 1994 * cases the dispatch mask is required to be tightly packed for our 1995 * invocation index calculations to work. 1996 */ 1997 return true; 1998 default: 1999 /* Most remaining fixed functions are limited to use a packed dispatch 2000 * mask due to the hardware representation of the dispatch mask as a 2001 * single counter representing the number of enabled channels. 2002 */ 2003 return true; 2004 } 2005} 2006 2007/** 2008 * Computes the first varying slot in the URB produced by the previous stage 2009 * that is used in the next stage. We do this by testing the varying slots in 2010 * the previous stage's vue map against the inputs read in the next stage. 2011 * 2012 * Note that: 2013 * 2014 * - Each URB offset contains two varying slots and we can only skip a 2015 * full offset if both slots are unused, so the value we return here is always 2016 * rounded down to the closest multiple of two. 2017 * 2018 * - gl_Layer and gl_ViewportIndex don't have their own varying slots, they are 2019 * part of the vue header, so if these are read we can't skip anything. 2020 */ 2021static inline int 2022brw_compute_first_urb_slot_required(uint64_t inputs_read, 2023 const struct brw_vue_map *prev_stage_vue_map) 2024{ 2025 if ((inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PRIMITIVE_SHADING_RATE)) == 0) { 2026 for (int i = 0; i < prev_stage_vue_map->num_slots; i++) { 2027 int varying = prev_stage_vue_map->slot_to_varying[i]; 2028 if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) 2029 return ROUND_DOWN_TO(i, 2); 2030 } 2031 } 2032 2033 return 0; 2034} 2035 2036/* From InlineData in 3DSTATE_TASK_SHADER_DATA and 3DSTATE_MESH_SHADER_DATA. */ 2037#define BRW_TASK_MESH_INLINE_DATA_SIZE_DW 8 2038 2039/* InlineData[0-1] is used for Vulkan descriptor. */ 2040#define BRW_TASK_MESH_PUSH_CONSTANTS_START_DW 2 2041 2042#define BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW \ 2043 (BRW_TASK_MESH_INLINE_DATA_SIZE_DW - BRW_TASK_MESH_PUSH_CONSTANTS_START_DW) 2044 2045/** 2046 * This enum is used as the base indice of the nir_load_topology_id_intel 2047 * intrinsic. This is used to return different values based on some aspect of 2048 * the topology of the device. 2049 */ 2050enum brw_topology_id 2051{ 2052 /* A value based of the DSS identifier the shader is currently running on. 2053 * Be mindful that the DSS ID can be higher than the total number of DSS on 2054 * the device. This is because of the fusing that can occur on different 2055 * parts. 2056 */ 2057 BRW_TOPOLOGY_ID_DSS, 2058 2059 /* A value composed of EU ID, thread ID & SIMD lane ID. */ 2060 BRW_TOPOLOGY_ID_EU_THREAD_SIMD, 2061}; 2062 2063#ifdef __cplusplus 2064} /* extern "C" */ 2065#endif 2066 2067#endif /* BRW_COMPILER_H */ 2068