1/* 2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#ifndef IR3_SHADER_H_ 28#define IR3_SHADER_H_ 29 30#include <stdio.h> 31 32#include "c11/threads.h" 33#include "compiler/nir/nir.h" 34#include "compiler/shader_enums.h" 35#include "util/bitscan.h" 36#include "util/disk_cache.h" 37 38#include "ir3_compiler.h" 39 40/* driver param indices: */ 41enum ir3_driver_param { 42 /* compute shader driver params: */ 43 IR3_DP_NUM_WORK_GROUPS_X = 0, 44 IR3_DP_NUM_WORK_GROUPS_Y = 1, 45 IR3_DP_NUM_WORK_GROUPS_Z = 2, 46 IR3_DP_WORK_DIM = 3, 47 IR3_DP_BASE_GROUP_X = 4, 48 IR3_DP_BASE_GROUP_Y = 5, 49 IR3_DP_BASE_GROUP_Z = 6, 50 IR3_DP_CS_SUBGROUP_SIZE = 7, 51 IR3_DP_LOCAL_GROUP_SIZE_X = 8, 52 IR3_DP_LOCAL_GROUP_SIZE_Y = 9, 53 IR3_DP_LOCAL_GROUP_SIZE_Z = 10, 54 IR3_DP_SUBGROUP_ID_SHIFT = 11, 55 IR3_DP_WORKGROUP_ID_X = 12, 56 IR3_DP_WORKGROUP_ID_Y = 13, 57 IR3_DP_WORKGROUP_ID_Z = 14, 58 /* NOTE: gl_NumWorkGroups should be vec4 aligned because 59 * glDispatchComputeIndirect() needs to load these from 60 * the info->indirect buffer. Keep that in mind when/if 61 * adding any addition CS driver params. 62 */ 63 IR3_DP_CS_COUNT = 16, /* must be aligned to vec4 */ 64 65 /* vertex shader driver params: */ 66 IR3_DP_DRAWID = 0, 67 IR3_DP_VTXID_BASE = 1, 68 IR3_DP_INSTID_BASE = 2, 69 IR3_DP_VTXCNT_MAX = 3, 70 /* user-clip-plane components, up to 8x vec4's: */ 71 IR3_DP_UCP0_X = 4, 72 /* .... */ 73 IR3_DP_UCP7_W = 35, 74 IR3_DP_VS_COUNT = 36, /* must be aligned to vec4 */ 75 76 /* fragment shader driver params: */ 77 IR3_DP_FS_SUBGROUP_SIZE = 0, 78}; 79 80#define IR3_MAX_SHADER_BUFFERS 32 81#define IR3_MAX_SHADER_IMAGES 32 82#define IR3_MAX_SO_BUFFERS 4 83#define IR3_MAX_SO_STREAMS 4 84#define IR3_MAX_SO_OUTPUTS 64 85#define IR3_MAX_UBO_PUSH_RANGES 32 86 87/* mirrors SYSTEM_VALUE_BARYCENTRIC_ but starting from 0 */ 88enum ir3_bary { 89 IJ_PERSP_PIXEL, 90 IJ_PERSP_SAMPLE, 91 IJ_PERSP_CENTROID, 92 IJ_PERSP_CENTER_RHW, 93 IJ_LINEAR_PIXEL, 94 IJ_LINEAR_CENTROID, 95 IJ_LINEAR_SAMPLE, 96 IJ_COUNT, 97}; 98 99/* Description of what wavesizes are allowed. */ 100enum ir3_wavesize_option { 101 IR3_SINGLE_ONLY, 102 IR3_SINGLE_OR_DOUBLE, 103 IR3_DOUBLE_ONLY, 104}; 105 106/** 107 * Description of a lowered UBO. 108 */ 109struct ir3_ubo_info { 110 uint32_t block; /* Which constant block */ 111 uint16_t bindless_base; /* For bindless, which base register is used */ 112 bool bindless; 113}; 114 115/** 116 * Description of a range of a lowered UBO access. 117 * 118 * Drivers should not assume that there are not multiple disjoint 119 * lowered ranges of a single UBO. 120 */ 121struct ir3_ubo_range { 122 struct ir3_ubo_info ubo; 123 uint32_t offset; /* start offset to push in the const register file */ 124 uint32_t start, end; /* range of block that's actually used */ 125}; 126 127struct ir3_ubo_analysis_state { 128 struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES]; 129 uint32_t num_enabled; 130 uint32_t size; 131 uint32_t 132 cmdstream_size; /* for per-gen backend to stash required cmdstream size */ 133}; 134 135/** 136 * Describes the layout of shader consts in the const register file. 137 * 138 * Layout of constant registers, each section aligned to vec4. Note 139 * that pointer size (ubo, etc) changes depending on generation. 140 * 141 * + user consts: only used for turnip push consts 142 * + lowered UBO ranges 143 * + preamble consts 144 * + UBO addresses: turnip is bindless and these are wasted 145 * + image dimensions: a5xx only; needed to calculate pixel offset, but only 146 * for images that have image_{load,store,size,atomic*} intrinsics 147 * + kernel params: cl only 148 * + driver params: these are stage-dependent; see ir3_driver_param 149 * + TFBO addresses: only for vs on a3xx/a4xx 150 * + primitive params: these are stage-dependent 151 * vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0) 152 * hs, ds: uvec4(primitive_stride, vertex_stride, 153 * patch_stride, patch_vertices_in) 154 * uvec4(tess_param_base, tess_factor_base) 155 * + primitive map 156 * + lowered immediates 157 * 158 * Immediates go last mostly because they are inserted in the CP pass 159 * after the nir -> ir3 frontend. 160 * 161 * Note UBO size in bytes should be aligned to vec4 162 */ 163struct ir3_const_state { 164 unsigned num_ubos; 165 unsigned num_driver_params; /* scalar */ 166 167 /* UBO that should be mapped to the NIR shader's constant_data (or -1). */ 168 int32_t constant_data_ubo; 169 170 struct { 171 /* user const start at zero */ 172 unsigned ubo; 173 unsigned image_dims; 174 unsigned kernel_params; 175 unsigned driver_param; 176 unsigned tfbo; 177 unsigned primitive_param; 178 unsigned primitive_map; 179 unsigned immediate; 180 } offsets; 181 182 struct { 183 uint32_t mask; /* bitmask of images that have image_store */ 184 uint32_t count; /* number of consts allocated */ 185 /* three const allocated per image which has image_store: 186 * + cpp (bytes per pixel) 187 * + pitch (y pitch) 188 * + array_pitch (z pitch) 189 */ 190 uint32_t off[IR3_MAX_SHADER_IMAGES]; 191 } image_dims; 192 193 unsigned immediates_count; 194 unsigned immediates_size; 195 uint32_t *immediates; 196 197 unsigned preamble_size; 198 199 /* State of ubo access lowered to push consts: */ 200 struct ir3_ubo_analysis_state ubo_state; 201 bool shared_consts_enable; 202}; 203 204/** 205 * A single output for vertex transform feedback. 206 */ 207struct ir3_stream_output { 208 unsigned register_index : 6; /**< 0 to 63 (OUT index) */ 209 unsigned start_component : 2; /** 0 to 3 */ 210 unsigned num_components : 3; /** 1 to 4 */ 211 unsigned output_buffer : 3; /**< 0 to PIPE_MAX_SO_BUFFERS */ 212 unsigned dst_offset : 16; /**< offset into the buffer in dwords */ 213 unsigned stream : 2; /**< 0 to 3 */ 214}; 215 216/** 217 * Stream output for vertex transform feedback. 218 */ 219struct ir3_stream_output_info { 220 unsigned num_outputs; 221 /** stride for an entire vertex for each buffer in dwords */ 222 uint16_t stride[IR3_MAX_SO_BUFFERS]; 223 224 /* These correspond to the VPC_SO_STREAM_CNTL fields */ 225 uint8_t streams_written; 226 uint8_t buffer_to_stream[IR3_MAX_SO_BUFFERS]; 227 228 /** 229 * Array of stream outputs, in the order they are to be written in. 230 * Selected components are tightly packed into the output buffer. 231 */ 232 struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS]; 233}; 234 235/** 236 * Starting from a4xx, HW supports pre-dispatching texture sampling 237 * instructions prior to scheduling a shader stage, when the 238 * coordinate maps exactly to an output of the previous stage. 239 */ 240 241/** 242 * There is a limit in the number of pre-dispatches allowed for any 243 * given stage. 244 */ 245#define IR3_MAX_SAMPLER_PREFETCH 4 246 247/** 248 * This is the output stream value for 'cmd', as used by blob. It may 249 * encode the return type (in 3 bits) but it hasn't been verified yet. 250 */ 251#define IR3_SAMPLER_PREFETCH_CMD 0x4 252#define IR3_SAMPLER_BINDLESS_PREFETCH_CMD 0x6 253 254/** 255 * Stream output for texture sampling pre-dispatches. 256 */ 257struct ir3_sampler_prefetch { 258 uint8_t src; 259 uint8_t samp_id; 260 uint8_t tex_id; 261 uint16_t samp_bindless_id; 262 uint16_t tex_bindless_id; 263 uint8_t dst; 264 uint8_t wrmask; 265 uint8_t half_precision; 266 uint8_t cmd; 267}; 268 269/* Configuration key used to identify a shader variant.. different 270 * shader variants can be used to implement features not supported 271 * in hw (two sided color), binning-pass vertex shader, etc. 272 * 273 * When adding to this struct, please update ir3_shader_variant()'s debug 274 * output. 275 */ 276struct ir3_shader_key { 277 union { 278 struct { 279 /* 280 * Combined Vertex/Fragment shader parameters: 281 */ 282 unsigned ucp_enables : 8; 283 284 /* do we need to check {v,f}saturate_{s,t,r}? */ 285 unsigned has_per_samp : 1; 286 287 /* 288 * Fragment shader variant parameters: 289 */ 290 unsigned sample_shading : 1; 291 unsigned msaa : 1; 292 /* used when shader needs to handle flat varyings (a4xx) 293 * for front/back color inputs to frag shader: 294 */ 295 unsigned rasterflat : 1; 296 297 /* Indicates that this is a tessellation pipeline which requires a 298 * whole different kind of vertex shader. In case of 299 * tessellation, this field also tells us which kind of output 300 * topology the TES uses, which the TCS needs to know. 301 */ 302#define IR3_TESS_NONE 0 303#define IR3_TESS_QUADS 1 304#define IR3_TESS_TRIANGLES 2 305#define IR3_TESS_ISOLINES 3 306 unsigned tessellation : 2; 307 308 unsigned has_gs : 1; 309 310 /* Whether stages after TCS read gl_PrimitiveID, used to determine 311 * whether the TCS has to store it in the tess factor BO. 312 */ 313 unsigned tcs_store_primid : 1; 314 315 /* Whether this variant sticks to the "safe" maximum constlen, 316 * which guarantees that the combined stages will never go over 317 * the limit: 318 */ 319 unsigned safe_constlen : 1; 320 321 /* Whether gl_Layer must be forced to 0 because it isn't written. */ 322 unsigned layer_zero : 1; 323 324 /* Whether gl_ViewportIndex must be forced to 0 because it isn't 325 * written. */ 326 unsigned view_zero : 1; 327 }; 328 uint32_t global; 329 }; 330 331 /* bitmask of ms shifts (a3xx) */ 332 uint32_t vsamples, fsamples; 333 334 /* bitmask of samplers which need astc srgb workaround (a4xx): */ 335 uint16_t vastc_srgb, fastc_srgb; 336 337 /* per-component (3-bit) swizzles of each sampler (a4xx tg4): */ 338 uint16_t vsampler_swizzles[16]; 339 uint16_t fsampler_swizzles[16]; 340}; 341 342static inline unsigned 343ir3_tess_mode(enum tess_primitive_mode tess_mode) 344{ 345 switch (tess_mode) { 346 case TESS_PRIMITIVE_ISOLINES: 347 return IR3_TESS_ISOLINES; 348 case TESS_PRIMITIVE_TRIANGLES: 349 return IR3_TESS_TRIANGLES; 350 case TESS_PRIMITIVE_QUADS: 351 return IR3_TESS_QUADS; 352 default: 353 unreachable("bad tessmode"); 354 } 355} 356 357static inline uint32_t 358ir3_tess_factor_stride(unsigned patch_type) 359{ 360 /* note: this matches the stride used by ir3's build_tessfactor_base */ 361 switch (patch_type) { 362 case IR3_TESS_ISOLINES: 363 return 12; 364 case IR3_TESS_TRIANGLES: 365 return 20; 366 case IR3_TESS_QUADS: 367 return 28; 368 default: 369 unreachable("bad tessmode"); 370 } 371} 372 373static inline bool 374ir3_shader_key_equal(const struct ir3_shader_key *a, 375 const struct ir3_shader_key *b) 376{ 377 /* slow-path if we need to check {v,f}saturate_{s,t,r} */ 378 if (a->has_per_samp || b->has_per_samp) 379 return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0; 380 return a->global == b->global; 381} 382 383/* will the two keys produce different lowering for a fragment shader? */ 384static inline bool 385ir3_shader_key_changes_fs(struct ir3_shader_key *key, 386 struct ir3_shader_key *last_key) 387{ 388 if (last_key->has_per_samp || key->has_per_samp) { 389 if ((last_key->fsamples != key->fsamples) || 390 (last_key->fastc_srgb != key->fastc_srgb) || 391 memcmp(last_key->fsampler_swizzles, key->fsampler_swizzles, 392 sizeof(key->fsampler_swizzles))) 393 return true; 394 } 395 396 if (last_key->rasterflat != key->rasterflat) 397 return true; 398 399 if (last_key->layer_zero != key->layer_zero) 400 return true; 401 402 if (last_key->ucp_enables != key->ucp_enables) 403 return true; 404 405 if (last_key->safe_constlen != key->safe_constlen) 406 return true; 407 408 return false; 409} 410 411/* will the two keys produce different lowering for a vertex shader? */ 412static inline bool 413ir3_shader_key_changes_vs(struct ir3_shader_key *key, 414 struct ir3_shader_key *last_key) 415{ 416 if (last_key->has_per_samp || key->has_per_samp) { 417 if ((last_key->vsamples != key->vsamples) || 418 (last_key->vastc_srgb != key->vastc_srgb) || 419 memcmp(last_key->vsampler_swizzles, key->vsampler_swizzles, 420 sizeof(key->vsampler_swizzles))) 421 return true; 422 } 423 424 if (last_key->ucp_enables != key->ucp_enables) 425 return true; 426 427 if (last_key->safe_constlen != key->safe_constlen) 428 return true; 429 430 return false; 431} 432 433/** 434 * On a4xx+a5xx, Images share state with textures and SSBOs: 435 * 436 * + Uses texture (cat5) state/instruction (isam) to read 437 * + Uses SSBO state and instructions (cat6) to write and for atomics 438 * 439 * Starting with a6xx, Images and SSBOs are basically the same thing, 440 * with texture state and isam also used for SSBO reads. 441 * 442 * On top of that, gallium makes the SSBO (shader_buffers) state semi 443 * sparse, with the first half of the state space used for atomic 444 * counters lowered to atomic buffers. We could ignore this, but I 445 * don't think we could *really* handle the case of a single shader 446 * that used the max # of textures + images + SSBOs. And once we are 447 * offsetting images by num_ssbos (or visa versa) to map them into 448 * the same hardware state, the hardware state has become coupled to 449 * the shader state, so at this point we might as well just use a 450 * mapping table to remap things from image/SSBO idx to hw idx. 451 * 452 * To make things less (more?) confusing, for the hw "SSBO" state 453 * (since it is really both SSBO and Image) I'll use the name "IBO" 454 */ 455struct ir3_ibo_mapping { 456#define IBO_INVALID 0xff 457 /* Maps logical SSBO state to hw tex state: */ 458 uint8_t ssbo_to_tex[IR3_MAX_SHADER_BUFFERS]; 459 460 /* Maps logical Image state to hw tex state: */ 461 uint8_t image_to_tex[IR3_MAX_SHADER_IMAGES]; 462 463 /* Maps hw state back to logical SSBO or Image state: 464 * 465 * note IBO_SSBO ORd into values to indicate that the 466 * hw slot is used for SSBO state vs Image state. 467 */ 468#define IBO_SSBO 0x80 469 uint8_t tex_to_image[32]; 470 471 /* including real textures */ 472 uint8_t num_tex; 473 /* the number of real textures, ie. image/ssbo start here */ 474 uint8_t tex_base; 475}; 476 477struct ir3_disasm_info { 478 bool write_disasm; 479 char *nir; 480 char *disasm; 481}; 482 483/* Represents half register in regid */ 484#define HALF_REG_ID 0x100 485 486/** 487 * Shader variant which contains the actual hw shader instructions, 488 * and necessary info for shader state setup. 489 */ 490struct ir3_shader_variant { 491 struct fd_bo *bo; 492 493 /* variant id (for debug) */ 494 uint32_t id; 495 496 /* id of the shader the variant came from (for debug) */ 497 uint32_t shader_id; 498 499 struct ir3_shader_key key; 500 501 /* vertex shaders can have an extra version for hwbinning pass, 502 * which is pointed to by so->binning: 503 */ 504 bool binning_pass; 505 // union { 506 struct ir3_shader_variant *binning; 507 struct ir3_shader_variant *nonbinning; 508 // }; 509 510 struct ir3 *ir; /* freed after assembling machine instructions */ 511 512 /* shader variants form a linked list: */ 513 struct ir3_shader_variant *next; 514 515 /* replicated here to avoid passing extra ptrs everywhere: */ 516 gl_shader_stage type; 517 struct ir3_compiler *compiler; 518 519 char *name; 520 521 /* variant's copy of nir->constant_data (since we don't track the NIR in 522 * the variant, and shader->nir is before the opt pass). Moves to v->bin 523 * after assembly. 524 */ 525 void *constant_data; 526 527 /* 528 * Below here is serialized when written to disk cache: 529 */ 530 531 /* The actual binary shader instructions, size given by info.sizedwords: */ 532 uint32_t *bin; 533 534 struct ir3_const_state *const_state; 535 536 /* 537 * The following macros are used by the shader disk cache save/ 538 * restore paths to serialize/deserialize the variant. Any 539 * pointers that require special handling in store_variant() 540 * and retrieve_variant() should go above here. 541 */ 542#define VARIANT_CACHE_START offsetof(struct ir3_shader_variant, info) 543#define VARIANT_CACHE_PTR(v) (((char *)v) + VARIANT_CACHE_START) 544#define VARIANT_CACHE_SIZE \ 545 (sizeof(struct ir3_shader_variant) - VARIANT_CACHE_START) 546 547 struct ir3_info info; 548 549 uint32_t constant_data_size; 550 551 /* Levels of nesting of flow control: 552 */ 553 unsigned branchstack; 554 555 unsigned loops; 556 557 /* the instructions length is in units of instruction groups 558 * (4 instructions for a3xx, 16 instructions for a4xx.. each 559 * instruction is 2 dwords): 560 */ 561 unsigned instrlen; 562 563 /* the constants length is in units of vec4's, and is the sum of 564 * the uniforms and the built-in compiler constants 565 */ 566 unsigned constlen; 567 568 /* The private memory size in bytes */ 569 unsigned pvtmem_size; 570 /* Whether we should use the new per-wave layout rather than per-fiber. */ 571 bool pvtmem_per_wave; 572 573 /* Size in bytes of required shared memory */ 574 unsigned shared_size; 575 576 /* About Linkage: 577 * + Let the frag shader determine the position/compmask for the 578 * varyings, since it is the place where we know if the varying 579 * is actually used, and if so, which components are used. So 580 * what the hw calls "outloc" is taken from the "inloc" of the 581 * frag shader. 582 * + From the vert shader, we only need the output regid 583 */ 584 585 bool frag_face, color0_mrt; 586 uint8_t fragcoord_compmask; 587 588 /* NOTE: for input/outputs, slot is: 589 * gl_vert_attrib - for VS inputs 590 * gl_varying_slot - for VS output / FS input 591 * gl_frag_result - for FS output 592 */ 593 594 /* varyings/outputs: */ 595 unsigned outputs_count; 596 struct { 597 uint8_t slot; 598 uint8_t regid; 599 uint8_t view; 600 bool half : 1; 601 } outputs[32 + 2]; /* +POSITION +PSIZE */ 602 bool writes_pos, writes_smask, writes_psize, writes_stencilref; 603 604 /* Size in dwords of all outputs for VS, size of entire patch for HS. */ 605 uint32_t output_size; 606 607 /* Expected size of incoming output_loc for HS, DS, and GS */ 608 uint32_t input_size; 609 610 /* Map from location to offset in per-primitive storage. In dwords for 611 * HS, where varyings are read in the next stage via ldg with a dword 612 * offset, and in bytes for all other stages. 613 * +POSITION, +PSIZE, ... - see shader_io_get_unique_index 614 */ 615 unsigned output_loc[12 + 32]; 616 617 /* attributes (VS) / varyings (FS): 618 * Note that sysval's should come *after* normal inputs. 619 */ 620 unsigned inputs_count; 621 struct { 622 uint8_t slot; 623 uint8_t regid; 624 uint8_t compmask; 625 /* location of input (ie. offset passed to bary.f, etc). This 626 * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx 627 * have the OUTLOCn value offset by 8, presumably to account 628 * for gl_Position/gl_PointSize) 629 */ 630 uint8_t inloc; 631 /* vertex shader specific: */ 632 bool sysval : 1; /* slot is a gl_system_value */ 633 /* fragment shader specific: */ 634 bool bary : 1; /* fetched varying (vs one loaded into reg) */ 635 bool rasterflat : 1; /* special handling for emit->rasterflat */ 636 bool half : 1; 637 bool flat : 1; 638 } inputs[32 + 2]; /* +POSITION +FACE */ 639 640 /* sum of input components (scalar). For frag shaders, it only counts 641 * the varying inputs: 642 */ 643 unsigned total_in; 644 645 /* sum of sysval input components (scalar). */ 646 unsigned sysval_in; 647 648 /* For frag shaders, the total number of inputs (not scalar, 649 * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR) 650 */ 651 unsigned varying_in; 652 653 /* Remapping table to map Image and SSBO to hw state: */ 654 struct ir3_ibo_mapping image_mapping; 655 656 /* number of samplers/textures (which are currently 1:1): */ 657 int num_samp; 658 659 /* is there an implicit sampler to read framebuffer (FS only).. if 660 * so the sampler-idx is 'num_samp - 1' (ie. it is appended after 661 * the last "real" texture) 662 */ 663 bool fb_read; 664 665 /* do we have one or more SSBO instructions: */ 666 bool has_ssbo; 667 668 /* Which bindless resources are used, for filling out sp_xs_config */ 669 bool bindless_tex; 670 bool bindless_samp; 671 bool bindless_ibo; 672 bool bindless_ubo; 673 674 /* do we need derivatives: */ 675 bool need_pixlod; 676 677 bool need_fine_derivatives; 678 679 /* do we need VS driver params? */ 680 bool need_driver_params; 681 682 /* do we have image write, etc (which prevents early-z): */ 683 bool no_earlyz; 684 685 /* do we have kill, which also prevents early-z, but not necessarily 686 * early-lrz (as long as lrz-write is disabled, which must be handled 687 * outside of ir3. Unlike other no_earlyz cases, kill doesn't have 688 * side effects that prevent early-lrz discard. 689 */ 690 bool has_kill; 691 692 bool per_samp; 693 694 /* Are we using split or merged register file? */ 695 bool mergedregs; 696 697 uint8_t clip_mask, cull_mask; 698 699 /* for astc srgb workaround, the number/base of additional 700 * alpha tex states we need, and index of original tex states 701 */ 702 struct { 703 unsigned base, count; 704 unsigned orig_idx[16]; 705 } astc_srgb; 706 707 /* for tg4 workaround, the number/base of additional 708 * unswizzled tex states we need, and index of original tex states 709 */ 710 struct { 711 unsigned base, count; 712 unsigned orig_idx[16]; 713 } tg4; 714 715 /* texture sampler pre-dispatches */ 716 uint32_t num_sampler_prefetch; 717 struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH]; 718 719 uint16_t local_size[3]; 720 bool local_size_variable; 721 722 /* Important for compute shader to determine max reg footprint */ 723 bool has_barrier; 724 725 /* The offset where images start in the IBO array. */ 726 unsigned num_ssbos; 727 728 /* The total number of SSBOs and images, i.e. the number of hardware IBOs. */ 729 unsigned num_ibos; 730 731 unsigned num_reserved_user_consts; 732 733 union { 734 struct { 735 enum tess_primitive_mode primitive_mode; 736 737 /** The number of vertices in the TCS output patch. */ 738 uint8_t tcs_vertices_out; 739 unsigned spacing:2; /*gl_tess_spacing*/ 740 741 /** Is the vertex order counterclockwise? */ 742 bool ccw:1; 743 bool point_mode:1; 744 } tess; 745 struct { 746 /** The output primitive type */ 747 uint16_t output_primitive; 748 749 /** The maximum number of vertices the geometry shader might write. */ 750 uint16_t vertices_out; 751 752 /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */ 753 uint8_t invocations; 754 755 /** The number of vertices received per input primitive (max. 6) */ 756 uint8_t vertices_in:3; 757 } gs; 758 struct { 759 bool early_fragment_tests : 1; 760 bool color_is_dual_source : 1; 761 } fs; 762 struct { 763 unsigned req_input_mem; 764 unsigned req_local_mem; 765 } cs; 766 }; 767 768 enum ir3_wavesize_option api_wavesize, real_wavesize; 769 770 /* For when we don't have a shader, variant's copy of streamout state */ 771 struct ir3_stream_output_info stream_output; 772 773 struct ir3_disasm_info disasm_info; 774}; 775 776static inline const char * 777ir3_shader_stage(struct ir3_shader_variant *v) 778{ 779 switch (v->type) { 780 case MESA_SHADER_VERTEX: 781 return v->binning_pass ? "BVERT" : "VERT"; 782 case MESA_SHADER_TESS_CTRL: 783 return "TCS"; 784 case MESA_SHADER_TESS_EVAL: 785 return "TES"; 786 case MESA_SHADER_GEOMETRY: 787 return "GEOM"; 788 case MESA_SHADER_FRAGMENT: 789 return "FRAG"; 790 case MESA_SHADER_COMPUTE: 791 case MESA_SHADER_KERNEL: 792 return "CL"; 793 default: 794 unreachable("invalid type"); 795 return NULL; 796 } 797} 798 799/* Currently we do not do binning for tess. And for GS there is no 800 * cross-stage VS+GS optimization, so the full VS+GS is used in 801 * the binning pass. 802 */ 803static inline bool 804ir3_has_binning_vs(const struct ir3_shader_key *key) 805{ 806 if (key->tessellation || key->has_gs) 807 return false; 808 return true; 809} 810 811/** 812 * Represents a shader at the API level, before state-specific variants are 813 * generated. 814 */ 815struct ir3_shader { 816 gl_shader_stage type; 817 818 /* shader id (for debug): */ 819 uint32_t id; 820 uint32_t variant_count; 821 822 /* Set by freedreno after shader_state_create, so we can emit debug info 823 * when recompiling a shader at draw time. 824 */ 825 bool initial_variants_done; 826 827 struct ir3_compiler *compiler; 828 829 unsigned num_reserved_user_consts; 830 831 /* What API-visible wavesizes are allowed. Even if only double wavesize is 832 * allowed, we may still use the smaller wavesize "under the hood" and the 833 * application simply sees the upper half as always disabled. 834 */ 835 enum ir3_wavesize_option api_wavesize; 836 837 /* What wavesizes we're allowed to actually use. If the API wavesize is 838 * single-only, then this must be single-only too. 839 */ 840 enum ir3_wavesize_option real_wavesize; 841 842 bool nir_finalized; 843 struct nir_shader *nir; 844 struct ir3_stream_output_info stream_output; 845 846 /* per shader stage specific info: */ 847 union { 848 /* for compute shaders: */ 849 struct { 850 unsigned req_input_mem; /* in dwords */ 851 unsigned req_local_mem; 852 } cs; 853 }; 854 855 struct ir3_shader_variant *variants; 856 mtx_t variants_lock; 857 858 cache_key cache_key; /* shader disk-cache key */ 859 860 /* Bitmask of bits of the shader key used by this shader. Used to avoid 861 * recompiles for GL NOS that doesn't actually apply to the shader. 862 */ 863 struct ir3_shader_key key_mask; 864 865 bool shared_consts_enable; 866}; 867 868/** 869 * In order to use the same cmdstream, in particular constlen setup and const 870 * emit, for both binning and draw pass (a6xx+), the binning pass re-uses it's 871 * corresponding draw pass shaders const_state. 872 */ 873static inline struct ir3_const_state * 874ir3_const_state(const struct ir3_shader_variant *v) 875{ 876 if (v->binning_pass) 877 return v->nonbinning->const_state; 878 return v->const_state; 879} 880 881/* Given a variant, calculate the maximum constlen it can have. 882 */ 883static inline unsigned 884ir3_max_const(const struct ir3_shader_variant *v) 885{ 886 const struct ir3_compiler *compiler = v->compiler; 887 bool shared_consts_enable = ir3_const_state(v)->shared_consts_enable; 888 889 /* Shared consts size for CS and FS matches with what's acutally used, 890 * but the size of shared consts for geomtry stages doesn't. 891 * So we use a hw quirk for geometry shared consts. 892 */ 893 uint32_t shared_consts_size = shared_consts_enable ? 894 compiler->shared_consts_size : 0; 895 896 uint32_t shared_consts_size_geom = shared_consts_enable ? 897 compiler->geom_shared_consts_size_quirk : 0; 898 899 uint32_t safe_shared_consts_size = shared_consts_enable ? 900 ALIGN_POT(MAX2(DIV_ROUND_UP(shared_consts_size_geom, 4), 901 DIV_ROUND_UP(shared_consts_size, 5)), 4) : 0; 902 903 if ((v->type == MESA_SHADER_COMPUTE) || 904 (v->type == MESA_SHADER_KERNEL)) { 905 return compiler->max_const_compute - shared_consts_size; 906 } else if (v->key.safe_constlen) { 907 return compiler->max_const_safe - safe_shared_consts_size; 908 } else if (v->type == MESA_SHADER_FRAGMENT) { 909 return compiler->max_const_frag - shared_consts_size; 910 } else { 911 return compiler->max_const_geom - shared_consts_size_geom; 912 } 913} 914 915void *ir3_shader_assemble(struct ir3_shader_variant *v); 916struct ir3_shader_variant * 917ir3_shader_create_variant(struct ir3_shader *shader, 918 const struct ir3_shader_key *key, 919 bool keep_ir); 920struct ir3_shader_variant * 921ir3_shader_get_variant(struct ir3_shader *shader, 922 const struct ir3_shader_key *key, bool binning_pass, 923 bool keep_ir, bool *created); 924 925 926struct ir3_shader_options { 927 unsigned reserved_user_consts; 928 enum ir3_wavesize_option api_wavesize, real_wavesize; 929 bool shared_consts_enable; 930}; 931 932struct ir3_shader * 933ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir, 934 const struct ir3_shader_options *options, 935 struct ir3_stream_output_info *stream_output); 936uint32_t ir3_trim_constlen(struct ir3_shader_variant **variants, 937 const struct ir3_compiler *compiler); 938void ir3_shader_destroy(struct ir3_shader *shader); 939void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out); 940uint64_t ir3_shader_outputs(const struct ir3_shader *so); 941 942int ir3_glsl_type_size(const struct glsl_type *type, bool bindless); 943 944/* 945 * Helper/util: 946 */ 947 948/* clears shader-key flags which don't apply to the given shader. 949 */ 950static inline void 951ir3_key_clear_unused(struct ir3_shader_key *key, struct ir3_shader *shader) 952{ 953 uint32_t *key_bits = (uint32_t *)key; 954 uint32_t *key_mask = (uint32_t *)&shader->key_mask; 955 STATIC_ASSERT(sizeof(*key) % 4 == 0); 956 for (int i = 0; i < sizeof(*key) >> 2; i++) 957 key_bits[i] &= key_mask[i]; 958} 959 960static inline int 961ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot) 962{ 963 int j; 964 965 for (j = 0; j < so->outputs_count; j++) 966 if (so->outputs[j].slot == slot) 967 return j; 968 969 /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n] 970 * in the vertex shader.. but the fragment shader doesn't know this 971 * so it will always have both IN.COLOR[n] and IN.BCOLOR[n]. So 972 * at link time if there is no matching OUT.BCOLOR[n], we must map 973 * OUT.COLOR[n] to IN.BCOLOR[n]. And visa versa if there is only 974 * a OUT.BCOLOR[n] but no matching OUT.COLOR[n] 975 */ 976 if (slot == VARYING_SLOT_BFC0) { 977 slot = VARYING_SLOT_COL0; 978 } else if (slot == VARYING_SLOT_BFC1) { 979 slot = VARYING_SLOT_COL1; 980 } else if (slot == VARYING_SLOT_COL0) { 981 slot = VARYING_SLOT_BFC0; 982 } else if (slot == VARYING_SLOT_COL1) { 983 slot = VARYING_SLOT_BFC1; 984 } else { 985 return -1; 986 } 987 988 for (j = 0; j < so->outputs_count; j++) 989 if (so->outputs[j].slot == slot) 990 return j; 991 992 return -1; 993} 994 995static inline int 996ir3_next_varying(const struct ir3_shader_variant *so, int i) 997{ 998 while (++i < so->inputs_count) 999 if (so->inputs[i].compmask && so->inputs[i].bary) 1000 break; 1001 return i; 1002} 1003 1004struct ir3_shader_linkage { 1005 /* Maximum location either consumed by the fragment shader or produced by 1006 * the last geometry stage, i.e. the size required for each vertex in the 1007 * VPC in DWORD's. 1008 */ 1009 uint8_t max_loc; 1010 1011 /* Number of entries in var. */ 1012 uint8_t cnt; 1013 1014 /* Bitset of locations used, including ones which are only used by the FS. 1015 */ 1016 uint32_t varmask[4]; 1017 1018 /* Map from VS output to location. */ 1019 struct { 1020 uint8_t slot; 1021 uint8_t regid; 1022 uint8_t compmask; 1023 uint8_t loc; 1024 } var[32]; 1025 1026 /* location for fixed-function gl_PrimitiveID passthrough */ 1027 uint8_t primid_loc; 1028 1029 /* location for fixed-function gl_ViewIndex passthrough */ 1030 uint8_t viewid_loc; 1031 1032 /* location for combined clip/cull distance arrays */ 1033 uint8_t clip0_loc, clip1_loc; 1034}; 1035 1036static inline void 1037ir3_link_add(struct ir3_shader_linkage *l, uint8_t slot, uint8_t regid_, 1038 uint8_t compmask, uint8_t loc) 1039{ 1040 for (int j = 0; j < util_last_bit(compmask); j++) { 1041 uint8_t comploc = loc + j; 1042 l->varmask[comploc / 32] |= 1 << (comploc % 32); 1043 } 1044 1045 l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask)); 1046 1047 if (regid_ != regid(63, 0)) { 1048 int i = l->cnt++; 1049 assert(i < ARRAY_SIZE(l->var)); 1050 1051 l->var[i].slot = slot; 1052 l->var[i].regid = regid_; 1053 l->var[i].compmask = compmask; 1054 l->var[i].loc = loc; 1055 } 1056} 1057 1058static inline void 1059ir3_link_shaders(struct ir3_shader_linkage *l, 1060 const struct ir3_shader_variant *vs, 1061 const struct ir3_shader_variant *fs, bool pack_vs_out) 1062{ 1063 /* On older platforms, varmask isn't programmed at all, and it appears 1064 * that the hardware generates a mask of used VPC locations using the VS 1065 * output map, and hangs if a FS bary instruction references a location 1066 * not in the list. This means that we need to have a dummy entry in the 1067 * VS out map for things like gl_PointCoord which aren't written by the 1068 * VS. Furthermore we can't use r63.x, so just pick a random register to 1069 * use if there is no VS output. 1070 */ 1071 const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0); 1072 int j = -1, k; 1073 1074 l->primid_loc = 0xff; 1075 l->viewid_loc = 0xff; 1076 l->clip0_loc = 0xff; 1077 l->clip1_loc = 0xff; 1078 1079 while (l->cnt < ARRAY_SIZE(l->var)) { 1080 j = ir3_next_varying(fs, j); 1081 1082 if (j >= fs->inputs_count) 1083 break; 1084 1085 if (fs->inputs[j].inloc >= fs->total_in) 1086 continue; 1087 1088 k = ir3_find_output(vs, fs->inputs[j].slot); 1089 1090 if (k < 0 && fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) { 1091 l->primid_loc = fs->inputs[j].inloc; 1092 } 1093 1094 if (fs->inputs[j].slot == VARYING_SLOT_VIEW_INDEX) { 1095 assert(k < 0); 1096 l->viewid_loc = fs->inputs[j].inloc; 1097 } 1098 1099 if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST0) 1100 l->clip0_loc = fs->inputs[j].inloc; 1101 1102 if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1) 1103 l->clip1_loc = fs->inputs[j].inloc; 1104 1105 ir3_link_add(l, fs->inputs[j].slot, 1106 k >= 0 ? vs->outputs[k].regid : default_regid, 1107 fs->inputs[j].compmask, fs->inputs[j].inloc); 1108 } 1109} 1110 1111static inline uint32_t 1112ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot) 1113{ 1114 int j; 1115 for (j = 0; j < so->outputs_count; j++) 1116 if (so->outputs[j].slot == slot) { 1117 uint32_t regid = so->outputs[j].regid; 1118 if (so->outputs[j].half) 1119 regid |= HALF_REG_ID; 1120 return regid; 1121 } 1122 return regid(63, 0); 1123} 1124 1125void ir3_link_stream_out(struct ir3_shader_linkage *l, 1126 const struct ir3_shader_variant *v); 1127 1128#define VARYING_SLOT_GS_HEADER_IR3 (VARYING_SLOT_MAX + 0) 1129#define VARYING_SLOT_GS_VERTEX_FLAGS_IR3 (VARYING_SLOT_MAX + 1) 1130#define VARYING_SLOT_TCS_HEADER_IR3 (VARYING_SLOT_MAX + 2) 1131#define VARYING_SLOT_REL_PATCH_ID_IR3 (VARYING_SLOT_MAX + 3) 1132 1133static inline uint32_t 1134ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot) 1135{ 1136 int j; 1137 for (j = 0; j < so->inputs_count; j++) 1138 if (so->inputs[j].sysval && (so->inputs[j].slot == slot)) 1139 return so->inputs[j].regid; 1140 return regid(63, 0); 1141} 1142 1143/* calculate register footprint in terms of half-regs (ie. one full 1144 * reg counts as two half-regs). 1145 */ 1146static inline uint32_t 1147ir3_shader_halfregs(const struct ir3_shader_variant *v) 1148{ 1149 return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1); 1150} 1151 1152static inline uint32_t 1153ir3_shader_nibo(const struct ir3_shader_variant *v) 1154{ 1155 return v->num_ibos; 1156} 1157 1158static inline uint32_t 1159ir3_shader_branchstack_hw(const struct ir3_shader_variant *v) 1160{ 1161 /* Dummy shader */ 1162 if (!v->compiler) 1163 return 0; 1164 1165 if (v->compiler->gen < 5) 1166 return v->branchstack; 1167 1168 if (v->branchstack > 0) { 1169 uint32_t branchstack = v->branchstack / 2 + 1; 1170 return MIN2(branchstack, v->compiler->branchstack_size / 2); 1171 } else { 1172 return 0; 1173 } 1174} 1175 1176#endif /* IR3_SHADER_H_ */ 1177