1/*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25/* The compiler middle-end architecture: Explaining (non-)monolithic shaders
26 * -------------------------------------------------------------------------
27 *
28 * Typically, there is one-to-one correspondence between API and HW shaders,
29 * that is, for every API shader, there is exactly one shader binary in
30 * the driver.
31 *
32 * The problem with that is that we also have to emulate some API states
33 * (e.g. alpha-test, and many others) in shaders too. The two obvious ways
34 * to deal with it are:
35 * - each shader has multiple variants for each combination of emulated states,
36 *   and the variants are compiled on demand, possibly relying on a shader
37 *   cache for good performance
38 * - patch shaders at the binary level
39 *
40 * This driver uses something completely different. The emulated states are
41 * usually implemented at the beginning or end of shaders. Therefore, we can
42 * split the shader into 3 parts:
43 * - prolog part (shader code dependent on states)
44 * - main part (the API shader)
45 * - epilog part (shader code dependent on states)
46 *
47 * Each part is compiled as a separate shader and the final binaries are
48 * concatenated. This type of shader is called non-monolithic, because it
49 * consists of multiple independent binaries. Creating a new shader variant
50 * is therefore only a concatenation of shader parts (binaries) and doesn't
51 * involve any compilation. The main shader parts are the only parts that are
52 * compiled when applications create shader objects. The prolog and epilog
53 * parts are compiled on the first use and saved, so that their binaries can
54 * be reused by many other shaders.
55 *
56 * One of the roles of the prolog part is to compute vertex buffer addresses
57 * for vertex shaders. A few of the roles of the epilog part are color buffer
58 * format conversions in pixel shaders that we have to do manually, and write
59 * tessellation factors in tessellation control shaders. The prolog and epilog
60 * have many other important responsibilities in various shader stages.
61 * They don't just "emulate legacy stuff".
62 *
63 * Monolithic shaders are shaders where the parts are combined before LLVM
64 * compilation, and the whole thing is compiled and optimized as one unit with
65 * one binary on the output. The result is the same as the non-monolithic
66 * shader, but the final code can be better, because LLVM can optimize across
67 * all shader parts. Monolithic shaders aren't usually used except for these
68 * special cases:
69 *
70 * 1) Some rarely-used states require modification of the main shader part
71 *    itself, and in such cases, only the monolithic shader variant is
72 *    compiled, and that's always done on the first use.
73 *
74 * 2) When we do cross-stage optimizations for separate shader objects and
75 *    e.g. eliminate unused shader varyings, the resulting optimized shader
76 *    variants are always compiled as monolithic shaders, and always
77 *    asynchronously (i.e. not stalling ongoing rendering). We call them
78 *    "optimized monolithic" shaders. The important property here is that
79 *    the non-monolithic unoptimized shader variant is always available for use
80 *    when the asynchronous compilation of the optimized shader is not done
81 *    yet.
82 *
83 * Starting with GFX9 chips, some shader stages are merged, and the number of
84 * shader parts per shader increased. The complete new list of shader parts is:
85 * - 1st shader: prolog part
86 * - 1st shader: main part
87 * - 2nd shader: prolog part
88 * - 2nd shader: main part
89 * - 2nd shader: epilog part
90 */
91
92/* How linking shader inputs and outputs between vertex, tessellation, and
93 * geometry shaders works.
94 *
95 * Inputs and outputs between shaders are stored in a buffer. This buffer
96 * lives in LDS (typical case for tessellation), but it can also live
97 * in memory (ESGS). Each input or output has a fixed location within a vertex.
98 * The highest used input or output determines the stride between vertices.
99 *
100 * Since GS and tessellation are only possible in the OpenGL core profile,
101 * only these semantics are valid for per-vertex data:
102 *
103 *   Name             Location
104 *
105 *   POSITION         0
106 *   PSIZE            1
107 *   CLIPDIST0..1     2..3
108 *   CULLDIST0..1     (not implemented)
109 *   GENERIC0..31     4..35
110 *
111 * For example, a shader only writing GENERIC0 has the output stride of 5.
112 *
113 * Only these semantics are valid for per-patch data:
114 *
115 *   Name             Location
116 *
117 *   TESSOUTER        0
118 *   TESSINNER        1
119 *   PATCH0..29       2..31
120 *
121 * That's how independent shaders agree on input and output locations.
122 * The si_shader_io_get_unique_index function assigns the locations.
123 *
124 * For tessellation, other required information for calculating the input and
125 * output addresses like the vertex stride, the patch stride, and the offsets
126 * where per-vertex and per-patch data start, is passed to the shader via
127 * user data SGPRs. The offsets and strides are calculated at draw time and
128 * aren't available at compile time.
129 */
130
131#ifndef SI_SHADER_H
132#define SI_SHADER_H
133
134#include "ac_binary.h"
135#include "ac_llvm_build.h"
136#include "ac_llvm_util.h"
137#include "util/simple_mtx.h"
138#include "util/u_inlines.h"
139#include "util/u_live_shader_cache.h"
140#include "util/u_queue.h"
141#include "si_pm4.h"
142
143#include <stdio.h>
144
145#ifdef __cplusplus
146extern "C" {
147#endif
148
149// Use LDS symbols when supported by LLVM. Can be disabled for testing the old
150// path on newer LLVM for now. Should be removed in the long term.
151#define USE_LDS_SYMBOLS (true)
152
153struct nir_shader;
154struct si_shader;
155struct si_context;
156
157#define SI_MAX_ATTRIBS    16
158#define SI_MAX_VS_OUTPUTS 40
159#define SI_USER_CLIP_PLANE_MASK  0x3F
160
161#define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29))
162
163#define SI_PS_INPUT_CNTL_0000          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
164#define SI_PS_INPUT_CNTL_0001          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
165#define SI_PS_INPUT_CNTL_UNUSED        SI_PS_INPUT_CNTL_0000
166/* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
167#define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
168
169/* SGPR user data indices */
170enum
171{
172   SI_SGPR_INTERNAL_BINDINGS,
173   SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
174   SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
175   SI_SGPR_SAMPLERS_AND_IMAGES,
176   SI_NUM_RESOURCE_SGPRS,
177
178   /* API VS, TES without GS, GS copy shader */
179   SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
180   SI_NUM_VS_STATE_RESOURCE_SGPRS,
181
182   /* all VS variants */
183   SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
184   SI_SGPR_DRAWID,
185   SI_SGPR_START_INSTANCE,
186   SI_VS_NUM_USER_SGPR,
187
188   SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
189
190   /* TES */
191   SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
192   SI_SGPR_TES_OFFCHIP_ADDR,
193   SI_TES_NUM_USER_SGPR,
194
195   /* GFX6-8: TCS only */
196   GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
197   GFX6_SGPR_TCS_OUT_OFFSETS,
198   GFX6_SGPR_TCS_OUT_LAYOUT,
199   GFX6_SGPR_TCS_IN_LAYOUT,
200   GFX6_TCS_NUM_USER_SGPR,
201
202   /* GFX9: Merged LS-HS (VS-TCS) only. */
203   GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR,
204   GFX9_SGPR_TCS_OUT_OFFSETS,
205   GFX9_SGPR_TCS_OUT_LAYOUT,
206   GFX9_TCS_NUM_USER_SGPR,
207
208   /* GS limits */
209   GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
210   SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
211
212   GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR),
213   GFX9_SGPR_ATTRIBUTE_RING_ADDR,
214   GFX9_GS_NUM_USER_SGPR,
215
216   /* PS only */
217   SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
218   SI_PS_NUM_USER_SGPR,
219
220   /* The value has to be 12, because the hw requires that descriptors
221    * are aligned to 4 SGPRs.
222    */
223   SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
224};
225
226/* LLVM function parameter indices */
227enum
228{
229   SI_NUM_RESOURCE_PARAMS = 4,
230
231   /* PS only parameters */
232   SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
233   SI_PARAM_PRIM_MASK,
234   SI_PARAM_PERSP_SAMPLE,
235   SI_PARAM_PERSP_CENTER,
236   SI_PARAM_PERSP_CENTROID,
237   SI_PARAM_PERSP_PULL_MODEL,
238   SI_PARAM_LINEAR_SAMPLE,
239   SI_PARAM_LINEAR_CENTER,
240   SI_PARAM_LINEAR_CENTROID,
241   SI_PARAM_LINE_STIPPLE_TEX,
242   SI_PARAM_POS_X_FLOAT,
243   SI_PARAM_POS_Y_FLOAT,
244   SI_PARAM_POS_Z_FLOAT,
245   SI_PARAM_POS_W_FLOAT,
246   SI_PARAM_FRONT_FACE,
247   SI_PARAM_ANCILLARY,
248   SI_PARAM_SAMPLE_COVERAGE,
249   SI_PARAM_POS_FIXED_PT,
250
251   SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
252};
253
254/* These fields are only set in current_vs_state (except INDEXED) in si_context, and they are
255 * accessible in the shader via vs_state_bits in VS, TES, and GS.
256 */
257#define VS_STATE_CLAMP_VERTEX_COLOR__SHIFT   0
258#define VS_STATE_CLAMP_VERTEX_COLOR__MASK    0x1 /* Shared by VS and GS */
259#define VS_STATE_INDEXED__SHIFT              1
260#define VS_STATE_INDEXED__MASK               0x1 /* Shared by VS and GS */
261
262/* These fields are only set in current_vs_state in si_context, and they are accessible
263 * in the shader via vs_state_bits in LS/HS.
264 */
265/* bit gap */
266#define VS_STATE_LS_OUT_PATCH_SIZE__SHIFT    11
267#define VS_STATE_LS_OUT_PATCH_SIZE__MASK     0x1fff
268#define VS_STATE_LS_OUT_VERTEX_SIZE__SHIFT   24
269#define VS_STATE_LS_OUT_VERTEX_SIZE__MASK    0xff
270
271/* These fields are only set in current_gs_state in si_context, and they are accessible
272 * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
273 */
274/* bit gap */
275#define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__SHIFT 18
276#define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__MASK  0xf
277#define GS_STATE_SMALL_PRIM_PRECISION__SHIFT    22
278#define GS_STATE_SMALL_PRIM_PRECISION__MASK     0xf
279#define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 26
280#define GS_STATE_STREAMOUT_QUERY_ENABLED__MASK  0x1
281#define GS_STATE_PROVOKING_VTX_INDEX__SHIFT     27
282#define GS_STATE_PROVOKING_VTX_INDEX__MASK      0x3
283#define GS_STATE_OUTPRIM__SHIFT                 29
284#define GS_STATE_OUTPRIM__MASK                  0x3
285#define GS_STATE_PIPELINE_STATS_EMU__SHIFT      31
286#define GS_STATE_PIPELINE_STATS_EMU__MASK       0x1
287
288#define ENCODE_FIELD(field, value) (((unsigned)(value) & field##__MASK) << field##__SHIFT)
289#define CLEAR_FIELD(field) (~((unsigned)field##__MASK << field##__SHIFT))
290
291/* This is called by functions that change states. */
292#define SET_FIELD(var, field, value) do { \
293   assert((value) == ((unsigned)(value) & field##__MASK)); \
294   (var) &= CLEAR_FIELD(field); \
295   (var) |= ENCODE_FIELD(field, value); \
296} while (0)
297
298/* This is called during shader compilation and returns LLVMValueRef. */
299#define GET_FIELD(ctx, field) si_unpack_param((ctx), (ctx)->vs_state_bits, field##__SHIFT, \
300                                             util_bitcount(field##__MASK))
301
302enum
303{
304   /* These represent the number of SGPRs the shader uses. */
305   SI_VS_BLIT_SGPRS_POS = 3,
306   SI_VS_BLIT_SGPRS_POS_COLOR = 7,
307   SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
308};
309
310#define SI_NGG_CULL_TRIANGLES                (1 << 0)   /* this implies W, view.xy, and small prim culling */
311#define SI_NGG_CULL_BACK_FACE                (1 << 1)   /* back faces */
312#define SI_NGG_CULL_FRONT_FACE               (1 << 2)   /* front faces */
313#define SI_NGG_CULL_LINES                    (1 << 3)   /* the primitive type is lines */
314#define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 4)   /* cull small lines according to the diamond exit rule */
315#define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
316#define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x)  (((x) >> 5) & 0xff)
317
318#define SI_PROFILE_WAVE32                    (1 << 0)
319#define SI_PROFILE_WAVE64                    (1 << 1)
320#define SI_PROFILE_IGNORE_LLVM13_DISCARD_BUG (1 << 2)
321#define SI_PROFILE_VS_NO_BINNING             (1 << 3)
322#define SI_PROFILE_PS_NO_BINNING             (1 << 4)
323#define SI_PROFILE_CLAMP_DIV_BY_ZERO         (1 << 5)
324
325/**
326 * For VS shader keys, describe any fixups required for vertex fetch.
327 *
328 * \ref log_size, \ref format, and the number of channels are interpreted as
329 * by \ref ac_build_opencoded_load_format.
330 *
331 * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
332 * impossible format and indicates that no fixup is needed (just use
333 * buffer_load_format_xyzw).
334 */
335union si_vs_fix_fetch {
336   struct {
337      uint8_t log_size : 2;        /* 1, 2, 4, 8 or bytes per channel */
338      uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
339      uint8_t format : 3;          /* AC_FETCH_FORMAT_xxx */
340      uint8_t reverse : 1;         /* reverse XYZ channels */
341   } u;
342   uint8_t bits;
343};
344
345struct si_shader;
346
347/* State of the context creating the shader object. */
348struct si_compiler_ctx_state {
349   /* Should only be used by si_init_shader_selector_async and
350    * si_build_shader_variant if thread_index == -1 (non-threaded). */
351   struct ac_llvm_compiler *compiler;
352
353   /* Used if thread_index == -1 or if debug.async is true. */
354   struct util_debug_callback debug;
355
356   /* Used for creating the log string for gallium/ddebug. */
357   bool is_debug_context;
358};
359
360enum si_color_output_type {
361   SI_TYPE_ANY32,
362   SI_TYPE_FLOAT16,
363   SI_TYPE_INT16,
364   SI_TYPE_UINT16,
365};
366
367union si_input_info {
368   struct {
369      ubyte semantic;
370      ubyte interpolate;
371      ubyte fp16_lo_hi_valid;
372      ubyte usage_mask;
373   };
374   uint32_t _unused; /* this just forces 4-byte alignment */
375};
376
377struct si_shader_info {
378   shader_info base;
379
380   uint32_t options; /* bitmask of SI_PROFILE_* */
381
382   ubyte num_inputs;
383   ubyte num_outputs;
384   union si_input_info input[PIPE_MAX_SHADER_INPUTS];
385   ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS];
386   ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
387   ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS];
388   ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
389   ubyte output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */
390
391   ubyte num_vs_inputs;
392   ubyte num_vbos_in_user_sgprs;
393   ubyte num_stream_output_components[4];
394   uint16_t enabled_streamout_buffer_mask;
395
396   uint64_t inputs_read; /* "get_unique_index" bits */
397   uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */
398
399   uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
400   uint64_t outputs_written;           /* "get_unique_index" bits */
401   uint32_t patch_outputs_written;     /* "get_unique_index_patch" bits */
402
403   ubyte clipdist_mask;
404   ubyte culldist_mask;
405
406   uint16_t lshs_vertex_stride;
407   uint16_t esgs_itemsize; /* vertex stride */
408   uint16_t gsvs_vertex_size;
409   ubyte gs_input_verts_per_prim;
410   unsigned max_gsvs_emit_size;
411
412   /* Set 0xf or 0x0 (4 bits) per each written output.
413    * ANDed with spi_shader_col_format.
414    */
415   unsigned colors_written_4bit;
416
417   int constbuf0_num_slots;
418   uint num_memory_stores;
419   ubyte color_attr_index[2];
420   ubyte color_interpolate[2];
421   ubyte color_interpolate_loc[2];
422   ubyte colors_read; /**< which color components are read by the FS */
423   ubyte colors_written;
424   uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */
425   bool vs_needs_prolog;
426   bool color0_writes_all_cbufs; /**< gl_FragColor */
427   bool reads_samplemask;   /**< does fragment shader read sample mask? */
428   bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
429   bool writes_z;           /**< does fragment shader write Z value? */
430   bool writes_stencil;     /**< does fragment shader write stencil value? */
431   bool writes_samplemask;  /**< does fragment shader write sample mask? */
432   bool writes_edgeflag;    /**< vertex shader outputs edgeflag */
433   bool uses_interp_color;
434   bool uses_persp_center_color;
435   bool uses_persp_centroid_color;
436   bool uses_persp_sample_color;
437   bool uses_persp_center;
438   bool uses_persp_centroid;
439   bool uses_persp_sample;
440   bool uses_linear_center;
441   bool uses_linear_centroid;
442   bool uses_linear_sample;
443   bool uses_interp_at_sample;
444   bool uses_instanceid;
445   bool uses_base_vertex;
446   bool uses_base_instance;
447   bool uses_drawid;
448   bool uses_primid;
449   bool uses_frontface;
450   bool uses_invocationid;
451   bool uses_thread_id[3];
452   bool uses_block_id[3];
453   bool uses_variable_block_size;
454   bool uses_grid_size;
455   bool uses_subgroup_info;
456   bool writes_position;
457   bool writes_psize;
458   bool writes_clipvertex;
459   bool writes_primid;
460   bool writes_viewport_index;
461   bool writes_layer;
462   bool uses_bindless_samplers;
463   bool uses_bindless_images;
464   bool uses_indirect_descriptor;
465   bool has_divergent_loop;
466
467   bool uses_vmem_sampler_or_bvh;
468   bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
469
470   /** Whether all codepaths write tess factors in all invocations. */
471   bool tessfactors_are_def_in_all_invocs;
472
473   /* A flag to check if vrs2x2 can be enabled to reduce number of
474    * fragment shader invocations if flat shading.
475    */
476   bool allow_flat_shading;
477
478   /* Optimization: if the texture bound to this texunit has been cleared to 1,
479    * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the
480    * value is 0xff (undetermined) and can be later changed to 0 (= false) or
481    * texunit + 1.
482    */
483   uint8_t writes_1_if_tex_is_1;
484};
485
486/* A shader selector is a gallium CSO and contains shader variants and
487 * binaries for one NIR program. This can be shared by multiple contexts.
488 */
489struct si_shader_selector {
490   struct util_live_shader base;
491   struct si_screen *screen;
492   struct util_queue_fence ready;
493   struct si_compiler_ctx_state compiler_ctx_state;
494   gl_shader_stage stage;
495
496   simple_mtx_t mutex;
497   union si_shader_key *keys;
498   unsigned variants_count;
499   unsigned variants_max_count;
500   struct si_shader **variants;
501
502   /* The compiled NIR shader without a prolog and/or epilog (not
503    * uploaded to a buffer object).
504    */
505   struct si_shader *main_shader_part;
506   struct si_shader *main_shader_part_ls;     /* as_ls is set in the key */
507   struct si_shader *main_shader_part_es;     /* as_es is set in the key */
508   struct si_shader *main_shader_part_ngg;    /* as_ngg is set in the key */
509   struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
510
511   struct nir_shader *nir;
512   void *nir_binary;
513   unsigned nir_size;
514
515   struct si_shader_info info;
516
517   enum pipe_shader_type pipe_shader_type;
518   ubyte const_and_shader_buf_descriptors_index;
519   ubyte sampler_and_images_descriptors_index;
520   ubyte cs_shaderbufs_sgpr_index;
521   ubyte cs_num_shaderbufs_in_user_sgprs;
522   ubyte cs_images_sgpr_index;
523   ubyte cs_images_num_sgprs;
524   ubyte cs_num_images_in_user_sgprs;
525   unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
526   enum pipe_prim_type rast_prim;
527
528   /* GS parameters. */
529   bool tess_turns_off_ngg;
530
531   /* bitmasks of used descriptor slots */
532   uint64_t active_const_and_shader_buffers;
533   uint64_t active_samplers_and_images;
534};
535
536/* Valid shader configurations:
537 *
538 * API shaders           VS | TCS | TES | GS |pass| PS
539 * are compiled as:         |     |     |    |thru|
540 *                          |     |     |    |    |
541 * Only VS & PS:         VS |     |     |    |    | PS
542 * GFX6     - with GS:   ES |     |     | GS | VS | PS
543 *          - with tess: LS | HS  | VS  |    |    | PS
544 *          - with both: LS | HS  | ES  | GS | VS | PS
545 * GFX9     - with GS:   -> |     |     | GS | VS | PS
546 *          - with tess: -> | HS  | VS  |    |    | PS
547 *          - with both: -> | HS  | ->  | GS | VS | PS
548 *                          |     |     |    |    |
549 * NGG      - VS & PS:   GS |     |     |    |    | PS
550 * (GFX10+) - with GS:   -> |     |     | GS |    | PS
551 *          - with tess: -> | HS  | GS  |    |    | PS
552 *          - with both: -> | HS  | ->  | GS |    | PS
553 *
554 * -> = merged with the next stage
555 */
556
557/* Use the byte alignment for all following structure members for optimal
558 * shader key memory footprint.
559 */
560#pragma pack(push, 1)
561
562/* Common VS bits between the shader key and the prolog key. */
563struct si_vs_prolog_bits {
564   /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
565    *   divisor is 0.
566    * - If "is_one" has a bit set, the instance divisor is 1.
567    * - If "is_fetched" has a bit set, the instance divisor will be loaded
568    *   from the constant buffer.
569    */
570   uint16_t instance_divisor_is_one;     /* bitmask of inputs */
571   uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
572   unsigned ls_vgpr_fix : 1;
573};
574
575/* Common TCS bits between the shader key and the epilog key. */
576struct si_tcs_epilog_bits {
577   unsigned prim_mode : 3;
578   unsigned invoc0_tess_factors_are_def : 1;
579   unsigned tes_reads_tess_factors : 1;
580};
581
582/* Common PS bits between the shader key and the prolog key. */
583struct si_ps_prolog_bits {
584   unsigned color_two_side : 1;
585   unsigned flatshade_colors : 1;
586   unsigned poly_stipple : 1;
587   unsigned force_persp_sample_interp : 1;
588   unsigned force_linear_sample_interp : 1;
589   unsigned force_persp_center_interp : 1;
590   unsigned force_linear_center_interp : 1;
591   unsigned bc_optimize_for_persp : 1;
592   unsigned bc_optimize_for_linear : 1;
593   unsigned samplemask_log_ps_iter : 3;
594};
595
596/* Common PS bits between the shader key and the epilog key. */
597struct si_ps_epilog_bits {
598   unsigned spi_shader_col_format;
599   unsigned color_is_int8 : 8;
600   unsigned color_is_int10 : 8;
601   unsigned last_cbuf : 3;
602   unsigned alpha_func : 3;
603   unsigned alpha_to_one : 1;
604   unsigned alpha_to_coverage_via_mrtz : 1;  /* gfx11+ */
605   unsigned clamp_color : 1;
606   unsigned dual_src_blend_swizzle : 1;      /* gfx11+ */
607};
608
609union si_shader_part_key {
610   struct {
611      struct si_vs_prolog_bits states;
612      unsigned wave32 : 1;
613      unsigned num_input_sgprs : 6;
614      /* For merged stages such as LS-HS, HS input VGPRs are first. */
615      unsigned num_merged_next_stage_vgprs : 3;
616      unsigned num_inputs : 5;
617      unsigned as_ls : 1;
618      unsigned as_es : 1;
619      unsigned as_ngg : 1;
620      unsigned load_vgprs_after_culling : 1;
621      /* Prologs for monolithic shaders shouldn't set EXEC. */
622      unsigned is_monolithic : 1;
623   } vs_prolog;
624   struct {
625      struct si_tcs_epilog_bits states;
626      unsigned wave32 : 1;
627      unsigned noop_s_barrier : 1;
628   } tcs_epilog;
629   struct {
630      struct si_ps_prolog_bits states;
631      unsigned wave32 : 1;
632      unsigned num_input_sgprs : 6;
633      unsigned num_input_vgprs : 5;
634      /* Color interpolation and two-side color selection. */
635      unsigned colors_read : 8;       /* color input components read */
636      unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
637      unsigned face_vgpr_index : 5;
638      unsigned ancillary_vgpr_index : 5;
639      unsigned sample_coverage_vgpr_index : 5;
640      unsigned wqm : 1;
641      char color_attr_index[2];
642      signed char color_interp_vgpr_index[2]; /* -1 == constant */
643   } ps_prolog;
644   struct {
645      struct si_ps_epilog_bits states;
646      unsigned wave32 : 1;
647      unsigned uses_discard : 1;
648      unsigned colors_written : 8;
649      unsigned color_types : 16;
650      unsigned writes_z : 1;
651      unsigned writes_stencil : 1;
652      unsigned writes_samplemask : 1;
653   } ps_epilog;
654};
655
656/* The shader key for geometry stages (VS, TCS, TES, GS) */
657struct si_shader_key_ge {
658   /* Prolog and epilog flags. */
659   union {
660      struct {
661         struct si_vs_prolog_bits prolog;
662      } vs;
663      struct {
664         struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */
665         struct si_shader_selector *ls;      /* for merged LS-HS */
666         struct si_tcs_epilog_bits epilog;
667      } tcs; /* tessellation control shader */
668      struct {
669         struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
670         struct si_shader_selector *es;      /* for merged ES-GS */
671      } gs;
672   } part;
673
674   /* These three are initially set according to the NEXT_SHADER property,
675    * or guessed if the property doesn't seem correct.
676    */
677   unsigned as_es : 1;  /* whether it's a shader before GS */
678   unsigned as_ls : 1;  /* whether it's VS before TCS */
679   unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled,
680                           also set for the stage right before GS */
681
682   /* Flags for monolithic compilation only. */
683   struct {
684      /* Whether fetch should be opencoded according to vs_fix_fetch.
685       * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
686       * with minimal fixups is used. */
687      uint16_t vs_fetch_opencode;
688      union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
689
690      union {
691         /* When PS needs PrimID and GS is disabled. */
692         unsigned vs_export_prim_id : 1;    /* VS and TES only */
693         unsigned gs_tri_strip_adj_fix : 1; /* GS only */
694      } u;
695   } mono;
696
697   /* Optimization flags for asynchronous compilation only. */
698   struct {
699      /* For HW VS (it can be VS, TES, GS) */
700      uint64_t kill_outputs; /* "get_unique_index" bits */
701      unsigned kill_clip_distances : 8;
702      unsigned kill_pointsize : 1;
703      unsigned remove_streamout : 1;
704
705      /* For NGG VS and TES. */
706      unsigned ngg_culling : 13; /* SI_NGG_CULL_* */
707
708      /* For shaders where monolithic variants have better code.
709       *
710       * This is a flag that has no effect on code generation,
711       * but forces monolithic shaders to be used as soon as
712       * possible, because it's in the "opt" group.
713       */
714      unsigned prefer_mono : 1;
715
716      /* VS and TCS have the same number of patch vertices. */
717      unsigned same_patch_vertices:1;
718
719      unsigned inline_uniforms:1;
720
721      /* This must be kept last to limit the number of variants
722       * depending only on the uniform values.
723       */
724      uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
725   } opt;
726};
727
728struct si_shader_key_ps {
729   struct {
730      /* Prolog and epilog flags. */
731      struct si_ps_prolog_bits prolog;
732      struct si_ps_epilog_bits epilog;
733   } part;
734
735   /* Flags for monolithic compilation only. */
736   struct {
737      unsigned poly_line_smoothing : 1;
738      unsigned point_smoothing : 1;
739      unsigned interpolate_at_sample_force_center : 1;
740      unsigned fbfetch_msaa : 1;
741      unsigned fbfetch_is_1D : 1;
742      unsigned fbfetch_layered : 1;
743   } mono;
744
745   /* Optimization flags for asynchronous compilation only. */
746   struct {
747      /* For shaders where monolithic variants have better code.
748       *
749       * This is a flag that has no effect on code generation,
750       * but forces monolithic shaders to be used as soon as
751       * possible, because it's in the "opt" group.
752       */
753      unsigned prefer_mono : 1;
754      unsigned inline_uniforms:1;
755
756      /* This must be kept last to limit the number of variants
757       * depending only on the uniform values.
758       */
759      uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
760   } opt;
761};
762
763union si_shader_key {
764   struct si_shader_key_ge ge; /* geometry engine shaders */
765   struct si_shader_key_ps ps;
766};
767
768/* Restore the pack alignment to default. */
769#pragma pack(pop)
770
771/* GCN-specific shader info. */
772struct si_shader_binary_info {
773   ubyte vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS];
774   uint64_t vs_output_param_mask; /* which params to export, indexed by "base" */
775   uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
776   ubyte num_input_sgprs;
777   ubyte num_input_vgprs;
778   bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
779   bool uses_vmem_sampler_or_bvh;
780   signed char face_vgpr_index;
781   signed char ancillary_vgpr_index;
782   signed char sample_coverage_vgpr_index;
783   bool uses_instanceid;
784   ubyte nr_pos_exports;
785   ubyte nr_param_exports;
786   unsigned private_mem_vgprs;
787   unsigned max_simd_waves;
788};
789
790struct si_shader_binary {
791   const char *elf_buffer;
792   size_t elf_size;
793
794   char *uploaded_code;
795   size_t uploaded_code_size;
796
797   char *llvm_ir_string;
798};
799
800struct gfx9_gs_info {
801   unsigned es_verts_per_subgroup;
802   unsigned gs_prims_per_subgroup;
803   unsigned gs_inst_prims_in_subgroup;
804   unsigned max_prims_per_subgroup;
805   unsigned esgs_ring_size; /* in bytes */
806};
807
808#define SI_NUM_VGT_STAGES_KEY_BITS 8
809#define SI_NUM_VGT_STAGES_STATES   (1 << SI_NUM_VGT_STAGES_KEY_BITS)
810
811/* The VGT_SHADER_STAGES key used to index the table of precomputed values.
812 * Some fields are set by state-change calls, most are set by draw_vbo.
813 */
814union si_vgt_stages_key {
815   struct {
816#if UTIL_ARCH_LITTLE_ENDIAN
817      uint8_t tess : 1;
818      uint8_t gs : 1;
819      uint8_t ngg_passthrough : 1;
820      uint8_t ngg : 1;       /* gfx10+ */
821      uint8_t streamout : 1; /* only used with NGG */
822      uint8_t hs_wave32 : 1;
823      uint8_t gs_wave32 : 1;
824      uint8_t vs_wave32 : 1;
825#else /* UTIL_ARCH_BIG_ENDIAN */
826      uint8_t vs_wave32 : 1;
827      uint8_t gs_wave32 : 1;
828      uint8_t hs_wave32 : 1;
829      uint8_t streamout : 1;
830      uint8_t ngg : 1;
831      uint8_t ngg_passthrough : 1;
832      uint8_t gs : 1;
833      uint8_t tess : 1;
834#endif
835   } u;
836   uint8_t index;
837};
838
839struct si_shader {
840   struct si_pm4_state pm4; /* base class */
841   struct si_compiler_ctx_state compiler_ctx_state;
842
843   struct si_shader_selector *selector;
844   struct si_shader_selector *previous_stage_sel; /* for refcounting */
845
846   struct si_shader_part *prolog;
847   struct si_shader *previous_stage; /* for GFX9 */
848   struct si_shader_part *epilog;
849   struct si_shader *gs_copy_shader;
850
851   struct si_resource *bo;
852   struct si_resource *scratch_bo;
853   union si_shader_key key;
854   struct util_queue_fence ready;
855   bool compilation_failed;
856   bool is_monolithic;
857   bool is_optimized;
858   bool is_binary_shared;
859   bool is_gs_copy_shader;
860   uint8_t wave_size;
861
862   /* The following data is all that's needed for binary shaders. */
863   struct si_shader_binary binary;
864   struct ac_shader_config config;
865   struct si_shader_binary_info info;
866
867   /* SI_SGPR_VS_STATE_BITS */
868   bool uses_vs_state_provoking_vertex;
869   bool uses_gs_state_outprim;
870
871   bool uses_base_instance;
872
873   struct {
874      uint16_t ngg_emit_size; /* in dwords */
875      uint16_t hw_max_esverts;
876      uint16_t max_gsprims;
877      uint16_t max_out_verts;
878      uint16_t prim_amp_factor;
879      bool max_vert_out_per_gs_instance;
880   } ngg;
881
882   /* Shader key + LLVM IR + disassembly + statistics.
883    * Generated for debug contexts only.
884    */
885   char *shader_log;
886   size_t shader_log_size;
887
888   struct gfx9_gs_info gs_info;
889
890   /* For save precompute context registers values. */
891   union {
892      struct {
893         unsigned vgt_gsvs_ring_offset_1;
894         unsigned vgt_gsvs_ring_offset_2;
895         unsigned vgt_gsvs_ring_offset_3;
896         unsigned vgt_gsvs_ring_itemsize;
897         unsigned vgt_gs_max_vert_out;
898         unsigned vgt_gs_vert_itemsize;
899         unsigned vgt_gs_vert_itemsize_1;
900         unsigned vgt_gs_vert_itemsize_2;
901         unsigned vgt_gs_vert_itemsize_3;
902         unsigned vgt_gs_instance_cnt;
903         unsigned vgt_gs_onchip_cntl;
904         unsigned vgt_gs_max_prims_per_subgroup;
905         unsigned vgt_esgs_ring_itemsize;
906         unsigned spi_shader_pgm_rsrc3_gs;
907         unsigned spi_shader_pgm_rsrc4_gs;
908      } gs;
909
910      struct {
911         unsigned ge_max_output_per_subgroup;
912         unsigned ge_ngg_subgrp_cntl;
913         unsigned vgt_primitiveid_en;
914         unsigned vgt_gs_onchip_cntl;
915         unsigned vgt_gs_instance_cnt;
916         unsigned vgt_esgs_ring_itemsize;
917         unsigned spi_vs_out_config;
918         unsigned spi_shader_idx_format;
919         unsigned spi_shader_pos_format;
920         unsigned pa_cl_vte_cntl;
921         unsigned pa_cl_ngg_cntl;
922         unsigned vgt_gs_max_vert_out; /* for API GS */
923         unsigned ge_pc_alloc;         /* uconfig register */
924         unsigned spi_shader_pgm_rsrc3_gs;
925         unsigned spi_shader_pgm_rsrc4_gs;
926         union si_vgt_stages_key vgt_stages;
927      } ngg;
928
929      struct {
930         unsigned vgt_gs_mode;
931         unsigned vgt_primitiveid_en;
932         unsigned vgt_reuse_off;
933         unsigned spi_vs_out_config;
934         unsigned spi_shader_pos_format;
935         unsigned pa_cl_vte_cntl;
936         unsigned ge_pc_alloc; /* uconfig register */
937      } vs;
938
939      struct {
940         unsigned spi_ps_input_ena;
941         unsigned spi_ps_input_addr;
942         unsigned spi_baryc_cntl;
943         unsigned spi_ps_in_control;
944         unsigned spi_shader_z_format;
945         unsigned spi_shader_col_format;
946         unsigned cb_shader_mask;
947         unsigned db_shader_control;
948         unsigned num_interp;
949      } ps;
950   } ctx_reg;
951
952   /*For save precompute registers value */
953   unsigned vgt_tf_param;                /* VGT_TF_PARAM */
954   unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
955   unsigned pa_cl_vs_out_cntl;
956   unsigned ge_cntl;
957};
958
959struct si_shader_part {
960   struct si_shader_part *next;
961   union si_shader_part_key key;
962   struct si_shader_binary binary;
963   struct ac_shader_config config;
964};
965
966/* si_shader.c */
967void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir);
968bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
969                       struct si_shader *shader, struct util_debug_callback *debug);
970bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
971                              struct si_shader *shader, struct util_debug_callback *debug);
972void si_shader_destroy(struct si_shader *shader);
973unsigned si_shader_io_get_unique_index_patch(unsigned semantic);
974unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying);
975bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
976                             uint64_t scratch_va);
977void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
978                    struct util_debug_callback *debug, FILE *f, bool check_debug_option);
979void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
980                                        struct util_debug_callback *debug);
981void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
982const char *si_get_shader_name(const struct si_shader *shader);
983void si_shader_binary_clean(struct si_shader_binary *binary);
984struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
985unsigned si_get_ps_num_interp(struct si_shader *ps);
986
987/* si_shader_info.c */
988void si_nir_scan_shader(struct si_screen *sscreen,  const struct nir_shader *nir,
989                        struct si_shader_info *info);
990
991/* si_shader_llvm_gs.c */
992struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
993                                             struct ac_llvm_compiler *compiler,
994                                             struct si_shader_selector *gs_selector,
995                                             const struct pipe_stream_output_info *so,
996                                             struct util_debug_callback *debug);
997
998/* si_shader_nir.c */
999void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
1000void si_nir_late_opts(nir_shader *nir);
1001char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);
1002
1003/* si_state_shaders.cpp */
1004unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader);
1005void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
1006                      struct gfx9_gs_info *out);
1007bool gfx10_is_ngg_passthrough(struct si_shader *shader);
1008
1009/* Inline helpers. */
1010
1011/* Return the pointer to the main shader part's pointer. */
1012static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
1013                                                         const union si_shader_key *key)
1014{
1015   if (sel->stage <= MESA_SHADER_GEOMETRY) {
1016      if (key->ge.as_ls)
1017         return &sel->main_shader_part_ls;
1018      if (key->ge.as_es && key->ge.as_ngg)
1019         return &sel->main_shader_part_ngg_es;
1020      if (key->ge.as_es)
1021         return &sel->main_shader_part_es;
1022      if (key->ge.as_ngg)
1023         return &sel->main_shader_part_ngg;
1024   }
1025   return &sel->main_shader_part;
1026}
1027
1028static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
1029{
1030   return selector ? selector->info.uses_bindless_samplers : false;
1031}
1032
1033static inline bool si_shader_uses_bindless_images(struct si_shader_selector *selector)
1034{
1035   return selector ? selector->info.uses_bindless_images : false;
1036}
1037
1038static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader)
1039{
1040   if (shader->selector->stage == MESA_SHADER_VERTEX &&
1041       !shader->selector->info.base.vs.blit_sgprs_amd &&
1042       !(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES))
1043      return true;
1044
1045   return false;
1046}
1047
1048static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader)
1049{
1050   return gfx10_edgeflags_have_effect(shader) &&
1051          shader->selector->info.writes_edgeflag;
1052}
1053
1054static inline bool si_shader_uses_streamout(struct si_shader *shader)
1055{
1056   return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
1057          shader->selector->info.enabled_streamout_buffer_mask &&
1058          !shader->key.ge.opt.remove_streamout;
1059}
1060
1061static inline bool si_shader_uses_discard(struct si_shader *shader)
1062{
1063   /* Changes to this should also update ps_modifies_zs. */
1064   return shader->selector->info.base.fs.uses_discard ||
1065          shader->key.ps.part.prolog.poly_stipple ||
1066          shader->key.ps.mono.point_smoothing ||
1067          shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS;
1068}
1069
1070#ifdef __cplusplus
1071}
1072#endif
1073
1074#endif
1075