1/*
2 * Copyright © 2019 Valve Corporation.
3 * Copyright © 2016 Red Hat.
4 * Copyright © 2016 Bas Nieuwenhuizen
5 *
6 * based in part on anv driver which is:
7 * Copyright © 2015 Intel Corporation
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a
10 * copy of this software and associated documentation files (the "Software"),
11 * to deal in the Software without restriction, including without limitation
12 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 * and/or sell copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice (including the next
17 * paragraph) shall be included in all copies or substantial portions of the
18 * Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 * IN THE SOFTWARE.
27 */
28
29#include "radv_shader_args.h"
30#include "radv_private.h"
31#include "radv_shader.h"
32
33static void
34set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs)
35{
36   ud_info->sgpr_idx = *sgpr_idx;
37   ud_info->num_sgprs = num_sgprs;
38   *sgpr_idx += num_sgprs;
39}
40
41static void
42set_loc_shader(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx, uint8_t num_sgprs)
43{
44   struct radv_userdata_info *ud_info = &args->user_sgprs_locs.shader_data[idx];
45   assert(ud_info);
46
47   set_loc(ud_info, sgpr_idx, num_sgprs);
48}
49
50static void
51set_loc_shader_ptr(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx)
52{
53   bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS &&
54                             idx != AC_UD_CS_TASK_RING_OFFSETS && idx != AC_UD_CS_SBT_DESCRIPTORS &&
55                             idx != AC_UD_CS_RAY_LAUNCH_SIZE_ADDR;
56
57   set_loc_shader(args, idx, sgpr_idx, use_32bit_pointers ? 1 : 2);
58}
59
60static void
61set_loc_desc(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx)
62{
63   struct radv_userdata_locations *locs = &args->user_sgprs_locs;
64   struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx];
65   assert(ud_info);
66
67   set_loc(ud_info, sgpr_idx, 1);
68
69   locs->descriptor_sets_enabled |= 1u << idx;
70}
71
72struct user_sgpr_info {
73   uint64_t inline_push_constant_mask;
74   bool inlined_all_push_consts;
75   bool indirect_all_descriptor_sets;
76   uint8_t remaining_sgprs;
77};
78
79static uint8_t
80count_vs_user_sgprs(const struct radv_shader_info *info)
81{
82   uint8_t count = 1; /* vertex offset */
83
84   if (info->vs.vb_desc_usage_mask)
85      count++;
86   if (info->vs.needs_draw_id)
87      count++;
88   if (info->vs.needs_base_instance)
89      count++;
90
91   return count;
92}
93
94static uint8_t
95count_ms_user_sgprs(const struct radv_shader_info *info)
96{
97   uint8_t count = 1 + 3; /* firstTask + num_work_groups[3] */
98
99   if (info->vs.needs_draw_id)
100      count++;
101   if (info->cs.uses_task_rings)
102      count++;
103
104   return count;
105}
106
107static unsigned
108count_ngg_sgprs(const struct radv_shader_info *info, bool has_ngg_query)
109{
110   unsigned count = 0;
111
112   if (has_ngg_query)
113      count += 1; /* ngg_query_state */
114   if (info->has_ngg_culling)
115      count += 5; /* ngg_culling_settings + 4x ngg_viewport_* */
116
117   return count;
118}
119
120static void
121allocate_inline_push_consts(const struct radv_shader_info *info,
122                            struct user_sgpr_info *user_sgpr_info)
123{
124   uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs;
125
126   if (!info->inline_push_constant_mask)
127      return;
128
129   uint64_t mask = info->inline_push_constant_mask;
130   uint8_t num_push_consts = util_bitcount64(mask);
131
132   /* Disable the default push constants path if all constants can be inlined and if shaders don't
133    * use dynamic descriptors.
134    */
135   if (num_push_consts <= MIN2(remaining_sgprs + 1, AC_MAX_INLINE_PUSH_CONSTS) &&
136       info->can_inline_all_push_constants && !info->loads_dynamic_offsets) {
137      user_sgpr_info->inlined_all_push_consts = true;
138      remaining_sgprs++;
139   } else {
140      /* Clamp to the maximum number of allowed inlined push constants. */
141      while (num_push_consts > MIN2(remaining_sgprs, AC_MAX_INLINE_PUSH_CONSTS_WITH_INDIRECT)) {
142         num_push_consts--;
143         mask &= ~BITFIELD64_BIT(util_last_bit64(mask) - 1);
144      }
145   }
146
147   user_sgpr_info->remaining_sgprs = remaining_sgprs - util_bitcount64(mask);
148   user_sgpr_info->inline_push_constant_mask = mask;
149}
150
151static void
152allocate_user_sgprs(enum amd_gfx_level gfx_level, const struct radv_shader_info *info,
153                    struct radv_shader_args *args, gl_shader_stage stage, bool has_previous_stage,
154                    gl_shader_stage previous_stage, bool needs_view_index, bool has_ngg_query,
155                    struct user_sgpr_info *user_sgpr_info)
156{
157   uint8_t user_sgpr_count = 0;
158
159   memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info));
160
161   /* 2 user sgprs will always be allocated for scratch/rings */
162   user_sgpr_count += 2;
163
164   if (stage == MESA_SHADER_TASK)
165      user_sgpr_count += 2; /* task descriptors */
166
167   /* prolog inputs */
168   if (info->vs.has_prolog)
169      user_sgpr_count += 2;
170
171   switch (stage) {
172   case MESA_SHADER_COMPUTE:
173   case MESA_SHADER_TASK:
174      if (info->cs.uses_sbt)
175         user_sgpr_count += 2;
176      if (info->cs.uses_grid_size)
177         user_sgpr_count += args->load_grid_size_from_user_sgpr ? 3 : 2;
178      if (info->cs.uses_ray_launch_size)
179         user_sgpr_count += 2;
180      if (info->vs.needs_draw_id)
181         user_sgpr_count += 1;
182      if (info->cs.uses_task_rings)
183         user_sgpr_count += 4; /* ring_entry, 2x ib_addr, ib_stride */
184      break;
185   case MESA_SHADER_FRAGMENT:
186      /* epilog continue PC */
187      if (info->ps.has_epilog)
188         user_sgpr_count += 1;
189      break;
190   case MESA_SHADER_VERTEX:
191      if (!args->is_gs_copy_shader)
192         user_sgpr_count += count_vs_user_sgprs(info);
193      break;
194   case MESA_SHADER_TESS_CTRL:
195      if (has_previous_stage) {
196         if (previous_stage == MESA_SHADER_VERTEX)
197            user_sgpr_count += count_vs_user_sgprs(info);
198      }
199      break;
200   case MESA_SHADER_TESS_EVAL:
201      break;
202   case MESA_SHADER_GEOMETRY:
203      if (has_previous_stage) {
204         if (info->is_ngg)
205            user_sgpr_count += count_ngg_sgprs(info, has_ngg_query);
206
207         if (previous_stage == MESA_SHADER_VERTEX) {
208            user_sgpr_count += count_vs_user_sgprs(info);
209         } else if (previous_stage == MESA_SHADER_MESH) {
210            user_sgpr_count += count_ms_user_sgprs(info);
211         }
212      }
213      break;
214   default:
215      break;
216   }
217
218   if (needs_view_index)
219      user_sgpr_count++;
220
221   if (info->force_vrs_per_vertex)
222      user_sgpr_count++;
223
224   if (info->loads_push_constants)
225      user_sgpr_count++;
226
227   if (info->so.num_outputs)
228      user_sgpr_count++;
229
230   uint32_t available_sgprs =
231      gfx_level >= GFX9 && stage != MESA_SHADER_COMPUTE && stage != MESA_SHADER_TASK ? 32 : 16;
232   uint32_t remaining_sgprs = available_sgprs - user_sgpr_count;
233   uint32_t num_desc_set = util_bitcount(info->desc_set_used_mask);
234
235   if (remaining_sgprs < num_desc_set) {
236      user_sgpr_info->indirect_all_descriptor_sets = true;
237      user_sgpr_info->remaining_sgprs = remaining_sgprs - 1;
238   } else {
239      user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set;
240   }
241
242   allocate_inline_push_consts(info, user_sgpr_info);
243}
244
245static void
246declare_global_input_sgprs(const struct radv_shader_info *info,
247                           const struct user_sgpr_info *user_sgpr_info,
248                           struct radv_shader_args *args)
249{
250   /* 1 for each descriptor set */
251   if (!user_sgpr_info->indirect_all_descriptor_sets) {
252      uint32_t mask = info->desc_set_used_mask;
253
254      while (mask) {
255         int i = u_bit_scan(&mask);
256
257         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR, &args->descriptor_sets[i]);
258      }
259   } else {
260      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR_PTR, &args->descriptor_sets[0]);
261   }
262
263   if (info->loads_push_constants && !user_sgpr_info->inlined_all_push_consts) {
264      /* 1 for push constants and dynamic descriptors */
265      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR, &args->ac.push_constants);
266   }
267
268   for (unsigned i = 0; i < util_bitcount64(user_sgpr_info->inline_push_constant_mask); i++) {
269      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.inline_push_consts[i]);
270   }
271   args->ac.inline_push_const_mask = user_sgpr_info->inline_push_constant_mask;
272
273   if (info->so.num_outputs) {
274      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->streamout_buffers);
275   }
276}
277
278static void
279declare_vs_specific_input_sgprs(const struct radv_shader_info *info, struct radv_shader_args *args,
280                                gl_shader_stage stage, bool has_previous_stage,
281                                gl_shader_stage previous_stage)
282{
283   if (info->vs.has_prolog)
284      ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_INT, &args->prolog_inputs);
285
286   if (!args->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX ||
287                                    (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
288      if (info->vs.vb_desc_usage_mask) {
289         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->ac.vertex_buffers);
290      }
291      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
292      if (info->vs.needs_draw_id) {
293         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
294      }
295      if (info->vs.needs_base_instance) {
296         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
297      }
298   }
299}
300
301static void
302declare_vs_input_vgprs(enum amd_gfx_level gfx_level, const struct radv_shader_info *info,
303                       struct radv_shader_args *args)
304{
305   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
306   if (!args->is_gs_copy_shader) {
307      if (info->vs.as_ls) {
308
309         if (gfx_level >= GFX11) {
310            ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
311            ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
312            ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
313         } else if (gfx_level >= GFX10) {
314            ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
315            ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */
316            ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
317         } else {
318            ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
319            ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
320            ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
321         }
322      } else {
323         if (gfx_level >= GFX10) {
324            if (info->is_ngg) {
325               ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */
326               ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */
327               ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
328            } else {
329               ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
330               ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_prim_id);
331               ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
332            }
333         } else {
334            ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
335            ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_prim_id);
336            ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
337         }
338      }
339   }
340
341   if (info->vs.dynamic_inputs) {
342      assert(info->vs.use_per_attribute_vb_descs);
343      unsigned num_attributes = util_last_bit(info->vs.vb_desc_usage_mask);
344      for (unsigned i = 0; i < num_attributes; i++)
345         ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_INT, &args->vs_inputs[i]);
346      /* Ensure the main shader doesn't use less vgprs than the prolog. The prolog requires one
347       * VGPR more than the number of shader arguments in the case of non-trivial divisors on GFX8.
348       */
349      ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
350   }
351}
352
353static void
354declare_streamout_sgprs(const struct radv_shader_info *info, struct radv_shader_args *args,
355                        gl_shader_stage stage)
356{
357   int i;
358
359   /* Streamout SGPRs. */
360   if (info->so.num_outputs) {
361      assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL);
362
363      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_config);
364      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_write_index);
365   } else if (stage == MESA_SHADER_TESS_EVAL) {
366      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
367   }
368
369   /* A streamout buffer offset is loaded if the stride is non-zero. */
370   for (i = 0; i < 4; i++) {
371      if (!info->so.strides[i])
372         continue;
373
374      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_offset[i]);
375   }
376}
377
378static void
379declare_tes_input_vgprs(struct radv_shader_args *args)
380{
381   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_u);
382   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_v);
383   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_rel_patch_id);
384   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id);
385}
386
387static void
388declare_ms_input_sgprs(const struct radv_shader_info *info, struct radv_shader_args *args)
389{
390   ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
391   ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
392   if (info->vs.needs_draw_id) {
393      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
394   }
395   if (info->cs.uses_task_rings) {
396      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.task_ring_entry);
397   }
398}
399
400static void
401declare_ms_input_vgprs(struct radv_shader_args *args)
402{
403   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
404   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */
405   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */
406   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* instance_id */
407}
408
409static void
410declare_ps_input_vgprs(const struct radv_shader_info *info, struct radv_shader_args *args)
411{
412   unsigned spi_ps_input = info->ps.spi_ps_input;
413
414   ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample);
415   ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center);
416   ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid);
417   ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, &args->ac.pull_model);
418   ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample);
419   ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center);
420   ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid);
421   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); /* line stipple tex */
422   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0]);
423   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1]);
424   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2]);
425   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3]);
426   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.front_face);
427   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary);
428   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.sample_coverage);
429   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* fixed pt */
430
431   if (args->remap_spi_ps_input) {
432      /* LLVM optimizes away unused FS inputs and computes spi_ps_input_addr itself and then
433       * communicates the results back via the ELF binary. Mirror what LLVM does by re-mapping the
434       * VGPR arguments here.
435       */
436      unsigned arg_count = 0;
437      for (unsigned i = 0, vgpr_arg = 0, vgpr_reg = 0; i < args->ac.arg_count; i++) {
438         if (args->ac.args[i].file != AC_ARG_VGPR) {
439            arg_count++;
440            continue;
441         }
442
443         if (!(spi_ps_input & (1 << vgpr_arg))) {
444            args->ac.args[i].skip = true;
445         } else {
446            args->ac.args[i].offset = vgpr_reg;
447            vgpr_reg += args->ac.args[i].size;
448            arg_count++;
449         }
450         vgpr_arg++;
451      }
452   }
453
454   if (info->ps.has_epilog) {
455      /* FIXME: Ensure the main shader doesn't have less VGPRs than the epilog */
456      for (unsigned i = 0; i < MAX_RTS; i++)
457         ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_INT, NULL);
458   }
459}
460
461static void
462declare_ngg_sgprs(const struct radv_shader_info *info, struct radv_shader_args *args,
463                  bool has_ngg_query)
464{
465   if (has_ngg_query)
466      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_query_state);
467
468   if (info->has_ngg_culling) {
469      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_culling_settings);
470      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_scale[0]);
471      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_scale[1]);
472      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_translate[0]);
473      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_translate[1]);
474   }
475}
476
477static void
478set_global_input_locs(struct radv_shader_args *args, const struct user_sgpr_info *user_sgpr_info,
479                      uint8_t *user_sgpr_idx)
480{
481   if (!user_sgpr_info->indirect_all_descriptor_sets) {
482      for (unsigned i = 0; i < ARRAY_SIZE(args->descriptor_sets); i++) {
483         if (args->descriptor_sets[i].used)
484            set_loc_desc(args, i, user_sgpr_idx);
485      }
486   } else {
487      set_loc_shader_ptr(args, AC_UD_INDIRECT_DESCRIPTOR_SETS, user_sgpr_idx);
488   }
489
490   if (args->ac.push_constants.used) {
491      set_loc_shader_ptr(args, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
492   }
493
494   if (user_sgpr_info->inline_push_constant_mask) {
495      set_loc_shader(args, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx,
496                     util_bitcount64(user_sgpr_info->inline_push_constant_mask));
497   }
498
499   if (args->streamout_buffers.used) {
500      set_loc_shader_ptr(args, AC_UD_STREAMOUT_BUFFERS, user_sgpr_idx);
501   }
502}
503
504static void
505set_vs_specific_input_locs(struct radv_shader_args *args, gl_shader_stage stage,
506                           bool has_previous_stage, gl_shader_stage previous_stage,
507                           uint8_t *user_sgpr_idx)
508{
509   if (args->prolog_inputs.used)
510      set_loc_shader(args, AC_UD_VS_PROLOG_INPUTS, user_sgpr_idx, 2);
511
512   if (!args->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX ||
513                                    (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
514      if (args->ac.vertex_buffers.used) {
515         set_loc_shader_ptr(args, AC_UD_VS_VERTEX_BUFFERS, user_sgpr_idx);
516      }
517
518      unsigned vs_num = args->ac.base_vertex.used + args->ac.draw_id.used +
519                        args->ac.start_instance.used;
520      set_loc_shader(args, AC_UD_VS_BASE_VERTEX_START_INSTANCE, user_sgpr_idx, vs_num);
521   }
522}
523
524static void
525set_ms_input_locs(struct radv_shader_args *args, uint8_t *user_sgpr_idx)
526{
527   unsigned vs_num =
528      args->ac.base_vertex.used + 3 * args->ac.num_work_groups.used + args->ac.draw_id.used;
529   set_loc_shader(args, AC_UD_VS_BASE_VERTEX_START_INSTANCE, user_sgpr_idx, vs_num);
530
531   if (args->ac.task_ring_entry.used)
532      set_loc_shader(args, AC_UD_TASK_RING_ENTRY, user_sgpr_idx, 1);
533}
534
535void
536radv_declare_shader_args(enum amd_gfx_level gfx_level, const struct radv_pipeline_key *key,
537                         const struct radv_shader_info *info, gl_shader_stage stage,
538                         bool has_previous_stage, gl_shader_stage previous_stage,
539                         struct radv_shader_args *args)
540{
541   struct user_sgpr_info user_sgpr_info;
542   bool needs_view_index = info->uses_view_index;
543   bool has_ngg_query = stage == MESA_SHADER_GEOMETRY || key->primitives_generated_query;
544
545   if (gfx_level >= GFX10 && info->is_ngg && stage != MESA_SHADER_GEOMETRY) {
546      /* Handle all NGG shaders as GS to simplify the code here. */
547      previous_stage = stage;
548      stage = MESA_SHADER_GEOMETRY;
549      has_previous_stage = true;
550   }
551
552   for (int i = 0; i < MAX_SETS; i++)
553      args->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
554   for (int i = 0; i < AC_UD_MAX_UD; i++)
555      args->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
556
557   allocate_user_sgprs(gfx_level, info, args, stage, has_previous_stage, previous_stage,
558                       needs_view_index, has_ngg_query, &user_sgpr_info);
559
560   if (args->explicit_scratch_args) {
561      ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->ring_offsets);
562   }
563   if (stage == MESA_SHADER_TASK) {
564      ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->task_ring_offsets);
565   }
566
567   /* To ensure prologs match the main VS, VS specific input SGPRs have to be placed before other
568    * sgprs.
569    */
570
571   switch (stage) {
572   case MESA_SHADER_COMPUTE:
573   case MESA_SHADER_TASK:
574      declare_global_input_sgprs(info, &user_sgpr_info, args);
575
576      if (info->cs.uses_sbt) {
577         ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_PTR, &args->ac.sbt_descriptors);
578      }
579
580      if (info->cs.uses_grid_size) {
581         if (args->load_grid_size_from_user_sgpr)
582            ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
583         else
584            ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_PTR, &args->ac.num_work_groups);
585      }
586
587      if (info->cs.uses_ray_launch_size) {
588         ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_PTR, &args->ac.ray_launch_size_addr);
589      }
590
591      if (info->vs.needs_draw_id) {
592         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
593      }
594
595      if (info->cs.uses_task_rings) {
596         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.task_ring_entry);
597         ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_INT, &args->task_ib_addr);
598         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->task_ib_stride);
599      }
600
601      for (int i = 0; i < 3; i++) {
602         if (info->cs.uses_block_id[i]) {
603            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
604         }
605      }
606
607      if (info->cs.uses_local_invocation_idx) {
608         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size);
609      }
610
611      if (args->explicit_scratch_args && gfx_level < GFX11) {
612         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
613      }
614
615      if (gfx_level >= GFX11)
616         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_ids);
617      else
618         ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, &args->ac.local_invocation_ids);
619      break;
620   case MESA_SHADER_VERTEX:
621      /* NGG is handled by the GS case */
622      assert(!info->is_ngg);
623
624      declare_vs_specific_input_sgprs(info, args, stage, has_previous_stage, previous_stage);
625
626      declare_global_input_sgprs(info, &user_sgpr_info, args);
627
628      if (needs_view_index) {
629         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.view_index);
630      }
631
632      if (info->force_vrs_per_vertex) {
633         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.force_vrs_rates);
634      }
635
636      if (info->vs.as_es) {
637         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
638      } else if (info->vs.as_ls) {
639         /* no extra parameters */
640      } else {
641         declare_streamout_sgprs(info, args, stage);
642      }
643
644      if (args->explicit_scratch_args) {
645         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
646      }
647
648      declare_vs_input_vgprs(gfx_level, info, args);
649      break;
650   case MESA_SHADER_TESS_CTRL:
651      if (has_previous_stage) {
652         // First 6 system regs
653         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
654         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
655         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
656
657         if (gfx_level >= GFX11) {
658            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_wave_id);
659         } else {
660            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
661         }
662
663         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown
664         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown
665
666         declare_vs_specific_input_sgprs(info, args, stage, has_previous_stage, previous_stage);
667
668         declare_global_input_sgprs(info, &user_sgpr_info, args);
669
670         if (needs_view_index) {
671            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.view_index);
672         }
673
674         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
675         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
676
677         declare_vs_input_vgprs(gfx_level, info, args);
678      } else {
679         declare_global_input_sgprs(info, &user_sgpr_info, args);
680
681         if (needs_view_index) {
682            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.view_index);
683         }
684
685         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
686         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
687         if (args->explicit_scratch_args) {
688            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
689         }
690         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
691         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
692      }
693      break;
694   case MESA_SHADER_TESS_EVAL:
695      /* NGG is handled by the GS case */
696      assert(!info->is_ngg);
697
698      declare_global_input_sgprs(info, &user_sgpr_info, args);
699
700      if (needs_view_index)
701         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.view_index);
702
703      if (info->tes.as_es) {
704         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
705         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
706         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
707      } else {
708         declare_streamout_sgprs(info, args, stage);
709         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
710      }
711      if (args->explicit_scratch_args) {
712         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
713      }
714      declare_tes_input_vgprs(args);
715      break;
716   case MESA_SHADER_GEOMETRY:
717      if (has_previous_stage) {
718         // First 6 system regs
719         if (info->is_ngg) {
720            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_tg_info);
721         } else {
722            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
723         }
724
725         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
726         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
727
728         if (gfx_level < GFX11) {
729            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
730         }
731
732         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown
733         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown
734
735         if (previous_stage == MESA_SHADER_VERTEX) {
736            declare_vs_specific_input_sgprs(info, args, stage, has_previous_stage, previous_stage);
737         } else if (previous_stage == MESA_SHADER_MESH) {
738            declare_ms_input_sgprs(info, args);
739         }
740
741         declare_global_input_sgprs(info, &user_sgpr_info, args);
742
743         if (needs_view_index) {
744            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.view_index);
745         }
746
747         if (info->force_vrs_per_vertex) {
748            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.force_vrs_rates);
749         }
750
751         if (info->is_ngg) {
752            declare_ngg_sgprs(info, args, has_ngg_query);
753         }
754
755         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
756         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
757         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
758         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
759         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
760
761         if (previous_stage == MESA_SHADER_VERTEX) {
762            declare_vs_input_vgprs(gfx_level, info, args);
763         } else if (previous_stage == MESA_SHADER_TESS_EVAL) {
764            declare_tes_input_vgprs(args);
765         } else if (previous_stage == MESA_SHADER_MESH) {
766            declare_ms_input_vgprs(args);
767         }
768      } else {
769         declare_global_input_sgprs(info, &user_sgpr_info, args);
770
771         if (needs_view_index) {
772            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.view_index);
773         }
774
775         if (info->force_vrs_per_vertex) {
776            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.force_vrs_rates);
777         }
778
779         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
780         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id);
781         if (args->explicit_scratch_args) {
782            ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
783         }
784         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
785         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
786         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
787         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
788         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[3]);
789         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[4]);
790         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[5]);
791         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
792      }
793      break;
794   case MESA_SHADER_FRAGMENT:
795      declare_global_input_sgprs(info, &user_sgpr_info, args);
796
797      if (info->ps.has_epilog) {
798         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ps_epilog_pc);
799      }
800
801      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask);
802      if (args->explicit_scratch_args && gfx_level < GFX11) {
803         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
804      }
805
806      declare_ps_input_vgprs(info, args);
807      break;
808   default:
809      unreachable("Shader stage not implemented");
810   }
811
812   uint8_t user_sgpr_idx = 0;
813
814   set_loc_shader_ptr(args, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_idx);
815   if (stage == MESA_SHADER_TASK) {
816      set_loc_shader_ptr(args, AC_UD_CS_TASK_RING_OFFSETS, &user_sgpr_idx);
817   }
818
819   /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including
820    * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */
821   if (has_previous_stage)
822      user_sgpr_idx = 0;
823
824   if (stage == MESA_SHADER_VERTEX || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))
825      set_vs_specific_input_locs(args, stage, has_previous_stage, previous_stage, &user_sgpr_idx);
826   else if (has_previous_stage && previous_stage == MESA_SHADER_MESH)
827      set_ms_input_locs(args, &user_sgpr_idx);
828
829   set_global_input_locs(args, &user_sgpr_info, &user_sgpr_idx);
830
831   switch (stage) {
832   case MESA_SHADER_COMPUTE:
833   case MESA_SHADER_TASK:
834      if (args->ac.sbt_descriptors.used) {
835         set_loc_shader_ptr(args, AC_UD_CS_SBT_DESCRIPTORS, &user_sgpr_idx);
836      }
837      if (args->ac.num_work_groups.used) {
838         set_loc_shader(args, AC_UD_CS_GRID_SIZE, &user_sgpr_idx,
839                        args->load_grid_size_from_user_sgpr ? 3 : 2);
840      }
841      if (args->ac.ray_launch_size_addr.used) {
842         set_loc_shader_ptr(args, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR, &user_sgpr_idx);
843      }
844      if (args->ac.draw_id.used) {
845         set_loc_shader(args, AC_UD_CS_TASK_DRAW_ID, &user_sgpr_idx, 1);
846      }
847      if (args->ac.task_ring_entry.used) {
848         set_loc_shader(args, AC_UD_TASK_RING_ENTRY, &user_sgpr_idx, 1);
849      }
850      if (args->task_ib_addr.used) {
851         assert(args->task_ib_stride.used);
852         set_loc_shader(args, AC_UD_CS_TASK_IB, &user_sgpr_idx, 3);
853      }
854      break;
855   case MESA_SHADER_VERTEX:
856      if (args->ac.view_index.used)
857         set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
858      if (args->ac.force_vrs_rates.used)
859         set_loc_shader(args, AC_UD_FORCE_VRS_RATES, &user_sgpr_idx, 1);
860      break;
861   case MESA_SHADER_TESS_CTRL:
862      if (args->ac.view_index.used)
863         set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
864      break;
865   case MESA_SHADER_TESS_EVAL:
866      if (args->ac.view_index.used)
867         set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
868      break;
869   case MESA_SHADER_GEOMETRY:
870      if (args->ac.view_index.used)
871         set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
872
873      if (args->ac.force_vrs_rates.used)
874         set_loc_shader(args, AC_UD_FORCE_VRS_RATES, &user_sgpr_idx, 1);
875
876      if (args->ngg_query_state.used) {
877         set_loc_shader(args, AC_UD_NGG_QUERY_STATE, &user_sgpr_idx, 1);
878      }
879
880      if (args->ngg_culling_settings.used) {
881         set_loc_shader(args, AC_UD_NGG_CULLING_SETTINGS, &user_sgpr_idx, 1);
882      }
883
884      if (args->ngg_viewport_scale[0].used) {
885         assert(args->ngg_viewport_scale[1].used &&
886                args->ngg_viewport_translate[0].used &&
887                args->ngg_viewport_translate[1].used);
888         set_loc_shader(args, AC_UD_NGG_VIEWPORT, &user_sgpr_idx, 4);
889      }
890      break;
891   case MESA_SHADER_FRAGMENT:
892      if (args->ps_epilog_pc.used)
893         set_loc_shader(args, AC_UD_PS_EPILOG_PC, &user_sgpr_idx, 1);
894      break;
895   default:
896      unreachable("Shader stage not implemented");
897   }
898
899   args->num_user_sgprs = user_sgpr_idx;
900}
901
902void
903radv_declare_ps_epilog_args(enum amd_gfx_level gfx_level, const struct radv_ps_epilog_key *key,
904                            struct radv_shader_args *args)
905{
906   unsigned num_inputs = 0;
907
908   ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->ring_offsets);
909   if (gfx_level < GFX11)
910      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
911
912   /* Declare VGPR arguments for color exports. */
913   for (unsigned i = 0; i < MAX_RTS; i++) {
914      unsigned col_format = (key->spi_shader_col_format >> (i * 4)) & 0xf;
915
916      if (col_format == V_028714_SPI_SHADER_ZERO)
917         continue;
918
919      ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_FLOAT, &args->ps_epilog_inputs[num_inputs]);
920      num_inputs++;
921   }
922}
923