1/*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28#include "radv_cs.h"
29#include "radv_debug.h"
30#include "radv_meta.h"
31#include "radv_private.h"
32#include "radv_radeon_winsys.h"
33#include "radv_shader.h"
34#include "sid.h"
35#include "vk_format.h"
36#include "vk_util.h"
37#include "vk_enum_defines.h"
38#include "vk_common_entrypoints.h"
39
40#include "ac_debug.h"
41#include "ac_shader_args.h"
42
43#include "util/fast_idiv_by_const.h"
44
45enum {
46   RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
47   RADV_PREFETCH_VS = (1 << 1),
48   RADV_PREFETCH_TCS = (1 << 2),
49   RADV_PREFETCH_TES = (1 << 3),
50   RADV_PREFETCH_GS = (1 << 4),
51   RADV_PREFETCH_PS = (1 << 5),
52   RADV_PREFETCH_MS = (1 << 6),
53   RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES |
54                            RADV_PREFETCH_GS | RADV_PREFETCH_PS | RADV_PREFETCH_MS)
55};
56
57static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
58                                         struct radv_image *image, VkImageLayout src_layout,
59                                         bool src_render_loop, VkImageLayout dst_layout,
60                                         bool dst_render_loop, uint32_t src_family_index,
61                                         uint32_t dst_family_index, const VkImageSubresourceRange *range,
62                                         struct radv_sample_locations_state *sample_locs);
63
64static void radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size);
65
66const struct radv_dynamic_state default_dynamic_state = {
67   .viewport =
68      {
69         .count = 0,
70      },
71   .scissor =
72      {
73         .count = 0,
74      },
75   .line_width = 1.0f,
76   .depth_bias =
77      {
78         .bias = 0.0f,
79         .clamp = 0.0f,
80         .slope = 0.0f,
81      },
82   .blend_constants = {0.0f, 0.0f, 0.0f, 0.0f},
83   .depth_bounds =
84      {
85         .min = 0.0f,
86         .max = 1.0f,
87      },
88   .stencil_compare_mask =
89      {
90         .front = ~0u,
91         .back = ~0u,
92      },
93   .stencil_write_mask =
94      {
95         .front = ~0u,
96         .back = ~0u,
97      },
98   .stencil_reference =
99      {
100         .front = 0u,
101         .back = 0u,
102      },
103   .line_stipple =
104      {
105         .factor = 0u,
106         .pattern = 0u,
107      },
108   .cull_mode = 0u,
109   .front_face = 0u,
110   .primitive_topology = 0u,
111   .fragment_shading_rate =
112      {
113         .size = {1u, 1u},
114         .combiner_ops = {VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR,
115                          VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR},
116      },
117   .depth_bias_enable = 0u,
118   .primitive_restart_enable = 0u,
119   .rasterizer_discard_enable = 0u,
120   .logic_op = 0u,
121   .color_write_enable = 0xffffffffu,
122};
123
124static void
125radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src)
126{
127   struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
128   uint64_t copy_mask = src->mask;
129   uint64_t dest_mask = 0;
130
131   dest->discard_rectangle.count = src->discard_rectangle.count;
132   dest->sample_location.count = src->sample_location.count;
133
134   if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
135      if (dest->viewport.count != src->viewport.count) {
136         dest->viewport.count = src->viewport.count;
137         dest_mask |= RADV_DYNAMIC_VIEWPORT;
138      }
139
140      if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
141                 src->viewport.count * sizeof(VkViewport))) {
142         typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count);
143         typed_memcpy(dest->viewport.xform, src->viewport.xform, src->viewport.count);
144         dest_mask |= RADV_DYNAMIC_VIEWPORT;
145      }
146   }
147
148   if (copy_mask & RADV_DYNAMIC_SCISSOR) {
149      if (dest->scissor.count != src->scissor.count) {
150         dest->scissor.count = src->scissor.count;
151         dest_mask |= RADV_DYNAMIC_SCISSOR;
152      }
153
154      if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
155                 src->scissor.count * sizeof(VkRect2D))) {
156         typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count);
157         dest_mask |= RADV_DYNAMIC_SCISSOR;
158      }
159   }
160
161   if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
162      if (dest->line_width != src->line_width) {
163         dest->line_width = src->line_width;
164         dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
165      }
166   }
167
168   if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
169      if (memcmp(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias))) {
170         dest->depth_bias = src->depth_bias;
171         dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
172      }
173   }
174
175   if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
176      if (memcmp(&dest->blend_constants, &src->blend_constants, sizeof(src->blend_constants))) {
177         typed_memcpy(dest->blend_constants, src->blend_constants, 4);
178         dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
179      }
180   }
181
182   if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
183      if (memcmp(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds))) {
184         dest->depth_bounds = src->depth_bounds;
185         dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
186      }
187   }
188
189   if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
190      if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
191                 sizeof(src->stencil_compare_mask))) {
192         dest->stencil_compare_mask = src->stencil_compare_mask;
193         dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
194      }
195   }
196
197   if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
198      if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
199                 sizeof(src->stencil_write_mask))) {
200         dest->stencil_write_mask = src->stencil_write_mask;
201         dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
202      }
203   }
204
205   if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
206      if (memcmp(&dest->stencil_reference, &src->stencil_reference,
207                 sizeof(src->stencil_reference))) {
208         dest->stencil_reference = src->stencil_reference;
209         dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
210      }
211   }
212
213   if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
214      if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
215                 src->discard_rectangle.count * sizeof(VkRect2D))) {
216         typed_memcpy(dest->discard_rectangle.rectangles, src->discard_rectangle.rectangles,
217                      src->discard_rectangle.count);
218         dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
219      }
220   }
221
222   if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
223      if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
224          dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
225          dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
226          memcmp(&dest->sample_location.locations, &src->sample_location.locations,
227                 src->sample_location.count * sizeof(VkSampleLocationEXT))) {
228         dest->sample_location.per_pixel = src->sample_location.per_pixel;
229         dest->sample_location.grid_size = src->sample_location.grid_size;
230         typed_memcpy(dest->sample_location.locations, src->sample_location.locations,
231                      src->sample_location.count);
232         dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
233      }
234   }
235
236   if (copy_mask & RADV_DYNAMIC_LINE_STIPPLE) {
237      if (memcmp(&dest->line_stipple, &src->line_stipple, sizeof(src->line_stipple))) {
238         dest->line_stipple = src->line_stipple;
239         dest_mask |= RADV_DYNAMIC_LINE_STIPPLE;
240      }
241   }
242
243   if (copy_mask & RADV_DYNAMIC_CULL_MODE) {
244      if (dest->cull_mode != src->cull_mode) {
245         dest->cull_mode = src->cull_mode;
246         dest_mask |= RADV_DYNAMIC_CULL_MODE;
247      }
248   }
249
250   if (copy_mask & RADV_DYNAMIC_FRONT_FACE) {
251      if (dest->front_face != src->front_face) {
252         dest->front_face = src->front_face;
253         dest_mask |= RADV_DYNAMIC_FRONT_FACE;
254      }
255   }
256
257   if (copy_mask & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
258      if (dest->primitive_topology != src->primitive_topology) {
259         dest->primitive_topology = src->primitive_topology;
260         dest_mask |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
261      }
262   }
263
264   if (copy_mask & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
265      if (dest->depth_test_enable != src->depth_test_enable) {
266         dest->depth_test_enable = src->depth_test_enable;
267         dest_mask |= RADV_DYNAMIC_DEPTH_TEST_ENABLE;
268      }
269   }
270
271   if (copy_mask & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
272      if (dest->depth_write_enable != src->depth_write_enable) {
273         dest->depth_write_enable = src->depth_write_enable;
274         dest_mask |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
275      }
276   }
277
278   if (copy_mask & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
279      if (dest->depth_compare_op != src->depth_compare_op) {
280         dest->depth_compare_op = src->depth_compare_op;
281         dest_mask |= RADV_DYNAMIC_DEPTH_COMPARE_OP;
282      }
283   }
284
285   if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
286      if (dest->depth_bounds_test_enable != src->depth_bounds_test_enable) {
287         dest->depth_bounds_test_enable = src->depth_bounds_test_enable;
288         dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
289      }
290   }
291
292   if (copy_mask & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
293      if (dest->stencil_test_enable != src->stencil_test_enable) {
294         dest->stencil_test_enable = src->stencil_test_enable;
295         dest_mask |= RADV_DYNAMIC_STENCIL_TEST_ENABLE;
296      }
297   }
298
299   if (copy_mask & RADV_DYNAMIC_STENCIL_OP) {
300      if (memcmp(&dest->stencil_op, &src->stencil_op, sizeof(src->stencil_op))) {
301         dest->stencil_op = src->stencil_op;
302         dest_mask |= RADV_DYNAMIC_STENCIL_OP;
303      }
304   }
305
306   if (copy_mask & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) {
307      if (memcmp(&dest->fragment_shading_rate, &src->fragment_shading_rate,
308                 sizeof(src->fragment_shading_rate))) {
309         dest->fragment_shading_rate = src->fragment_shading_rate;
310         dest_mask |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
311      }
312   }
313
314   if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) {
315      if (dest->depth_bias_enable != src->depth_bias_enable) {
316         dest->depth_bias_enable = src->depth_bias_enable;
317         dest_mask |= RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
318      }
319   }
320
321   if (copy_mask & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
322      if (dest->primitive_restart_enable != src->primitive_restart_enable) {
323         dest->primitive_restart_enable = src->primitive_restart_enable;
324         dest_mask |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
325      }
326   }
327
328   if (copy_mask & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
329      if (dest->rasterizer_discard_enable != src->rasterizer_discard_enable) {
330         dest->rasterizer_discard_enable = src->rasterizer_discard_enable;
331         dest_mask |= RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
332      }
333   }
334
335   if (copy_mask & RADV_DYNAMIC_LOGIC_OP) {
336      if (dest->logic_op != src->logic_op) {
337         dest->logic_op = src->logic_op;
338         dest_mask |= RADV_DYNAMIC_LOGIC_OP;
339      }
340   }
341
342   if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_ENABLE) {
343      if (dest->color_write_enable != src->color_write_enable) {
344         dest->color_write_enable = src->color_write_enable;
345         dest_mask |= RADV_DYNAMIC_COLOR_WRITE_ENABLE;
346      }
347   }
348
349   cmd_buffer->state.dirty |= dest_mask;
350}
351
352bool
353radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
354{
355   return cmd_buffer->qf == RADV_QUEUE_COMPUTE &&
356          cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
357}
358
359enum amd_ip_type
360radv_queue_family_to_ring(struct radv_physical_device *physical_device,
361                          enum radv_queue_family f)
362{
363   switch (f) {
364   case RADV_QUEUE_GENERAL:
365      return AMD_IP_GFX;
366   case RADV_QUEUE_COMPUTE:
367      return AMD_IP_COMPUTE;
368   case RADV_QUEUE_TRANSFER:
369      return AMD_IP_SDMA;
370   default:
371      unreachable("Unknown queue family");
372   }
373}
374
375static void
376radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
377                            unsigned count, const uint32_t *data)
378{
379   struct radeon_cmdbuf *cs = cmd_buffer->cs;
380
381   radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
382
383   radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
384   radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine_sel));
385   radeon_emit(cs, va);
386   radeon_emit(cs, va >> 32);
387   radeon_emit_array(cs, data, count);
388}
389
390static void
391radv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
392                     unsigned size)
393{
394   uint32_t *zeroes = alloca(size);
395   memset(zeroes, 0, size);
396   radv_emit_write_data_packet(cmd_buffer, engine_sel, va, size / 4, zeroes);
397}
398
399static void
400radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
401{
402   list_del(&cmd_buffer->pool_link);
403
404   util_dynarray_fini(&cmd_buffer->cached_vertex_formats);
405
406   list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
407   {
408      cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
409      list_del(&up->list);
410      free(up);
411   }
412
413   if (cmd_buffer->upload.upload_bo)
414      cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo);
415
416   if (cmd_buffer->state.own_render_pass) {
417      radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device),
418                             radv_render_pass_to_handle(cmd_buffer->state.pass), NULL);
419      cmd_buffer->state.own_render_pass = false;
420   }
421
422   if (cmd_buffer->cs)
423      cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
424   if (cmd_buffer->ace_internal.cs)
425      cmd_buffer->device->ws->cs_destroy(cmd_buffer->ace_internal.cs);
426
427   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
428      struct radv_descriptor_set_header *set = &cmd_buffer->descriptors[i].push_set.set;
429      free(set->mapped_ptr);
430      if (set->layout)
431         vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->layout->vk);
432      vk_object_base_finish(&set->base);
433   }
434
435   vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base);
436
437   vk_command_buffer_finish(&cmd_buffer->vk);
438   vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer);
439}
440
441static VkResult
442radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool,
443                       VkCommandBufferLevel level, VkCommandBuffer *pCommandBuffer)
444{
445   struct radv_cmd_buffer *cmd_buffer;
446   unsigned ring;
447   cmd_buffer = vk_zalloc(&pool->vk.alloc, sizeof(*cmd_buffer), 8,
448                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
449   if (cmd_buffer == NULL)
450      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
451
452   VkResult result =
453      vk_command_buffer_init(&cmd_buffer->vk, &pool->vk, level);
454   if (result != VK_SUCCESS) {
455      vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer);
456      return result;
457   }
458
459   cmd_buffer->device = device;
460   cmd_buffer->pool = pool;
461
462   list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
463   cmd_buffer->qf = vk_queue_to_radv(device->physical_device, pool->vk.queue_family_index);
464
465   ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf);
466
467   cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
468   if (!cmd_buffer->cs) {
469      radv_destroy_cmd_buffer(cmd_buffer);
470      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
471   }
472
473   vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base,
474                       VK_OBJECT_TYPE_DESCRIPTOR_SET);
475
476   util_dynarray_init(&cmd_buffer->cached_vertex_formats, NULL);
477
478   for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
479      vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base,
480                          VK_OBJECT_TYPE_DESCRIPTOR_SET);
481
482   *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
483
484   list_inithead(&cmd_buffer->upload.list);
485
486   return VK_SUCCESS;
487}
488
489static VkResult
490radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
491{
492   vk_command_buffer_reset(&cmd_buffer->vk);
493
494   cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
495   if (cmd_buffer->ace_internal.cs)
496      cmd_buffer->device->ws->cs_reset(cmd_buffer->ace_internal.cs);
497
498   list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
499   {
500      cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
501      list_del(&up->list);
502      free(up);
503   }
504
505   if (cmd_buffer->state.own_render_pass) {
506      radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device),
507                             radv_render_pass_to_handle(cmd_buffer->state.pass), NULL);
508      cmd_buffer->state.own_render_pass = false;
509   }
510
511   cmd_buffer->push_constant_stages = 0;
512   cmd_buffer->scratch_size_per_wave_needed = 0;
513   cmd_buffer->scratch_waves_wanted = 0;
514   cmd_buffer->compute_scratch_size_per_wave_needed = 0;
515   cmd_buffer->compute_scratch_waves_wanted = 0;
516   cmd_buffer->esgs_ring_size_needed = 0;
517   cmd_buffer->gsvs_ring_size_needed = 0;
518   cmd_buffer->tess_rings_needed = false;
519   cmd_buffer->task_rings_needed = false;
520   cmd_buffer->mesh_scratch_ring_needed = false;
521   cmd_buffer->gds_needed = false;
522   cmd_buffer->gds_oa_needed = false;
523   cmd_buffer->sample_positions_needed = false;
524   cmd_buffer->ace_internal.sem.gfx2ace_value = 0;
525   cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = 0;
526   cmd_buffer->ace_internal.sem.va = 0;
527
528   if (cmd_buffer->upload.upload_bo)
529      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
530   cmd_buffer->upload.offset = 0;
531
532   cmd_buffer->record_result = VK_SUCCESS;
533
534   memset(cmd_buffer->vertex_binding_buffers, 0, sizeof(struct radv_buffer *) * cmd_buffer->used_vertex_bindings);
535   cmd_buffer->used_vertex_bindings = 0;
536
537   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
538      cmd_buffer->descriptors[i].dirty = 0;
539      cmd_buffer->descriptors[i].valid = 0;
540      cmd_buffer->descriptors[i].push_dirty = false;
541   }
542
543   if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
544      uint32_t pred_value = 0;
545      uint32_t pred_offset;
546      if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset))
547         cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
548
549      cmd_buffer->mec_inv_pred_emitted = false;
550      cmd_buffer->mec_inv_pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
551   }
552
553   if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 &&
554       cmd_buffer->qf == RADV_QUEUE_GENERAL) {
555      unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends;
556      unsigned fence_offset, eop_bug_offset;
557      void *fence_ptr;
558
559      radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr);
560      memset(fence_ptr, 0, 8);
561
562      cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
563      cmd_buffer->gfx9_fence_va += fence_offset;
564
565      radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8);
566
567      if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
568         /* Allocate a buffer for the EOP bug on GFX9. */
569         radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr);
570         memset(fence_ptr, 0, 16 * num_db);
571         cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
572         cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
573
574         radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db);
575      }
576   }
577
578   cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
579
580   return cmd_buffer->record_result;
581}
582
583static bool
584radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed)
585{
586   uint64_t new_size;
587   struct radeon_winsys_bo *bo = NULL;
588   struct radv_cmd_buffer_upload *upload;
589   struct radv_device *device = cmd_buffer->device;
590
591   new_size = MAX2(min_needed, 16 * 1024);
592   new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
593
594   VkResult result =
595      device->ws->buffer_create(device->ws, new_size, 4096, device->ws->cs_domain(device->ws),
596                                RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
597                                   RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
598                                RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo);
599
600   if (result != VK_SUCCESS) {
601      cmd_buffer->record_result = result;
602      return false;
603   }
604
605   radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
606   if (cmd_buffer->upload.upload_bo) {
607      upload = malloc(sizeof(*upload));
608
609      if (!upload) {
610         cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
611         device->ws->buffer_destroy(device->ws, bo);
612         return false;
613      }
614
615      memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
616      list_add(&upload->list, &cmd_buffer->upload.list);
617   }
618
619   cmd_buffer->upload.upload_bo = bo;
620   cmd_buffer->upload.size = new_size;
621   cmd_buffer->upload.offset = 0;
622   cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
623
624   if (!cmd_buffer->upload.map) {
625      cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
626      return false;
627   }
628
629   return true;
630}
631
632bool
633radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size,
634                             unsigned *out_offset, void **ptr)
635{
636   assert(size % 4 == 0);
637
638   struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
639
640   /* Align to the scalar cache line size if it results in this allocation
641    * being placed in less of them.
642    */
643   unsigned offset = cmd_buffer->upload.offset;
644   unsigned line_size = rad_info->gfx_level >= GFX10 ? 64 : 32;
645   unsigned gap = align(offset, line_size) - offset;
646   if ((size & (line_size - 1)) > gap)
647      offset = align(offset, line_size);
648
649   if (offset + size > cmd_buffer->upload.size) {
650      if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
651         return false;
652      offset = 0;
653   }
654
655   *out_offset = offset;
656   *ptr = cmd_buffer->upload.map + offset;
657
658   cmd_buffer->upload.offset = offset + size;
659   return true;
660}
661
662bool
663radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data,
664                            unsigned *out_offset)
665{
666   uint8_t *ptr;
667
668   if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr))
669      return false;
670   assert(ptr);
671
672   memcpy(ptr, data, size);
673   return true;
674}
675
676void
677radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
678{
679   struct radv_device *device = cmd_buffer->device;
680   struct radeon_cmdbuf *cs = cmd_buffer->cs;
681   uint64_t va;
682
683   va = radv_buffer_get_va(device->trace_bo);
684   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
685      va += 4;
686
687   ++cmd_buffer->state.trace_id;
688   radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id);
689
690   radeon_check_space(cmd_buffer->device->ws, cs, 2);
691
692   radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
693   radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
694}
695
696static void
697radv_ace_internal_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask,
698                          VkPipelineStageFlags2 dst_stage_mask)
699{
700   /* Update flush bits from the main cmdbuf, except the stage flush. */
701   cmd_buffer->ace_internal.flush_bits |=
702      cmd_buffer->state.flush_bits & RADV_CMD_FLUSH_ALL_COMPUTE & ~RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
703
704   /* Add stage flush only when necessary. */
705   if (src_stage_mask &
706       (VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV | VK_PIPELINE_STAGE_2_TRANSFER_BIT |
707        VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
708      cmd_buffer->ace_internal.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
709
710   /* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */
711   if (src_stage_mask &
712       (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT |
713        VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
714        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
715      dst_stage_mask |= cmd_buffer->state.dma_is_busy ? VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV : 0;
716
717   /* Increment the GFX/ACE semaphore when task shaders are blocked. */
718   if (dst_stage_mask &
719       (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
720        VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV))
721      cmd_buffer->ace_internal.sem.gfx2ace_value++;
722}
723
724static void
725radv_ace_internal_cache_flush(struct radv_cmd_buffer *cmd_buffer)
726{
727   struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
728   const uint32_t flush_bits = cmd_buffer->ace_internal.flush_bits;
729   enum rgp_flush_bits sqtt_flush_bits = 0;
730
731   si_cs_emit_cache_flush(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0,
732                          true, flush_bits, &sqtt_flush_bits, 0);
733
734   cmd_buffer->ace_internal.flush_bits = 0;
735}
736
737static uint64_t
738radv_ace_internal_sem_create(struct radv_cmd_buffer *cmd_buffer)
739{
740   /* DWORD 0: GFX->ACE semaphore (GFX blocks ACE, ie. ACE waits for GFX)
741    * DWORD 1: ACE->GFX semaphore
742    */
743   uint64_t sem_init = 0;
744   uint32_t va_off = 0;
745   if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint64_t), &sem_init, &va_off)) {
746      cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
747      return 0;
748   }
749
750   return radv_buffer_get_va(cmd_buffer->upload.upload_bo) + va_off;
751}
752
753static bool
754radv_ace_internal_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
755{
756   return cmd_buffer->ace_internal.sem.gfx2ace_value !=
757          cmd_buffer->ace_internal.sem.emitted_gfx2ace_value;
758}
759
760ALWAYS_INLINE static bool
761radv_flush_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer)
762{
763   if (!radv_ace_internal_sem_dirty(cmd_buffer))
764      return false;
765
766   if (!cmd_buffer->ace_internal.sem.va) {
767      cmd_buffer->ace_internal.sem.va = radv_ace_internal_sem_create(cmd_buffer);
768      if (!cmd_buffer->ace_internal.sem.va)
769         return false;
770   }
771
772   /* GFX writes a value to the semaphore which ACE can wait for.*/
773   si_cs_emit_write_event_eop(
774      cmd_buffer->cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
775      radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
776      EOP_DATA_SEL_VALUE_32BIT, cmd_buffer->ace_internal.sem.va,
777      cmd_buffer->ace_internal.sem.gfx2ace_value, cmd_buffer->gfx9_eop_bug_va);
778
779   cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = cmd_buffer->ace_internal.sem.gfx2ace_value;
780   return true;
781}
782
783ALWAYS_INLINE static void
784radv_wait_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer)
785{
786   assert(cmd_buffer->ace_internal.sem.va);
787   struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
788   radeon_check_space(cmd_buffer->device->ws, ace_cs, 7);
789
790   /* ACE waits for the semaphore which GFX wrote. */
791   radv_cp_wait_mem(ace_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, cmd_buffer->ace_internal.sem.va,
792                    cmd_buffer->ace_internal.sem.gfx2ace_value, 0xffffffff);
793}
794
795static struct radeon_cmdbuf *
796radv_ace_internal_create(struct radv_cmd_buffer *cmd_buffer)
797{
798   assert(!cmd_buffer->ace_internal.cs);
799   struct radv_device *device = cmd_buffer->device;
800   struct radeon_cmdbuf *ace_cs = device->ws->cs_create(device->ws, AMD_IP_COMPUTE);
801
802   if (!ace_cs) {
803      cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
804   }
805
806   return ace_cs;
807}
808
809static VkResult
810radv_ace_internal_finalize(struct radv_cmd_buffer *cmd_buffer)
811{
812   assert(cmd_buffer->ace_internal.cs);
813   struct radv_device *device = cmd_buffer->device;
814   struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
815
816   /* Emit pending cache flush. */
817   radv_ace_internal_cache_flush(cmd_buffer);
818
819   /* Clear the ACE semaphore if it exists.
820    * This is necessary in case the same cmd buffer is submitted again in the future.
821    */
822   if (cmd_buffer->ace_internal.sem.va) {
823      struct radeon_cmdbuf *main_cs = cmd_buffer->cs;
824      uint64_t gfx2ace_va = cmd_buffer->ace_internal.sem.va;
825      uint64_t ace2gfx_va = cmd_buffer->ace_internal.sem.va + 4;
826
827      /* ACE: write 1 to the ACE->GFX semaphore. */
828      si_cs_emit_write_event_eop(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
829                                 true, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
830                                 EOP_DATA_SEL_VALUE_32BIT, ace2gfx_va, 1,
831                                 cmd_buffer->gfx9_eop_bug_va);
832
833      /* Wait for ACE to finish, otherwise we may risk writing 0 to the semaphore
834       * when ACE is still waiting for it. This may not happen in practice, but
835       * better safe than sorry.
836       */
837      radv_cp_wait_mem(main_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, ace2gfx_va, 1, 0xffffffff);
838
839      /* GFX: clear GFX->ACE and ACE->GFX semaphores. */
840      radv_emit_clear_data(cmd_buffer, V_370_ME, gfx2ace_va, 8);
841   }
842
843   device->ws->cs_add_buffers(ace_cs, cmd_buffer->cs);
844   return device->ws->cs_finalize(ace_cs);
845}
846
847static void
848radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags)
849{
850   if (unlikely(cmd_buffer->device->thread_trace.bo)) {
851      radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
852      radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
853   }
854
855   if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
856      enum rgp_flush_bits sqtt_flush_bits = 0;
857      assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
858
859      radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
860
861      /* Force wait for graphics or compute engines to be idle. */
862      si_cs_emit_cache_flush(cmd_buffer->cs,
863                             cmd_buffer->device->physical_device->rad_info.gfx_level,
864                             &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va,
865                             radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits,
866                             cmd_buffer->gfx9_eop_bug_va);
867
868      if (cmd_buffer->state.graphics_pipeline && (flags & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) &&
869          radv_pipeline_has_stage(cmd_buffer->state.graphics_pipeline, MESA_SHADER_TASK)) {
870         /* Force wait for compute engines to be idle on the internal cmdbuf. */
871         si_cs_emit_cache_flush(cmd_buffer->ace_internal.cs,
872                                cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0,
873                                true, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, &sqtt_flush_bits, 0);
874      }
875   }
876
877   if (unlikely(cmd_buffer->device->trace_bo))
878      radv_cmd_buffer_trace_emit(cmd_buffer);
879}
880
881static void
882radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
883{
884   struct radv_device *device = cmd_buffer->device;
885   enum amd_ip_type ring;
886   uint32_t data[2];
887   uint64_t va;
888
889   va = radv_buffer_get_va(device->trace_bo);
890
891   ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf);
892
893   switch (ring) {
894   case AMD_IP_GFX:
895      va += 8;
896      break;
897   case AMD_IP_COMPUTE:
898      va += 16;
899      break;
900   default:
901      assert(!"invalid IP type");
902   }
903
904   uint64_t pipeline_address = (uintptr_t)pipeline;
905   data[0] = pipeline_address;
906   data[1] = pipeline_address >> 32;
907
908   radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
909}
910
911static void
912radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr)
913{
914   struct radv_device *device = cmd_buffer->device;
915   uint32_t data[2];
916   uint64_t va;
917
918   va = radv_buffer_get_va(device->trace_bo);
919   va += 24;
920
921   data[0] = vb_ptr;
922   data[1] = vb_ptr >> 32;
923
924   radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
925}
926
927static void
928radv_save_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader_part *prolog)
929{
930   struct radv_device *device = cmd_buffer->device;
931   uint32_t data[2];
932   uint64_t va;
933
934   va = radv_buffer_get_va(device->trace_bo);
935   va += 32;
936
937   uint64_t prolog_address = (uintptr_t)prolog;
938   data[0] = prolog_address;
939   data[1] = prolog_address >> 32;
940
941   radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
942}
943
944void
945radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
946                        struct radv_descriptor_set *set, unsigned idx)
947{
948   struct radv_descriptor_state *descriptors_state =
949      radv_get_descriptors_state(cmd_buffer, bind_point);
950
951   descriptors_state->sets[idx] = set;
952
953   descriptors_state->valid |= (1u << idx); /* active descriptors */
954   descriptors_state->dirty |= (1u << idx);
955}
956
957static void
958radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
959{
960   struct radv_descriptor_state *descriptors_state =
961      radv_get_descriptors_state(cmd_buffer, bind_point);
962   struct radv_device *device = cmd_buffer->device;
963   uint32_t data[MAX_SETS * 2] = {0};
964   uint64_t va;
965   va = radv_buffer_get_va(device->trace_bo) + 40;
966
967   u_foreach_bit(i, descriptors_state->valid)
968   {
969      struct radv_descriptor_set *set = descriptors_state->sets[i];
970      data[i * 2] = (uint64_t)(uintptr_t)set;
971      data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
972   }
973
974   radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data);
975}
976
977struct radv_userdata_info *
978radv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx)
979{
980   struct radv_shader *shader = radv_get_shader(pipeline, stage);
981   return &shader->info.user_sgprs_locs.shader_data[idx];
982}
983
984static void
985radv_emit_userdata_address(struct radv_device *device, struct radeon_cmdbuf *cs,
986                           struct radv_pipeline *pipeline, gl_shader_stage stage, int idx,
987                           uint64_t va)
988{
989   struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
990   uint32_t base_reg = pipeline->user_data_0[stage];
991   if (loc->sgpr_idx == -1)
992      return;
993
994   assert(loc->num_sgprs == 1);
995
996   radv_emit_shader_pointer(device, cs, base_reg + loc->sgpr_idx * 4, va, false);
997}
998
999static void
1000radv_emit_descriptor_pointers(struct radv_device *device, struct radeon_cmdbuf *cs,
1001                              struct radv_pipeline *pipeline,
1002                              struct radv_descriptor_state *descriptors_state,
1003                              gl_shader_stage stage)
1004{
1005   uint32_t sh_base = pipeline->user_data_0[stage];
1006   struct radv_userdata_locations *locs = &pipeline->shaders[stage]->info.user_sgprs_locs;
1007   unsigned mask = locs->descriptor_sets_enabled;
1008
1009   mask &= descriptors_state->dirty & descriptors_state->valid;
1010
1011   while (mask) {
1012      int start, count;
1013
1014      u_bit_scan_consecutive_range(&mask, &start, &count);
1015
1016      struct radv_userdata_info *loc = &locs->descriptor_sets[start];
1017      unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
1018
1019      radv_emit_shader_pointer_head(cs, sh_offset, count, true);
1020      for (int i = 0; i < count; i++) {
1021         struct radv_descriptor_set *set = descriptors_state->sets[start + i];
1022
1023         radv_emit_shader_pointer_body(device, cs, set->header.va, true);
1024      }
1025   }
1026}
1027
1028/**
1029 * Convert the user sample locations to hardware sample locations (the values
1030 * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
1031 */
1032static void
1033radv_convert_user_sample_locs(struct radv_sample_locations_state *state, uint32_t x, uint32_t y,
1034                              VkOffset2D *sample_locs)
1035{
1036   uint32_t x_offset = x % state->grid_size.width;
1037   uint32_t y_offset = y % state->grid_size.height;
1038   uint32_t num_samples = (uint32_t)state->per_pixel;
1039   VkSampleLocationEXT *user_locs;
1040   uint32_t pixel_offset;
1041
1042   pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
1043
1044   assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
1045   user_locs = &state->locations[pixel_offset];
1046
1047   for (uint32_t i = 0; i < num_samples; i++) {
1048      float shifted_pos_x = user_locs[i].x - 0.5;
1049      float shifted_pos_y = user_locs[i].y - 0.5;
1050
1051      int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
1052      int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
1053
1054      sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
1055      sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
1056   }
1057}
1058
1059/**
1060 * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
1061 * locations.
1062 */
1063static void
1064radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
1065                               uint32_t *sample_locs_pixel)
1066{
1067   for (uint32_t i = 0; i < num_samples; i++) {
1068      uint32_t sample_reg_idx = i / 4;
1069      uint32_t sample_loc_idx = i % 4;
1070      int32_t pos_x = sample_locs[i].x;
1071      int32_t pos_y = sample_locs[i].y;
1072
1073      uint32_t shift_x = 8 * sample_loc_idx;
1074      uint32_t shift_y = shift_x + 4;
1075
1076      sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
1077      sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
1078   }
1079}
1080
1081/**
1082 * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
1083 * sample locations.
1084 */
1085static uint64_t
1086radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs,
1087                               uint32_t num_samples)
1088{
1089   uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities));
1090   uint32_t sample_mask = num_samples - 1;
1091   uint32_t *distances = alloca(num_samples * sizeof(*distances));
1092   uint64_t centroid_priority = 0;
1093
1094   /* Compute the distances from center for each sample. */
1095   for (int i = 0; i < num_samples; i++) {
1096      distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y);
1097   }
1098
1099   /* Compute the centroid priorities by looking at the distances array. */
1100   for (int i = 0; i < num_samples; i++) {
1101      uint32_t min_idx = 0;
1102
1103      for (int j = 1; j < num_samples; j++) {
1104         if (distances[j] < distances[min_idx])
1105            min_idx = j;
1106      }
1107
1108      centroid_priorities[i] = min_idx;
1109      distances[min_idx] = 0xffffffff;
1110   }
1111
1112   /* Compute the final centroid priority. */
1113   for (int i = 0; i < 8; i++) {
1114      centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4);
1115   }
1116
1117   return centroid_priority << 32 | centroid_priority;
1118}
1119
1120/**
1121 * Emit the sample locations that are specified with VK_EXT_sample_locations.
1122 */
1123static void
1124radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
1125{
1126   struct radv_sample_locations_state *sample_location = &cmd_buffer->state.dynamic.sample_location;
1127   uint32_t num_samples = (uint32_t)sample_location->per_pixel;
1128   struct radeon_cmdbuf *cs = cmd_buffer->cs;
1129   uint32_t sample_locs_pixel[4][2] = {0};
1130   VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
1131   uint32_t max_sample_dist = 0;
1132   uint64_t centroid_priority;
1133
1134   if (!cmd_buffer->state.dynamic.sample_location.count)
1135      return;
1136
1137   /* Convert the user sample locations to hardware sample locations. */
1138   radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
1139   radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
1140   radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
1141   radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
1142
1143   /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
1144   for (uint32_t i = 0; i < 4; i++) {
1145      radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]);
1146   }
1147
1148   /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
1149   centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples);
1150
1151   /* Compute the maximum sample distance from the specified locations. */
1152   for (unsigned i = 0; i < 4; ++i) {
1153      for (uint32_t j = 0; j < num_samples; j++) {
1154         VkOffset2D offset = sample_locs[i][j];
1155         max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y)));
1156      }
1157   }
1158
1159   /* Emit the specified user sample locations. */
1160   switch (num_samples) {
1161   case 2:
1162   case 4:
1163      radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
1164                             sample_locs_pixel[0][0]);
1165      radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
1166                             sample_locs_pixel[1][0]);
1167      radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
1168                             sample_locs_pixel[2][0]);
1169      radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
1170                             sample_locs_pixel[3][0]);
1171      break;
1172   case 8:
1173      radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
1174                             sample_locs_pixel[0][0]);
1175      radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
1176                             sample_locs_pixel[1][0]);
1177      radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
1178                             sample_locs_pixel[2][0]);
1179      radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
1180                             sample_locs_pixel[3][0]);
1181      radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1,
1182                             sample_locs_pixel[0][1]);
1183      radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1,
1184                             sample_locs_pixel[1][1]);
1185      radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1,
1186                             sample_locs_pixel[2][1]);
1187      radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1,
1188                             sample_locs_pixel[3][1]);
1189      break;
1190   default:
1191      unreachable("invalid number of samples");
1192   }
1193
1194   /* Emit the maximum sample distance and the centroid priority. */
1195   radeon_set_context_reg_rmw(cs, R_028BE0_PA_SC_AA_CONFIG,
1196                              S_028BE0_MAX_SAMPLE_DIST(max_sample_dist), ~C_028BE0_MAX_SAMPLE_DIST);
1197
1198   radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1199   radeon_emit(cs, centroid_priority);
1200   radeon_emit(cs, centroid_priority >> 32);
1201
1202   cmd_buffer->state.context_roll_without_scissor_emitted = true;
1203}
1204
1205static void
1206radv_emit_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs,
1207                             struct radv_pipeline *pipeline, gl_shader_stage stage, int idx,
1208                             uint32_t *values)
1209{
1210   struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
1211   uint32_t base_reg = pipeline->user_data_0[stage];
1212   if (loc->sgpr_idx == -1)
1213      return;
1214
1215   radeon_check_space(device->ws, cs, 2 + loc->num_sgprs);
1216
1217   radeon_set_sh_reg_seq(cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs);
1218   radeon_emit_array(cs, values, loc->num_sgprs);
1219}
1220
1221static void
1222radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
1223                              struct radv_graphics_pipeline *pipeline)
1224{
1225   int num_samples = pipeline->ms.num_samples;
1226   struct radv_graphics_pipeline *old_pipeline = cmd_buffer->state.emitted_graphics_pipeline;
1227
1228   if (pipeline->base.shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions)
1229      cmd_buffer->sample_positions_needed = true;
1230
1231   if (old_pipeline && num_samples == old_pipeline->ms.num_samples)
1232      return;
1233
1234   radv_emit_default_sample_locations(cmd_buffer->cs, num_samples);
1235
1236   cmd_buffer->state.context_roll_without_scissor_emitted = true;
1237}
1238
1239static void
1240radv_update_binning_state(struct radv_cmd_buffer *cmd_buffer,
1241                          struct radv_graphics_pipeline *pipeline)
1242{
1243   const struct radv_graphics_pipeline *old_pipeline = cmd_buffer->state.emitted_graphics_pipeline;
1244
1245   if (pipeline->base.device->physical_device->rad_info.gfx_level < GFX9)
1246      return;
1247
1248   if (old_pipeline &&
1249       old_pipeline->binning.pa_sc_binner_cntl_0 ==
1250          pipeline->binning.pa_sc_binner_cntl_0)
1251      return;
1252
1253   bool binning_flush = false;
1254   if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 ||
1255       cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 ||
1256       cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 ||
1257       cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
1258      binning_flush = !old_pipeline ||
1259                      G_028C44_BINNING_MODE(old_pipeline->binning.pa_sc_binner_cntl_0) !=
1260                         G_028C44_BINNING_MODE(pipeline->binning.pa_sc_binner_cntl_0);
1261   }
1262
1263   radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0,
1264                          pipeline->binning.pa_sc_binner_cntl_0 |
1265                             S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush));
1266
1267   cmd_buffer->state.context_roll_without_scissor_emitted = true;
1268}
1269
1270static void
1271radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader)
1272{
1273   uint64_t va;
1274
1275   if (!shader)
1276      return;
1277
1278   va = radv_shader_get_va(shader);
1279
1280   si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
1281}
1282
1283static void
1284radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer,
1285                      struct radv_graphics_pipeline *pipeline, bool first_stage_only)
1286{
1287   struct radv_cmd_state *state = &cmd_buffer->state;
1288   uint32_t mask = state->prefetch_L2_mask;
1289
1290   /* Fast prefetch path for starting draws as soon as possible. */
1291   if (first_stage_only)
1292      mask &= RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS | RADV_PREFETCH_MS;
1293
1294   if (mask & RADV_PREFETCH_VS)
1295      radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_VERTEX]);
1296
1297   if (mask & RADV_PREFETCH_MS)
1298      radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_MESH]);
1299
1300   if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
1301      si_cp_dma_prefetch(cmd_buffer, state->vb_va, pipeline->vb_desc_alloc_size);
1302
1303   if (mask & RADV_PREFETCH_TCS)
1304      radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_TESS_CTRL]);
1305
1306   if (mask & RADV_PREFETCH_TES)
1307      radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_TESS_EVAL]);
1308
1309   if (mask & RADV_PREFETCH_GS) {
1310      radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_GEOMETRY]);
1311      if (radv_pipeline_has_gs_copy_shader(&pipeline->base))
1312         radv_emit_shader_prefetch(cmd_buffer, pipeline->base.gs_copy_shader);
1313   }
1314
1315   if (mask & RADV_PREFETCH_PS)
1316      radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_FRAGMENT]);
1317
1318   state->prefetch_L2_mask &= ~mask;
1319}
1320
1321static void
1322radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
1323{
1324   if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
1325      return;
1326
1327   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1328   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1329
1330   unsigned sx_ps_downconvert = 0;
1331   unsigned sx_blend_opt_epsilon = 0;
1332   unsigned sx_blend_opt_control = 0;
1333
1334   for (unsigned i = 0; i < subpass->color_count; ++i) {
1335      unsigned format, swap;
1336      bool has_alpha, has_rgb;
1337      if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1338         /* We don't set the DISABLE bits, because the HW can't have holes,
1339          * so the SPI color format is set to 32-bit 1-component. */
1340         sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1341         continue;
1342      }
1343
1344      int idx = subpass->color_attachments[i].attachment;
1345      if (cmd_buffer->state.attachments) {
1346         struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb;
1347
1348         format = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
1349                     ? G_028C70_FORMAT_GFX11(cb->cb_color_info)
1350                     : G_028C70_FORMAT_GFX6(cb->cb_color_info);
1351         swap = G_028C70_COMP_SWAP(cb->cb_color_info);
1352         has_alpha = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
1353                        ? !G_028C74_FORCE_DST_ALPHA_1_GFX11(cb->cb_color_attrib)
1354                        : !G_028C74_FORCE_DST_ALPHA_1_GFX6(cb->cb_color_attrib);
1355      } else {
1356         VkFormat fmt = cmd_buffer->state.pass->attachments[idx].format;
1357         format = radv_translate_colorformat(fmt);
1358         swap = radv_translate_colorswap(fmt, false);
1359         has_alpha = vk_format_description(fmt)->swizzle[3] != PIPE_SWIZZLE_1;
1360      }
1361
1362      uint32_t spi_format = (pipeline->col_format >> (i * 4)) & 0xf;
1363      uint32_t colormask = (pipeline->cb_target_mask >> (i * 4)) & 0xf;
1364
1365      if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32)
1366         has_rgb = !has_alpha;
1367      else
1368         has_rgb = true;
1369
1370      /* Check the colormask and export format. */
1371      if (!(colormask & 0x7))
1372         has_rgb = false;
1373      if (!(colormask & 0x8))
1374         has_alpha = false;
1375
1376      if (spi_format == V_028714_SPI_SHADER_ZERO) {
1377         has_rgb = false;
1378         has_alpha = false;
1379      }
1380
1381      /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
1382       * optimization, even though it has no alpha. */
1383      if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
1384         has_alpha = true;
1385
1386      /* Disable value checking for disabled channels. */
1387      if (!has_rgb)
1388         sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1389      if (!has_alpha)
1390         sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1391
1392      /* Enable down-conversion for 32bpp and smaller formats. */
1393      switch (format) {
1394      case V_028C70_COLOR_8:
1395      case V_028C70_COLOR_8_8:
1396      case V_028C70_COLOR_8_8_8_8:
1397         /* For 1 and 2-channel formats, use the superset thereof. */
1398         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
1399             spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1400             spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1401            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1402            sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
1403         }
1404         break;
1405
1406      case V_028C70_COLOR_5_6_5:
1407         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1408            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1409            sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
1410         }
1411         break;
1412
1413      case V_028C70_COLOR_1_5_5_5:
1414         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1415            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1416            sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
1417         }
1418         break;
1419
1420      case V_028C70_COLOR_4_4_4_4:
1421         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1422            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1423            sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
1424         }
1425         break;
1426
1427      case V_028C70_COLOR_32:
1428         if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
1429            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1430         else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
1431            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1432         break;
1433
1434      case V_028C70_COLOR_16:
1435      case V_028C70_COLOR_16_16:
1436         /* For 1-channel formats, use the superset thereof. */
1437         if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
1438             spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1439             spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1440             spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1441            if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
1442               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1443            else
1444               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1445         }
1446         break;
1447
1448      case V_028C70_COLOR_10_11_11:
1449         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1450            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1451         break;
1452
1453      case V_028C70_COLOR_2_10_10_10:
1454         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1455            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1456            sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
1457         }
1458         break;
1459      case V_028C70_COLOR_5_9_9_9:
1460         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1461            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
1462         break;
1463      }
1464   }
1465
1466   /* Do not set the DISABLE bits for the unused attachments, as that
1467    * breaks dual source blending in SkQP and does not seem to improve
1468    * performance. */
1469
1470   if (sx_ps_downconvert == cmd_buffer->state.last_sx_ps_downconvert &&
1471       sx_blend_opt_epsilon == cmd_buffer->state.last_sx_blend_opt_epsilon &&
1472       sx_blend_opt_control == cmd_buffer->state.last_sx_blend_opt_control)
1473      return;
1474
1475   radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1476   radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1477   radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1478   radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1479
1480   cmd_buffer->state.context_roll_without_scissor_emitted = true;
1481
1482   cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert;
1483   cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon;
1484   cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control;
1485}
1486
1487static void
1488radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer)
1489{
1490   if (!cmd_buffer->device->pbb_allowed)
1491      return;
1492
1493   struct radv_binning_settings settings =
1494      radv_get_binning_settings(cmd_buffer->device->physical_device);
1495   bool break_for_new_ps =
1496      (!cmd_buffer->state.emitted_graphics_pipeline ||
1497       cmd_buffer->state.emitted_graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT] !=
1498          cmd_buffer->state.graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT]) &&
1499      (settings.context_states_per_bin > 1 || settings.persistent_states_per_bin > 1);
1500   bool break_for_new_cb_target_mask =
1501      (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) &&
1502      settings.context_states_per_bin > 1;
1503
1504   if (!break_for_new_ps && !break_for_new_cb_target_mask)
1505      return;
1506
1507   radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1508   radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1509}
1510
1511static void
1512radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1513{
1514   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1515
1516   if (cmd_buffer->state.emitted_graphics_pipeline == pipeline)
1517      return;
1518
1519   radv_update_multisample_state(cmd_buffer, pipeline);
1520   radv_update_binning_state(cmd_buffer, pipeline);
1521
1522   cmd_buffer->scratch_size_per_wave_needed =
1523      MAX2(cmd_buffer->scratch_size_per_wave_needed, pipeline->base.scratch_bytes_per_wave);
1524   cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, pipeline->base.max_waves);
1525
1526   if (!cmd_buffer->state.emitted_graphics_pipeline ||
1527       cmd_buffer->state.emitted_graphics_pipeline->negative_one_to_one != pipeline->negative_one_to_one ||
1528       cmd_buffer->state.emitted_graphics_pipeline->depth_clamp_mode != pipeline->depth_clamp_mode)
1529      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
1530
1531   if (!cmd_buffer->state.emitted_graphics_pipeline ||
1532       radv_rast_prim_is_points_or_lines(cmd_buffer->state.emitted_graphics_pipeline->rast_prim) != radv_rast_prim_is_points_or_lines(pipeline->rast_prim) ||
1533       cmd_buffer->state.emitted_graphics_pipeline->line_width != pipeline->line_width)
1534      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
1535
1536   if (!cmd_buffer->state.emitted_graphics_pipeline ||
1537       cmd_buffer->state.emitted_graphics_pipeline->pa_su_sc_mode_cntl != pipeline->pa_su_sc_mode_cntl)
1538      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE |
1539                                 RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
1540                                 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
1541
1542   if (!cmd_buffer->state.emitted_graphics_pipeline ||
1543       cmd_buffer->state.emitted_graphics_pipeline->pa_cl_clip_cntl != pipeline->pa_cl_clip_cntl)
1544      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1545
1546   if (!cmd_buffer->state.emitted_graphics_pipeline ||
1547       cmd_buffer->state.emitted_graphics_pipeline->cb_color_control != pipeline->cb_color_control)
1548      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
1549
1550   if (!cmd_buffer->state.emitted_graphics_pipeline)
1551      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
1552                                 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |
1553                                 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS |
1554                                 RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
1555
1556   if (!cmd_buffer->state.emitted_graphics_pipeline ||
1557       cmd_buffer->state.emitted_graphics_pipeline->db_depth_control != pipeline->db_depth_control)
1558      cmd_buffer->state.dirty |=
1559         RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
1560         RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
1561         RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1562
1563   if (!cmd_buffer->state.emitted_graphics_pipeline)
1564      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1565
1566   if (!cmd_buffer->state.emitted_graphics_pipeline ||
1567       cmd_buffer->state.emitted_graphics_pipeline->cb_target_mask != pipeline->cb_target_mask) {
1568      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
1569   }
1570
1571   radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
1572
1573   if (pipeline->has_ngg_culling &&
1574       pipeline->last_vgt_api_stage != MESA_SHADER_GEOMETRY &&
1575       !cmd_buffer->state.last_nggc_settings) {
1576      /* The already emitted RSRC2 contains the LDS required for NGG culling.
1577       * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage.
1578       * API GS always needs LDS, so this isn't useful there.
1579       */
1580      struct radv_shader *v = pipeline->base.shaders[pipeline->last_vgt_api_stage];
1581      radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
1582                        (v->config.rsrc2 & C_00B22C_LDS_SIZE) |
1583                        S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling));
1584   }
1585
1586   if (!cmd_buffer->state.emitted_graphics_pipeline ||
1587       cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.cdw != pipeline->base.ctx_cs.cdw ||
1588       cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs_hash != pipeline->base.ctx_cs_hash ||
1589       memcmp(cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.buf,
1590              pipeline->base.ctx_cs.cdw * 4)) {
1591      radeon_emit_array(cmd_buffer->cs, pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.cdw);
1592      cmd_buffer->state.context_roll_without_scissor_emitted = true;
1593   }
1594
1595   radv_emit_batch_break_on_new_ps(cmd_buffer);
1596
1597   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.slab_bo);
1598
1599   if (unlikely(cmd_buffer->device->trace_bo))
1600      radv_save_pipeline(cmd_buffer, &pipeline->base);
1601
1602   cmd_buffer->state.emitted_graphics_pipeline = pipeline;
1603
1604   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
1605}
1606
1607static void
1608radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
1609{
1610   const struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1611   const struct radv_viewport_state *viewport = &cmd_buffer->state.dynamic.viewport;
1612   int i;
1613   const unsigned count = viewport->count;
1614
1615   assert(count);
1616   radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, count * 6);
1617
1618   for (i = 0; i < count; i++) {
1619      radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[0]));
1620      radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[0]));
1621      radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[1]));
1622      radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[1]));
1623
1624      double scale_z, translate_z;
1625      if (pipeline->negative_one_to_one) {
1626         scale_z = viewport->xform[i].scale[2] * 0.5f;
1627         translate_z = (viewport->xform[i].translate[2] + viewport->viewports[i].maxDepth) * 0.5f;
1628      } else {
1629         scale_z = viewport->xform[i].scale[2];
1630         translate_z = viewport->xform[i].translate[2];
1631
1632      }
1633      radeon_emit(cmd_buffer->cs, fui(scale_z));
1634      radeon_emit(cmd_buffer->cs, fui(translate_z));
1635   }
1636
1637   radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, count * 2);
1638   for (i = 0; i < count; i++) {
1639      float zmin, zmax;
1640
1641      if (pipeline->depth_clamp_mode == RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE) {
1642         zmin = 0.0f;
1643         zmax = 1.0f;
1644      } else {
1645         zmin = MIN2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
1646         zmax = MAX2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
1647      }
1648
1649      radeon_emit(cmd_buffer->cs, fui(zmin));
1650      radeon_emit(cmd_buffer->cs, fui(zmax));
1651   }
1652}
1653
1654void
1655radv_write_scissors(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs)
1656{
1657   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1658   uint32_t count = cmd_buffer->state.dynamic.scissor.count;
1659   unsigned rast_prim;
1660
1661   if (!(pipeline->dynamic_states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) ||
1662       (pipeline->active_stages & (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
1663                                   VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
1664                                   VK_SHADER_STAGE_GEOMETRY_BIT |
1665                                   VK_SHADER_STAGE_MESH_BIT_NV))) {
1666      /* Ignore dynamic primitive topology for TES/GS/MS stages. */
1667      rast_prim = pipeline->rast_prim;
1668   } else {
1669      rast_prim = si_conv_prim_to_gs_out(cmd_buffer->state.dynamic.primitive_topology);
1670   }
1671
1672   si_write_scissors(cs, 0, count, cmd_buffer->state.dynamic.scissor.scissors,
1673                     cmd_buffer->state.dynamic.viewport.viewports, rast_prim,
1674                     cmd_buffer->state.dynamic.line_width);
1675}
1676
1677static void
1678radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
1679{
1680   radv_write_scissors(cmd_buffer, cmd_buffer->cs);
1681
1682   cmd_buffer->state.context_roll_without_scissor_emitted = false;
1683}
1684
1685static void
1686radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
1687{
1688   if (!cmd_buffer->state.dynamic.discard_rectangle.count)
1689      return;
1690
1691   radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
1692                              cmd_buffer->state.dynamic.discard_rectangle.count * 2);
1693   for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
1694      VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
1695      radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
1696      radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
1697                                     S_028214_BR_Y(rect.offset.y + rect.extent.height));
1698   }
1699}
1700
1701static void
1702radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
1703{
1704   unsigned width = cmd_buffer->state.dynamic.line_width * 8;
1705
1706   radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
1707                          S_028A08_WIDTH(CLAMP(width, 0, 0xFFFF)));
1708}
1709
1710static void
1711radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
1712{
1713   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1714
1715   radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
1716   radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
1717}
1718
1719static void
1720radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
1721{
1722   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1723
1724   radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2);
1725   radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) |
1726                                  S_028430_STENCILMASK(d->stencil_compare_mask.front) |
1727                                  S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
1728                                  S_028430_STENCILOPVAL(1));
1729   radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
1730                                  S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
1731                                  S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
1732                                  S_028434_STENCILOPVAL_BF(1));
1733}
1734
1735static void
1736radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
1737{
1738   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1739
1740   radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2);
1741   radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.min));
1742   radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.max));
1743}
1744
1745static void
1746radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
1747{
1748   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1749   unsigned slope = fui(d->depth_bias.slope * 16.0f);
1750
1751   radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
1752   radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
1753   radeon_emit(cmd_buffer->cs, slope);                    /* FRONT SCALE */
1754   radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias));  /* FRONT OFFSET */
1755   radeon_emit(cmd_buffer->cs, slope);                    /* BACK SCALE */
1756   radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias));  /* BACK OFFSET */
1757}
1758
1759static void
1760radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
1761{
1762   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1763   uint32_t auto_reset_cntl = 1;
1764
1765   if (d->primitive_topology == V_008958_DI_PT_LINESTRIP)
1766      auto_reset_cntl = 2;
1767
1768   radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
1769                          S_028A0C_LINE_PATTERN(d->line_stipple.pattern) |
1770                             S_028A0C_REPEAT_COUNT(d->line_stipple.factor - 1) |
1771                             S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
1772}
1773
1774uint32_t
1775radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer *cmd_buffer)
1776{
1777   unsigned pa_su_sc_mode_cntl = cmd_buffer->state.graphics_pipeline->pa_su_sc_mode_cntl;
1778   const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1779
1780   pa_su_sc_mode_cntl &= C_028814_CULL_FRONT &
1781                         C_028814_CULL_BACK &
1782                         C_028814_FACE &
1783                         C_028814_POLY_OFFSET_FRONT_ENABLE &
1784                         C_028814_POLY_OFFSET_BACK_ENABLE &
1785                         C_028814_POLY_OFFSET_PARA_ENABLE;
1786
1787   pa_su_sc_mode_cntl |= S_028814_CULL_FRONT(!!(d->cull_mode & VK_CULL_MODE_FRONT_BIT)) |
1788                         S_028814_CULL_BACK(!!(d->cull_mode & VK_CULL_MODE_BACK_BIT)) |
1789                         S_028814_FACE(d->front_face) |
1790                         S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) |
1791                         S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) |
1792                         S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable);
1793   return pa_su_sc_mode_cntl;
1794}
1795
1796static void
1797radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
1798{
1799   unsigned pa_su_sc_mode_cntl = radv_get_pa_su_sc_mode_cntl(cmd_buffer);
1800
1801   radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
1802}
1803
1804static void
1805radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
1806{
1807   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1808
1809   assert(!cmd_buffer->state.mesh_shading);
1810
1811   if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
1812      radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs,
1813                                 R_030908_VGT_PRIMITIVE_TYPE, 1, d->primitive_topology);
1814   } else {
1815      radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->primitive_topology);
1816   }
1817}
1818
1819static void
1820radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
1821{
1822   unsigned db_depth_control = cmd_buffer->state.graphics_pipeline->db_depth_control;
1823   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1824
1825   db_depth_control &= C_028800_Z_ENABLE &
1826                       C_028800_Z_WRITE_ENABLE &
1827                       C_028800_ZFUNC &
1828                       C_028800_DEPTH_BOUNDS_ENABLE &
1829                       C_028800_STENCIL_ENABLE &
1830                       C_028800_BACKFACE_ENABLE &
1831                       C_028800_STENCILFUNC &
1832                       C_028800_STENCILFUNC_BF;
1833
1834   db_depth_control |= S_028800_Z_ENABLE(d->depth_test_enable ? 1 : 0) |
1835                       S_028800_Z_WRITE_ENABLE(d->depth_write_enable ? 1 : 0) |
1836                       S_028800_ZFUNC(d->depth_compare_op) |
1837                       S_028800_DEPTH_BOUNDS_ENABLE(d->depth_bounds_test_enable ? 1 : 0) |
1838                       S_028800_STENCIL_ENABLE(d->stencil_test_enable ? 1 : 0) |
1839                       S_028800_BACKFACE_ENABLE(d->stencil_test_enable ? 1 : 0) |
1840                       S_028800_STENCILFUNC(d->stencil_op.front.compare_op) |
1841                       S_028800_STENCILFUNC_BF(d->stencil_op.back.compare_op);
1842
1843   radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, db_depth_control);
1844}
1845
1846static void
1847radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
1848{
1849   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1850
1851   radeon_set_context_reg(
1852      cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL,
1853      S_02842C_STENCILFAIL(si_translate_stencil_op(d->stencil_op.front.fail_op)) |
1854         S_02842C_STENCILZPASS(si_translate_stencil_op(d->stencil_op.front.pass_op)) |
1855         S_02842C_STENCILZFAIL(si_translate_stencil_op(d->stencil_op.front.depth_fail_op)) |
1856         S_02842C_STENCILFAIL_BF(si_translate_stencil_op(d->stencil_op.back.fail_op)) |
1857         S_02842C_STENCILZPASS_BF(si_translate_stencil_op(d->stencil_op.back.pass_op)) |
1858         S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(d->stencil_op.back.depth_fail_op)));
1859}
1860
1861static void
1862radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
1863{
1864   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1865   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1866   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1867   uint32_t rate_x = MIN2(2, d->fragment_shading_rate.size.width) - 1;
1868   uint32_t rate_y = MIN2(2, d->fragment_shading_rate.size.height) - 1;
1869   uint32_t pa_cl_vrs_cntl = pipeline->vrs.pa_cl_vrs_cntl;
1870   uint32_t pipeline_comb_mode = d->fragment_shading_rate.combiner_ops[0];
1871   uint32_t htile_comb_mode = d->fragment_shading_rate.combiner_ops[1];
1872
1873   assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3);
1874
1875   if (subpass && !subpass->vrs_attachment) {
1876      /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
1877       * can cheat by tweaking the different combiner modes.
1878       */
1879      switch (htile_comb_mode) {
1880      case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
1881         /* The result of min(A, 1x1) is always 1x1. */
1882         FALLTHROUGH;
1883      case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
1884         /* Force the per-draw VRS rate to 1x1. */
1885         rate_x = rate_y = 0;
1886
1887         /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate
1888          * combiner mode as passthrough.
1889          */
1890         pipeline_comb_mode = V_028848_VRS_COMB_MODE_PASSTHRU;
1891         break;
1892      case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
1893         /* The result of max(A, 1x1) is always A. */
1894         FALLTHROUGH;
1895      case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
1896         /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */
1897         break;
1898      default:
1899         break;
1900      }
1901   }
1902
1903   /* Emit per-draw VRS rate which is the first combiner. */
1904   radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE,
1905                          S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y));
1906
1907   /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the
1908    * draw rate and the vertex rate.
1909    */
1910   if (cmd_buffer->state.mesh_shading) {
1911      pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU) |
1912                        S_028848_PRIMITIVE_RATE_COMBINER_MODE(pipeline_comb_mode);
1913   } else {
1914      pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(pipeline_comb_mode) |
1915                        S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU);
1916   }
1917
1918   /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE
1919    * rate.
1920    */
1921   pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode);
1922
1923   radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl);
1924}
1925
1926static void
1927radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer)
1928{
1929   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1930
1931   if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1932      radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_GE_MULTI_PRIM_IB_RESET_EN,
1933                             d->primitive_restart_enable);
1934   } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
1935      radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
1936                             d->primitive_restart_enable);
1937   } else {
1938      radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
1939                             d->primitive_restart_enable);
1940   }
1941}
1942
1943static void
1944radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer *cmd_buffer)
1945{
1946   unsigned pa_cl_clip_cntl = cmd_buffer->state.graphics_pipeline->pa_cl_clip_cntl;
1947   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1948
1949   pa_cl_clip_cntl &= C_028810_DX_RASTERIZATION_KILL;
1950   pa_cl_clip_cntl |= S_028810_DX_RASTERIZATION_KILL(d->rasterizer_discard_enable);
1951
1952   radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL, pa_cl_clip_cntl);
1953}
1954
1955static void
1956radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer)
1957{
1958   unsigned cb_color_control = cmd_buffer->state.graphics_pipeline->cb_color_control;
1959   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1960
1961   cb_color_control &= C_028808_ROP3;
1962   cb_color_control |= S_028808_ROP3(d->logic_op);
1963
1964   radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control);
1965}
1966
1967static void
1968radv_emit_color_write_enable(struct radv_cmd_buffer *cmd_buffer)
1969{
1970   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1971   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1972
1973   radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK,
1974                          pipeline->cb_target_mask & d->color_write_enable);
1975}
1976
1977static void
1978radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index,
1979                         struct radv_color_buffer_info *cb, struct radv_image_view *iview,
1980                         VkImageLayout layout, bool in_render_loop)
1981{
1982   bool is_vi = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8;
1983   uint32_t cb_fdcc_control = cb->cb_dcc_control;
1984   uint32_t cb_color_info = cb->cb_color_info;
1985   struct radv_image *image = iview->image;
1986
1987   if (!radv_layout_dcc_compressed(
1988          cmd_buffer->device, image, iview->vk.base_mip_level, layout, in_render_loop,
1989          radv_image_queue_family_mask(image, cmd_buffer->qf,
1990                                       cmd_buffer->qf))) {
1991      if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1992         cb_fdcc_control &= C_028C78_FDCC_ENABLE;
1993      } else {
1994         cb_color_info &= C_028C70_DCC_ENABLE;
1995      }
1996   }
1997
1998   if (!radv_layout_fmask_compressed(
1999          cmd_buffer->device, image, layout,
2000          radv_image_queue_family_mask(image, cmd_buffer->qf,
2001                                       cmd_buffer->qf))) {
2002      cb_color_info &= C_028C70_COMPRESSION;
2003   }
2004
2005   if (radv_image_is_tc_compat_cmask(image) && (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
2006                                                radv_is_dcc_decompress_pipeline(cmd_buffer))) {
2007      /* If this bit is set, the FMASK decompression operation
2008       * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS).
2009       */
2010      cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY;
2011   }
2012
2013   if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2014      radeon_set_context_reg_seq(cmd_buffer->cs, R_028C6C_CB_COLOR0_VIEW + index * 0x3c, 4);
2015      radeon_emit(cmd_buffer->cs, cb->cb_color_view);                      /* CB_COLOR0_VIEW */
2016      radeon_emit(cmd_buffer->cs, cb->cb_color_info);                      /* CB_COLOR0_INFO */
2017      radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);                    /* CB_COLOR0_ATTRIB */
2018      radeon_emit(cmd_buffer->cs, cb_fdcc_control);                        /* CB_COLOR0_FDCC_CONTROL */
2019
2020      radeon_set_context_reg(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, cb->cb_color_base);
2021      radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4, cb->cb_color_base >> 32);
2022      radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
2023      radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4, cb->cb_dcc_base >> 32);
2024      radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->cb_color_attrib2);
2025      radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->cb_color_attrib3);
2026   } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
2027      radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2028      radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2029      radeon_emit(cmd_buffer->cs, 0);
2030      radeon_emit(cmd_buffer->cs, 0);
2031      radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2032      radeon_emit(cmd_buffer->cs, cb_color_info);
2033      radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2034      radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2035      radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2036      radeon_emit(cmd_buffer->cs, 0);
2037      radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2038      radeon_emit(cmd_buffer->cs, 0);
2039
2040      radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
2041
2042      radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
2043                             cb->cb_color_base >> 32);
2044      radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
2045                             cb->cb_color_cmask >> 32);
2046      radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
2047                             cb->cb_color_fmask >> 32);
2048      radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
2049                             cb->cb_dcc_base >> 32);
2050      radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4,
2051                             cb->cb_color_attrib2);
2052      radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4,
2053                             cb->cb_color_attrib3);
2054   } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2055      radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2056      radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2057      radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
2058      radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
2059      radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2060      radeon_emit(cmd_buffer->cs, cb_color_info);
2061      radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2062      radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2063      radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2064      radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
2065      radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2066      radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
2067
2068      radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
2069      radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
2070      radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
2071
2072      radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
2073                             cb->cb_mrt_epitch);
2074   } else {
2075      radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2076      radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2077      radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
2078      radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
2079      radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2080      radeon_emit(cmd_buffer->cs, cb_color_info);
2081      radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2082      radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2083      radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2084      radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
2085      radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2086      radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
2087
2088      if (is_vi) { /* DCC BASE */
2089         radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c,
2090                                cb->cb_dcc_base);
2091      }
2092   }
2093
2094   if (G_028C70_DCC_ENABLE(cb_color_info)) {
2095      /* Drawing with DCC enabled also compresses colorbuffers. */
2096      VkImageSubresourceRange range = {
2097         .aspectMask = iview->vk.aspects,
2098         .baseMipLevel = iview->vk.base_mip_level,
2099         .levelCount = iview->vk.level_count,
2100         .baseArrayLayer = iview->vk.base_array_layer,
2101         .layerCount = iview->vk.layer_count,
2102      };
2103
2104      radv_update_dcc_metadata(cmd_buffer, image, &range, true);
2105   }
2106}
2107
2108static void
2109radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
2110                             const struct radv_image_view *iview, VkImageLayout layout,
2111                             bool in_render_loop, bool requires_cond_exec)
2112{
2113   const struct radv_image *image = iview->image;
2114   uint32_t db_z_info = ds->db_z_info;
2115   uint32_t db_z_info_reg;
2116
2117   if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug ||
2118       !radv_image_is_tc_compat_htile(image))
2119      return;
2120
2121   if (!radv_layout_is_htile_compressed(
2122          cmd_buffer->device, image, layout, in_render_loop,
2123          radv_image_queue_family_mask(image, cmd_buffer->qf,
2124                                       cmd_buffer->qf))) {
2125      db_z_info &= C_028040_TILE_SURFACE_ENABLE;
2126   }
2127
2128   db_z_info &= C_028040_ZRANGE_PRECISION;
2129
2130   if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2131      db_z_info_reg = R_028038_DB_Z_INFO;
2132   } else {
2133      db_z_info_reg = R_028040_DB_Z_INFO;
2134   }
2135
2136   /* When we don't know the last fast clear value we need to emit a
2137    * conditional packet that will eventually skip the following
2138    * SET_CONTEXT_REG packet.
2139    */
2140   if (requires_cond_exec) {
2141      uint64_t va = radv_get_tc_compat_zrange_va(image, iview->vk.base_mip_level);
2142
2143      radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
2144      radeon_emit(cmd_buffer->cs, va);
2145      radeon_emit(cmd_buffer->cs, va >> 32);
2146      radeon_emit(cmd_buffer->cs, 0);
2147      radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
2148   }
2149
2150   radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
2151}
2152
2153static void
2154radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
2155                      struct radv_image_view *iview, VkImageLayout layout, bool in_render_loop)
2156{
2157   const struct radv_image *image = iview->image;
2158   uint32_t db_z_info = ds->db_z_info;
2159   uint32_t db_stencil_info = ds->db_stencil_info;
2160   uint32_t db_htile_surface = ds->db_htile_surface;
2161
2162   if (!radv_layout_is_htile_compressed(
2163          cmd_buffer->device, image, layout, in_render_loop,
2164          radv_image_queue_family_mask(image, cmd_buffer->qf,
2165                                       cmd_buffer->qf))) {
2166      db_z_info &= C_028040_TILE_SURFACE_ENABLE;
2167      db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
2168   }
2169
2170   if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3 &&
2171       !cmd_buffer->state.subpass->vrs_attachment) {
2172      db_htile_surface &= C_028ABC_VRS_HTILE_ENCODING;
2173   }
2174
2175   radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
2176   radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
2177
2178   if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
2179      radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
2180      radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
2181
2182      if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2183         radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 6);
2184      } else {
2185         radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
2186         radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
2187      }
2188      radeon_emit(cmd_buffer->cs, db_z_info);
2189      radeon_emit(cmd_buffer->cs, db_stencil_info);
2190      radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
2191      radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
2192      radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
2193      radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
2194
2195      radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
2196      radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
2197      radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
2198      radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
2199      radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
2200      radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
2201   } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2202      radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
2203      radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
2204      radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
2205      radeon_emit(cmd_buffer->cs, ds->db_depth_size);
2206
2207      radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
2208      radeon_emit(cmd_buffer->cs, db_z_info);          /* DB_Z_INFO */
2209      radeon_emit(cmd_buffer->cs, db_stencil_info);    /* DB_STENCIL_INFO */
2210      radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */
2211      radeon_emit(cmd_buffer->cs,
2212                  S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */
2213      radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);   /* DB_STENCIL_READ_BASE */
2214      radeon_emit(cmd_buffer->cs,
2215                  S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
2216      radeon_emit(cmd_buffer->cs, ds->db_z_write_base);              /* DB_Z_WRITE_BASE */
2217      radeon_emit(cmd_buffer->cs,
2218                  S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */
2219      radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);   /* DB_STENCIL_WRITE_BASE */
2220      radeon_emit(cmd_buffer->cs,
2221                  S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
2222
2223      radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
2224      radeon_emit(cmd_buffer->cs, ds->db_z_info2);
2225      radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
2226   } else {
2227      radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
2228
2229      radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
2230      radeon_emit(cmd_buffer->cs, ds->db_depth_info);         /* R_02803C_DB_DEPTH_INFO */
2231      radeon_emit(cmd_buffer->cs, db_z_info);                 /* R_028040_DB_Z_INFO */
2232      radeon_emit(cmd_buffer->cs, db_stencil_info);           /* R_028044_DB_STENCIL_INFO */
2233      radeon_emit(cmd_buffer->cs, ds->db_z_read_base);        /* R_028048_DB_Z_READ_BASE */
2234      radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);  /* R_02804C_DB_STENCIL_READ_BASE */
2235      radeon_emit(cmd_buffer->cs, ds->db_z_write_base);       /* R_028050_DB_Z_WRITE_BASE */
2236      radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
2237      radeon_emit(cmd_buffer->cs, ds->db_depth_size);         /* R_028058_DB_DEPTH_SIZE */
2238      radeon_emit(cmd_buffer->cs, ds->db_depth_slice);        /* R_02805C_DB_DEPTH_SLICE */
2239   }
2240
2241   /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
2242   radv_update_zrange_precision(cmd_buffer, ds, iview, layout, in_render_loop, true);
2243
2244   radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
2245                          ds->pa_su_poly_offset_db_fmt_cntl);
2246}
2247
2248/**
2249 * Update the fast clear depth/stencil values if the image is bound as a
2250 * depth/stencil buffer.
2251 */
2252static void
2253radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
2254                                const struct radv_image_view *iview,
2255                                VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2256{
2257   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2258   const struct radv_image *image = iview->image;
2259   struct radeon_cmdbuf *cs = cmd_buffer->cs;
2260   uint32_t att_idx;
2261
2262   if (!cmd_buffer->state.attachments || !subpass)
2263      return;
2264
2265   if (!subpass->depth_stencil_attachment)
2266      return;
2267
2268   att_idx = subpass->depth_stencil_attachment->attachment;
2269   if (cmd_buffer->state.attachments[att_idx].iview->image != image)
2270      return;
2271
2272   if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
2273      radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
2274      radeon_emit(cs, ds_clear_value.stencil);
2275      radeon_emit(cs, fui(ds_clear_value.depth));
2276   } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
2277      radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth));
2278   } else {
2279      assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
2280      radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil);
2281   }
2282
2283   /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
2284    * only needed when clearing Z to 0.0.
2285    */
2286   if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) {
2287      VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2288      bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2289
2290      radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, iview,
2291                                   layout, in_render_loop, false);
2292   }
2293
2294   cmd_buffer->state.context_roll_without_scissor_emitted = true;
2295}
2296
2297/**
2298 * Set the clear depth/stencil values to the image's metadata.
2299 */
2300static void
2301radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2302                           const VkImageSubresourceRange *range,
2303                           VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2304{
2305   struct radeon_cmdbuf *cs = cmd_buffer->cs;
2306   uint32_t level_count = radv_get_levelCount(image, range);
2307
2308   if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
2309      uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
2310
2311      /* Use the fastest way when both aspects are used. */
2312      radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating));
2313      radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2314      radeon_emit(cs, va);
2315      radeon_emit(cs, va >> 32);
2316
2317      for (uint32_t l = 0; l < level_count; l++) {
2318         radeon_emit(cs, ds_clear_value.stencil);
2319         radeon_emit(cs, fui(ds_clear_value.depth));
2320      }
2321   } else {
2322      /* Otherwise we need one WRITE_DATA packet per level. */
2323      for (uint32_t l = 0; l < level_count; l++) {
2324         uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
2325         unsigned value;
2326
2327         if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
2328            value = fui(ds_clear_value.depth);
2329            va += 4;
2330         } else {
2331            assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
2332            value = ds_clear_value.stencil;
2333         }
2334
2335         radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
2336         radeon_emit(cs,
2337                     S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2338         radeon_emit(cs, va);
2339         radeon_emit(cs, va >> 32);
2340         radeon_emit(cs, value);
2341      }
2342   }
2343}
2344
2345/**
2346 * Update the TC-compat metadata value for this image.
2347 */
2348static void
2349radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2350                                   const VkImageSubresourceRange *range, uint32_t value)
2351{
2352   struct radeon_cmdbuf *cs = cmd_buffer->cs;
2353
2354   if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
2355      return;
2356
2357   uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
2358   uint32_t level_count = radv_get_levelCount(image, range);
2359
2360   radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating));
2361   radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2362   radeon_emit(cs, va);
2363   radeon_emit(cs, va >> 32);
2364
2365   for (uint32_t l = 0; l < level_count; l++)
2366      radeon_emit(cs, value);
2367}
2368
2369static void
2370radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
2371                                      const struct radv_image_view *iview,
2372                                      VkClearDepthStencilValue ds_clear_value)
2373{
2374   VkImageSubresourceRange range = {
2375      .aspectMask = iview->vk.aspects,
2376      .baseMipLevel = iview->vk.base_mip_level,
2377      .levelCount = iview->vk.level_count,
2378      .baseArrayLayer = iview->vk.base_array_layer,
2379      .layerCount = iview->vk.layer_count,
2380   };
2381   uint32_t cond_val;
2382
2383   /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
2384    * depth clear value is 0.0f.
2385    */
2386   cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
2387
2388   radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val);
2389}
2390
2391/**
2392 * Update the clear depth/stencil values for this image.
2393 */
2394void
2395radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2396                              const struct radv_image_view *iview,
2397                              VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2398{
2399   VkImageSubresourceRange range = {
2400      .aspectMask = iview->vk.aspects,
2401      .baseMipLevel = iview->vk.base_mip_level,
2402      .levelCount = iview->vk.level_count,
2403      .baseArrayLayer = iview->vk.base_array_layer,
2404      .layerCount = iview->vk.layer_count,
2405   };
2406   struct radv_image *image = iview->image;
2407
2408   assert(radv_htile_enabled(image, range.baseMipLevel));
2409
2410   radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects);
2411
2412   if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
2413      radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value);
2414   }
2415
2416   radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects);
2417}
2418
2419/**
2420 * Load the clear depth/stencil values from the image's metadata.
2421 */
2422static void
2423radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview)
2424{
2425   struct radeon_cmdbuf *cs = cmd_buffer->cs;
2426   const struct radv_image *image = iview->image;
2427   VkImageAspectFlags aspects = vk_format_aspects(image->vk.format);
2428   uint64_t va = radv_get_ds_clear_value_va(image, iview->vk.base_mip_level);
2429   unsigned reg_offset = 0, reg_count = 0;
2430
2431   assert(radv_image_has_htile(image));
2432
2433   if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
2434      ++reg_count;
2435   } else {
2436      ++reg_offset;
2437      va += 4;
2438   }
2439   if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
2440      ++reg_count;
2441
2442   uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
2443
2444   if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2445      radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
2446      radeon_emit(cs, va);
2447      radeon_emit(cs, va >> 32);
2448      radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2449      radeon_emit(cs, reg_count);
2450   } else {
2451      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2452      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2453                         (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
2454      radeon_emit(cs, va);
2455      radeon_emit(cs, va >> 32);
2456      radeon_emit(cs, reg >> 2);
2457      radeon_emit(cs, 0);
2458
2459      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
2460      radeon_emit(cs, 0);
2461   }
2462}
2463
2464/*
2465 * With DCC some colors don't require CMASK elimination before being
2466 * used as a texture. This sets a predicate value to determine if the
2467 * cmask eliminate is required.
2468 */
2469void
2470radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2471                         const VkImageSubresourceRange *range, bool value)
2472{
2473   if (!image->fce_pred_offset)
2474      return;
2475
2476   uint64_t pred_val = value;
2477   uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
2478   uint32_t level_count = radv_get_levelCount(image, range);
2479   uint32_t count = 2 * level_count;
2480
2481   radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2482   radeon_emit(cmd_buffer->cs,
2483               S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2484   radeon_emit(cmd_buffer->cs, va);
2485   radeon_emit(cmd_buffer->cs, va >> 32);
2486
2487   for (uint32_t l = 0; l < level_count; l++) {
2488      radeon_emit(cmd_buffer->cs, pred_val);
2489      radeon_emit(cmd_buffer->cs, pred_val >> 32);
2490   }
2491}
2492
2493/**
2494 * Update the DCC predicate to reflect the compression state.
2495 */
2496void
2497radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2498                         const VkImageSubresourceRange *range, bool value)
2499{
2500   if (image->dcc_pred_offset == 0)
2501      return;
2502
2503   uint64_t pred_val = value;
2504   uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
2505   uint32_t level_count = radv_get_levelCount(image, range);
2506   uint32_t count = 2 * level_count;
2507
2508   assert(radv_dcc_enabled(image, range->baseMipLevel));
2509
2510   radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2511   radeon_emit(cmd_buffer->cs,
2512               S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2513   radeon_emit(cmd_buffer->cs, va);
2514   radeon_emit(cmd_buffer->cs, va >> 32);
2515
2516   for (uint32_t l = 0; l < level_count; l++) {
2517      radeon_emit(cmd_buffer->cs, pred_val);
2518      radeon_emit(cmd_buffer->cs, pred_val >> 32);
2519   }
2520}
2521
2522/**
2523 * Update the fast clear color values if the image is bound as a color buffer.
2524 */
2525static void
2526radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2527                                   int cb_idx, uint32_t color_values[2])
2528{
2529   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2530   struct radeon_cmdbuf *cs = cmd_buffer->cs;
2531   uint32_t att_idx;
2532
2533   if (!cmd_buffer->state.attachments || !subpass)
2534      return;
2535
2536   att_idx = subpass->color_attachments[cb_idx].attachment;
2537   if (att_idx == VK_ATTACHMENT_UNUSED)
2538      return;
2539
2540   if (cmd_buffer->state.attachments[att_idx].iview->image != image)
2541      return;
2542
2543   radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
2544   radeon_emit(cs, color_values[0]);
2545   radeon_emit(cs, color_values[1]);
2546
2547   cmd_buffer->state.context_roll_without_scissor_emitted = true;
2548}
2549
2550/**
2551 * Set the clear color values to the image's metadata.
2552 */
2553static void
2554radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2555                              const VkImageSubresourceRange *range, uint32_t color_values[2])
2556{
2557   struct radeon_cmdbuf *cs = cmd_buffer->cs;
2558   uint32_t level_count = radv_get_levelCount(image, range);
2559   uint32_t count = 2 * level_count;
2560
2561   assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel));
2562
2563   if (radv_image_has_clear_value(image)) {
2564      uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
2565
2566      radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating));
2567      radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2568      radeon_emit(cs, va);
2569      radeon_emit(cs, va >> 32);
2570
2571      for (uint32_t l = 0; l < level_count; l++) {
2572         radeon_emit(cs, color_values[0]);
2573         radeon_emit(cs, color_values[1]);
2574      }
2575   } else {
2576      /* Some default value we can set in the update. */
2577      assert(color_values[0] == 0 && color_values[1] == 0);
2578   }
2579}
2580
2581/**
2582 * Update the clear color values for this image.
2583 */
2584void
2585radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2586                                 const struct radv_image_view *iview, int cb_idx,
2587                                 uint32_t color_values[2])
2588{
2589   struct radv_image *image = iview->image;
2590   VkImageSubresourceRange range = {
2591      .aspectMask = iview->vk.aspects,
2592      .baseMipLevel = iview->vk.base_mip_level,
2593      .levelCount = iview->vk.level_count,
2594      .baseArrayLayer = iview->vk.base_array_layer,
2595      .layerCount = iview->vk.layer_count,
2596   };
2597
2598   assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->vk.base_mip_level));
2599
2600   /* Do not need to update the clear value for images that are fast cleared with the comp-to-single
2601    * mode because the hardware gets the value from the image directly.
2602    */
2603   if (iview->image->support_comp_to_single)
2604      return;
2605
2606   radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
2607
2608   radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2609}
2610
2611/**
2612 * Load the clear color values from the image's metadata.
2613 */
2614static void
2615radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview,
2616                               int cb_idx)
2617{
2618   struct radeon_cmdbuf *cs = cmd_buffer->cs;
2619   struct radv_image *image = iview->image;
2620
2621   if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->vk.base_mip_level))
2622      return;
2623
2624   if (iview->image->support_comp_to_single)
2625      return;
2626
2627   if (!radv_image_has_clear_value(image)) {
2628      uint32_t color_values[2] = {0, 0};
2629      radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2630      return;
2631   }
2632
2633   uint64_t va = radv_image_get_fast_clear_va(image, iview->vk.base_mip_level);
2634   uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
2635
2636   if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2637      radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
2638      radeon_emit(cs, va);
2639      radeon_emit(cs, va >> 32);
2640      radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2641      radeon_emit(cs, 2);
2642   } else {
2643      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
2644      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2645                         COPY_DATA_COUNT_SEL);
2646      radeon_emit(cs, va);
2647      radeon_emit(cs, va >> 32);
2648      radeon_emit(cs, reg >> 2);
2649      radeon_emit(cs, 0);
2650
2651      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
2652      radeon_emit(cs, 0);
2653   }
2654}
2655
2656/* GFX9+ metadata cache flushing workaround. metadata cache coherency is
2657 * broken if the CB caches data of multiple mips of the same image at the
2658 * same time.
2659 *
2660 * Insert some flushes to avoid this.
2661 */
2662static void
2663radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
2664{
2665   struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2666   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2667   bool color_mip_changed = false;
2668
2669   /* Entire workaround is not applicable before GFX9 */
2670   if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
2671      return;
2672
2673   if (!framebuffer)
2674      return;
2675
2676   for (int i = 0; i < subpass->color_count; ++i) {
2677      int idx = subpass->color_attachments[i].attachment;
2678      if (idx == VK_ATTACHMENT_UNUSED)
2679         continue;
2680
2681      struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2682
2683      if ((radv_image_has_CB_metadata(iview->image) ||
2684           radv_dcc_enabled(iview->image, iview->vk.base_mip_level) ||
2685           radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) &&
2686          cmd_buffer->state.cb_mip[i] != iview->vk.base_mip_level)
2687         color_mip_changed = true;
2688
2689      cmd_buffer->state.cb_mip[i] = iview->vk.base_mip_level;
2690   }
2691
2692   if (color_mip_changed) {
2693      cmd_buffer->state.flush_bits |=
2694         RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2695   }
2696}
2697
2698/* This function does the flushes for mip changes if the levels are not zero for
2699 * all render targets. This way we can assume at the start of the next cmd_buffer
2700 * that rendering to mip 0 doesn't need any flushes. As that is the most common
2701 * case that saves some flushes. */
2702static void
2703radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
2704{
2705   /* Entire workaround is not applicable before GFX9 */
2706   if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
2707      return;
2708
2709   bool need_color_mip_flush = false;
2710   for (unsigned i = 0; i < 8; ++i) {
2711      if (cmd_buffer->state.cb_mip[i]) {
2712         need_color_mip_flush = true;
2713         break;
2714      }
2715   }
2716
2717   if (need_color_mip_flush) {
2718      cmd_buffer->state.flush_bits |=
2719         RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2720   }
2721
2722   memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
2723}
2724
2725static struct radv_image *
2726radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
2727{
2728   struct radv_device *device = cmd_buffer->device;
2729
2730   if (!device->vrs.image) {
2731      VkResult result;
2732
2733      /* The global VRS state is initialized on-demand to avoid wasting VRAM. */
2734      result = radv_device_init_vrs_state(device);
2735      if (result != VK_SUCCESS) {
2736         cmd_buffer->record_result = result;
2737         return NULL;
2738      }
2739   }
2740
2741   return device->vrs.image;
2742}
2743
2744static void
2745radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
2746{
2747   int i;
2748   struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2749   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2750   bool disable_constant_encode_ac01 = false;
2751   unsigned color_invalid = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
2752                            ? G_028C70_FORMAT_GFX11(V_028C70_COLOR_INVALID)
2753                            : G_028C70_FORMAT_GFX6(V_028C70_COLOR_INVALID);
2754
2755   for (i = 0; i < subpass->color_count; ++i) {
2756      if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
2757         radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
2758         continue;
2759      }
2760
2761      int idx = subpass->color_attachments[i].attachment;
2762      struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2763      VkImageLayout layout = subpass->color_attachments[i].layout;
2764      bool in_render_loop = subpass->color_attachments[i].in_render_loop;
2765
2766      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bindings[0].bo);
2767
2768      assert(iview->vk.aspects & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
2769                                   VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
2770
2771      if (iview->image->disjoint && iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
2772         for (uint32_t plane_id = 0; plane_id < iview->image->plane_count; plane_id++) {
2773            radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2774                  iview->image->bindings[plane_id].bo);
2775         }
2776      } else {
2777         uint32_t plane_id = iview->image->disjoint ? iview->plane_id : 0;
2778         radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2779               iview->image->bindings[plane_id].bo);
2780      }
2781
2782      radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout,
2783                               in_render_loop);
2784
2785      radv_load_color_clear_metadata(cmd_buffer, iview, i);
2786
2787      if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 &&
2788          iview->image->dcc_sign_reinterpret) {
2789         /* Disable constant encoding with the clear value of "1" with different DCC signedness
2790          * because the hardware will fill "1" instead of the clear value.
2791          */
2792         disable_constant_encode_ac01 = true;
2793      }
2794   }
2795   for (; i < cmd_buffer->state.last_subpass_color_count; i++) {
2796      radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
2797   }
2798   cmd_buffer->state.last_subpass_color_count = subpass->color_count;
2799
2800   if (subpass->depth_stencil_attachment) {
2801      int idx = subpass->depth_stencil_attachment->attachment;
2802      VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2803      bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2804      struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2805      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2806                         cmd_buffer->state.attachments[idx].iview->image->bindings[0].bo);
2807
2808      radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout,
2809                            in_render_loop);
2810
2811      if (radv_layout_is_htile_compressed(
2812             cmd_buffer->device, iview->image, layout, in_render_loop,
2813             radv_image_queue_family_mask(iview->image, cmd_buffer->qf,
2814                                          cmd_buffer->qf))) {
2815         /* Only load the depth/stencil fast clear values when
2816          * compressed rendering is enabled.
2817          */
2818         radv_load_ds_clear_metadata(cmd_buffer, iview);
2819      }
2820   } else if (subpass->vrs_attachment && radv_cmd_buffer_get_vrs_image(cmd_buffer)) {
2821      /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to
2822       * bind our internal depth buffer that contains the VRS data as part of HTILE.
2823       */
2824      VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
2825      struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
2826      struct radv_image *image = cmd_buffer->device->vrs.image;
2827      struct radv_ds_buffer_info ds;
2828      struct radv_image_view iview;
2829
2830      radv_image_view_init(&iview, cmd_buffer->device,
2831                           &(VkImageViewCreateInfo){
2832                              .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
2833                              .image = radv_image_to_handle(image),
2834                              .viewType = radv_meta_get_view_type(image),
2835                              .format = image->vk.format,
2836                              .subresourceRange =
2837                                 {
2838                                    .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
2839                                    .baseMipLevel = 0,
2840                                    .levelCount = 1,
2841                                    .baseArrayLayer = 0,
2842                                    .layerCount = 1,
2843                                 },
2844                           },
2845                           0, NULL);
2846
2847      radv_initialise_vrs_surface(image, htile_buffer, &ds);
2848
2849      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo);
2850
2851      radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, layout, false);
2852
2853      radv_image_view_finish(&iview);
2854   } else {
2855      unsigned num_samples = 0;
2856
2857      /* On GFX11, DB_Z_INFO.NUM_SAMPLES should always match the framebuffer samples. It affects
2858       * VRS and occlusion queries if depth and stencil are not bound.
2859       */
2860      if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX11)
2861         num_samples = util_logbase2(subpass->max_sample_count);
2862
2863      if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9)
2864         radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
2865      else
2866         radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
2867
2868      radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID) |       /* DB_Z_INFO */
2869                                  S_028040_NUM_SAMPLES(num_samples));
2870      radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
2871   }
2872   radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
2873                          S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height));
2874
2875   if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8) {
2876      bool disable_constant_encode =
2877         cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
2878      enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
2879      uint8_t watermark = gfx_level >= GFX10 ? 6 : 4;
2880
2881      if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2882         radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_FDCC_CONTROL,
2883                                S_028424_SAMPLE_MASK_TRACKER_WATERMARK(watermark));
2884      } else {
2885         radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
2886                                S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(gfx_level <= GFX9) |
2887                                S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
2888                                S_028424_DISABLE_CONSTANT_ENCODE_AC01(disable_constant_encode_ac01) |
2889                                S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
2890      }
2891   }
2892
2893   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
2894}
2895
2896static void
2897radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
2898{
2899   struct radeon_cmdbuf *cs = cmd_buffer->cs;
2900   struct radv_cmd_state *state = &cmd_buffer->state;
2901
2902   /* With indirect generated commands the index buffer bind may be part of the
2903    * indirect command buffer, in which case the app may not have bound any yet. */
2904   if (state->index_type < 0)
2905      return;
2906
2907   /* For the direct indexed draws we use DRAW_INDEX_2, which includes
2908    * the index_va and max_index_count already. */
2909   if (!indirect)
2910      return;
2911
2912   if (state->max_index_count ||
2913       !cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) {
2914      radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
2915      radeon_emit(cs, state->index_va);
2916      radeon_emit(cs, state->index_va >> 32);
2917
2918      radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
2919      radeon_emit(cs, state->max_index_count);
2920   }
2921
2922   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
2923}
2924
2925void
2926radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer, bool enable_occlusion_queries)
2927{
2928   bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
2929   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
2930   uint32_t pa_sc_mode_cntl_1 = pipeline ? pipeline->ms.pa_sc_mode_cntl_1 : 0;
2931   uint32_t db_count_control;
2932
2933   if (!enable_occlusion_queries) {
2934      if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
2935         if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2936             pipeline->disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2937            /* Re-enable out-of-order rasterization if the
2938             * bound pipeline supports it and if it's has
2939             * been disabled before starting any perfect
2940             * occlusion queries.
2941             */
2942            radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2943         }
2944      }
2945      db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
2946   } else {
2947      const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2948      uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
2949      bool gfx10_perfect =
2950         cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10 && has_perfect_queries;
2951
2952      if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
2953         /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
2954          * covered tiles, discards, and early depth testing. For more details,
2955          * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
2956         db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
2957                            S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
2958                            S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) |
2959                            S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
2960
2961         if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2962             pipeline->disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2963            /* If the bound pipeline has enabled
2964             * out-of-order rasterization, we should
2965             * disable it before starting any perfect
2966             * occlusion queries.
2967             */
2968            pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
2969
2970            radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2971         }
2972      } else {
2973         db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate);
2974      }
2975   }
2976
2977   radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
2978
2979   cmd_buffer->state.context_roll_without_scissor_emitted = true;
2980}
2981
2982unsigned
2983radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
2984{
2985   /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
2986    * single array sorted in ascending order using:
2987    * - total number of attributes
2988    * - number of instanced attributes
2989    * - index of first instanced attribute
2990    */
2991
2992   /* From total number of attributes to offset. */
2993   static const uint16_t total_to_offset[16] = {0,   1,   4,   10,  20,  35,  56,  84,
2994                                                120, 165, 220, 286, 364, 455, 560, 680};
2995   unsigned start_index = total_to_offset[num_attributes - 1];
2996
2997   /* From number of instanced attributes to offset. This would require a different LUT depending on
2998    * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
2999    * attributes.
3000    */
3001   static const uint8_t count_to_offset_total16[16] = {0,   16,  31,  45,  58,  70,  81,  91,
3002                                                       100, 108, 115, 121, 126, 130, 133, 135};
3003   unsigned count = util_bitcount(instance_rate_inputs);
3004   unsigned offset_from_start_index =
3005      count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
3006
3007   unsigned first = ffs(instance_rate_inputs) - 1;
3008   return start_index + offset_from_start_index + first;
3009}
3010
3011union vs_prolog_key_header {
3012   struct {
3013      uint32_t key_size : 8;
3014      uint32_t num_attributes : 6;
3015      uint32_t as_ls : 1;
3016      uint32_t is_ngg : 1;
3017      uint32_t wave32 : 1;
3018      uint32_t next_stage : 3;
3019      uint32_t instance_rate_inputs : 1;
3020      uint32_t alpha_adjust_lo : 1;
3021      uint32_t alpha_adjust_hi : 1;
3022      uint32_t misaligned_mask : 1;
3023      uint32_t post_shuffle : 1;
3024      uint32_t nontrivial_divisors : 1;
3025      uint32_t zero_divisors : 1;
3026      /* We need this to ensure the padding is zero. It's useful even if it's unused. */
3027      uint32_t padding0 : 5;
3028   };
3029   uint32_t v;
3030};
3031
3032uint32_t
3033radv_hash_vs_prolog(const void *key_)
3034{
3035   const uint32_t *key = key_;
3036   union vs_prolog_key_header header;
3037   header.v = key[0];
3038   return _mesa_hash_data(key, header.key_size);
3039}
3040
3041bool
3042radv_cmp_vs_prolog(const void *a_, const void *b_)
3043{
3044   const uint32_t *a = a_;
3045   const uint32_t *b = b_;
3046   if (a[0] != b[0])
3047      return false;
3048
3049   union vs_prolog_key_header header;
3050   header.v = a[0];
3051   return memcmp(a, b, header.key_size) == 0;
3052}
3053
3054static struct radv_shader_part *
3055lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader,
3056                 uint32_t *nontrivial_divisors)
3057{
3058   STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
3059   assert(vs_shader->info.vs.dynamic_inputs);
3060
3061   const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
3062   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3063   struct radv_device *device = cmd_buffer->device;
3064
3065   unsigned num_attributes = pipeline->last_vertex_attrib_bit;
3066   uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
3067
3068   uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask;
3069   uint32_t zero_divisors = state->zero_divisors & attribute_mask;
3070   *nontrivial_divisors = state->nontrivial_divisors & attribute_mask;
3071   uint32_t misaligned_mask = cmd_buffer->state.vbo_misaligned_mask;
3072   if (cmd_buffer->state.vbo_misaligned_mask_invalid) {
3073      assert(device->physical_device->rad_info.gfx_level == GFX6 ||
3074             device->physical_device->rad_info.gfx_level >= GFX10);
3075
3076      u_foreach_bit (index, cmd_buffer->state.vbo_misaligned_mask_invalid & attribute_mask) {
3077         uint8_t binding = state->bindings[index];
3078         if (!(cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(binding)))
3079            continue;
3080         uint8_t req = state->format_align_req_minus_1[index];
3081         struct radv_vertex_binding *vb = &cmd_buffer->vertex_bindings[binding];
3082         VkDeviceSize offset = vb->offset + state->offsets[index];
3083         if ((offset & req) || (vb->stride & req))
3084            misaligned_mask |= BITFIELD_BIT(index);
3085      }
3086      cmd_buffer->state.vbo_misaligned_mask = misaligned_mask;
3087      cmd_buffer->state.vbo_misaligned_mask_invalid &= ~attribute_mask;
3088   }
3089
3090   /* try to use a pre-compiled prolog first */
3091   struct radv_shader_part *prolog = NULL;
3092   if (pipeline->can_use_simple_input &&
3093       (!vs_shader->info.vs.as_ls || !instance_rate_inputs) &&
3094       !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) {
3095      if (!instance_rate_inputs) {
3096         prolog = device->simple_vs_prologs[num_attributes - 1];
3097      } else if (num_attributes <= 16 && !*nontrivial_divisors && !zero_divisors &&
3098                 util_bitcount(instance_rate_inputs) ==
3099                    (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
3100         unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
3101         prolog = device->instance_rate_vs_prologs[index];
3102      }
3103   }
3104   if (prolog)
3105      return prolog;
3106
3107   /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */
3108   uint32_t key_words[17];
3109   unsigned key_size = 1;
3110
3111   struct radv_vs_prolog_key key;
3112   key.state = state;
3113   key.num_attributes = num_attributes;
3114   key.misaligned_mask = misaligned_mask;
3115   /* The instance ID input VGPR is placed differently when as_ls=true. */
3116   key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs;
3117   key.is_ngg = vs_shader->info.is_ngg;
3118   key.wave32 = vs_shader->info.wave_size == 32;
3119   key.next_stage = pipeline->next_vertex_stage;
3120
3121   union vs_prolog_key_header header;
3122   header.v = 0;
3123   header.num_attributes = num_attributes;
3124   header.as_ls = key.as_ls;
3125   header.is_ngg = key.is_ngg;
3126   header.wave32 = key.wave32;
3127   header.next_stage = key.next_stage;
3128
3129   if (instance_rate_inputs & ~*nontrivial_divisors) {
3130      header.instance_rate_inputs = true;
3131      key_words[key_size++] = instance_rate_inputs;
3132   }
3133   if (*nontrivial_divisors) {
3134      header.nontrivial_divisors = true;
3135      key_words[key_size++] = *nontrivial_divisors;
3136   }
3137   if (zero_divisors) {
3138      header.zero_divisors = true;
3139      key_words[key_size++] = zero_divisors;
3140   }
3141   if (misaligned_mask) {
3142      header.misaligned_mask = true;
3143      key_words[key_size++] = misaligned_mask;
3144
3145      uint8_t *formats = (uint8_t *)&key_words[key_size];
3146      unsigned num_formats = 0;
3147      u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index];
3148      while (num_formats & 0x3)
3149         formats[num_formats++] = 0;
3150      key_size += num_formats / 4u;
3151
3152      if (state->post_shuffle & attribute_mask) {
3153         header.post_shuffle = true;
3154         key_words[key_size++] = state->post_shuffle & attribute_mask;
3155      }
3156   }
3157   if (state->alpha_adjust_lo & attribute_mask) {
3158      header.alpha_adjust_lo = true;
3159      key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
3160   }
3161   if (state->alpha_adjust_hi & attribute_mask) {
3162      header.alpha_adjust_hi = true;
3163      key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
3164   }
3165
3166   header.key_size = key_size * sizeof(key_words[0]);
3167   key_words[0] = header.v;
3168
3169   uint32_t hash = radv_hash_vs_prolog(key_words);
3170
3171   if (cmd_buffer->state.emitted_vs_prolog &&
3172       cmd_buffer->state.emitted_vs_prolog_key_hash == hash &&
3173       radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key))
3174      return cmd_buffer->state.emitted_vs_prolog;
3175
3176   u_rwlock_rdlock(&device->vs_prologs_lock);
3177   struct hash_entry *prolog_entry =
3178      _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
3179   u_rwlock_rdunlock(&device->vs_prologs_lock);
3180
3181   if (!prolog_entry) {
3182      u_rwlock_wrlock(&device->vs_prologs_lock);
3183      prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
3184      if (prolog_entry) {
3185         u_rwlock_wrunlock(&device->vs_prologs_lock);
3186         return prolog_entry->data;
3187      }
3188
3189      prolog = radv_create_vs_prolog(device, &key);
3190      uint32_t *key2 = malloc(key_size * 4);
3191      if (!prolog || !key2) {
3192         radv_shader_part_destroy(device, prolog);
3193         free(key2);
3194         u_rwlock_wrunlock(&device->vs_prologs_lock);
3195         return NULL;
3196      }
3197      memcpy(key2, key_words, key_size * 4);
3198      _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog);
3199
3200      u_rwlock_wrunlock(&device->vs_prologs_lock);
3201      return prolog;
3202   }
3203
3204   return prolog_entry->data;
3205}
3206
3207static void
3208emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader,
3209                 struct radv_shader_part *prolog, bool pipeline_is_dirty)
3210{
3211   /* no need to re-emit anything in this case */
3212   if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty)
3213      return;
3214
3215   enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
3216   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3217   uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset;
3218
3219   assert(cmd_buffer->state.emitted_graphics_pipeline == cmd_buffer->state.graphics_pipeline);
3220
3221   uint32_t rsrc1 = vs_shader->config.rsrc1;
3222   if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1))
3223      rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
3224
3225   /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not
3226    * work.
3227    */
3228   assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1));
3229
3230   unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS;
3231   unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
3232   if (vs_shader->info.is_ngg || pipeline->base.shaders[MESA_SHADER_GEOMETRY] == vs_shader) {
3233      pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES;
3234      rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
3235   } else if (pipeline->base.shaders[MESA_SHADER_TESS_CTRL] == vs_shader) {
3236      pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS;
3237      rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
3238   } else if (vs_shader->info.vs.as_ls) {
3239      pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS;
3240      rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
3241   } else if (vs_shader->info.vs.as_es) {
3242      pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES;
3243      rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
3244   }
3245
3246   radeon_set_sh_reg(cmd_buffer->cs, pgm_lo_reg, prolog_va >> 8);
3247
3248   if (chip < GFX10)
3249      radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1);
3250   else
3251      assert(rsrc1 == vs_shader->config.rsrc1);
3252
3253   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo);
3254}
3255
3256static void
3257emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader,
3258                   uint32_t nontrivial_divisors, bool pipeline_is_dirty)
3259{
3260   /* no need to re-emit anything in this case */
3261   if (!nontrivial_divisors && !pipeline_is_dirty && cmd_buffer->state.emitted_vs_prolog &&
3262       !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors)
3263      return;
3264
3265   const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
3266   uint64_t input_va = radv_shader_get_va(vs_shader);
3267
3268   if (nontrivial_divisors) {
3269      unsigned inputs_offset;
3270      uint32_t *inputs;
3271      unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
3272      if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
3273         return;
3274
3275      *(inputs++) = input_va;
3276      *(inputs++) = input_va >> 32;
3277
3278      u_foreach_bit(index, nontrivial_divisors)
3279      {
3280         uint32_t div = state->divisors[index];
3281         if (div == 0) {
3282            *(inputs++) = 0;
3283            *(inputs++) = 1;
3284         } else if (util_is_power_of_two_or_zero(div)) {
3285            *(inputs++) = util_logbase2(div) | (1 << 8);
3286            *(inputs++) = 0xffffffffu;
3287         } else {
3288            struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
3289            *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
3290            *(inputs++) = info.multiplier;
3291         }
3292      }
3293
3294      input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
3295   }
3296
3297   struct radv_userdata_info *loc =
3298      &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS];
3299   uint32_t base_reg = cmd_buffer->state.graphics_pipeline->base.user_data_0[MESA_SHADER_VERTEX];
3300   assert(loc->sgpr_idx != -1);
3301   assert(loc->num_sgprs == 2);
3302   radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
3303                            input_va, true);
3304}
3305
3306static void
3307radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3308{
3309   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3310   struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
3311
3312   assert(!cmd_buffer->state.mesh_shading);
3313
3314   if (!vs_shader->info.vs.has_prolog)
3315      return;
3316
3317   uint32_t nontrivial_divisors;
3318   struct radv_shader_part *prolog =
3319      lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
3320   if (!prolog) {
3321      cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3322      return;
3323   }
3324   emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty);
3325   emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty);
3326
3327   cmd_buffer->state.emitted_vs_prolog = prolog;
3328
3329   if (unlikely(cmd_buffer->device->trace_bo))
3330      radv_save_vs_prolog(cmd_buffer, prolog);
3331}
3332
3333static void
3334radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3335{
3336   uint64_t states =
3337      cmd_buffer->state.dirty & cmd_buffer->state.emitted_graphics_pipeline->needed_dynamic_state;
3338
3339   if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
3340      radv_emit_viewport(cmd_buffer);
3341
3342   if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
3343       !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
3344      radv_emit_scissor(cmd_buffer);
3345
3346   if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
3347      radv_emit_line_width(cmd_buffer);
3348
3349   if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
3350      radv_emit_blend_constants(cmd_buffer);
3351
3352   if (states &
3353       (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
3354        RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
3355      radv_emit_stencil(cmd_buffer);
3356
3357   if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
3358      radv_emit_depth_bounds(cmd_buffer);
3359
3360   if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
3361      radv_emit_depth_bias(cmd_buffer);
3362
3363   if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
3364      radv_emit_discard_rectangle(cmd_buffer);
3365
3366   if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
3367      radv_emit_sample_locations(cmd_buffer);
3368
3369   if (states & (RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE))
3370      radv_emit_line_stipple(cmd_buffer);
3371
3372   if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
3373                 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE))
3374      radv_emit_culling(cmd_buffer, states);
3375
3376   if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)
3377      radv_emit_primitive_topology(cmd_buffer);
3378
3379   if (states &
3380       (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
3381        RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
3382        RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP))
3383      radv_emit_depth_control(cmd_buffer, states);
3384
3385   if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)
3386      radv_emit_stencil_control(cmd_buffer);
3387
3388   if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE)
3389      radv_emit_fragment_shading_rate(cmd_buffer);
3390
3391   if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
3392      radv_emit_primitive_restart_enable(cmd_buffer);
3393
3394   if (states & RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE)
3395      radv_emit_rasterizer_discard_enable(cmd_buffer);
3396
3397   if (states & RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP)
3398      radv_emit_logic_op(cmd_buffer);
3399
3400   if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE)
3401      radv_emit_color_write_enable(cmd_buffer);
3402
3403   if (states & RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT)
3404      radv_emit_vertex_input(cmd_buffer, pipeline_is_dirty);
3405
3406   cmd_buffer->state.dirty &= ~states;
3407}
3408
3409static void
3410radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
3411{
3412   struct radv_descriptor_state *descriptors_state =
3413      radv_get_descriptors_state(cmd_buffer, bind_point);
3414   struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
3415   unsigned bo_offset;
3416
3417   if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr,
3418                                    &bo_offset))
3419      return;
3420
3421   set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3422   set->header.va += bo_offset;
3423}
3424
3425static void
3426radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
3427                                    struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3428{
3429   struct radv_descriptor_state *descriptors_state =
3430      radv_get_descriptors_state(cmd_buffer, bind_point);
3431   uint32_t size = MAX_SETS * 4;
3432   uint32_t offset;
3433   void *ptr;
3434
3435   if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
3436      return;
3437
3438   for (unsigned i = 0; i < MAX_SETS; i++) {
3439      uint32_t *uptr = ((uint32_t *)ptr) + i;
3440      uint64_t set_va = 0;
3441      struct radv_descriptor_set *set = descriptors_state->sets[i];
3442      if (descriptors_state->valid & (1u << i))
3443         set_va = set->header.va;
3444      uptr[0] = set_va & 0xffffffff;
3445   }
3446
3447   struct radeon_cmdbuf *cs = cmd_buffer->cs;
3448   struct radv_device *device = cmd_buffer->device;
3449   uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3450   va += offset;
3451
3452   if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
3453      struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
3454
3455      if (pipeline->shaders[MESA_SHADER_VERTEX])
3456         radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_VERTEX,
3457                                    AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3458
3459      if (pipeline->shaders[MESA_SHADER_FRAGMENT])
3460         radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_FRAGMENT,
3461                                    AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3462
3463      if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH))
3464         radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_MESH,
3465                                    AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3466
3467      if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK))
3468         radv_emit_userdata_address(device, cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
3469                                    AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3470
3471      if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_GEOMETRY))
3472         radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_GEOMETRY,
3473                                    AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3474
3475      if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
3476         radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_TESS_CTRL,
3477                                    AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3478
3479      if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
3480         radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_TESS_EVAL,
3481                                    AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3482   } else {
3483      radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_COMPUTE,
3484                                 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3485   }
3486}
3487
3488static void
3489radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
3490                       struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3491{
3492   struct radv_descriptor_state *descriptors_state =
3493      radv_get_descriptors_state(cmd_buffer, bind_point);
3494   struct radv_device *device = cmd_buffer->device;
3495   struct radeon_cmdbuf *cs = cmd_buffer->cs;
3496   bool flush_indirect_descriptors;
3497
3498   if (!descriptors_state->dirty)
3499      return;
3500
3501   if (descriptors_state->push_dirty)
3502      radv_flush_push_descriptors(cmd_buffer, bind_point);
3503
3504   flush_indirect_descriptors = pipeline->need_indirect_descriptor_sets;
3505
3506   if (flush_indirect_descriptors)
3507      radv_flush_indirect_descriptor_sets(cmd_buffer, pipeline, bind_point);
3508
3509   ASSERTED unsigned cdw_max =
3510      radeon_check_space(device->ws, cs, MAX_SETS * MESA_VULKAN_SHADER_STAGES * 4);
3511
3512   if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
3513      radv_emit_descriptor_pointers(device, cs, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
3514   } else {
3515      radv_foreach_stage(stage, stages & ~VK_SHADER_STAGE_TASK_BIT_NV)
3516      {
3517         if (!cmd_buffer->state.graphics_pipeline->base.shaders[stage])
3518            continue;
3519
3520         radv_emit_descriptor_pointers(device, cs, pipeline, descriptors_state, stage);
3521      }
3522
3523      if (stages & VK_SHADER_STAGE_TASK_BIT_NV) {
3524         radv_emit_descriptor_pointers(device, cmd_buffer->ace_internal.cs, pipeline,
3525                                       descriptors_state, MESA_SHADER_TASK);
3526      }
3527   }
3528
3529   descriptors_state->dirty = 0;
3530   descriptors_state->push_dirty = false;
3531
3532   assert(cmd_buffer->cs->cdw <= cdw_max);
3533
3534   if (unlikely(cmd_buffer->device->trace_bo))
3535      radv_save_descriptors(cmd_buffer, bind_point);
3536}
3537
3538static bool
3539radv_shader_loads_push_constants(struct radv_pipeline *pipeline, gl_shader_stage stage)
3540{
3541   struct radv_userdata_info *loc =
3542      radv_lookup_user_sgpr(pipeline, stage, AC_UD_PUSH_CONSTANTS);
3543   return loc->sgpr_idx != -1;
3544}
3545
3546static void
3547radv_emit_all_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs,
3548                                 struct radv_pipeline *pipeline, gl_shader_stage stage,
3549                                 uint32_t *values, bool *need_push_constants)
3550{
3551   const struct radv_shader *shader = radv_get_shader(pipeline, stage);
3552   if (!shader)
3553      return;
3554
3555   *need_push_constants |= radv_shader_loads_push_constants(pipeline, stage);
3556
3557   const uint64_t mask = shader->info.inline_push_constant_mask;
3558   if (!mask)
3559      return;
3560
3561   const uint8_t base = ffs(mask) - 1;
3562   if (mask == u_bit_consecutive64(base, util_last_bit64(mask) - base)) {
3563      /* consecutive inline push constants */
3564      radv_emit_inline_push_consts(device, cs, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
3565                                   values + base);
3566   } else {
3567      /* sparse inline push constants */
3568      uint32_t consts[AC_MAX_INLINE_PUSH_CONSTS];
3569      unsigned num_consts = 0;
3570      u_foreach_bit64 (idx, mask)
3571         consts[num_consts++] = values[idx];
3572      radv_emit_inline_push_consts(device, cs, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
3573                                   consts);
3574   }
3575}
3576
3577static void
3578radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
3579                     struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3580{
3581   struct radv_device *device = cmd_buffer->device;
3582   struct radeon_cmdbuf *cs = cmd_buffer->cs;
3583   struct radv_descriptor_state *descriptors_state =
3584      radv_get_descriptors_state(cmd_buffer, bind_point);
3585   struct radv_shader *shader, *prev_shader;
3586   bool need_push_constants = false;
3587   unsigned offset;
3588   void *ptr;
3589   uint64_t va;
3590   uint32_t internal_stages;
3591   uint32_t dirty_stages = 0;
3592
3593   stages &= cmd_buffer->push_constant_stages;
3594   if (!stages || (!pipeline->push_constant_size && !pipeline->dynamic_offset_count))
3595      return;
3596
3597   internal_stages = stages;
3598   switch (bind_point) {
3599   case VK_PIPELINE_BIND_POINT_GRAPHICS:
3600      break;
3601   case VK_PIPELINE_BIND_POINT_COMPUTE:
3602      dirty_stages = RADV_RT_STAGE_BITS;
3603      break;
3604   case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
3605      internal_stages = VK_SHADER_STAGE_COMPUTE_BIT;
3606      dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT;
3607      break;
3608   default:
3609      unreachable("Unhandled bind point");
3610   }
3611
3612   radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_NV)
3613   {
3614      radv_emit_all_inline_push_consts(
3615         device, cs, pipeline, stage, (uint32_t *)cmd_buffer->push_constants, &need_push_constants);
3616   }
3617
3618   if (internal_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
3619      radv_emit_all_inline_push_consts(device, cmd_buffer->ace_internal.cs, pipeline,
3620                                       MESA_SHADER_TASK, (uint32_t *)cmd_buffer->push_constants,
3621                                       &need_push_constants);
3622   }
3623
3624   if (need_push_constants) {
3625      if (!radv_cmd_buffer_upload_alloc(
3626             cmd_buffer, pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count, &offset,
3627             &ptr))
3628         return;
3629
3630      memcpy(ptr, cmd_buffer->push_constants, pipeline->push_constant_size);
3631      memcpy((char *)ptr + pipeline->push_constant_size, descriptors_state->dynamic_buffers,
3632             16 * pipeline->dynamic_offset_count);
3633
3634      va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3635      va += offset;
3636
3637      ASSERTED unsigned cdw_max =
3638         radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_VULKAN_SHADER_STAGES * 4);
3639
3640      prev_shader = NULL;
3641      radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_NV)
3642      {
3643         shader = radv_get_shader(pipeline, stage);
3644
3645         /* Avoid redundantly emitting the address for merged stages. */
3646         if (shader && shader != prev_shader) {
3647            radv_emit_userdata_address(device, cs, pipeline, stage, AC_UD_PUSH_CONSTANTS, va);
3648
3649            prev_shader = shader;
3650         }
3651      }
3652
3653      if (internal_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
3654         radv_emit_userdata_address(device, cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
3655                                    AC_UD_PUSH_CONSTANTS, va);
3656      }
3657
3658      assert(cmd_buffer->cs->cdw <= cdw_max);
3659   }
3660
3661   cmd_buffer->push_constant_stages &= ~stages;
3662   cmd_buffer->push_constant_stages |= dirty_stages;
3663}
3664
3665enum radv_dst_sel {
3666   DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
3667                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3668   DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
3669                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3670   DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3671                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3672   DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3673                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3674   DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3675                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
3676   DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3677                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
3678};
3679
3680static const uint32_t data_format_dst_sel[] = {
3681   [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001,
3682   [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001,
3683   [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001,
3684   [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01,
3685   [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001,
3686   [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01,
3687   [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1,
3688   [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1,
3689   [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW,
3690   [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW,
3691   [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW,
3692   [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01,
3693   [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW,
3694   [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1,
3695   [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW,
3696};
3697
3698void
3699radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
3700                              const struct radv_graphics_pipeline *pipeline,
3701                              bool full_null_descriptors, void *vb_ptr)
3702{
3703   struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
3704   enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
3705   unsigned desc_index = 0;
3706   uint32_t mask = pipeline->vb_desc_usage_mask;
3707   uint64_t va;
3708   const struct radv_vs_input_state *vs_state =
3709      vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
3710   assert(!vs_state || pipeline->use_per_attribute_vb_descs);
3711
3712   while (mask) {
3713      unsigned i = u_bit_scan(&mask);
3714      uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
3715      uint32_t offset, rsrc_word3;
3716      unsigned binding =
3717         vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
3718                  : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
3719      struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding];
3720      unsigned num_records;
3721      unsigned stride;
3722
3723      if (vs_state) {
3724         unsigned format = vs_state->formats[i];
3725         unsigned dfmt = format & 0xf;
3726         unsigned nfmt = (format >> 4) & 0x7;
3727
3728         rsrc_word3 = vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
3729
3730         if (chip >= GFX10)
3731            rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
3732         else
3733            rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
3734      } else {
3735         if (chip >= GFX10)
3736            rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
3737         else
3738            rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
3739                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3740      }
3741
3742      if (pipeline->uses_dynamic_stride) {
3743         stride = cmd_buffer->vertex_bindings[binding].stride;
3744      } else {
3745         stride = pipeline->binding_stride[binding];
3746      }
3747
3748      if (!buffer) {
3749         if (full_null_descriptors) {
3750            /* Put all the info in for the DGC generation shader in case the VBO gets overridden. */
3751            desc[0] = 0;
3752            desc[1] = S_008F04_STRIDE(stride);
3753            desc[2] = 0;
3754            desc[3] = rsrc_word3;
3755         } else if (vs_state) {
3756            /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
3757             * to include the format/word3 so that the alpha channel is 1 for formats without an
3758             * alpha channel.
3759             */
3760            desc[0] = 0;
3761            desc[1] = S_008F04_STRIDE(16);
3762            desc[2] = 0;
3763            desc[3] = rsrc_word3;
3764         } else {
3765            memset(desc, 0, 4 * 4);
3766         }
3767
3768         continue;
3769      }
3770
3771      va = radv_buffer_get_va(buffer->bo);
3772
3773      offset = cmd_buffer->vertex_bindings[binding].offset;
3774      va += offset + buffer->offset;
3775      if (vs_state)
3776         va += vs_state->offsets[i];
3777
3778      if (cmd_buffer->vertex_bindings[binding].size) {
3779         num_records = cmd_buffer->vertex_bindings[binding].size;
3780      } else {
3781         num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
3782      }
3783
3784      if (pipeline->use_per_attribute_vb_descs) {
3785         uint32_t attrib_end =
3786            vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i] : pipeline->attrib_ends[i];
3787
3788         if (num_records < attrib_end) {
3789            num_records = 0; /* not enough space for one vertex */
3790         } else if (stride == 0) {
3791            num_records = 1; /* only one vertex */
3792         } else {
3793            num_records = (num_records - attrib_end) / stride + 1;
3794            /* If attrib_offset>stride, then the compiler will increase the vertex index by
3795             * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
3796             * only allowed with static strides.
3797             */
3798            num_records += pipeline->attrib_index_offset[i];
3799         }
3800
3801         /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
3802          * into bytes in that case. GFX8 always uses bytes.
3803          */
3804         if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
3805            num_records = (num_records - 1) * stride + attrib_end;
3806         } else if (!num_records) {
3807            /* On GFX9, it seems bounds checking is disabled if both
3808             * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
3809             * GFX10.3 but it doesn't hurt.
3810             */
3811            if (full_null_descriptors) {
3812               /* Put all the info in for the DGC generation shader in case the VBO gets overridden.
3813                */
3814               desc[0] = 0;
3815               desc[1] = S_008F04_STRIDE(stride);
3816               desc[2] = 0;
3817               desc[3] = rsrc_word3;
3818            } else if (vs_state) {
3819               desc[0] = 0;
3820               desc[1] = S_008F04_STRIDE(16);
3821               desc[2] = 0;
3822               desc[3] = rsrc_word3;
3823            } else {
3824               memset(desc, 0, 16);
3825            }
3826
3827            continue;
3828         }
3829      } else {
3830         if (chip != GFX8 && stride)
3831            num_records = DIV_ROUND_UP(num_records, stride);
3832      }
3833
3834      if (chip >= GFX10) {
3835         /* OOB_SELECT chooses the out-of-bounds check:
3836          * - 1: index >= NUM_RECORDS (Structured)
3837          * - 3: offset >= NUM_RECORDS (Raw)
3838          */
3839         int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
3840         rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11);
3841      }
3842
3843      desc[0] = va;
3844      desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
3845      desc[2] = num_records;
3846      desc[3] = rsrc_word3;
3847   }
3848}
3849
3850static void
3851radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3852{
3853   if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
3854       cmd_buffer->state.graphics_pipeline->vb_desc_usage_mask) {
3855      /* Mesh shaders don't have vertex descriptors. */
3856      assert(!cmd_buffer->state.mesh_shading);
3857
3858      struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3859      unsigned vb_offset;
3860      void *vb_ptr;
3861      uint64_t va;
3862
3863      /* allocate some descriptor state for vertex buffers */
3864      if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset,
3865                                        &vb_ptr))
3866         return;
3867
3868      radv_write_vertex_descriptors(cmd_buffer, pipeline, false, vb_ptr);
3869
3870      va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3871      va += vb_offset;
3872
3873      radv_emit_userdata_address(cmd_buffer->device, cmd_buffer->cs, &pipeline->base,
3874                                 MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS, va);
3875
3876      cmd_buffer->state.vb_va = va;
3877      cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
3878
3879      if (unlikely(cmd_buffer->device->trace_bo))
3880         radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr);
3881   }
3882   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
3883}
3884
3885static void
3886radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
3887{
3888   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3889   struct radv_userdata_info *loc;
3890   uint32_t base_reg;
3891
3892   for (unsigned stage = 0; stage < MESA_VULKAN_SHADER_STAGES; ++stage) {
3893      if (!radv_get_shader(&pipeline->base, stage))
3894         continue;
3895
3896      loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_STREAMOUT_BUFFERS);
3897      if (loc->sgpr_idx == -1)
3898         continue;
3899
3900      base_reg = pipeline->base.user_data_0[stage];
3901
3902      radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
3903                               false);
3904   }
3905
3906   if (radv_pipeline_has_gs_copy_shader(&pipeline->base)) {
3907      loc = &pipeline->base.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
3908      if (loc->sgpr_idx != -1) {
3909         base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
3910
3911         radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
3912                                  va, false);
3913      }
3914   }
3915}
3916
3917static void
3918radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
3919{
3920   if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
3921      struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
3922      struct radv_streamout_state *so = &cmd_buffer->state.streamout;
3923      unsigned so_offset;
3924      void *so_ptr;
3925      uint64_t va;
3926
3927      /* Allocate some descriptor state for streamout buffers. */
3928      if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr))
3929         return;
3930
3931      for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
3932         struct radv_buffer *buffer = sb[i].buffer;
3933         uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
3934
3935         if (!(so->enabled_mask & (1 << i)))
3936            continue;
3937
3938         va = radv_buffer_get_va(buffer->bo) + buffer->offset;
3939
3940         va += sb[i].offset;
3941
3942         /* Set the descriptor.
3943          *
3944          * On GFX8, the format must be non-INVALID, otherwise
3945          * the buffer will be considered not bound and store
3946          * instructions will be no-ops.
3947          */
3948         uint32_t size = 0xffffffff;
3949
3950         /* Compute the correct buffer size for NGG streamout
3951          * because it's used to determine the max emit per
3952          * buffer.
3953          */
3954         if (cmd_buffer->device->physical_device->use_ngg_streamout)
3955            size = buffer->vk.size - sb[i].offset;
3956
3957         uint32_t rsrc_word3 =
3958            S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3959            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3960
3961         if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
3962            rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
3963                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
3964         } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
3965            rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
3966                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
3967         } else {
3968            rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3969         }
3970
3971         desc[0] = va;
3972         desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
3973         desc[2] = size;
3974         desc[3] = rsrc_word3;
3975      }
3976
3977      va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3978      va += so_offset;
3979
3980      radv_emit_streamout_buffers(cmd_buffer, va);
3981   }
3982
3983   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
3984}
3985
3986static void
3987radv_flush_ngg_query_state(struct radv_cmd_buffer *cmd_buffer)
3988{
3989   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3990   const unsigned stage = pipeline->last_vgt_api_stage;
3991   struct radv_userdata_info *loc;
3992   uint32_t ngg_query_state = 0;
3993   uint32_t base_reg;
3994
3995   loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_NGG_QUERY_STATE);
3996   if (loc->sgpr_idx == -1)
3997      return;
3998
3999   assert(pipeline->is_ngg);
4000
4001   /* By default NGG queries are disabled but they are enabled if the command buffer has active GDS
4002    * queries or if it's a secondary command buffer that inherits the number of generated
4003    * primitives.
4004    */
4005   if (cmd_buffer->state.active_pipeline_gds_queries ||
4006       (cmd_buffer->state.inherited_pipeline_statistics &
4007        VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
4008      ngg_query_state = 1;
4009
4010   base_reg = pipeline->base.user_data_0[stage];
4011   assert(loc->sgpr_idx != -1);
4012
4013   radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ngg_query_state);
4014}
4015
4016static void
4017radv_flush_force_vrs_state(struct radv_cmd_buffer *cmd_buffer)
4018{
4019   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
4020   enum amd_gfx_level gfx_level = pipeline->base.device->physical_device->rad_info.gfx_level;
4021   const unsigned stage = pipeline->last_vgt_api_stage;
4022   struct radv_userdata_info *loc;
4023   uint32_t vrs_rates = 0;
4024   uint32_t base_reg;
4025
4026   if (!pipeline->force_vrs_per_vertex) {
4027      /* Un-set the SGPR index so we know to re-emit it later. */
4028      cmd_buffer->state.last_vrs_rates_sgpr_idx = -1;
4029      return;
4030   }
4031
4032   loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_FORCE_VRS_RATES);
4033   assert(loc->sgpr_idx != -1);
4034
4035   base_reg = pipeline->base.user_data_0[stage];
4036
4037   switch (cmd_buffer->device->force_vrs) {
4038   case RADV_FORCE_VRS_2x2:
4039      vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X2 : (1u << 2) | (1u << 4);
4040      break;
4041   case RADV_FORCE_VRS_2x1:
4042      vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X1 : (1u << 2) | (0u << 4);
4043      break;
4044   case RADV_FORCE_VRS_1x2:
4045      vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_1X2 : (0u << 2) | (1u << 4);
4046      break;
4047   default:
4048      break;
4049   }
4050
4051   if (cmd_buffer->state.last_vrs_rates != vrs_rates ||
4052       cmd_buffer->state.last_vrs_rates_sgpr_idx != loc->sgpr_idx) {
4053      radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, vrs_rates);
4054   }
4055
4056   cmd_buffer->state.last_vrs_rates = vrs_rates;
4057   cmd_buffer->state.last_vrs_rates_sgpr_idx = loc->sgpr_idx;
4058}
4059
4060static void
4061radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
4062{
4063   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
4064
4065   radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
4066   radv_flush_streamout_descriptors(cmd_buffer);
4067
4068   VkShaderStageFlags stages = VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_NV;
4069   radv_flush_descriptors(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS);
4070   radv_flush_constants(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS);
4071   radv_flush_ngg_query_state(cmd_buffer);
4072   radv_flush_force_vrs_state(cmd_buffer);
4073}
4074
4075struct radv_draw_info {
4076   /**
4077    * Number of vertices.
4078    */
4079   uint32_t count;
4080
4081   /**
4082    * First instance id.
4083    */
4084   uint32_t first_instance;
4085
4086   /**
4087    * Number of instances.
4088    */
4089   uint32_t instance_count;
4090
4091   /**
4092    * Whether it's an indexed draw.
4093    */
4094   bool indexed;
4095
4096   /**
4097    * Indirect draw parameters resource.
4098    */
4099   struct radv_buffer *indirect;
4100   uint64_t indirect_offset;
4101   uint32_t stride;
4102
4103   /**
4104    * Draw count parameters resource.
4105    */
4106   struct radv_buffer *count_buffer;
4107   uint64_t count_buffer_offset;
4108
4109   /**
4110    * Stream output parameters resource.
4111    */
4112   struct radv_buffer *strmout_buffer;
4113   uint64_t strmout_buffer_offset;
4114};
4115
4116static uint32_t
4117radv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer)
4118{
4119   uint32_t index_type = G_028A7C_INDEX_TYPE(cmd_buffer->state.index_type);
4120   switch (index_type) {
4121   case V_028A7C_VGT_INDEX_8:
4122      return 0xffu;
4123   case V_028A7C_VGT_INDEX_16:
4124      return 0xffffu;
4125   case V_028A7C_VGT_INDEX_32:
4126      return 0xffffffffu;
4127   default:
4128      unreachable("invalid index type");
4129   }
4130}
4131
4132static void
4133si_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw,
4134                           bool indirect_draw, bool count_from_stream_output,
4135                           uint32_t draw_vertex_count)
4136{
4137   struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
4138   struct radv_cmd_state *state = &cmd_buffer->state;
4139   unsigned topology = state->dynamic.primitive_topology;
4140   bool prim_restart_enable = state->dynamic.primitive_restart_enable;
4141   struct radeon_cmdbuf *cs = cmd_buffer->cs;
4142   unsigned ia_multi_vgt_param;
4143
4144   ia_multi_vgt_param =
4145      si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output,
4146                                draw_vertex_count, topology, prim_restart_enable);
4147
4148   if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
4149      if (info->gfx_level == GFX9) {
4150         radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
4151                                    R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
4152      } else if (info->gfx_level >= GFX7) {
4153         radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
4154      } else {
4155         radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
4156      }
4157      state->last_ia_multi_vgt_param = ia_multi_vgt_param;
4158   }
4159}
4160
4161static void
4162radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
4163{
4164   struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
4165   struct radv_cmd_state *state = &cmd_buffer->state;
4166   struct radeon_cmdbuf *cs = cmd_buffer->cs;
4167   uint32_t topology = state->dynamic.primitive_topology;
4168   bool disable_instance_packing = false;
4169
4170   /* Draw state. */
4171   if (info->gfx_level < GFX10) {
4172      si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect,
4173                                 !!draw_info->strmout_buffer,
4174                                 draw_info->indirect ? 0 : draw_info->count);
4175   }
4176
4177   if (state->dynamic.primitive_restart_enable) {
4178      uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
4179
4180      if (primitive_reset_index != state->last_primitive_reset_index) {
4181         radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index);
4182         state->last_primitive_reset_index = primitive_reset_index;
4183      }
4184   }
4185
4186   if (draw_info->strmout_buffer) {
4187      uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
4188
4189      va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
4190
4191      radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
4192
4193      if (info->gfx_level >= GFX10) {
4194         /* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption
4195          * (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+.
4196          */
4197         radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
4198         radeon_emit(cs, 0);
4199
4200         radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
4201         radeon_emit(cs, va);
4202         radeon_emit(cs, va >> 32);
4203         radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2);
4204         radeon_emit(cs, 1); /* 1 DWORD */
4205      } else {
4206         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
4207         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
4208                         COPY_DATA_WR_CONFIRM);
4209         radeon_emit(cs, va);
4210         radeon_emit(cs, va >> 32);
4211         radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
4212         radeon_emit(cs, 0); /* unused */
4213      }
4214
4215      radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
4216   }
4217
4218   /* RDNA2 is affected by a hardware bug when instance packing is enabled for adjacent primitive
4219    * topologies and instance_count > 1, pipeline stats generated by GE are incorrect. It needs to
4220    * be applied for indexed and non-indexed draws.
4221    */
4222   if (info->gfx_level == GFX10_3 && state->active_pipeline_queries > 0 &&
4223       (draw_info->instance_count > 1 || draw_info->indirect) &&
4224       (topology == V_008958_DI_PT_LINELIST_ADJ || topology == V_008958_DI_PT_LINESTRIP_ADJ ||
4225        topology == V_008958_DI_PT_TRILIST_ADJ || topology == V_008958_DI_PT_TRISTRIP_ADJ)) {
4226      disable_instance_packing = true;
4227   }
4228
4229   if ((draw_info->indexed && state->index_type != state->last_index_type) ||
4230       (info->gfx_level == GFX10_3 &&
4231        (state->last_index_type == -1 ||
4232         disable_instance_packing != G_028A7C_DISABLE_INSTANCE_PACKING(state->last_index_type)))) {
4233      uint32_t index_type = state->index_type | S_028A7C_DISABLE_INSTANCE_PACKING(disable_instance_packing);
4234
4235      if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
4236         radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
4237                                    R_03090C_VGT_INDEX_TYPE, 2, index_type);
4238      } else {
4239         radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
4240         radeon_emit(cs, index_type);
4241      }
4242
4243      state->last_index_type = index_type;
4244   }
4245}
4246
4247static void
4248radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask)
4249{
4250   /* For simplicity, if the barrier wants to wait for the task shader,
4251    * just make it wait for the mesh shader too.
4252    */
4253   if (src_stage_mask & VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV)
4254      src_stage_mask |= VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV;
4255
4256   if (src_stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT |
4257                         VK_PIPELINE_STAGE_2_RESOLVE_BIT |
4258                         VK_PIPELINE_STAGE_2_BLIT_BIT |
4259                         VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
4260      /* Be conservative for now. */
4261      src_stage_mask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
4262   }
4263
4264   if (src_stage_mask &
4265       (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
4266        VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
4267        VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR |
4268        VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
4269        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
4270      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
4271   }
4272
4273   if (src_stage_mask &
4274       (VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
4275        VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
4276        VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
4277        VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
4278      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
4279   } else if (src_stage_mask &
4280              (VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
4281               VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
4282               VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
4283               VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
4284               VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
4285               VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV |
4286               VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
4287               VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT)) {
4288      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
4289   }
4290}
4291
4292static bool
4293can_skip_buffer_l2_flushes(struct radv_device *device)
4294{
4295   return device->physical_device->rad_info.gfx_level == GFX9 ||
4296          (device->physical_device->rad_info.gfx_level >= GFX10 &&
4297           !device->physical_device->rad_info.tcc_rb_non_coherent);
4298}
4299
4300/*
4301 * In vulkan barriers have two kinds of operations:
4302 *
4303 * - visibility (implemented with radv_src_access_flush)
4304 * - availability (implemented with radv_dst_access_flush)
4305 *
4306 * for a memory operation to observe the result of a previous memory operation
4307 * one needs to do a visibility operation from the source memory and then an
4308 * availability operation to the target memory.
4309 *
4310 * The complication is the availability and visibility operations do not need to
4311 * be in the same barrier.
4312 *
4313 * The cleanest way to implement this is to define the visibility operation to
4314 * bring the caches to a "state of rest", which none of the caches below that
4315 * level dirty.
4316 *
4317 * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty.
4318 *
4319 * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all
4320 * buffers and for images marked as coherent, and VRAM/GTT for non-coherent
4321 * images. However, given the existence of memory barriers which do not specify
4322 * the image/buffer it often devolves to just VRAM/GTT anyway.
4323 *
4324 * To help reducing the invalidations for GPUs that have L2 coherency between the
4325 * RB and the shader caches, we always invalidate L2 on the src side, as we can
4326 * use our knowledge of past usage to optimize flushes away.
4327 */
4328
4329enum radv_cmd_flush_bits
4330radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 src_flags,
4331                      const struct radv_image *image)
4332{
4333   bool has_CB_meta = true, has_DB_meta = true;
4334   bool image_is_coherent = image ? image->l2_coherent : false;
4335   enum radv_cmd_flush_bits flush_bits = 0;
4336
4337   if (image) {
4338      if (!radv_image_has_CB_metadata(image))
4339         has_CB_meta = false;
4340      if (!radv_image_has_htile(image))
4341         has_DB_meta = false;
4342   }
4343
4344   u_foreach_bit64(b, src_flags)
4345   {
4346      switch ((VkAccessFlags2)(1 << b)) {
4347      case VK_ACCESS_2_SHADER_WRITE_BIT:
4348      case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
4349         /* since the STORAGE bit isn't set we know that this is a meta operation.
4350          * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so
4351          * set it here. */
4352         if (image && !(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
4353            if (vk_format_is_depth_or_stencil(image->vk.format)) {
4354               flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4355            } else {
4356               flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4357            }
4358         }
4359
4360         if (!image_is_coherent)
4361            flush_bits |= RADV_CMD_FLAG_INV_L2;
4362         break;
4363      case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
4364      case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
4365      case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
4366         if (!image_is_coherent)
4367            flush_bits |= RADV_CMD_FLAG_WB_L2;
4368         break;
4369      case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
4370         flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4371         if (has_CB_meta)
4372            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4373         break;
4374      case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
4375         flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4376         if (has_DB_meta)
4377            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4378         break;
4379      case VK_ACCESS_2_TRANSFER_WRITE_BIT:
4380         flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4381
4382         if (!image_is_coherent)
4383            flush_bits |= RADV_CMD_FLAG_INV_L2;
4384         if (has_CB_meta)
4385            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4386         if (has_DB_meta)
4387            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4388         break;
4389      case VK_ACCESS_2_MEMORY_WRITE_BIT:
4390         flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4391
4392         if (!image_is_coherent)
4393            flush_bits |= RADV_CMD_FLAG_INV_L2;
4394         if (has_CB_meta)
4395            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4396         if (has_DB_meta)
4397            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4398         break;
4399      default:
4400         break;
4401      }
4402   }
4403   return flush_bits;
4404}
4405
4406enum radv_cmd_flush_bits
4407radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 dst_flags,
4408                      const struct radv_image *image)
4409{
4410   bool has_CB_meta = true, has_DB_meta = true;
4411   enum radv_cmd_flush_bits flush_bits = 0;
4412   bool flush_CB = true, flush_DB = true;
4413   bool image_is_coherent = image ? image->l2_coherent : false;
4414
4415   if (image) {
4416      if (!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
4417         flush_CB = false;
4418         flush_DB = false;
4419      }
4420
4421      if (!radv_image_has_CB_metadata(image))
4422         has_CB_meta = false;
4423      if (!radv_image_has_htile(image))
4424         has_DB_meta = false;
4425   }
4426
4427   /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
4428    * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
4429   image_is_coherent |=
4430      can_skip_buffer_l2_flushes(cmd_buffer->device) && !cmd_buffer->state.rb_noncoherent_dirty;
4431
4432   u_foreach_bit64(b, dst_flags)
4433   {
4434      switch ((VkAccessFlags2)(1 << b)) {
4435      case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
4436         /* SMEM loads are used to read compute dispatch size in shaders */
4437         if (!cmd_buffer->device->load_grid_size_from_user_sgpr)
4438            flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
4439
4440         /* Ensure the DGC meta shader can read the commands. */
4441         if (cmd_buffer->device->uses_device_generated_commands) {
4442            flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
4443
4444            if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
4445               flush_bits |= RADV_CMD_FLAG_INV_L2;
4446         }
4447
4448         break;
4449      case VK_ACCESS_2_INDEX_READ_BIT:
4450      case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
4451         break;
4452      case VK_ACCESS_2_UNIFORM_READ_BIT:
4453         flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
4454         break;
4455      case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
4456      case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
4457      case VK_ACCESS_2_TRANSFER_READ_BIT:
4458      case VK_ACCESS_2_TRANSFER_WRITE_BIT:
4459         flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
4460
4461         if (has_CB_meta || has_DB_meta)
4462            flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
4463         if (!image_is_coherent)
4464            flush_bits |= RADV_CMD_FLAG_INV_L2;
4465         break;
4466      case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
4467      case VK_ACCESS_2_SHADER_READ_BIT:
4468      case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
4469         flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
4470         /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
4471          * invalidate the scalar cache. */
4472         if (!cmd_buffer->device->physical_device->use_llvm && !image)
4473            flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
4474
4475         if (has_CB_meta || has_DB_meta)
4476            flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
4477         if (!image_is_coherent)
4478            flush_bits |= RADV_CMD_FLAG_INV_L2;
4479         break;
4480      case VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR:
4481         flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
4482         if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
4483            flush_bits |= RADV_CMD_FLAG_INV_L2;
4484         break;
4485      case VK_ACCESS_2_SHADER_WRITE_BIT:
4486      case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
4487      case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
4488         break;
4489      case VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT:
4490      case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
4491         if (flush_CB)
4492            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4493         if (has_CB_meta)
4494            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4495         break;
4496      case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
4497      case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
4498         if (flush_DB)
4499            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4500         if (has_DB_meta)
4501            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4502         break;
4503      case VK_ACCESS_2_MEMORY_READ_BIT:
4504      case VK_ACCESS_2_MEMORY_WRITE_BIT:
4505         flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
4506         if (!image_is_coherent)
4507            flush_bits |= RADV_CMD_FLAG_INV_L2;
4508         if (flush_CB)
4509            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4510         if (has_CB_meta)
4511            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4512         if (flush_DB)
4513            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4514         if (has_DB_meta)
4515            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4516         break;
4517      default:
4518         break;
4519      }
4520   }
4521   return flush_bits;
4522}
4523
4524void
4525radv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
4526                          const struct radv_subpass_barrier *barrier)
4527{
4528   struct radv_render_pass *pass = cmd_buffer->state.pass;
4529
4530   for (uint32_t i = 0; i < pass->attachment_count; i++) {
4531      struct radv_image_view *iview = cmd_buffer->state.attachments[i].iview;
4532
4533      cmd_buffer->state.flush_bits |=
4534         radv_src_access_flush(cmd_buffer, barrier->src_access_mask, iview->image);
4535   }
4536
4537   radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
4538
4539   for (uint32_t i = 0; i < pass->attachment_count; i++) {
4540      struct radv_image_view *iview = cmd_buffer->state.attachments[i].iview;
4541
4542      cmd_buffer->state.flush_bits |=
4543         radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, iview->image);
4544   }
4545
4546   radv_ace_internal_barrier(cmd_buffer, barrier->src_stage_mask, barrier->dst_stage_mask);
4547}
4548
4549uint32_t
4550radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
4551{
4552   struct radv_cmd_state *state = &cmd_buffer->state;
4553   uint32_t subpass_id = state->subpass - state->pass->subpasses;
4554
4555   /* The id of this subpass shouldn't exceed the number of subpasses in
4556    * this render pass minus 1.
4557    */
4558   assert(subpass_id < state->pass->subpass_count);
4559   return subpass_id;
4560}
4561
4562static struct radv_sample_locations_state *
4563radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer, uint32_t att_idx,
4564                                     bool begin_subpass)
4565{
4566   struct radv_cmd_state *state = &cmd_buffer->state;
4567   uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
4568   struct radv_image_view *view = state->attachments[att_idx].iview;
4569
4570   if (view->image->info.samples == 1)
4571      return NULL;
4572
4573   if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) {
4574      /* Return the initial sample locations if this is the initial
4575       * layout transition of the given subpass attachemnt.
4576       */
4577      if (state->attachments[att_idx].sample_location.count > 0)
4578         return &state->attachments[att_idx].sample_location;
4579   } else {
4580      /* Otherwise return the subpass sample locations if defined. */
4581      if (state->subpass_sample_locs) {
4582         /* Because the driver sets the current subpass before
4583          * initial layout transitions, we should use the sample
4584          * locations from the previous subpass to avoid an
4585          * off-by-one problem. Otherwise, use the sample
4586          * locations for the current subpass for final layout
4587          * transitions.
4588          */
4589         if (begin_subpass)
4590            subpass_id--;
4591
4592         for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) {
4593            if (state->subpass_sample_locs[i].subpass_idx == subpass_id)
4594               return &state->subpass_sample_locs[i].sample_location;
4595         }
4596      }
4597   }
4598
4599   return NULL;
4600}
4601
4602static void
4603radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
4604                                     struct radv_subpass_attachment att, bool begin_subpass)
4605{
4606   unsigned idx = att.attachment;
4607   struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview;
4608   struct radv_sample_locations_state *sample_locs;
4609   VkImageSubresourceRange range;
4610   range.aspectMask = view->vk.aspects;
4611   range.baseMipLevel = view->vk.base_mip_level;
4612   range.levelCount = 1;
4613   range.baseArrayLayer = view->vk.base_array_layer;
4614   range.layerCount = cmd_buffer->state.framebuffer->layers;
4615
4616   if (cmd_buffer->state.subpass->view_mask) {
4617      /* If the current subpass uses multiview, the driver might have
4618       * performed a fast color/depth clear to the whole image
4619       * (including all layers). To make sure the driver will
4620       * decompress the image correctly (if needed), we have to
4621       * account for the "real" number of layers. If the view mask is
4622       * sparse, this will decompress more layers than needed.
4623       */
4624      range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
4625   }
4626
4627   /* Get the subpass sample locations for the given attachment, if NULL
4628    * is returned the driver will use the default HW locations.
4629    */
4630   sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx, begin_subpass);
4631
4632   /* Determine if the subpass uses separate depth/stencil layouts. */
4633   bool uses_separate_depth_stencil_layouts = false;
4634   if ((cmd_buffer->state.attachments[idx].current_layout !=
4635        cmd_buffer->state.attachments[idx].current_stencil_layout) ||
4636       (att.layout != att.stencil_layout)) {
4637      uses_separate_depth_stencil_layouts = true;
4638   }
4639
4640   /* For separate layouts, perform depth and stencil transitions
4641    * separately.
4642    */
4643   if (uses_separate_depth_stencil_layouts &&
4644       (range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
4645      /* Depth-only transitions. */
4646      range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
4647      radv_handle_image_transition(cmd_buffer, view->image,
4648                                   cmd_buffer->state.attachments[idx].current_layout,
4649                                   cmd_buffer->state.attachments[idx].current_in_render_loop,
4650                                   att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
4651
4652      /* Stencil-only transitions. */
4653      range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
4654      radv_handle_image_transition(
4655         cmd_buffer, view->image, cmd_buffer->state.attachments[idx].current_stencil_layout,
4656         cmd_buffer->state.attachments[idx].current_in_render_loop, att.stencil_layout,
4657         att.in_render_loop, 0, 0, &range, sample_locs);
4658   } else {
4659      radv_handle_image_transition(cmd_buffer, view->image,
4660                                   cmd_buffer->state.attachments[idx].current_layout,
4661                                   cmd_buffer->state.attachments[idx].current_in_render_loop,
4662                                   att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
4663   }
4664
4665   cmd_buffer->state.attachments[idx].current_layout = att.layout;
4666   cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout;
4667   cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop;
4668}
4669
4670void
4671radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass *subpass)
4672{
4673   cmd_buffer->state.subpass = subpass;
4674
4675   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
4676}
4677
4678static VkResult
4679radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer,
4680                                      struct radv_render_pass *pass,
4681                                      const VkRenderPassBeginInfo *info)
4682{
4683   const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs =
4684      vk_find_struct_const(info->pNext, RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT);
4685   struct radv_cmd_state *state = &cmd_buffer->state;
4686
4687   if (!sample_locs) {
4688      state->subpass_sample_locs = NULL;
4689      return VK_SUCCESS;
4690   }
4691
4692   for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) {
4693      const VkAttachmentSampleLocationsEXT *att_sample_locs =
4694         &sample_locs->pAttachmentInitialSampleLocations[i];
4695      uint32_t att_idx = att_sample_locs->attachmentIndex;
4696      struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image;
4697
4698      assert(vk_format_is_depth_or_stencil(image->vk.format));
4699
4700      /* From the Vulkan spec 1.1.108:
4701       *
4702       * "If the image referenced by the framebuffer attachment at
4703       *  index attachmentIndex was not created with
4704       *  VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT
4705       *  then the values specified in sampleLocationsInfo are
4706       *  ignored."
4707       */
4708      if (!(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT))
4709         continue;
4710
4711      const VkSampleLocationsInfoEXT *sample_locs_info = &att_sample_locs->sampleLocationsInfo;
4712
4713      state->attachments[att_idx].sample_location.per_pixel =
4714         sample_locs_info->sampleLocationsPerPixel;
4715      state->attachments[att_idx].sample_location.grid_size =
4716         sample_locs_info->sampleLocationGridSize;
4717      state->attachments[att_idx].sample_location.count = sample_locs_info->sampleLocationsCount;
4718      typed_memcpy(&state->attachments[att_idx].sample_location.locations[0],
4719                   sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
4720   }
4721
4722   state->subpass_sample_locs =
4723      vk_alloc(&cmd_buffer->pool->vk.alloc,
4724               sample_locs->postSubpassSampleLocationsCount * sizeof(state->subpass_sample_locs[0]),
4725               8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4726   if (state->subpass_sample_locs == NULL) {
4727      cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4728      return cmd_buffer->record_result;
4729   }
4730
4731   state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount;
4732
4733   for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) {
4734      const VkSubpassSampleLocationsEXT *subpass_sample_locs_info =
4735         &sample_locs->pPostSubpassSampleLocations[i];
4736      const VkSampleLocationsInfoEXT *sample_locs_info =
4737         &subpass_sample_locs_info->sampleLocationsInfo;
4738
4739      state->subpass_sample_locs[i].subpass_idx = subpass_sample_locs_info->subpassIndex;
4740      state->subpass_sample_locs[i].sample_location.per_pixel =
4741         sample_locs_info->sampleLocationsPerPixel;
4742      state->subpass_sample_locs[i].sample_location.grid_size =
4743         sample_locs_info->sampleLocationGridSize;
4744      state->subpass_sample_locs[i].sample_location.count = sample_locs_info->sampleLocationsCount;
4745      typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0],
4746                   sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
4747   }
4748
4749   return VK_SUCCESS;
4750}
4751
4752static VkResult
4753radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass,
4754                                 const VkRenderPassBeginInfo *info)
4755{
4756   struct radv_cmd_state *state = &cmd_buffer->state;
4757   const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL;
4758
4759   if (info) {
4760      attachment_info = vk_find_struct_const(info->pNext, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
4761   }
4762
4763   if (pass->attachment_count == 0) {
4764      state->attachments = NULL;
4765      return VK_SUCCESS;
4766   }
4767
4768   state->attachments =
4769      vk_alloc(&cmd_buffer->pool->vk.alloc, pass->attachment_count * sizeof(state->attachments[0]),
4770               8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4771   if (state->attachments == NULL) {
4772      cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4773      return cmd_buffer->record_result;
4774   }
4775
4776   for (uint32_t i = 0; i < pass->attachment_count; ++i) {
4777      struct radv_render_pass_attachment *att = &pass->attachments[i];
4778      VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
4779      VkImageAspectFlags clear_aspects = 0;
4780
4781      if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
4782         /* color attachment */
4783         if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4784            clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
4785         }
4786      } else {
4787         /* depthstencil attachment */
4788         if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
4789             att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4790            clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
4791            if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4792                att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
4793               clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4794         }
4795         if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4796             att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4797            clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4798         }
4799      }
4800
4801      state->attachments[i].pending_clear_aspects = clear_aspects;
4802      state->attachments[i].cleared_views = 0;
4803      if (clear_aspects && info) {
4804         assert(info->clearValueCount > i);
4805         state->attachments[i].clear_value = info->pClearValues[i];
4806      }
4807
4808      state->attachments[i].current_layout = att->initial_layout;
4809      state->attachments[i].current_in_render_loop = false;
4810      state->attachments[i].current_stencil_layout = att->stencil_initial_layout;
4811      state->attachments[i].sample_location.count = 0;
4812
4813      struct radv_image_view *iview;
4814      if (attachment_info && attachment_info->attachmentCount > i) {
4815         iview = radv_image_view_from_handle(attachment_info->pAttachments[i]);
4816      } else {
4817         iview = radv_image_view_from_handle(state->framebuffer->attachments[i]);
4818      }
4819
4820      state->attachments[i].iview = iview;
4821      if (iview->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4822         radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview);
4823      } else {
4824         radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview);
4825      }
4826   }
4827
4828   return VK_SUCCESS;
4829}
4830
4831VKAPI_ATTR VkResult VKAPI_CALL
4832radv_AllocateCommandBuffers(VkDevice _device, const VkCommandBufferAllocateInfo *pAllocateInfo,
4833                            VkCommandBuffer *pCommandBuffers)
4834{
4835   RADV_FROM_HANDLE(radv_device, device, _device);
4836   RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
4837
4838   VkResult result = VK_SUCCESS;
4839   uint32_t i;
4840
4841   for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
4842
4843      if (!list_is_empty(&pool->free_cmd_buffers)) {
4844         struct radv_cmd_buffer *cmd_buffer =
4845            list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
4846
4847         list_del(&cmd_buffer->pool_link);
4848         list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
4849
4850         result = radv_reset_cmd_buffer(cmd_buffer);
4851         vk_command_buffer_finish(&cmd_buffer->vk);
4852         VkResult init_result =
4853            vk_command_buffer_init(&cmd_buffer->vk, &pool->vk, pAllocateInfo->level);
4854         if (init_result != VK_SUCCESS)
4855            result = init_result;
4856
4857         pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
4858      } else {
4859         result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, &pCommandBuffers[i]);
4860      }
4861      if (result != VK_SUCCESS)
4862         break;
4863   }
4864
4865   if (result != VK_SUCCESS) {
4866      radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, pCommandBuffers);
4867
4868      /* From the Vulkan 1.0.66 spec:
4869       *
4870       * "vkAllocateCommandBuffers can be used to create multiple
4871       *  command buffers. If the creation of any of those command
4872       *  buffers fails, the implementation must destroy all
4873       *  successfully created command buffer objects from this
4874       *  command, set all entries of the pCommandBuffers array to
4875       *  NULL and return the error."
4876       */
4877      memset(pCommandBuffers, 0, sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
4878   }
4879
4880   return result;
4881}
4882
4883VKAPI_ATTR void VKAPI_CALL
4884radv_FreeCommandBuffers(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount,
4885                        const VkCommandBuffer *pCommandBuffers)
4886{
4887   RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4888
4889   for (uint32_t i = 0; i < commandBufferCount; i++) {
4890      RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
4891
4892      if (!cmd_buffer)
4893         continue;
4894      assert(cmd_buffer->pool == pool);
4895
4896      list_del(&cmd_buffer->pool_link);
4897      list_addtail(&cmd_buffer->pool_link, &pool->free_cmd_buffers);
4898   }
4899}
4900
4901VKAPI_ATTR VkResult VKAPI_CALL
4902radv_ResetCommandBuffer(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags)
4903{
4904   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4905   return radv_reset_cmd_buffer(cmd_buffer);
4906}
4907
4908static void
4909radv_inherit_dynamic_rendering(struct radv_cmd_buffer *cmd_buffer,
4910                               const VkCommandBufferInheritanceInfo *inherit_info,
4911                               const VkCommandBufferInheritanceRenderingInfo *dyn_info)
4912{
4913   const VkAttachmentSampleCountInfoAMD *sample_info =
4914      vk_find_struct_const(inherit_info->pNext, ATTACHMENT_SAMPLE_COUNT_INFO_AMD);
4915   VkResult result;
4916   /* (normal + resolve) for color attachments and ds and a VRS attachment */
4917   VkAttachmentDescription2 att_desc[MAX_RTS * 2 + 3];
4918   VkAttachmentReference2 color_refs[MAX_RTS], ds_ref;
4919   unsigned att_count = 0;
4920
4921   VkSubpassDescription2 subpass = {
4922      .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
4923      .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
4924      .viewMask = dyn_info->viewMask,
4925      .colorAttachmentCount = dyn_info->colorAttachmentCount,
4926      .pColorAttachments = color_refs,
4927   };
4928
4929   for (unsigned i = 0; i < dyn_info->colorAttachmentCount; ++i) {
4930      if (dyn_info->pColorAttachmentFormats[i] == VK_FORMAT_UNDEFINED) {
4931         color_refs[i] = (VkAttachmentReference2){
4932            .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
4933            .attachment = VK_ATTACHMENT_UNUSED,
4934         };
4935         continue;
4936      }
4937
4938      color_refs[i] = (VkAttachmentReference2){
4939         .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
4940         .attachment = att_count,
4941         .layout = VK_IMAGE_LAYOUT_GENERAL, /* Shouldn't be used */
4942         .aspectMask = 0,                   /* Shouldn't be used */
4943      };
4944
4945      VkAttachmentDescription2 *att = att_desc + att_count++;
4946      memset(att, 0, sizeof(*att));
4947      att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
4948      att->format = dyn_info->pColorAttachmentFormats[i];
4949      att->samples =
4950         sample_info ? sample_info->pColorAttachmentSamples[i] : dyn_info->rasterizationSamples;
4951      att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
4952      att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
4953      att->initialLayout = VK_IMAGE_LAYOUT_GENERAL;
4954      att->finalLayout = VK_IMAGE_LAYOUT_GENERAL;
4955   }
4956
4957   if (dyn_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED ||
4958       dyn_info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED) {
4959      VkFormat fmt = dyn_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED
4960                        ? dyn_info->depthAttachmentFormat
4961                        : dyn_info->stencilAttachmentFormat;
4962
4963      ds_ref = (VkAttachmentReference2){
4964         .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
4965         .attachment = att_count,
4966         .layout = VK_IMAGE_LAYOUT_GENERAL, /* Shouldn't be used */
4967         .aspectMask = 0,                   /* Shouldn't be used */
4968      };
4969      subpass.pDepthStencilAttachment = &ds_ref;
4970
4971      VkAttachmentDescription2 *att = att_desc + att_count++;
4972
4973      memset(att, 0, sizeof(*att));
4974      att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
4975      att->format = fmt;
4976      att->samples =
4977         sample_info ? sample_info->depthStencilAttachmentSamples : dyn_info->rasterizationSamples;
4978      att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
4979      att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
4980      att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
4981      att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
4982   }
4983
4984   VkRenderPassCreateInfo2 rp_create_info = {
4985      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
4986      .attachmentCount = att_count,
4987      .pAttachments = att_desc,
4988      .subpassCount = 1,
4989      .pSubpasses = &subpass,
4990   };
4991
4992   VkRenderPass rp;
4993   result =
4994      radv_CreateRenderPass2(radv_device_to_handle(cmd_buffer->device), &rp_create_info, NULL, &rp);
4995   if (result != VK_SUCCESS) {
4996      cmd_buffer->record_result = result;
4997      return;
4998   }
4999
5000   cmd_buffer->state.pass = radv_render_pass_from_handle(rp);
5001   cmd_buffer->state.own_render_pass = true;
5002}
5003
5004VKAPI_ATTR VkResult VKAPI_CALL
5005radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)
5006{
5007   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5008   VkResult result = VK_SUCCESS;
5009
5010   if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
5011      /* If the command buffer has already been resetted with
5012       * vkResetCommandBuffer, no need to do it again.
5013       */
5014      result = radv_reset_cmd_buffer(cmd_buffer);
5015      if (result != VK_SUCCESS)
5016         return result;
5017   }
5018
5019   memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
5020   cmd_buffer->state.last_primitive_reset_en = -1;
5021   cmd_buffer->state.last_index_type = -1;
5022   cmd_buffer->state.last_num_instances = -1;
5023   cmd_buffer->state.last_vertex_offset = -1;
5024   cmd_buffer->state.last_first_instance = -1;
5025   cmd_buffer->state.last_drawid = -1;
5026   cmd_buffer->state.last_subpass_color_count = MAX_RTS;
5027   cmd_buffer->state.predication_type = -1;
5028   cmd_buffer->state.last_sx_ps_downconvert = -1;
5029   cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
5030   cmd_buffer->state.last_sx_blend_opt_control = -1;
5031   cmd_buffer->state.last_nggc_settings = -1;
5032   cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
5033   cmd_buffer->state.mesh_shading = false;
5034   cmd_buffer->state.last_vrs_rates = -1;
5035   cmd_buffer->state.last_vrs_rates_sgpr_idx = -1;
5036   cmd_buffer->usage_flags = pBeginInfo->flags;
5037
5038   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
5039       (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
5040      struct radv_subpass *subpass = NULL;
5041
5042      assert(pBeginInfo->pInheritanceInfo);
5043
5044      cmd_buffer->state.framebuffer =
5045         vk_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
5046
5047      if (pBeginInfo->pInheritanceInfo->renderPass) {
5048         cmd_buffer->state.pass =
5049            radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
5050         assert(pBeginInfo->pInheritanceInfo->subpass < cmd_buffer->state.pass->subpass_count);
5051         subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
5052      } else {
5053         const VkCommandBufferInheritanceRenderingInfo *dyn_info =
5054            vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext,
5055                                 COMMAND_BUFFER_INHERITANCE_RENDERING_INFO);
5056         if (dyn_info) {
5057            radv_inherit_dynamic_rendering(cmd_buffer, pBeginInfo->pInheritanceInfo, dyn_info);
5058            subpass = &cmd_buffer->state.pass->subpasses[0];
5059         }
5060      }
5061
5062      if (cmd_buffer->state.framebuffer) {
5063         result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL);
5064         if (result != VK_SUCCESS)
5065            return result;
5066      }
5067
5068      cmd_buffer->state.inherited_pipeline_statistics =
5069         pBeginInfo->pInheritanceInfo->pipelineStatistics;
5070
5071      if (cmd_buffer->state.pass) {
5072         cmd_buffer->state.subpass = subpass;
5073         if (cmd_buffer->state.framebuffer)
5074            cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
5075      }
5076   }
5077
5078   if (unlikely(cmd_buffer->device->trace_bo))
5079      radv_cmd_buffer_trace_emit(cmd_buffer);
5080
5081   radv_describe_begin_cmd_buffer(cmd_buffer);
5082
5083   cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
5084
5085   return result;
5086}
5087
5088VKAPI_ATTR void VKAPI_CALL
5089radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding,
5090                           uint32_t bindingCount, const VkBuffer *pBuffers,
5091                           const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes,
5092                           const VkDeviceSize *pStrides)
5093{
5094   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5095   struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
5096   const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
5097
5098   /* We have to defer setting up vertex buffer since we need the buffer
5099    * stride from the pipeline. */
5100
5101   assert(firstBinding + bindingCount <= MAX_VBS);
5102   enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
5103
5104   if (firstBinding + bindingCount > cmd_buffer->used_vertex_bindings)
5105      cmd_buffer->used_vertex_bindings = firstBinding + bindingCount;
5106
5107   uint32_t misaligned_mask_invalid = 0;
5108
5109   for (uint32_t i = 0; i < bindingCount; i++) {
5110      RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
5111      uint32_t idx = firstBinding + i;
5112      VkDeviceSize size = pSizes ? pSizes[i] : 0;
5113      /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */
5114      VkDeviceSize stride = pStrides ? pStrides[i] : vb[idx].stride;
5115
5116      if (!!cmd_buffer->vertex_binding_buffers[idx] != !!buffer ||
5117          (buffer && ((vb[idx].offset & 0x3) != (pOffsets[i] & 0x3) ||
5118                      (vb[idx].stride & 0x3) != (stride & 0x3)))) {
5119         misaligned_mask_invalid |= state->bindings_match_attrib ? BITFIELD_BIT(idx) : 0xffffffff;
5120      }
5121
5122      cmd_buffer->vertex_binding_buffers[idx] = buffer;
5123      vb[idx].offset = pOffsets[i];
5124      vb[idx].size = size;
5125      vb[idx].stride = stride;
5126
5127      uint32_t bit = BITFIELD_BIT(idx);
5128      if (buffer) {
5129         radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->vertex_binding_buffers[idx]->bo);
5130         cmd_buffer->state.vbo_bound_mask |= bit;
5131      } else {
5132         cmd_buffer->state.vbo_bound_mask &= ~bit;
5133      }
5134   }
5135
5136   if ((chip == GFX6 || chip >= GFX10) && misaligned_mask_invalid) {
5137      cmd_buffer->state.vbo_misaligned_mask_invalid = misaligned_mask_invalid;
5138      cmd_buffer->state.vbo_misaligned_mask &= ~misaligned_mask_invalid;
5139   }
5140
5141   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
5142                              RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
5143}
5144
5145static uint32_t
5146vk_to_index_type(VkIndexType type)
5147{
5148   switch (type) {
5149   case VK_INDEX_TYPE_UINT8_EXT:
5150      return V_028A7C_VGT_INDEX_8;
5151   case VK_INDEX_TYPE_UINT16:
5152      return V_028A7C_VGT_INDEX_16;
5153   case VK_INDEX_TYPE_UINT32:
5154      return V_028A7C_VGT_INDEX_32;
5155   default:
5156      unreachable("invalid index type");
5157   }
5158}
5159
5160uint32_t
5161radv_get_vgt_index_size(uint32_t type)
5162{
5163   uint32_t index_type = G_028A7C_INDEX_TYPE(type);
5164   switch (index_type) {
5165   case V_028A7C_VGT_INDEX_8:
5166      return 1;
5167   case V_028A7C_VGT_INDEX_16:
5168      return 2;
5169   case V_028A7C_VGT_INDEX_32:
5170      return 4;
5171   default:
5172      unreachable("invalid index type");
5173   }
5174}
5175
5176VKAPI_ATTR void VKAPI_CALL
5177radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset,
5178                        VkIndexType indexType)
5179{
5180   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5181   RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
5182
5183   cmd_buffer->state.index_buffer = index_buffer;
5184   cmd_buffer->state.index_offset = offset;
5185   cmd_buffer->state.index_type = vk_to_index_type(indexType);
5186   cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
5187   cmd_buffer->state.index_va += index_buffer->offset + offset;
5188
5189   int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
5190   cmd_buffer->state.max_index_count =
5191      (vk_buffer_range(&index_buffer->vk, offset, VK_WHOLE_SIZE)) / index_size;
5192   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
5193   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
5194}
5195
5196static void
5197radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
5198                         struct radv_descriptor_set *set, unsigned idx)
5199{
5200   struct radeon_winsys *ws = cmd_buffer->device->ws;
5201
5202   radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
5203
5204   assert(set);
5205   assert(!(set->header.layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
5206
5207   if (!cmd_buffer->device->use_global_bo_list) {
5208      for (unsigned j = 0; j < set->header.buffer_count; ++j)
5209         if (set->descriptors[j])
5210            radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
5211   }
5212
5213   if (set->header.bo)
5214      radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo);
5215}
5216
5217VKAPI_ATTR void VKAPI_CALL
5218radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
5219                           VkPipelineLayout _layout, uint32_t firstSet, uint32_t descriptorSetCount,
5220                           const VkDescriptorSet *pDescriptorSets, uint32_t dynamicOffsetCount,
5221                           const uint32_t *pDynamicOffsets)
5222{
5223   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5224   RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5225   unsigned dyn_idx = 0;
5226
5227   const bool no_dynamic_bounds =
5228      cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
5229   struct radv_descriptor_state *descriptors_state =
5230      radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
5231
5232   for (unsigned i = 0; i < descriptorSetCount; ++i) {
5233      unsigned set_idx = i + firstSet;
5234      RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
5235
5236      if (!set) {
5237         /* From the Vulkan spec 1.3.211:
5238          *
5239          * "VUID-vkCmdBindDescriptorSets-layout-06564
5240          *  If layout was not created with VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT, each
5241          *  element of pDescriptorSets must be a valid VkDescriptorSet"
5242          */
5243         assert(layout->independent_sets);
5244         continue;
5245      }
5246
5247      /* If the set is already bound we only need to update the
5248       * (potentially changed) dynamic offsets. */
5249      if (descriptors_state->sets[set_idx] != set ||
5250          !(descriptors_state->valid & (1u << set_idx))) {
5251         radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, set_idx);
5252      }
5253
5254      for (unsigned j = 0; j < set->header.layout->dynamic_offset_count; ++j, ++dyn_idx) {
5255         unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
5256         uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
5257         assert(dyn_idx < dynamicOffsetCount);
5258
5259         struct radv_descriptor_range *range = set->header.dynamic_descriptors + j;
5260
5261         if (!range->va) {
5262            memset(dst, 0, 4 * 4);
5263         } else {
5264            uint64_t va = range->va + pDynamicOffsets[dyn_idx];
5265            dst[0] = va;
5266            dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
5267            dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
5268            dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5269                     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5270
5271            if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
5272               dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
5273                         S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
5274            } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
5275               dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5276                         S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5277            } else {
5278               dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5279                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5280            }
5281         }
5282
5283         cmd_buffer->push_constant_stages |= set->header.layout->dynamic_shader_stages;
5284      }
5285   }
5286}
5287
5288static bool
5289radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set,
5290                              struct radv_descriptor_set_layout *layout,
5291                              VkPipelineBindPoint bind_point)
5292{
5293   struct radv_descriptor_state *descriptors_state =
5294      radv_get_descriptors_state(cmd_buffer, bind_point);
5295   set->header.size = layout->size;
5296
5297   if (set->header.layout != layout) {
5298      if (set->header.layout)
5299         vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->header.layout->vk);
5300      vk_descriptor_set_layout_ref(&layout->vk);
5301      set->header.layout = layout;
5302   }
5303
5304   if (descriptors_state->push_set.capacity < set->header.size) {
5305      size_t new_size = MAX2(set->header.size, 1024);
5306      new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
5307      new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
5308
5309      free(set->header.mapped_ptr);
5310      set->header.mapped_ptr = malloc(new_size);
5311
5312      if (!set->header.mapped_ptr) {
5313         descriptors_state->push_set.capacity = 0;
5314         cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
5315         return false;
5316      }
5317
5318      descriptors_state->push_set.capacity = new_size;
5319   }
5320
5321   return true;
5322}
5323
5324void
5325radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
5326                              VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout _layout,
5327                              uint32_t set, uint32_t descriptorWriteCount,
5328                              const VkWriteDescriptorSet *pDescriptorWrites)
5329{
5330   RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5331   struct radv_descriptor_set *push_set =
5332      (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors;
5333   unsigned bo_offset;
5334
5335   assert(set == 0);
5336   assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
5337
5338   push_set->header.size = layout->set[set].layout->size;
5339   push_set->header.layout = layout->set[set].layout;
5340
5341   if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset,
5342                                     (void **)&push_set->header.mapped_ptr))
5343      return;
5344
5345   push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5346   push_set->header.va += bo_offset;
5347
5348   radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
5349                                   radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
5350                                   pDescriptorWrites, 0, NULL);
5351
5352   radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
5353}
5354
5355VKAPI_ATTR void VKAPI_CALL
5356radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
5357                             VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount,
5358                             const VkWriteDescriptorSet *pDescriptorWrites)
5359{
5360   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5361   RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5362   struct radv_descriptor_state *descriptors_state =
5363      radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
5364   struct radv_descriptor_set *push_set =
5365      (struct radv_descriptor_set *)&descriptors_state->push_set.set;
5366
5367   assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
5368
5369   if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
5370                                      pipelineBindPoint))
5371      return;
5372
5373   /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
5374    * because it is invalid, according to Vulkan spec.
5375    */
5376   for (int i = 0; i < descriptorWriteCount; i++) {
5377      ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
5378      assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
5379   }
5380
5381   radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
5382                                   radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
5383                                   pDescriptorWrites, 0, NULL);
5384
5385   radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
5386   descriptors_state->push_dirty = true;
5387}
5388
5389VKAPI_ATTR void VKAPI_CALL
5390radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,
5391                                         VkDescriptorUpdateTemplate descriptorUpdateTemplate,
5392                                         VkPipelineLayout _layout, uint32_t set, const void *pData)
5393{
5394   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5395   RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5396   RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
5397   struct radv_descriptor_state *descriptors_state =
5398      radv_get_descriptors_state(cmd_buffer, templ->bind_point);
5399   struct radv_descriptor_set *push_set =
5400      (struct radv_descriptor_set *)&descriptors_state->push_set.set;
5401
5402   assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
5403
5404   if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
5405                                      templ->bind_point))
5406      return;
5407
5408   radv_cmd_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
5409                                                descriptorUpdateTemplate, pData);
5410
5411   radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
5412   descriptors_state->push_dirty = true;
5413}
5414
5415VKAPI_ATTR void VKAPI_CALL
5416radv_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout,
5417                      VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size,
5418                      const void *pValues)
5419{
5420   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5421   memcpy(cmd_buffer->push_constants + offset, pValues, size);
5422   cmd_buffer->push_constant_stages |= stageFlags;
5423}
5424
5425VKAPI_ATTR VkResult VKAPI_CALL
5426radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
5427{
5428   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5429
5430   radv_emit_mip_change_flush_default(cmd_buffer);
5431
5432   if (cmd_buffer->qf != RADV_QUEUE_TRANSFER) {
5433      if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX6)
5434         cmd_buffer->state.flush_bits |=
5435            RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
5436
5437      /* Make sure to sync all pending active queries at the end of
5438       * command buffer.
5439       */
5440      cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
5441
5442      /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
5443       * command buffer.
5444       */
5445      if (cmd_buffer->state.rb_noncoherent_dirty && can_skip_buffer_l2_flushes(cmd_buffer->device))
5446         cmd_buffer->state.flush_bits |= radv_src_access_flush(
5447            cmd_buffer,
5448            VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
5449            VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
5450            NULL);
5451
5452      /* Since NGG streamout uses GDS, we need to make GDS idle when
5453       * we leave the IB, otherwise another process might overwrite
5454       * it while our shaders are busy.
5455       */
5456      if (cmd_buffer->gds_needed)
5457         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
5458
5459      /* Finalize the internal compute command stream, if it exists. */
5460      if (cmd_buffer->ace_internal.cs) {
5461         VkResult result = radv_ace_internal_finalize(cmd_buffer);
5462         if (result != VK_SUCCESS)
5463            return vk_error(cmd_buffer, result);
5464      }
5465
5466      si_emit_cache_flush(cmd_buffer);
5467   }
5468
5469   /* Make sure CP DMA is idle at the end of IBs because the kernel
5470    * doesn't wait for it.
5471    */
5472   si_cp_dma_wait_for_idle(cmd_buffer);
5473
5474   radv_describe_end_cmd_buffer(cmd_buffer);
5475
5476   vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments);
5477   vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.subpass_sample_locs);
5478
5479   VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs);
5480   if (result != VK_SUCCESS)
5481      return vk_error(cmd_buffer, result);
5482
5483   cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
5484
5485   return cmd_buffer->record_result;
5486}
5487
5488static void
5489radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer,
5490                           struct radv_compute_pipeline *pipeline)
5491{
5492   if (pipeline == cmd_buffer->state.emitted_compute_pipeline)
5493      return;
5494
5495   assert(!pipeline->base.ctx_cs.cdw);
5496
5497   cmd_buffer->state.emitted_compute_pipeline = pipeline;
5498
5499   radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.cs.cdw);
5500   radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
5501
5502   cmd_buffer->compute_scratch_size_per_wave_needed =
5503      MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, pipeline->base.scratch_bytes_per_wave);
5504   cmd_buffer->compute_scratch_waves_wanted =
5505      MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->base.max_waves);
5506
5507   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.slab_bo);
5508
5509   if (unlikely(cmd_buffer->device->trace_bo))
5510      radv_save_pipeline(cmd_buffer, &pipeline->base);
5511}
5512
5513static void
5514radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
5515{
5516   struct radv_descriptor_state *descriptors_state =
5517      radv_get_descriptors_state(cmd_buffer, bind_point);
5518
5519   descriptors_state->dirty |= descriptors_state->valid;
5520}
5521
5522VKAPI_ATTR void VKAPI_CALL
5523radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
5524                     VkPipeline _pipeline)
5525{
5526   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5527   RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
5528
5529   switch (pipelineBindPoint) {
5530   case VK_PIPELINE_BIND_POINT_COMPUTE: {
5531      struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
5532
5533      if (cmd_buffer->state.compute_pipeline == compute_pipeline)
5534         return;
5535      radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
5536
5537      cmd_buffer->state.compute_pipeline = compute_pipeline;
5538      cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
5539      cmd_buffer->task_rings_needed |=
5540         pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.uses_task_rings;
5541      break;
5542   }
5543   case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
5544      struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
5545
5546      if (cmd_buffer->state.rt_pipeline == compute_pipeline)
5547         return;
5548      radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
5549
5550      cmd_buffer->state.rt_pipeline = compute_pipeline;
5551      cmd_buffer->push_constant_stages |= RADV_RT_STAGE_BITS;
5552      if (compute_pipeline->dynamic_stack_size)
5553         radv_set_rt_stack_size(cmd_buffer, cmd_buffer->state.rt_stack_size);
5554      break;
5555   }
5556   case VK_PIPELINE_BIND_POINT_GRAPHICS: {
5557      struct radv_graphics_pipeline *graphics_pipeline =
5558         pipeline ? radv_pipeline_to_graphics(pipeline) : NULL;
5559
5560      if (cmd_buffer->state.graphics_pipeline == graphics_pipeline)
5561         return;
5562      radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
5563
5564      bool vtx_emit_count_changed =
5565         !pipeline || !cmd_buffer->state.graphics_pipeline ||
5566         cmd_buffer->state.graphics_pipeline->vtx_emit_num != graphics_pipeline->vtx_emit_num ||
5567         cmd_buffer->state.graphics_pipeline->vtx_base_sgpr != graphics_pipeline->vtx_base_sgpr;
5568      cmd_buffer->state.graphics_pipeline = graphics_pipeline;
5569      if (!pipeline)
5570         break;
5571
5572      bool mesh_shading = radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH);
5573      if (mesh_shading != cmd_buffer->state.mesh_shading) {
5574         /* Re-emit VRS state because the combiner is different (vertex vs primitive).
5575          * Re-emit primitive topology because the mesh shading pipeline clobbered it.
5576          */
5577         cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE |
5578                                    RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
5579      }
5580
5581      cmd_buffer->state.mesh_shading = mesh_shading;
5582      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
5583      cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
5584
5585      /* the new vertex shader might not have the same user regs */
5586      if (vtx_emit_count_changed) {
5587         cmd_buffer->state.last_first_instance = -1;
5588         cmd_buffer->state.last_vertex_offset = -1;
5589         cmd_buffer->state.last_drawid = -1;
5590      }
5591
5592      /* Prefetch all pipeline shaders at first draw time. */
5593      cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
5594
5595      if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug &&
5596          cmd_buffer->state.emitted_graphics_pipeline &&
5597          cmd_buffer->state.emitted_graphics_pipeline->is_ngg &&
5598          !cmd_buffer->state.graphics_pipeline->is_ngg) {
5599         /* Transitioning from NGG to legacy GS requires
5600          * VGT_FLUSH on GFX10 and Navi21. VGT_FLUSH
5601          * is also emitted at the beginning of IBs when legacy
5602          * GS ring pointers are set.
5603          */
5604         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
5605      }
5606
5607      radv_bind_dynamic_state(cmd_buffer, &graphics_pipeline->dynamic_state);
5608
5609      if (graphics_pipeline->esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
5610         cmd_buffer->esgs_ring_size_needed = graphics_pipeline->esgs_ring_size;
5611      if (graphics_pipeline->gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
5612         cmd_buffer->gsvs_ring_size_needed = graphics_pipeline->gsvs_ring_size;
5613
5614      if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
5615         cmd_buffer->tess_rings_needed = true;
5616      if (mesh_shading)
5617         cmd_buffer->mesh_scratch_ring_needed |=
5618            pipeline->shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring;
5619
5620      if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK)) {
5621         if (!cmd_buffer->ace_internal.cs) {
5622            cmd_buffer->ace_internal.cs = radv_ace_internal_create(cmd_buffer);
5623            if (!cmd_buffer->ace_internal.cs)
5624               return;
5625         }
5626
5627         cmd_buffer->task_rings_needed = true;
5628      }
5629      break;
5630   }
5631   default:
5632      assert(!"invalid bind point");
5633      break;
5634   }
5635}
5636
5637VKAPI_ATTR void VKAPI_CALL
5638radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
5639                    const VkViewport *pViewports)
5640{
5641   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5642   struct radv_cmd_state *state = &cmd_buffer->state;
5643   ASSERTED const uint32_t total_count = firstViewport + viewportCount;
5644
5645   assert(firstViewport < MAX_VIEWPORTS);
5646   assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
5647
5648   if (state->dynamic.viewport.count < total_count)
5649      state->dynamic.viewport.count = total_count;
5650
5651   memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
5652          viewportCount * sizeof(*pViewports));
5653   for (unsigned i = 0; i < viewportCount; i++) {
5654      radv_get_viewport_xform(&pViewports[i],
5655                              state->dynamic.viewport.xform[i + firstViewport].scale,
5656                              state->dynamic.viewport.xform[i + firstViewport].translate);
5657   }
5658
5659   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
5660}
5661
5662VKAPI_ATTR void VKAPI_CALL
5663radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount,
5664                   const VkRect2D *pScissors)
5665{
5666   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5667   struct radv_cmd_state *state = &cmd_buffer->state;
5668   ASSERTED const uint32_t total_count = firstScissor + scissorCount;
5669
5670   assert(firstScissor < MAX_SCISSORS);
5671   assert(total_count >= 1 && total_count <= MAX_SCISSORS);
5672
5673   if (state->dynamic.scissor.count < total_count)
5674      state->dynamic.scissor.count = total_count;
5675
5676   memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
5677          scissorCount * sizeof(*pScissors));
5678
5679   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
5680}
5681
5682VKAPI_ATTR void VKAPI_CALL
5683radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
5684{
5685   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5686
5687   if (cmd_buffer->state.dynamic.line_width != lineWidth)
5688      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
5689
5690   cmd_buffer->state.dynamic.line_width = lineWidth;
5691   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
5692}
5693
5694VKAPI_ATTR void VKAPI_CALL
5695radv_CmdSetDepthBias(VkCommandBuffer commandBuffer, float depthBiasConstantFactor,
5696                     float depthBiasClamp, float depthBiasSlopeFactor)
5697{
5698   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5699   struct radv_cmd_state *state = &cmd_buffer->state;
5700
5701   state->dynamic.depth_bias.bias = depthBiasConstantFactor;
5702   state->dynamic.depth_bias.clamp = depthBiasClamp;
5703   state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
5704
5705   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
5706}
5707
5708VKAPI_ATTR void VKAPI_CALL
5709radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4])
5710{
5711   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5712   struct radv_cmd_state *state = &cmd_buffer->state;
5713
5714   memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
5715
5716   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
5717}
5718
5719VKAPI_ATTR void VKAPI_CALL
5720radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds)
5721{
5722   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5723   struct radv_cmd_state *state = &cmd_buffer->state;
5724
5725   state->dynamic.depth_bounds.min = minDepthBounds;
5726   state->dynamic.depth_bounds.max = maxDepthBounds;
5727
5728   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
5729}
5730
5731VKAPI_ATTR void VKAPI_CALL
5732radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5733                              uint32_t compareMask)
5734{
5735   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5736   struct radv_cmd_state *state = &cmd_buffer->state;
5737
5738   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5739      state->dynamic.stencil_compare_mask.front = compareMask;
5740   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5741      state->dynamic.stencil_compare_mask.back = compareMask;
5742
5743   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
5744}
5745
5746VKAPI_ATTR void VKAPI_CALL
5747radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5748                            uint32_t writeMask)
5749{
5750   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5751   struct radv_cmd_state *state = &cmd_buffer->state;
5752
5753   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5754      state->dynamic.stencil_write_mask.front = writeMask;
5755   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5756      state->dynamic.stencil_write_mask.back = writeMask;
5757
5758   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
5759}
5760
5761VKAPI_ATTR void VKAPI_CALL
5762radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5763                            uint32_t reference)
5764{
5765   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5766
5767   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5768      cmd_buffer->state.dynamic.stencil_reference.front = reference;
5769   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5770      cmd_buffer->state.dynamic.stencil_reference.back = reference;
5771
5772   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
5773}
5774
5775VKAPI_ATTR void VKAPI_CALL
5776radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle,
5777                               uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles)
5778{
5779   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5780   struct radv_cmd_state *state = &cmd_buffer->state;
5781   ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
5782
5783   assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
5784   assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
5785
5786   typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
5787                pDiscardRectangles, discardRectangleCount);
5788
5789   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
5790}
5791
5792VKAPI_ATTR void VKAPI_CALL
5793radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
5794                              const VkSampleLocationsInfoEXT *pSampleLocationsInfo)
5795{
5796   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5797   struct radv_cmd_state *state = &cmd_buffer->state;
5798
5799   assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
5800
5801   state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
5802   state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
5803   state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
5804   typed_memcpy(&state->dynamic.sample_location.locations[0],
5805                pSampleLocationsInfo->pSampleLocations, pSampleLocationsInfo->sampleLocationsCount);
5806
5807   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
5808}
5809
5810VKAPI_ATTR void VKAPI_CALL
5811radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor,
5812                          uint16_t lineStipplePattern)
5813{
5814   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5815   struct radv_cmd_state *state = &cmd_buffer->state;
5816
5817   state->dynamic.line_stipple.factor = lineStippleFactor;
5818   state->dynamic.line_stipple.pattern = lineStipplePattern;
5819
5820   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
5821}
5822
5823VKAPI_ATTR void VKAPI_CALL
5824radv_CmdSetCullMode(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
5825{
5826   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5827   struct radv_cmd_state *state = &cmd_buffer->state;
5828
5829   state->dynamic.cull_mode = cullMode;
5830
5831   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE;
5832}
5833
5834VKAPI_ATTR void VKAPI_CALL
5835radv_CmdSetFrontFace(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
5836{
5837   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5838   struct radv_cmd_state *state = &cmd_buffer->state;
5839
5840   state->dynamic.front_face = frontFace;
5841
5842   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
5843}
5844
5845VKAPI_ATTR void VKAPI_CALL
5846radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer, VkPrimitiveTopology primitiveTopology)
5847{
5848   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5849   struct radv_cmd_state *state = &cmd_buffer->state;
5850   unsigned primitive_topology = si_translate_prim(primitiveTopology);
5851
5852   if ((state->dynamic.primitive_topology == V_008958_DI_PT_LINESTRIP) !=
5853       (primitive_topology == V_008958_DI_PT_LINESTRIP))
5854      state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
5855
5856   if (radv_prim_is_points_or_lines(state->dynamic.primitive_topology) !=
5857       radv_prim_is_points_or_lines(primitive_topology))
5858      state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
5859
5860   state->dynamic.primitive_topology = primitive_topology;
5861
5862   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
5863}
5864
5865VKAPI_ATTR void VKAPI_CALL
5866radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer, uint32_t viewportCount,
5867                             const VkViewport *pViewports)
5868{
5869   radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
5870}
5871
5872VKAPI_ATTR void VKAPI_CALL
5873radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer, uint32_t scissorCount,
5874                            const VkRect2D *pScissors)
5875{
5876   radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
5877}
5878
5879VKAPI_ATTR void VKAPI_CALL
5880radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable)
5881
5882{
5883   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5884   struct radv_cmd_state *state = &cmd_buffer->state;
5885
5886   state->dynamic.depth_test_enable = depthTestEnable;
5887
5888   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
5889}
5890
5891VKAPI_ATTR void VKAPI_CALL
5892radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable)
5893{
5894   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5895   struct radv_cmd_state *state = &cmd_buffer->state;
5896
5897   state->dynamic.depth_write_enable = depthWriteEnable;
5898
5899   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
5900}
5901
5902VKAPI_ATTR void VKAPI_CALL
5903radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp)
5904{
5905   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5906   struct radv_cmd_state *state = &cmd_buffer->state;
5907
5908   state->dynamic.depth_compare_op = depthCompareOp;
5909
5910   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
5911}
5912
5913VKAPI_ATTR void VKAPI_CALL
5914radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable)
5915{
5916   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5917   struct radv_cmd_state *state = &cmd_buffer->state;
5918
5919   state->dynamic.depth_bounds_test_enable = depthBoundsTestEnable;
5920
5921   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
5922}
5923
5924VKAPI_ATTR void VKAPI_CALL
5925radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable)
5926{
5927   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5928   struct radv_cmd_state *state = &cmd_buffer->state;
5929
5930   state->dynamic.stencil_test_enable = stencilTestEnable;
5931
5932   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
5933}
5934
5935VKAPI_ATTR void VKAPI_CALL
5936radv_CmdSetStencilOp(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5937                     VkStencilOp failOp, VkStencilOp passOp, VkStencilOp depthFailOp,
5938                     VkCompareOp compareOp)
5939{
5940   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5941   struct radv_cmd_state *state = &cmd_buffer->state;
5942
5943   if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
5944      state->dynamic.stencil_op.front.fail_op = failOp;
5945      state->dynamic.stencil_op.front.pass_op = passOp;
5946      state->dynamic.stencil_op.front.depth_fail_op = depthFailOp;
5947      state->dynamic.stencil_op.front.compare_op = compareOp;
5948   }
5949
5950   if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
5951      state->dynamic.stencil_op.back.fail_op = failOp;
5952      state->dynamic.stencil_op.back.pass_op = passOp;
5953      state->dynamic.stencil_op.back.depth_fail_op = depthFailOp;
5954      state->dynamic.stencil_op.back.compare_op = compareOp;
5955   }
5956
5957   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
5958}
5959
5960VKAPI_ATTR void VKAPI_CALL
5961radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize,
5962                                  const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
5963{
5964   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5965   struct radv_cmd_state *state = &cmd_buffer->state;
5966
5967   state->dynamic.fragment_shading_rate.size = *pFragmentSize;
5968   for (unsigned i = 0; i < 2; i++)
5969      state->dynamic.fragment_shading_rate.combiner_ops[i] = combinerOps[i];
5970
5971   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
5972}
5973
5974VKAPI_ATTR void VKAPI_CALL
5975radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable)
5976{
5977   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5978   struct radv_cmd_state *state = &cmd_buffer->state;
5979
5980   state->dynamic.depth_bias_enable = depthBiasEnable;
5981
5982   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
5983}
5984
5985VKAPI_ATTR void VKAPI_CALL
5986radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable)
5987{
5988   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5989   struct radv_cmd_state *state = &cmd_buffer->state;
5990
5991   state->dynamic.primitive_restart_enable = primitiveRestartEnable;
5992
5993   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
5994}
5995
5996VKAPI_ATTR void VKAPI_CALL
5997radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer, VkBool32 rasterizerDiscardEnable)
5998{
5999   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6000   struct radv_cmd_state *state = &cmd_buffer->state;
6001
6002   state->dynamic.rasterizer_discard_enable = rasterizerDiscardEnable;
6003
6004   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
6005}
6006
6007VKAPI_ATTR void VKAPI_CALL
6008radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints)
6009{
6010   /* not implemented */
6011}
6012
6013VKAPI_ATTR void VKAPI_CALL
6014radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp)
6015{
6016   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6017   struct radv_cmd_state *state = &cmd_buffer->state;
6018   unsigned logic_op = si_translate_blend_logic_op(logicOp);
6019
6020   state->dynamic.logic_op = logic_op;
6021
6022   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
6023}
6024
6025VKAPI_ATTR void VKAPI_CALL
6026radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
6027                               const VkBool32 *pColorWriteEnables)
6028{
6029   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6030   struct radv_cmd_state *state = &cmd_buffer->state;
6031   uint32_t color_write_enable = 0;
6032
6033   assert(attachmentCount <= MAX_RTS);
6034
6035   for (uint32_t i = 0; i < attachmentCount; i++) {
6036      color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
6037   }
6038
6039   state->dynamic.color_write_enable = color_write_enable;
6040
6041   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
6042}
6043
6044VKAPI_ATTR void VKAPI_CALL
6045radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount,
6046                          const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
6047                          uint32_t vertexAttributeDescriptionCount,
6048                          const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
6049{
6050   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6051   struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
6052
6053   const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS];
6054   for (unsigned i = 0; i < vertexBindingDescriptionCount; i++)
6055      bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i];
6056
6057   cmd_buffer->state.vbo_misaligned_mask = 0;
6058   cmd_buffer->state.vbo_misaligned_mask_invalid = 0;
6059
6060   memset(state, 0, sizeof(*state));
6061   state->bindings_match_attrib = true;
6062
6063   enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
6064   for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
6065      const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
6066      const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
6067      unsigned loc = attrib->location;
6068
6069      state->attribute_mask |= 1u << loc;
6070      state->bindings[loc] = attrib->binding;
6071      if (attrib->binding != loc)
6072         state->bindings_match_attrib = false;
6073      if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) {
6074         state->instance_rate_inputs |= 1u << loc;
6075         state->divisors[loc] = binding->divisor;
6076         if (binding->divisor == 0) {
6077            state->zero_divisors |= 1u << loc;
6078         } else if (binding->divisor > 1) {
6079            state->nontrivial_divisors |= 1u << loc;
6080         }
6081      }
6082      cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
6083      state->offsets[loc] = attrib->offset;
6084
6085      struct dynamic_vertex_format_cache *found = NULL;
6086      util_dynarray_foreach(&cmd_buffer->cached_vertex_formats,
6087                            struct dynamic_vertex_format_cache,
6088                            vf) {
6089         if (vf->format == attrib->format) {
6090            found = vf;
6091            break;
6092         }
6093      }
6094      if (!found) {
6095         unsigned nfmt, dfmt;
6096         bool post_shuffle;
6097         enum radv_vs_input_alpha_adjust alpha_adjust;
6098         const struct util_format_description *format_desc = vk_format_description(attrib->format);
6099
6100         found = util_dynarray_grow(&cmd_buffer->cached_vertex_formats,
6101                                    struct dynamic_vertex_format_cache, 1);
6102         radv_translate_vertex_format(cmd_buffer->device->physical_device, attrib->format, format_desc,
6103                                      &dfmt, &nfmt, &post_shuffle, &alpha_adjust);
6104         found->format = attrib->format;
6105         found->hw_fmt = dfmt | (nfmt << 4);
6106         const uint8_t format_align_req_minus_1 = format_desc->channel[0].size >= 32 ? 3 :
6107            (format_desc->block.bits / 8u - 1);
6108         found->fmt_align_req_minus_1 = format_align_req_minus_1;
6109         found->fmt_size = format_desc->block.bits / 8u;
6110         found->post_shuffle = post_shuffle;
6111         found->alpha_adjust_lo = alpha_adjust & 0x1;
6112         found->alpha_adjust_hi = (alpha_adjust >> 1) & 0x1;
6113      }
6114
6115      state->formats[loc] = found->hw_fmt;
6116      state->format_align_req_minus_1[loc] = found->fmt_align_req_minus_1;
6117      state->format_sizes[loc] = found->fmt_size;
6118      state->alpha_adjust_lo |= found->alpha_adjust_lo << loc;
6119      state->alpha_adjust_hi |= found->alpha_adjust_hi << loc;
6120      if (found->post_shuffle)
6121         state->post_shuffle |= 1u << loc;
6122
6123      if ((chip == GFX6 || chip >= GFX10) &&
6124          cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(attrib->binding)) {
6125         if (binding->stride & found->fmt_align_req_minus_1) {
6126            cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc);
6127         } else if ((cmd_buffer->vertex_bindings[attrib->binding].offset + state->offsets[loc]) &
6128                    found->fmt_align_req_minus_1) {
6129            cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc);
6130         }
6131      }
6132   }
6133
6134   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
6135                              RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
6136}
6137
6138VKAPI_ATTR void VKAPI_CALL
6139radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount,
6140                        const VkCommandBuffer *pCmdBuffers)
6141{
6142   RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
6143
6144   assert(commandBufferCount > 0);
6145
6146   radv_emit_mip_change_flush_default(primary);
6147
6148   /* Emit pending flushes on primary prior to executing secondary */
6149   si_emit_cache_flush(primary);
6150
6151   /* Make sure CP DMA is idle on primary prior to executing secondary. */
6152   si_cp_dma_wait_for_idle(primary);
6153
6154   for (uint32_t i = 0; i < commandBufferCount; i++) {
6155      RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
6156      bool allow_ib2 = true;
6157
6158      if (secondary->device->physical_device->rad_info.gfx_level == GFX7 &&
6159          secondary->state.uses_draw_indirect_multi) {
6160         /* Do not launch an IB2 for secondary command buffers that contain
6161          * DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU.
6162          */
6163         allow_ib2 = false;
6164      }
6165
6166      if (secondary->qf == RADV_QUEUE_COMPUTE) {
6167         /* IB2 packets are not supported on compute queues according to PAL. */
6168         allow_ib2 = false;
6169      }
6170
6171      primary->scratch_size_per_wave_needed =
6172         MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
6173      primary->scratch_waves_wanted =
6174         MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted);
6175      primary->compute_scratch_size_per_wave_needed =
6176         MAX2(primary->compute_scratch_size_per_wave_needed,
6177              secondary->compute_scratch_size_per_wave_needed);
6178      primary->compute_scratch_waves_wanted =
6179         MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted);
6180
6181      if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
6182         primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
6183      if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
6184         primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
6185      if (secondary->tess_rings_needed)
6186         primary->tess_rings_needed = true;
6187      if (secondary->task_rings_needed)
6188         primary->task_rings_needed = true;
6189      if (secondary->mesh_scratch_ring_needed)
6190         primary->mesh_scratch_ring_needed = true;
6191      if (secondary->sample_positions_needed)
6192         primary->sample_positions_needed = true;
6193      if (secondary->gds_needed)
6194         primary->gds_needed = true;
6195
6196      if (!secondary->state.framebuffer && primary->state.pass && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
6197         /* Emit the framebuffer state from primary if secondary
6198          * has been recorded without a framebuffer, otherwise
6199          * fast color/depth clears can't work.
6200          */
6201         radv_emit_fb_mip_change_flush(primary);
6202         radv_emit_framebuffer_state(primary);
6203      }
6204
6205      if (secondary->ace_internal.cs) {
6206         if (!primary->ace_internal.cs) {
6207            primary->ace_internal.cs = radv_ace_internal_create(primary);
6208            if (!primary->ace_internal.cs)
6209               return;
6210         }
6211
6212         struct radeon_cmdbuf *ace_primary = primary->ace_internal.cs;
6213         struct radeon_cmdbuf *ace_secondary = secondary->ace_internal.cs;
6214
6215         /* Emit pending flushes on primary prior to executing secondary. */
6216         radv_ace_internal_cache_flush(primary);
6217
6218         /* Wait for primary GFX->ACE semaphore, if necessary. */
6219         if (radv_flush_gfx2ace_semaphore(primary))
6220            radv_wait_gfx2ace_semaphore(primary);
6221
6222         /* Execute the secondary compute cmdbuf.
6223          * Don't use IB2 packets because they are not supported on compute queues.
6224          */
6225         primary->device->ws->cs_execute_secondary(ace_primary, ace_secondary, false);
6226      }
6227
6228      /* Update pending ACE internal flush bits from the secondary cmdbuf */
6229      primary->ace_internal.flush_bits |= secondary->ace_internal.flush_bits;
6230
6231      /* Increment primary semaphore if secondary was dirty.
6232       * This happens when the secondary cmdbuf has a barrier which
6233       * isn't consumed by a draw call.
6234       */
6235      if (radv_ace_internal_sem_dirty(secondary))
6236         primary->ace_internal.sem.gfx2ace_value++;
6237
6238      primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
6239
6240      /* When the secondary command buffer is compute only we don't
6241       * need to re-emit the current graphics pipeline.
6242       */
6243      if (secondary->state.emitted_graphics_pipeline) {
6244         primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline;
6245      }
6246
6247      /* When the secondary command buffer is graphics only we don't
6248       * need to re-emit the current compute pipeline.
6249       */
6250      if (secondary->state.emitted_compute_pipeline) {
6251         primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
6252      }
6253
6254      /* Only re-emit the draw packets when needed. */
6255      if (secondary->state.last_primitive_reset_en != -1) {
6256         primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en;
6257      }
6258
6259      if (secondary->state.last_primitive_reset_index) {
6260         primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index;
6261      }
6262
6263      if (secondary->state.last_ia_multi_vgt_param) {
6264         primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
6265      }
6266
6267      primary->state.last_first_instance = secondary->state.last_first_instance;
6268      primary->state.last_num_instances = secondary->state.last_num_instances;
6269      primary->state.last_drawid = secondary->state.last_drawid;
6270      primary->state.last_subpass_color_count = secondary->state.last_subpass_color_count;
6271      primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
6272      primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert;
6273      primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon;
6274      primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control;
6275
6276      if (secondary->state.last_index_type != -1) {
6277         primary->state.last_index_type = secondary->state.last_index_type;
6278      }
6279
6280      primary->state.last_nggc_settings = secondary->state.last_nggc_settings;
6281      primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx;
6282      primary->state.last_nggc_skip = secondary->state.last_nggc_skip;
6283
6284      primary->state.last_vrs_rates = secondary->state.last_vrs_rates;
6285      primary->state.last_vrs_rates_sgpr_idx = secondary->state.last_vrs_rates_sgpr_idx;
6286   }
6287
6288   /* After executing commands from secondary buffers we have to dirty
6289    * some states.
6290    */
6291   primary->state.dirty |=
6292      RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_ALL;
6293   radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
6294   radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
6295}
6296
6297VKAPI_ATTR VkResult VKAPI_CALL
6298radv_CreateCommandPool(VkDevice _device, const VkCommandPoolCreateInfo *pCreateInfo,
6299                       const VkAllocationCallbacks *pAllocator, VkCommandPool *pCmdPool)
6300{
6301   RADV_FROM_HANDLE(radv_device, device, _device);
6302   struct radv_cmd_pool *pool;
6303
6304   pool =
6305      vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
6306   if (pool == NULL)
6307      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
6308
6309   VkResult result = vk_command_pool_init(&pool->vk, &device->vk, pCreateInfo, pAllocator);
6310   if (result != VK_SUCCESS) {
6311      vk_free2(&device->vk.alloc, pAllocator, pool);
6312      return result;
6313   }
6314
6315   list_inithead(&pool->cmd_buffers);
6316   list_inithead(&pool->free_cmd_buffers);
6317
6318   *pCmdPool = radv_cmd_pool_to_handle(pool);
6319
6320   return VK_SUCCESS;
6321}
6322
6323VKAPI_ATTR void VKAPI_CALL
6324radv_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool,
6325                        const VkAllocationCallbacks *pAllocator)
6326{
6327   RADV_FROM_HANDLE(radv_device, device, _device);
6328   RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
6329
6330   if (!pool)
6331      return;
6332
6333   list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
6334   {
6335      radv_destroy_cmd_buffer(cmd_buffer);
6336   }
6337
6338   list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
6339   {
6340      radv_destroy_cmd_buffer(cmd_buffer);
6341   }
6342
6343   vk_command_pool_finish(&pool->vk);
6344   vk_free2(&device->vk.alloc, pAllocator, pool);
6345}
6346
6347VKAPI_ATTR VkResult VKAPI_CALL
6348radv_ResetCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags)
6349{
6350   RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
6351   VkResult result;
6352
6353   list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
6354   {
6355      result = radv_reset_cmd_buffer(cmd_buffer);
6356      if (result != VK_SUCCESS)
6357         return result;
6358   }
6359
6360   return VK_SUCCESS;
6361}
6362
6363VKAPI_ATTR void VKAPI_CALL
6364radv_TrimCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlags flags)
6365{
6366   RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
6367
6368   list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
6369   {
6370      radv_destroy_cmd_buffer(cmd_buffer);
6371   }
6372}
6373
6374static void
6375radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpass_id)
6376{
6377   struct radv_cmd_state *state = &cmd_buffer->state;
6378   struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
6379
6380   ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096);
6381
6382   radv_emit_subpass_barrier(cmd_buffer, &subpass->start_barrier);
6383
6384   radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
6385
6386   radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
6387
6388   for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
6389      const uint32_t a = subpass->attachments[i].attachment;
6390      if (a == VK_ATTACHMENT_UNUSED)
6391         continue;
6392
6393      radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true);
6394   }
6395
6396   radv_ace_internal_barrier(cmd_buffer, 0, 0);
6397   radv_describe_barrier_end(cmd_buffer);
6398
6399   radv_cmd_buffer_clear_subpass(cmd_buffer);
6400
6401   if (subpass->vrs_attachment) {
6402      int idx = subpass->vrs_attachment->attachment;
6403      struct radv_image_view *vrs_iview = cmd_buffer->state.attachments[idx].iview;
6404
6405      if (subpass->depth_stencil_attachment) {
6406         /* When a subpass uses a VRS attachment and a depth/stencil attachment, we just need to
6407          * copy the VRS rates to the HTILE buffer of the attachment.
6408          */
6409         int ds_idx = subpass->depth_stencil_attachment->attachment;
6410         struct radv_image_view *ds_iview = cmd_buffer->state.attachments[ds_idx].iview;
6411         struct radv_image *ds_image = ds_iview->image;
6412         uint32_t level = ds_iview->vk.base_mip_level;
6413
6414         VkExtent2D extent = {
6415            .width = radv_minify(ds_image->info.width, level),
6416            .height = radv_minify(ds_image->info.height, level),
6417         };
6418
6419         /* HTILE buffer */
6420         uint64_t htile_offset = ds_image->bindings[0].offset + ds_image->planes[0].surface.meta_offset +
6421                                 ds_image->planes[0].surface.u.gfx9.meta_levels[level].offset;
6422         uint64_t htile_size = ds_image->planes[0].surface.u.gfx9.meta_levels[level].size;
6423         struct radv_buffer htile_buffer;
6424
6425         radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bindings[0].bo, htile_size, htile_offset);
6426
6427         /* Copy the VRS rates to the HTILE buffer. */
6428         radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, &htile_buffer, true);
6429
6430         radv_buffer_finish(&htile_buffer);
6431      } else {
6432         /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have
6433          * to copy the VRS rates to our internal HTILE buffer.
6434          */
6435         struct vk_framebuffer *fb = cmd_buffer->state.framebuffer;
6436         struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
6437
6438         if (ds_image) {
6439            /* HTILE buffer */
6440            struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
6441
6442            VkExtent2D extent = {
6443               .width = MIN2(fb->width, ds_image->info.width),
6444               .height = MIN2(fb->height, ds_image->info.height),
6445            };
6446
6447            /* Copy the VRS rates to the HTILE buffer. */
6448            radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, htile_buffer, false);
6449         }
6450      }
6451   }
6452
6453   assert(cmd_buffer->cs->cdw <= cdw_max);
6454}
6455
6456static void
6457radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
6458{
6459   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
6460
6461   /* Have to be conservative in cmdbuffers with inherited attachments. */
6462   if (!cmd_buffer->state.attachments) {
6463      cmd_buffer->state.rb_noncoherent_dirty = true;
6464      return;
6465   }
6466
6467   for (uint32_t i = 0; i < subpass->color_count; ++i) {
6468      const uint32_t a = subpass->color_attachments[i].attachment;
6469      if (a == VK_ATTACHMENT_UNUSED)
6470         continue;
6471      if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
6472         cmd_buffer->state.rb_noncoherent_dirty = true;
6473         return;
6474      }
6475   }
6476   if (subpass->depth_stencil_attachment &&
6477       !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
6478           .iview->image->l2_coherent)
6479      cmd_buffer->state.rb_noncoherent_dirty = true;
6480}
6481
6482void
6483radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
6484                                const struct radv_subpass *subpass)
6485{
6486   radv_mark_noncoherent_rb(cmd_buffer);
6487   radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
6488}
6489
6490static void
6491radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
6492{
6493   struct radv_cmd_state *state = &cmd_buffer->state;
6494   const struct radv_subpass *subpass = state->subpass;
6495   uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
6496
6497   radv_cmd_buffer_resolve_subpass(cmd_buffer);
6498
6499   radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
6500
6501   for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
6502      const uint32_t a = subpass->attachments[i].attachment;
6503      if (a == VK_ATTACHMENT_UNUSED)
6504         continue;
6505
6506      if (state->pass->attachments[a].last_subpass_idx != subpass_id)
6507         continue;
6508
6509      VkImageLayout layout = state->pass->attachments[a].final_layout;
6510      VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout;
6511      struct radv_subpass_attachment att = {a, layout, stencil_layout};
6512      radv_handle_subpass_image_transition(cmd_buffer, att, false);
6513   }
6514
6515   radv_ace_internal_barrier(cmd_buffer, 0, 0);
6516   radv_describe_barrier_end(cmd_buffer);
6517}
6518
6519VKAPI_ATTR void VKAPI_CALL
6520radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
6521                         const VkRenderPassBeginInfo *pRenderPassBeginInfo,
6522                         const VkSubpassBeginInfo *pSubpassBeginInfo)
6523{
6524   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6525   RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBeginInfo->renderPass);
6526   RADV_FROM_HANDLE(vk_framebuffer, framebuffer, pRenderPassBeginInfo->framebuffer);
6527   VkResult result;
6528
6529   cmd_buffer->state.framebuffer = framebuffer;
6530   cmd_buffer->state.pass = pass;
6531   cmd_buffer->state.render_area = pRenderPassBeginInfo->renderArea;
6532
6533   result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBeginInfo);
6534   if (result != VK_SUCCESS)
6535      return;
6536
6537   result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBeginInfo);
6538   if (result != VK_SUCCESS)
6539      return;
6540
6541   radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
6542}
6543
6544VKAPI_ATTR void VKAPI_CALL
6545radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pSubpassBeginInfo,
6546                     const VkSubpassEndInfo *pSubpassEndInfo)
6547{
6548   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6549
6550   radv_mark_noncoherent_rb(cmd_buffer);
6551
6552   uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
6553   radv_cmd_buffer_end_subpass(cmd_buffer);
6554   radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
6555}
6556
6557static void
6558radv_emit_view_index_per_stage(struct radeon_cmdbuf *cs, struct radv_graphics_pipeline *pipeline,
6559                               unsigned stage, unsigned index)
6560{
6561   struct radv_userdata_info *loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_VIEW_INDEX);
6562   if (loc->sgpr_idx == -1)
6563      return;
6564   uint32_t base_reg = pipeline->base.user_data_0[stage];
6565   radeon_set_sh_reg(cs, base_reg + loc->sgpr_idx * 4, index);
6566}
6567
6568static void
6569radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
6570{
6571   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
6572
6573   radv_foreach_stage(stage, pipeline->active_stages & ~VK_SHADER_STAGE_TASK_BIT_NV) {
6574      radv_emit_view_index_per_stage(cmd_buffer->cs, pipeline, stage, index);
6575   }
6576   if (radv_pipeline_has_gs_copy_shader(&pipeline->base)) {
6577      struct radv_userdata_info *loc =
6578         &pipeline->base.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
6579      if (loc->sgpr_idx != -1) {
6580         uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
6581         radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
6582      }
6583   }
6584   if (pipeline->active_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
6585      radv_emit_view_index_per_stage(cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
6586                                     index);
6587   }
6588}
6589
6590/**
6591 * Emulates predication for MEC using COND_EXEC.
6592 * When the current command buffer is predicating, emit a COND_EXEC packet
6593 * so that the MEC skips the next few dwords worth of packets.
6594 *
6595 * To make it work with inverted conditional rendering, we allocate
6596 * space in the upload BO and emit some packets to invert the condition.
6597 */
6598static void
6599radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmdbuf *cs,
6600                                 uint64_t inv_va, bool *inv_emitted, unsigned dwords)
6601{
6602   if (!state->predicating)
6603      return;
6604
6605   uint64_t va = state->predication_va;
6606
6607   if (!state->predication_type) {
6608      /* Invert the condition the first time it is needed. */
6609      if (!*inv_emitted) {
6610         *inv_emitted = true;
6611
6612         /* Write 1 to the inverted predication VA. */
6613         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6614         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
6615                            COPY_DATA_WR_CONFIRM);
6616         radeon_emit(cs, 1);
6617         radeon_emit(cs, 0);
6618         radeon_emit(cs, inv_va);
6619         radeon_emit(cs, inv_va >> 32);
6620
6621         /* If the API predication VA == 0, skip next command. */
6622         radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
6623         radeon_emit(cs, va);
6624         radeon_emit(cs, va >> 32);
6625         radeon_emit(cs, 0);
6626         radeon_emit(cs, 6); /* 1x COPY_DATA size */
6627
6628         /* Write 0 to the new predication VA (when the API condition != 0) */
6629         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6630         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
6631                            COPY_DATA_WR_CONFIRM);
6632         radeon_emit(cs, 0);
6633         radeon_emit(cs, 0);
6634         radeon_emit(cs, inv_va);
6635         radeon_emit(cs, inv_va >> 32);
6636      }
6637
6638      va = inv_va;
6639   }
6640
6641   radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
6642   radeon_emit(cs, va);
6643   radeon_emit(cs, va >> 32);
6644   radeon_emit(cs, 0); /* Cache policy */
6645   radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */
6646}
6647
6648static void
6649radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count,
6650                         uint32_t use_opaque)
6651{
6652   radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
6653   radeon_emit(cmd_buffer->cs, vertex_count);
6654   radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
6655}
6656
6657/**
6658 * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices.
6659 *
6660 * The starting address "index_va" may point anywhere within the index buffer. The number of
6661 * indexes allocated in the index buffer *past that point* is specified by "max_index_count".
6662 * Hardware uses this information to return 0 for out-of-bounds reads.
6663 */
6664static void
6665radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va,
6666                                 uint32_t max_index_count, uint32_t index_count, bool not_eop)
6667{
6668   radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
6669   radeon_emit(cmd_buffer->cs, max_index_count);
6670   radeon_emit(cmd_buffer->cs, index_va);
6671   radeon_emit(cmd_buffer->cs, index_va >> 32);
6672   radeon_emit(cmd_buffer->cs, index_count);
6673   /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
6674    * can be changed between draws and GS fast launch must be disabled.
6675    * NOT_EOP doesn't work on gfx9 and older.
6676    */
6677   radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
6678}
6679
6680/* MUST inline this function to avoid massive perf loss in drawoverhead */
6681ALWAYS_INLINE static void
6682radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed,
6683                                  uint32_t draw_count, uint64_t count_va, uint32_t stride)
6684{
6685   struct radeon_cmdbuf *cs = cmd_buffer->cs;
6686   const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
6687   bool draw_id_enable = cmd_buffer->state.graphics_pipeline->uses_drawid;
6688   uint32_t base_reg = cmd_buffer->state.graphics_pipeline->vtx_base_sgpr;
6689   uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0;
6690   bool predicating = cmd_buffer->state.predicating;
6691   bool mesh = cmd_buffer->state.mesh_shading;
6692   assert(base_reg);
6693
6694   /* just reset draw state for vertex data */
6695   cmd_buffer->state.last_first_instance = -1;
6696   cmd_buffer->state.last_num_instances = -1;
6697   cmd_buffer->state.last_drawid = -1;
6698   cmd_buffer->state.last_vertex_offset = -1;
6699
6700   vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2;
6701   if (cmd_buffer->state.graphics_pipeline->uses_baseinstance)
6702      start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2;
6703   if (draw_id_enable)
6704      draw_id_reg = ((base_reg + mesh * 12 + 4) - SI_SH_REG_OFFSET) >> 2;
6705
6706   if (draw_count == 1 && !count_va && !draw_id_enable) {
6707      radeon_emit(cs,
6708                  PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating));
6709      radeon_emit(cs, 0);
6710      radeon_emit(cs, vertex_offset_reg);
6711      radeon_emit(cs, start_instance_reg);
6712      radeon_emit(cs, di_src_sel);
6713   } else {
6714      radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
6715                           predicating));
6716      radeon_emit(cs, 0);
6717      radeon_emit(cs, vertex_offset_reg);
6718      radeon_emit(cs, start_instance_reg);
6719      radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
6720                         S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
6721      radeon_emit(cs, draw_count); /* count */
6722      radeon_emit(cs, count_va);   /* count_addr */
6723      radeon_emit(cs, count_va >> 32);
6724      radeon_emit(cs, stride); /* stride */
6725      radeon_emit(cs, di_src_sel);
6726
6727      cmd_buffer->state.uses_draw_indirect_multi = true;
6728   }
6729}
6730
6731ALWAYS_INLINE static void
6732radv_cs_emit_dispatch_taskmesh_direct_ace_packet(struct radv_cmd_buffer *cmd_buffer,
6733                                                 const uint32_t x, const uint32_t y,
6734                                                 const uint32_t z)
6735{
6736   struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6737   struct radv_shader *compute_shader = radv_get_shader(pipeline, MESA_SHADER_TASK);
6738   struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6739   const bool predicating = cmd_buffer->state.predicating;
6740   const uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator_task |
6741                                       S_00B800_CS_W32_EN(compute_shader->info.wave_size == 32);
6742
6743   struct radv_userdata_info *ring_entry_loc =
6744      radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_TASK_RING_ENTRY);
6745   assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
6746
6747   uint32_t ring_entry_reg =
6748      (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6749
6750   radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, predicating) | PKT3_SHADER_TYPE_S(1));
6751   radeon_emit(cs, x);
6752   radeon_emit(cs, y);
6753   radeon_emit(cs, z);
6754   radeon_emit(cs, dispatch_initiator);
6755   radeon_emit(cs, ring_entry_reg & 0xFFFF);
6756}
6757
6758ALWAYS_INLINE static void
6759radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(struct radv_cmd_buffer *cmd_buffer,
6760                                                         uint64_t data_va, uint32_t draw_count,
6761                                                         uint64_t count_va, uint32_t stride)
6762{
6763   assert((data_va & 0x03) == 0);
6764   assert((count_va & 0x03) == 0);
6765
6766   struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6767   struct radv_shader *compute_shader = radv_get_shader(pipeline, MESA_SHADER_TASK);
6768   struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6769
6770   const uint32_t count_indirect_enable = !!count_va;
6771   const uint32_t xyz_dim_enable = compute_shader->info.cs.uses_grid_size;
6772   const uint32_t draw_id_enable = compute_shader->info.vs.needs_draw_id;
6773   const uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator_task |
6774                                       S_00B800_CS_W32_EN(compute_shader->info.wave_size == 32);
6775
6776   const struct radv_userdata_info *ring_entry_loc =
6777      radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_TASK_RING_ENTRY);
6778   const struct radv_userdata_info *xyz_dim_loc =
6779      radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_GRID_SIZE);
6780   const struct radv_userdata_info *draw_id_loc =
6781      radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_DRAW_ID);
6782
6783   assert(ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
6784   assert(!xyz_dim_enable || (xyz_dim_loc->sgpr_idx != -1 && xyz_dim_loc->num_sgprs == 3));
6785   assert(!draw_id_enable || (draw_id_loc->sgpr_idx != -1 && draw_id_loc->num_sgprs == 1));
6786
6787   const uint32_t ring_entry_reg =
6788      (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6789   const uint32_t xyz_dim_reg =
6790      !xyz_dim_enable
6791         ? 0
6792         : (R_00B900_COMPUTE_USER_DATA_0 + xyz_dim_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6793   const uint32_t draw_id_reg =
6794      !draw_id_enable
6795         ? 0
6796         : (R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6797
6798   radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1));
6799   radeon_emit(cs, data_va);
6800   radeon_emit(cs, data_va >> 32);
6801   radeon_emit(cs, ring_entry_reg & 0xFFFF);
6802   radeon_emit(cs, (count_indirect_enable << 1) | (draw_id_enable << 2) | (xyz_dim_enable << 3) |
6803                      (draw_id_reg << 16));
6804   radeon_emit(cs, xyz_dim_reg & 0xFFFF);
6805   radeon_emit(cs, draw_count);
6806   radeon_emit(cs, count_va);
6807   radeon_emit(cs, count_va >> 32);
6808   radeon_emit(cs, stride);
6809   radeon_emit(cs, dispatch_initiator);
6810}
6811
6812ALWAYS_INLINE static void
6813radv_cs_emit_dispatch_taskmesh_gfx_packet(struct radv_cmd_buffer *cmd_buffer)
6814{
6815   struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6816   struct radeon_cmdbuf *cs = cmd_buffer->cs;
6817   bool predicating = cmd_buffer->state.predicating;
6818
6819   struct radv_userdata_info *ring_entry_loc =
6820      radv_lookup_user_sgpr(pipeline, MESA_SHADER_MESH, AC_UD_TASK_RING_ENTRY);
6821
6822   assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1);
6823
6824   uint32_t base_reg = cmd_buffer->state.graphics_pipeline->vtx_base_sgpr;
6825   uint32_t xyz_dim_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2;
6826   uint32_t ring_entry_reg = ((base_reg + ring_entry_loc->sgpr_idx * 4) - SI_SH_REG_OFFSET) >> 2;
6827
6828   radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, predicating));
6829   radeon_emit(cs, (ring_entry_reg << 16) | (xyz_dim_reg & 0xFFFF));
6830   radeon_emit(cs, 0);
6831   radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
6832}
6833
6834static inline void
6835radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer,
6836                                   const struct radv_draw_info *info, const uint32_t vertex_offset)
6837{
6838   struct radv_cmd_state *state = &cmd_buffer->state;
6839   struct radeon_cmdbuf *cs = cmd_buffer->cs;
6840   const bool uses_baseinstance = state->graphics_pipeline->uses_baseinstance;
6841   const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6842
6843   radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, state->graphics_pipeline->vtx_emit_num);
6844
6845   radeon_emit(cs, vertex_offset);
6846   state->last_vertex_offset = vertex_offset;
6847   if (uses_drawid) {
6848      radeon_emit(cs, 0);
6849      state->last_drawid = 0;
6850   }
6851   if (uses_baseinstance) {
6852      radeon_emit(cs, info->first_instance);
6853      state->last_first_instance = info->first_instance;
6854   }
6855}
6856
6857ALWAYS_INLINE static void
6858radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
6859                          const uint32_t vertex_offset)
6860{
6861   const struct radv_cmd_state *state = &cmd_buffer->state;
6862   const bool uses_baseinstance = state->graphics_pipeline->uses_baseinstance;
6863   const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6864
6865   /* this looks very dumb, but it allows the compiler to optimize better and yields
6866    * ~3-4% perf increase in drawoverhead
6867    */
6868   if (vertex_offset != state->last_vertex_offset) {
6869      radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6870   } else if (uses_drawid && 0 != state->last_drawid) {
6871      radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6872   } else if (uses_baseinstance && info->first_instance != state->last_first_instance) {
6873      radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6874   }
6875}
6876
6877ALWAYS_INLINE static void
6878radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid)
6879{
6880   struct radv_cmd_state *state = &cmd_buffer->state;
6881   struct radeon_cmdbuf *cs = cmd_buffer->cs;
6882   radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, 1 + !!drawid);
6883   radeon_emit(cs, vertex_offset);
6884   state->last_vertex_offset = vertex_offset;
6885   if (drawid)
6886      radeon_emit(cs, drawid);
6887
6888}
6889
6890ALWAYS_INLINE static void
6891radv_emit_userdata_mesh(struct radv_cmd_buffer *cmd_buffer,
6892                        const uint32_t x, const uint32_t y, const uint32_t z,
6893                        const uint32_t first_task)
6894{
6895   struct radv_cmd_state *state = &cmd_buffer->state;
6896   struct radeon_cmdbuf *cs = cmd_buffer->cs;
6897   const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6898
6899   radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, state->graphics_pipeline->vtx_emit_num);
6900   radeon_emit(cs, first_task);
6901   radeon_emit(cs, x);
6902   radeon_emit(cs, y);
6903   radeon_emit(cs, z);
6904
6905   if (uses_drawid) {
6906      radeon_emit(cs, 0);
6907      state->last_drawid = 0;
6908   }
6909}
6910
6911ALWAYS_INLINE static void
6912radv_emit_userdata_mesh_first_task_0_draw_id_0(struct radv_cmd_buffer *cmd_buffer)
6913{
6914   struct radv_cmd_state *state = &cmd_buffer->state;
6915   struct radeon_cmdbuf *cs = cmd_buffer->cs;
6916   struct radv_graphics_pipeline *pipeline = state->graphics_pipeline;
6917   const bool uses_drawid = pipeline->uses_drawid;
6918
6919   radeon_set_sh_reg_seq(cs, pipeline->vtx_base_sgpr, 1);
6920   radeon_emit(cs, 0);
6921
6922   if (uses_drawid) {
6923      radeon_set_sh_reg_seq(cs, pipeline->vtx_base_sgpr + (pipeline->vtx_emit_num - 1) * 4, 1);
6924      radeon_emit(cs, 0);
6925   }
6926}
6927
6928ALWAYS_INLINE static void
6929radv_emit_userdata_task_ib_only(struct radv_cmd_buffer *cmd_buffer, uint64_t ib_va,
6930                                uint32_t ib_stride)
6931{
6932   struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6933   struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6934
6935   struct radv_userdata_info *task_ib_loc =
6936      radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_IB);
6937
6938   if (task_ib_loc->sgpr_idx != -1) {
6939      assert(task_ib_loc->num_sgprs == 3);
6940      unsigned task_ib_reg = R_00B900_COMPUTE_USER_DATA_0 + task_ib_loc->sgpr_idx * 4;
6941
6942      radeon_set_sh_reg_seq(cs, task_ib_reg, 3);
6943      radeon_emit(cs, ib_va);
6944      radeon_emit(cs, ib_va >> 32);
6945      radeon_emit(cs, ib_stride);
6946   }
6947}
6948
6949ALWAYS_INLINE static void
6950radv_emit_userdata_task(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z,
6951                        uint32_t draw_id, uint32_t first_task, uint64_t ib_va)
6952{
6953   struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6954   struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6955
6956   struct radv_userdata_info *xyz_loc =
6957      radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_GRID_SIZE);
6958   struct radv_userdata_info *draw_id_loc =
6959      radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_DRAW_ID);
6960
6961   if (xyz_loc->sgpr_idx != -1) {
6962      assert(xyz_loc->num_sgprs == 3);
6963      unsigned xyz_reg = R_00B900_COMPUTE_USER_DATA_0 + xyz_loc->sgpr_idx * 4;
6964
6965      radeon_set_sh_reg_seq(cs, xyz_reg, 3);
6966      radeon_emit(cs, x);
6967      radeon_emit(cs, y);
6968      radeon_emit(cs, z);
6969   }
6970
6971   if (draw_id_loc->sgpr_idx != -1) {
6972      assert(draw_id_loc->num_sgprs == 1);
6973      unsigned draw_id_reg = R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4;
6974
6975      radeon_set_sh_reg_seq(cs, draw_id_reg, 1);
6976      radeon_emit(cs, draw_id);
6977   }
6978
6979   radv_emit_userdata_task_ib_only(cmd_buffer, ib_va, first_task ? 8 : 0);
6980}
6981
6982ALWAYS_INLINE static void
6983radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer,
6984                               const struct radv_draw_info *info,
6985                               uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo,
6986                               uint32_t stride,
6987                               const int32_t *vertexOffset)
6988
6989{
6990   struct radv_cmd_state *state = &cmd_buffer->state;
6991   struct radeon_cmdbuf *cs = cmd_buffer->cs;
6992   const int index_size = radv_get_vgt_index_size(state->index_type);
6993   unsigned i = 0;
6994   const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6995   const bool can_eop =
6996      !uses_drawid && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10;
6997
6998   if (uses_drawid) {
6999      if (vertexOffset) {
7000         radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
7001         vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
7002            const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
7003
7004            /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
7005            if (!remaining_indexes &&
7006                cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
7007               continue;
7008
7009            if (i > 0)
7010               radeon_set_sh_reg(cs, state->graphics_pipeline->vtx_base_sgpr + sizeof(uint32_t), i);
7011
7012            const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
7013
7014            if (!state->subpass->view_mask) {
7015               radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7016            } else {
7017               u_foreach_bit(view, state->subpass->view_mask) {
7018                  radv_emit_view_index(cmd_buffer, view);
7019
7020                  radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7021               }
7022            }
7023         }
7024      } else {
7025         vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
7026            const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
7027
7028            /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
7029            if (!remaining_indexes &&
7030                cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
7031               continue;
7032
7033            if (i > 0) {
7034               if (state->last_vertex_offset != draw->vertexOffset)
7035                  radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i);
7036               else
7037                  radeon_set_sh_reg(cs, state->graphics_pipeline->vtx_base_sgpr + sizeof(uint32_t), i);
7038            } else
7039               radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
7040
7041            const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
7042
7043            if (!state->subpass->view_mask) {
7044               radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7045            } else {
7046               u_foreach_bit(view, state->subpass->view_mask) {
7047                  radv_emit_view_index(cmd_buffer, view);
7048
7049                  radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7050               }
7051            }
7052         }
7053      }
7054      if (drawCount > 1) {
7055         state->last_drawid = drawCount - 1;
7056      }
7057   } else {
7058      if (vertexOffset) {
7059         if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10) {
7060            /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
7061             * count == 0 for the last draw that doesn't have NOT_EOP.
7062             */
7063            while (drawCount > 1) {
7064               const VkMultiDrawIndexedInfoEXT *last = (const VkMultiDrawIndexedInfoEXT*)(((const uint8_t*)minfo) + (drawCount - 1) * stride);
7065               if (last->indexCount)
7066                  break;
7067               drawCount--;
7068            }
7069         }
7070
7071         radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
7072         vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
7073            const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
7074
7075            /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
7076            if (!remaining_indexes &&
7077                cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
7078               continue;
7079
7080            const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
7081
7082            if (!state->subpass->view_mask) {
7083               radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && i < drawCount - 1);
7084            } else {
7085               u_foreach_bit(view, state->subpass->view_mask) {
7086                  radv_emit_view_index(cmd_buffer, view);
7087
7088                  radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7089               }
7090            }
7091         }
7092      } else {
7093         vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
7094            const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
7095
7096            /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
7097            if (!remaining_indexes &&
7098                cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
7099               continue;
7100
7101            const VkMultiDrawIndexedInfoEXT *next = (const VkMultiDrawIndexedInfoEXT*)(i < drawCount - 1 ? ((uint8_t*)draw + stride) : NULL);
7102            const bool offset_changes = next && next->vertexOffset != draw->vertexOffset;
7103            radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
7104
7105            const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
7106
7107            if (!state->subpass->view_mask) {
7108               radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && !offset_changes && i < drawCount - 1);
7109            } else {
7110               u_foreach_bit(view, state->subpass->view_mask) {
7111                  radv_emit_view_index(cmd_buffer, view);
7112
7113                  radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7114               }
7115            }
7116         }
7117      }
7118      if (drawCount > 1) {
7119         state->last_drawid = drawCount - 1;
7120      }
7121   }
7122}
7123
7124ALWAYS_INLINE static void
7125radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
7126                              uint32_t drawCount, const VkMultiDrawInfoEXT *minfo,
7127                              uint32_t use_opaque, uint32_t stride)
7128{
7129   unsigned i = 0;
7130   const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7131   const bool uses_drawid = cmd_buffer->state.graphics_pipeline->uses_drawid;
7132   uint32_t last_start = 0;
7133
7134   vk_foreach_multi_draw(draw, i, minfo, drawCount, stride) {
7135      if (!i)
7136         radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex);
7137      else
7138         radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0);
7139
7140      if (!view_mask) {
7141         radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
7142      } else {
7143         u_foreach_bit(view, view_mask) {
7144            radv_emit_view_index(cmd_buffer, view);
7145            radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
7146         }
7147      }
7148      last_start = draw->firstVertex;
7149   }
7150   if (drawCount > 1) {
7151       struct radv_cmd_state *state = &cmd_buffer->state;
7152       state->last_vertex_offset = last_start;
7153       if (uses_drawid)
7154           state->last_drawid = drawCount - 1;
7155   }
7156}
7157
7158ALWAYS_INLINE static void
7159radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer,
7160                                  uint32_t x, uint32_t y, uint32_t z,
7161                                  uint32_t first_task)
7162{
7163   const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7164   const uint32_t count = x * y * z;
7165
7166   radv_emit_userdata_mesh(cmd_buffer, x, y, z, first_task);
7167
7168   if (!view_mask) {
7169      radv_cs_emit_draw_packet(cmd_buffer, count, 0);
7170   } else {
7171      u_foreach_bit(view, view_mask) {
7172         radv_emit_view_index(cmd_buffer, view);
7173         radv_cs_emit_draw_packet(cmd_buffer, count, 0);
7174      }
7175   }
7176}
7177
7178ALWAYS_INLINE static void
7179radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y,
7180                                       uint32_t z, uint32_t first_task)
7181{
7182   uint64_t fake_ib_va = 0;
7183   const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7184   const unsigned num_views = MAX2(1, util_bitcount(view_mask));
7185   unsigned ace_predication_size = num_views * 6; /* DISPATCH_TASKMESH_DIRECT_ACE size */
7186
7187   if (first_task) {
7188      /* Pass this as the IB to the shader for emulating firstTask in task shaders. */
7189      uint32_t fake_ib_dwords[2] = {x, first_task};
7190      unsigned fake_ib_offset;
7191      radv_cmd_buffer_upload_data(cmd_buffer, 8, fake_ib_dwords, &fake_ib_offset);
7192      fake_ib_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + fake_ib_offset;
7193   }
7194
7195   radv_emit_userdata_task(cmd_buffer, x, y, z, 0, first_task, fake_ib_va);
7196   radv_emit_userdata_mesh_first_task_0_draw_id_0(cmd_buffer);
7197   radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->ace_internal.cs,
7198                                    cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted,
7199                                    ace_predication_size);
7200
7201   if (!view_mask) {
7202      radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
7203      radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7204   } else {
7205      u_foreach_bit (view, view_mask) {
7206         radv_emit_view_index(cmd_buffer, view);
7207         radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
7208         radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7209      }
7210   }
7211}
7212
7213static void
7214radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer,
7215                                         const struct radv_draw_info *info, uint64_t nv_ib_va,
7216                                         uint32_t nv_ib_stride)
7217{
7218   const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7219   struct radeon_winsys *ws = cmd_buffer->device->ws;
7220   const unsigned num_views = MAX2(1, util_bitcount(view_mask));
7221   unsigned ace_predication_size = num_views * 11; /* DISPATCH_TASKMESH_INDIRECT_MULTI_ACE size */
7222   struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
7223
7224   const uint64_t va =
7225      radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
7226   const uint64_t count_va = !info->count_buffer
7227                                ? 0
7228                                : radv_buffer_get_va(info->count_buffer->bo) +
7229                                     info->count_buffer->offset + info->count_buffer_offset;
7230   uint64_t workaround_cond_va = 0;
7231
7232   if (count_va) {
7233      radv_cs_add_buffer(ws, cmd_buffer->ace_internal.cs, info->count_buffer->bo);
7234
7235      /* MEC firmware bug workaround.
7236       * When the count buffer contains zero, DISPATCH_TASKMESH_INDIRECT_MULTI_ACE hangs.
7237       * - We must ensure that DISPATCH_TASKMESH_INDIRECT_MULTI_ACE
7238       *   is only executed when the count buffer contains non-zero.
7239       * - Furthermore, we must also ensure that each DISPATCH_TASKMESH_GFX packet
7240       *   has a matching ACE packet.
7241       *
7242       * As a workaround:
7243       * - Reserve a dword in the upload buffer and initialize it to 1 for the workaround
7244       * - When count != 0, write 0 to the workaround BO and execute the indirect dispatch
7245       * - When workaround BO != 0 (count was 0), execute an empty direct dispatch
7246       */
7247
7248      uint32_t workaround_cond_init = 0;
7249      uint32_t workaround_cond_off;
7250      if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &workaround_cond_init, &workaround_cond_off))
7251         cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
7252
7253      workaround_cond_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + workaround_cond_off;
7254
7255      radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
7256      radeon_emit(ace_cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7257                             COPY_DATA_WR_CONFIRM);
7258      radeon_emit(ace_cs, 1);
7259      radeon_emit(ace_cs, 0);
7260      radeon_emit(ace_cs, workaround_cond_va);
7261      radeon_emit(ace_cs, workaround_cond_va >> 32);
7262
7263      /* 2x COND_EXEC + 1x COPY_DATA + Nx DISPATCH_TASKMESH_DIRECT_ACE */
7264      ace_predication_size += 2 * 5 + 6 + 6 * num_views;
7265   }
7266
7267   radv_cs_add_buffer(ws, cmd_buffer->ace_internal.cs, info->indirect->bo);
7268   radv_emit_userdata_task_ib_only(cmd_buffer, nv_ib_va, nv_ib_stride);
7269   radv_emit_userdata_mesh_first_task_0_draw_id_0(cmd_buffer);
7270   radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->ace_internal.cs,
7271                                    cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted,
7272                                    ace_predication_size);
7273
7274   if (workaround_cond_va) {
7275      radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
7276      radeon_emit(ace_cs, count_va);
7277      radeon_emit(ace_cs, count_va >> 32);
7278      radeon_emit(ace_cs, 0);
7279      radeon_emit(ace_cs,
7280                  6 + 11 * num_views); /* 1x COPY_DATA + Nx DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */
7281
7282      radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
7283      radeon_emit(ace_cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7284                             COPY_DATA_WR_CONFIRM);
7285      radeon_emit(ace_cs, 0);
7286      radeon_emit(ace_cs, 0);
7287      radeon_emit(ace_cs, workaround_cond_va);
7288      radeon_emit(ace_cs, workaround_cond_va >> 32);
7289   }
7290
7291   if (!view_mask) {
7292      radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count,
7293                                                               count_va, info->stride);
7294      radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7295   } else {
7296      u_foreach_bit (view, view_mask) {
7297         radv_emit_view_index(cmd_buffer, view);
7298         radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count,
7299                                                                  count_va, info->stride);
7300         radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7301      }
7302   }
7303
7304   if (workaround_cond_va) {
7305      radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
7306      radeon_emit(ace_cs, workaround_cond_va);
7307      radeon_emit(ace_cs, workaround_cond_va >> 32);
7308      radeon_emit(ace_cs, 0);
7309      radeon_emit(ace_cs, 6 * num_views); /* Nx DISPATCH_TASKMESH_DIRECT_ACE */
7310
7311      for (unsigned v = 0; v < num_views; ++v) {
7312         radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, 0, 0, 0);
7313      }
7314   }
7315}
7316
7317static void
7318radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer,
7319                                const struct radv_draw_info *info)
7320{
7321   const struct radv_cmd_state *state = &cmd_buffer->state;
7322   struct radeon_winsys *ws = cmd_buffer->device->ws;
7323   struct radeon_cmdbuf *cs = cmd_buffer->cs;
7324   const uint64_t va =
7325      radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
7326   const uint64_t count_va = info->count_buffer
7327                                ? radv_buffer_get_va(info->count_buffer->bo) +
7328                                     info->count_buffer->offset + info->count_buffer_offset
7329                                : 0;
7330
7331   radv_cs_add_buffer(ws, cs, info->indirect->bo);
7332
7333   radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
7334   radeon_emit(cs, 1);
7335   radeon_emit(cs, va);
7336   radeon_emit(cs, va >> 32);
7337
7338   if (info->count_buffer) {
7339      radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
7340   }
7341
7342   if (!state->subpass->view_mask) {
7343      radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
7344                                        info->stride);
7345   } else {
7346      u_foreach_bit(i, state->subpass->view_mask)
7347      {
7348         radv_emit_view_index(cmd_buffer, i);
7349
7350         radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
7351                                           info->stride);
7352      }
7353   }
7354}
7355
7356/*
7357 * Vega and raven have a bug which triggers if there are multiple context
7358 * register contexts active at the same time with different scissor values.
7359 *
7360 * There are two possible workarounds:
7361 * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
7362 *    there is only ever 1 active set of scissor values at the same time.
7363 *
7364 * 2) Whenever the hardware switches contexts we have to set the scissor
7365 *    registers again even if it is a noop. That way the new context gets
7366 *    the correct scissor values.
7367 *
7368 * This implements option 2. radv_need_late_scissor_emission needs to
7369 * return true on affected HW if radv_emit_all_graphics_states sets
7370 * any context registers.
7371 */
7372static bool
7373radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
7374                                const struct radv_draw_info *info)
7375{
7376   struct radv_cmd_state *state = &cmd_buffer->state;
7377
7378   if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
7379      return false;
7380
7381   if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
7382      return true;
7383
7384   uint64_t used_states =
7385      cmd_buffer->state.graphics_pipeline->needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
7386
7387   /* Index, vertex and streamout buffers don't change context regs, and
7388    * pipeline is already handled.
7389    */
7390   used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER |
7391                    RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT | RADV_CMD_DIRTY_STREAMOUT_BUFFER |
7392                    RADV_CMD_DIRTY_PIPELINE);
7393
7394   if (cmd_buffer->state.dirty & used_states)
7395      return true;
7396
7397   uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
7398
7399   if (info->indexed && state->dynamic.primitive_restart_enable &&
7400       primitive_reset_index != state->last_primitive_reset_index)
7401      return true;
7402
7403   return false;
7404}
7405
7406ALWAYS_INLINE static bool
7407radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt,
7408                      bool indirect)
7409{
7410   /* If we have to draw only a few vertices, we get better latency if
7411    * we disable NGG culling.
7412    *
7413    * When tessellation is used, what matters is the number of tessellated
7414    * vertices, so let's always assume it's not a small draw.
7415    */
7416   return !has_tess && !indirect && vtx_cnt < 128;
7417}
7418
7419ALWAYS_INLINE static uint32_t
7420radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
7421{
7422   const struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7423   const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
7424
7425   /* Cull every triangle when rasterizer discard is enabled. */
7426   if (d->rasterizer_discard_enable ||
7427       G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.graphics_pipeline->pa_cl_clip_cntl))
7428      return radv_nggc_front_face | radv_nggc_back_face;
7429
7430   uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.graphics_pipeline->pa_su_sc_mode_cntl;
7431   uint32_t nggc_settings = radv_nggc_none;
7432
7433   /* The culling code needs to know whether face is CW or CCW. */
7434   bool ccw = (pipeline->needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE)
7435              ? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE
7436              : G_028814_FACE(pa_su_sc_mode_cntl) == 0;
7437
7438   /* Take inverted viewport into account. */
7439   ccw ^= vp_y_inverted;
7440
7441   if (ccw)
7442      nggc_settings |= radv_nggc_face_is_ccw;
7443
7444   /* Face culling settings. */
7445   if ((pipeline->needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
7446         ? (d->cull_mode & VK_CULL_MODE_FRONT_BIT)
7447         : G_028814_CULL_FRONT(pa_su_sc_mode_cntl))
7448      nggc_settings |= radv_nggc_front_face;
7449   if ((pipeline->needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
7450         ? (d->cull_mode & VK_CULL_MODE_BACK_BIT)
7451         : G_028814_CULL_BACK(pa_su_sc_mode_cntl))
7452      nggc_settings |= radv_nggc_back_face;
7453
7454   /* Small primitive culling is only valid when conservative overestimation is not used. It's also
7455    * disabled for user sample locations because small primitive culling assumes a sample
7456    * position at (0.5, 0.5). */
7457   if (!pipeline->uses_conservative_overestimate && !pipeline->uses_user_sample_locations) {
7458      nggc_settings |= radv_nggc_small_primitives;
7459
7460      /* small_prim_precision = num_samples / 2^subpixel_bits
7461       * num_samples is also always a power of two, so the small prim precision can only be
7462       * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
7463       */
7464      unsigned subpixel_bits = 256;
7465      int32_t small_prim_precision_log2 = util_logbase2(pipeline->ms.num_samples) - util_logbase2(subpixel_bits);
7466      nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u);
7467   }
7468
7469   return nggc_settings;
7470}
7471
7472static void
7473radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
7474{
7475   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7476   const unsigned stage = pipeline->last_vgt_api_stage;
7477   const bool nggc_supported = pipeline->has_ngg_culling;
7478
7479   if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) {
7480      /* Current shader doesn't support culling and culling was already disabled:
7481       * No further steps needed, just remember the SGPR's location is not set.
7482       */
7483      cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
7484      return;
7485   }
7486
7487   /* Check dirty flags:
7488    * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed).
7489    * - Dirty dynamic flags: culling settings may have changed.
7490    */
7491   const bool dirty =
7492      cmd_buffer->state.dirty &
7493      (RADV_CMD_DIRTY_PIPELINE |
7494       RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
7495       RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT);
7496
7497   /* Check small draw status:
7498    * For small draw calls, we disable culling by setting the SGPR to 0.
7499    */
7500   const bool skip =
7501      radv_skip_ngg_culling(stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect);
7502
7503   /* See if anything changed. */
7504   if (!dirty && skip == cmd_buffer->state.last_nggc_skip)
7505      return;
7506
7507   /* Remember small draw state. */
7508   cmd_buffer->state.last_nggc_skip = skip;
7509   const struct radv_shader *v = pipeline->base.shaders[stage];
7510   assert(v->info.has_ngg_culling == nggc_supported);
7511
7512   /* Find the user SGPR. */
7513   const uint32_t base_reg = pipeline->base.user_data_0[stage];
7514   const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx;
7515   assert(!nggc_supported || nggc_sgpr_idx != -1);
7516
7517   /* Get viewport transform. */
7518   float vp_scale[2], vp_translate[2];
7519   memcpy(vp_scale, cmd_buffer->state.dynamic.viewport.xform[0].scale, 2 * sizeof(float));
7520   memcpy(vp_translate, cmd_buffer->state.dynamic.viewport.xform[0].translate, 2 * sizeof(float));
7521   bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
7522
7523   /* Get current culling settings. */
7524   uint32_t nggc_settings = nggc_supported && !skip
7525                            ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted)
7526                            : radv_nggc_none;
7527
7528   bool emit_viewport = nggc_settings &&
7529                        (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT ||
7530                         cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx ||
7531                         !cmd_buffer->state.last_nggc_settings);
7532
7533   if (emit_viewport) {
7534      /* Correction for inverted Y */
7535      if (vp_y_inverted) {
7536         vp_scale[1] = -vp_scale[1];
7537         vp_translate[1] = -vp_translate[1];
7538      }
7539
7540      /* Correction for number of samples per pixel. */
7541      for (unsigned i = 0; i < 2; ++i) {
7542         vp_scale[i] *= (float) pipeline->ms.num_samples;
7543         vp_translate[i] *= (float) pipeline->ms.num_samples;
7544      }
7545
7546      uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
7547      const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx;
7548      assert(vp_sgpr_idx != -1);
7549      radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4);
7550      radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
7551   }
7552
7553   bool emit_settings = nggc_supported &&
7554                        (cmd_buffer->state.last_nggc_settings != nggc_settings ||
7555                         cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx);
7556
7557   /* This needs to be emitted when culling is turned on
7558    * and when it's already on but some settings change.
7559    */
7560   if (emit_settings) {
7561      assert(nggc_sgpr_idx >= 0);
7562      radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings);
7563   }
7564
7565   /* These only need to be emitted when culling is turned on or off,
7566    * but not when it stays on and just some settings change.
7567    */
7568   if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) {
7569      uint32_t rsrc2 = v->config.rsrc2;
7570
7571      if (!nggc_settings) {
7572         /* Allocate less LDS when culling is disabled. (But GS always needs it.) */
7573         if (stage != MESA_SHADER_GEOMETRY)
7574            rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling);
7575      }
7576
7577      /* When the pipeline is dirty and not yet emitted, don't write it here
7578       * because radv_emit_graphics_pipeline will overwrite this register.
7579       */
7580      if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) ||
7581          cmd_buffer->state.emitted_graphics_pipeline == pipeline) {
7582         radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
7583      }
7584   }
7585
7586   cmd_buffer->state.last_nggc_settings = nggc_settings;
7587   cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx;
7588}
7589
7590static void
7591radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
7592                              bool pipeline_is_dirty)
7593{
7594   bool late_scissor_emission;
7595
7596   if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
7597       cmd_buffer->state.emitted_graphics_pipeline != cmd_buffer->state.graphics_pipeline)
7598      radv_emit_rbplus_state(cmd_buffer);
7599
7600   if (cmd_buffer->device->physical_device->use_ngg_culling &&
7601       cmd_buffer->state.graphics_pipeline->is_ngg)
7602      radv_emit_ngg_culling_state(cmd_buffer, info);
7603
7604   if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
7605      radv_emit_graphics_pipeline(cmd_buffer);
7606
7607   /* This should be before the cmd_buffer->state.dirty is cleared
7608    * (excluding RADV_CMD_DIRTY_PIPELINE) and after
7609    * cmd_buffer->state.context_roll_without_scissor_emitted is set. */
7610   late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info);
7611
7612   if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
7613      radv_emit_framebuffer_state(cmd_buffer);
7614
7615   if (info->indexed) {
7616      if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
7617         radv_emit_index_buffer(cmd_buffer, info->indirect);
7618   } else {
7619      /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
7620       * so the state must be re-emitted before the next indexed
7621       * draw.
7622       */
7623      if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
7624         cmd_buffer->state.last_index_type = -1;
7625         cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
7626      }
7627   }
7628
7629   if (cmd_buffer->device->force_vrs != RADV_FORCE_VRS_1x1) {
7630      struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
7631      uint64_t dynamic_states =
7632         cmd_buffer->state.dirty & cmd_buffer->state.emitted_graphics_pipeline->needed_dynamic_state;
7633
7634      if ((dynamic_states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE) &&
7635          d->fragment_shading_rate.size.width == 1 &&
7636          d->fragment_shading_rate.size.height == 1 &&
7637          d->fragment_shading_rate.combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
7638          d->fragment_shading_rate.combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
7639         /* When per-vertex VRS is forced and the dynamic fragment shading rate is a no-op, ignore
7640          * it. This is needed for vkd3d-proton because it always declares per-draw VRS as dynamic.
7641          */
7642         cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
7643      }
7644   }
7645
7646   radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty);
7647
7648   radv_emit_draw_registers(cmd_buffer, info);
7649
7650   if (late_scissor_emission)
7651      radv_emit_scissor(cmd_buffer);
7652}
7653
7654/* MUST inline this function to avoid massive perf loss in drawoverhead */
7655ALWAYS_INLINE static bool
7656radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount)
7657{
7658   const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
7659   const bool pipeline_is_dirty = (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
7660                                  cmd_buffer->state.graphics_pipeline != cmd_buffer->state.emitted_graphics_pipeline;
7661
7662   ASSERTED const unsigned cdw_max =
7663      radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
7664
7665   if (likely(!info->indirect)) {
7666      /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
7667       * no workaround for indirect draws, but we can at least skip
7668       * direct draws.
7669       */
7670      if (unlikely(!info->instance_count))
7671         return false;
7672
7673      /* Handle count == 0. */
7674      if (unlikely(!info->count && !info->strmout_buffer))
7675         return false;
7676   }
7677
7678   /* Need to apply this workaround early as it can set flush flags. */
7679   if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
7680      radv_emit_fb_mip_change_flush(cmd_buffer);
7681
7682   /* Use optimal packet order based on whether we need to sync the
7683    * pipeline.
7684    */
7685   if (cmd_buffer->state.flush_bits &
7686       (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
7687        RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
7688      /* If we have to wait for idle, set all states first, so that
7689       * all SET packets are processed in parallel with previous draw
7690       * calls. Then upload descriptors, set shader pointers, and
7691       * draw, and prefetch at the end. This ensures that the time
7692       * the CUs are idle is very short. (there are only SET_SH
7693       * packets between the wait and the draw)
7694       */
7695      radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
7696      si_emit_cache_flush(cmd_buffer);
7697      /* <-- CUs are idle here --> */
7698
7699      radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
7700   } else {
7701      /* If we don't wait for idle, start prefetches first, then set
7702       * states, and draw at the end.
7703       */
7704      si_emit_cache_flush(cmd_buffer);
7705
7706      if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
7707         /* Only prefetch the vertex shader and VBO descriptors
7708          * in order to start the draw as soon as possible.
7709          */
7710         radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.graphics_pipeline, true);
7711      }
7712
7713      radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
7714
7715      radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
7716   }
7717
7718   radv_describe_draw(cmd_buffer);
7719   if (likely(!info->indirect)) {
7720      struct radv_cmd_state *state = &cmd_buffer->state;
7721      struct radeon_cmdbuf *cs = cmd_buffer->cs;
7722      assert(state->graphics_pipeline->vtx_base_sgpr);
7723      if (state->last_num_instances != info->instance_count) {
7724         radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
7725         radeon_emit(cs, info->instance_count);
7726         state->last_num_instances = info->instance_count;
7727      }
7728   }
7729   assert(cmd_buffer->cs->cdw <= cdw_max);
7730
7731   return true;
7732}
7733
7734ALWAYS_INLINE static bool
7735radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
7736                          uint32_t drawCount)
7737{
7738   struct radv_descriptor_state *descriptors_state =
7739      radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
7740   const bool pipeline_is_dirty =
7741      cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE &&
7742      cmd_buffer->state.graphics_pipeline != cmd_buffer->state.emitted_graphics_pipeline;
7743   const bool push_dirty = descriptors_state->push_dirty;
7744   const uint32_t desc_dirty = descriptors_state->dirty;
7745
7746   const bool gfx_result = radv_before_draw(cmd_buffer, info, drawCount);
7747   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7748   struct radv_shader *task_shader = radv_get_shader(&pipeline->base, MESA_SHADER_TASK);
7749
7750   /* If there is no task shader, no need to do anything special. */
7751   if (!task_shader)
7752      return gfx_result;
7753
7754   /* Need to check the count even for indirect draws to work around
7755    * an issue with DISPATCH_TASKMESH_INDIRECT_MULTI_ACE.
7756    */
7757   if (!info->count || !gfx_result)
7758      return false;
7759
7760   const bool need_task_semaphore = radv_flush_gfx2ace_semaphore(cmd_buffer);
7761   struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
7762   struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
7763   struct radeon_winsys *ws = cmd_buffer->device->ws;
7764
7765   assert(ace_cs);
7766   ASSERTED const unsigned ace_cdw_max =
7767      radeon_check_space(ws, ace_cs, 4096 + 128 * (drawCount - 1));
7768
7769   if (need_task_semaphore)
7770      radv_wait_gfx2ace_semaphore(cmd_buffer);
7771
7772   if (pipeline_is_dirty) {
7773      radv_pipeline_emit_hw_cs(pdevice, ace_cs, task_shader);
7774      radv_pipeline_emit_compute_state(pdevice, ace_cs, task_shader);
7775   }
7776
7777   radv_ace_internal_cache_flush(cmd_buffer);
7778
7779   /* Restore dirty state of descriptors
7780    * They were marked non-dirty in radv_before_draw,
7781    * but they need to be re-emitted now to the ACE cmdbuf.
7782    */
7783   descriptors_state->push_dirty = push_dirty;
7784   descriptors_state->dirty = desc_dirty;
7785
7786   /* Flush descriptors and push constants for task shaders. */
7787   radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_TASK_BIT_NV, &pipeline->base,
7788                          VK_PIPELINE_BIND_POINT_GRAPHICS);
7789   radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_TASK_BIT_NV, &pipeline->base,
7790                        VK_PIPELINE_BIND_POINT_GRAPHICS);
7791
7792   assert(ace_cs->cdw <= ace_cdw_max);
7793   return true;
7794}
7795
7796static void
7797radv_after_draw(struct radv_cmd_buffer *cmd_buffer)
7798{
7799   const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
7800   bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
7801   /* Start prefetches after the draw has been started. Both will
7802    * run in parallel, but starting the draw first is more
7803    * important.
7804    */
7805   if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
7806      radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.graphics_pipeline, false);
7807   }
7808
7809   /* Workaround for a VGT hang when streamout is enabled.
7810    * It must be done after drawing.
7811    */
7812   if (radv_is_streamout_enabled(cmd_buffer) &&
7813       (rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA ||
7814        rad_info->family == CHIP_FIJI)) {
7815      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
7816   }
7817
7818   radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
7819}
7820
7821static struct radv_buffer
7822radv_nv_mesh_indirect_bo(struct radv_cmd_buffer *cmd_buffer,
7823                         struct radv_buffer *buffer, VkDeviceSize offset,
7824                         uint32_t draw_count, uint32_t stride)
7825{
7826   /* Translates the indirect BO format used by NV_mesh_shader API
7827    * to the BO format used by DRAW_INDIRECT / DRAW_INDIRECT_MULTI.
7828    */
7829
7830   struct radeon_cmdbuf *cs = cmd_buffer->cs;
7831   struct radeon_winsys *ws = cmd_buffer->device->ws;
7832
7833   const size_t src_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
7834   const size_t dst_stride = sizeof(VkDrawIndirectCommand);
7835   const size_t src_off_task_count = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
7836   const size_t src_off_first_task = offsetof(VkDrawMeshTasksIndirectCommandNV, firstTask);
7837   const size_t dst_off_vertex_count = offsetof(VkDrawIndirectCommand, vertexCount);
7838   const size_t dst_off_first_vertex = offsetof(VkDrawIndirectCommand, firstVertex);
7839
7840   /* Fill the buffer with all zeroes except instanceCount = 1.
7841    * This helps emit fewer copy packets below.
7842    */
7843   VkDrawIndirectCommand *fill_data = (VkDrawIndirectCommand *) alloca(dst_stride * draw_count);
7844   const VkDrawIndirectCommand filler = { .instanceCount = 1 };
7845   for (unsigned i = 0; i < draw_count; ++i)
7846      fill_data[i] = filler;
7847
7848   /* We'll have to copy data from the API BO. */
7849   uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
7850   radv_cs_add_buffer(ws, cs, buffer->bo);
7851
7852   /* Allocate some space in the upload BO. */
7853   unsigned out_offset;
7854   radv_cmd_buffer_upload_data(cmd_buffer, dst_stride * draw_count, fill_data, &out_offset);
7855   const uint64_t new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + out_offset;
7856
7857   ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 12 * draw_count + 2);
7858
7859   /* Copy data from the API BO so that the format is suitable for the
7860    * indirect draw packet:
7861    * - vertexCount = taskCount (copied here)
7862    * - instanceCount = 1 (filled by CPU above)
7863    * - firstVertex = firstTask (copied here)
7864    * - firstInstance = 0 (filled by CPU above)
7865    */
7866   for (unsigned i = 0; i < draw_count; ++i) {
7867      const uint64_t src_task_count = va + i * src_stride + src_off_task_count;
7868      const uint64_t src_first_task = va + i * src_stride + src_off_first_task;
7869      const uint64_t dst_vertex_count = new_va + i * dst_stride + dst_off_vertex_count;
7870      const uint64_t dst_first_vertex = new_va + i * dst_stride + dst_off_first_vertex;
7871
7872      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
7873      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7874                      COPY_DATA_WR_CONFIRM);
7875      radeon_emit(cs, src_task_count);
7876      radeon_emit(cs, src_task_count >> 32);
7877      radeon_emit(cs, dst_vertex_count);
7878      radeon_emit(cs, dst_vertex_count >> 32);
7879
7880      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
7881      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7882                      COPY_DATA_WR_CONFIRM);
7883      radeon_emit(cs, src_first_task);
7884      radeon_emit(cs, src_first_task >> 32);
7885      radeon_emit(cs, dst_first_vertex);
7886      radeon_emit(cs, dst_first_vertex >> 32);
7887   }
7888
7889   /* Wait for the copies to finish */
7890   radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
7891   radeon_emit(cs, 0);
7892
7893   /* The draw packet can now use this buffer: */
7894   struct radv_buffer buf = *buffer;
7895   buf.bo = cmd_buffer->upload.upload_bo;
7896   buf.offset = out_offset;
7897
7898   assert(cmd_buffer->cs->cdw <= cdw_max);
7899
7900   return buf;
7901}
7902
7903static struct radv_buffer
7904radv_nv_task_indirect_bo(struct radv_cmd_buffer *cmd_buffer, struct radv_buffer *buffer,
7905                         VkDeviceSize offset, uint32_t draw_count, uint32_t stride)
7906{
7907   /* Translates the indirect BO format used by NV_mesh_shader API
7908    * to the BO format used by DISPATCH_TASKMESH_INDIRECT_MULTI_ACE.
7909    */
7910
7911   assert(draw_count);
7912   static_assert(sizeof(VkDispatchIndirectCommand) == 12, "Incorrect size of taskmesh command.");
7913
7914   struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
7915   struct radeon_winsys *ws = cmd_buffer->device->ws;
7916
7917   const size_t src_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
7918   const size_t dst_stride = sizeof(VkDispatchIndirectCommand);
7919   const size_t src_off_task_count = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
7920   const size_t dst_off_x = offsetof(VkDispatchIndirectCommand, x);
7921
7922   const unsigned new_disp_size = dst_stride * draw_count;
7923
7924   const uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
7925   radv_cs_add_buffer(ws, cs, buffer->bo);
7926
7927   /* Fill the buffer with X=0, Y=1, Z=1. */
7928   VkDispatchIndirectCommand *fill_data = (VkDispatchIndirectCommand *)alloca(new_disp_size);
7929   for (unsigned i = 0; i < draw_count; ++i) {
7930      fill_data[i].x = 0;
7931      fill_data[i].y = 1;
7932      fill_data[i].z = 1;
7933   }
7934
7935   /* Allocate space in the upload BO. */
7936   unsigned out_offset;
7937   ASSERTED bool uploaded =
7938      radv_cmd_buffer_upload_data(cmd_buffer, new_disp_size, fill_data, &out_offset);
7939   const uint64_t new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + out_offset;
7940   assert(uploaded);
7941
7942   /* Clamp draw count to fit the actual size of the buffer.
7943    * This is to avoid potential out of bounds copies (eg. for draws with an indirect count buffer).
7944    * The remaining indirect draws will stay filled with X=0, Y=1, Z=1 which is harmless.
7945    */
7946   draw_count = MIN2(draw_count, (buffer->vk.size - buffer->offset - offset) / src_stride);
7947
7948   ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 6 * draw_count + 2);
7949
7950   /* Copy taskCount from the NV API BO to the X dispatch size of the compatible BO. */
7951   for (unsigned i = 0; i < draw_count; ++i) {
7952      const uint64_t src_task_count = va + i * src_stride + src_off_task_count;
7953      const uint64_t dst_x = new_va + i * dst_stride + dst_off_x;
7954
7955      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
7956      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7957                         COPY_DATA_WR_CONFIRM);
7958      radeon_emit(cs, src_task_count);
7959      radeon_emit(cs, src_task_count >> 32);
7960      radeon_emit(cs, dst_x);
7961      radeon_emit(cs, dst_x >> 32);
7962   }
7963
7964   assert(cs->cdw <= cdw_max);
7965
7966   /* The draw packet can now use this buffer: */
7967   struct radv_buffer buf = *buffer;
7968   buf.bo = cmd_buffer->upload.upload_bo;
7969   buf.offset = out_offset;
7970
7971   return buf;
7972}
7973
7974VKAPI_ATTR void VKAPI_CALL
7975radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount,
7976             uint32_t firstVertex, uint32_t firstInstance)
7977{
7978   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7979   struct radv_draw_info info;
7980
7981   info.count = vertexCount;
7982   info.instance_count = instanceCount;
7983   info.first_instance = firstInstance;
7984   info.strmout_buffer = NULL;
7985   info.indirect = NULL;
7986   info.indexed = false;
7987
7988   if (!radv_before_draw(cmd_buffer, &info, 1))
7989      return;
7990   const VkMultiDrawInfoEXT minfo = { firstVertex, vertexCount };
7991   radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0);
7992   radv_after_draw(cmd_buffer);
7993}
7994
7995VKAPI_ATTR void VKAPI_CALL
7996radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo,
7997                          uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)
7998{
7999   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8000   struct radv_draw_info info;
8001
8002   if (!drawCount)
8003      return;
8004
8005   info.count = pVertexInfo->vertexCount;
8006   info.instance_count = instanceCount;
8007   info.first_instance = firstInstance;
8008   info.strmout_buffer = NULL;
8009   info.indirect = NULL;
8010   info.indexed = false;
8011
8012   if (!radv_before_draw(cmd_buffer, &info, drawCount))
8013      return;
8014   radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride);
8015   radv_after_draw(cmd_buffer);
8016}
8017
8018VKAPI_ATTR void VKAPI_CALL
8019radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
8020                    uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance)
8021{
8022   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8023   struct radv_draw_info info;
8024
8025   info.indexed = true;
8026   info.count = indexCount;
8027   info.instance_count = instanceCount;
8028   info.first_instance = firstInstance;
8029   info.strmout_buffer = NULL;
8030   info.indirect = NULL;
8031
8032   if (!radv_before_draw(cmd_buffer, &info, 1))
8033      return;
8034   const VkMultiDrawIndexedInfoEXT minfo = { firstIndex, indexCount, vertexOffset };
8035   radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL);
8036   radv_after_draw(cmd_buffer);
8037}
8038
8039VKAPI_ATTR void VKAPI_CALL
8040radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo,
8041                            uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset)
8042{
8043   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8044   struct radv_draw_info info;
8045
8046   if (!drawCount)
8047      return;
8048
8049   const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo;
8050   info.indexed = true;
8051   info.count = minfo->indexCount;
8052   info.instance_count = instanceCount;
8053   info.first_instance = firstInstance;
8054   info.strmout_buffer = NULL;
8055   info.indirect = NULL;
8056
8057   if (!radv_before_draw(cmd_buffer, &info, drawCount))
8058      return;
8059   radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset);
8060   radv_after_draw(cmd_buffer);
8061}
8062
8063VKAPI_ATTR void VKAPI_CALL
8064radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
8065                     uint32_t drawCount, uint32_t stride)
8066{
8067   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8068   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8069   struct radv_draw_info info;
8070
8071   info.count = drawCount;
8072   info.indirect = buffer;
8073   info.indirect_offset = offset;
8074   info.stride = stride;
8075   info.strmout_buffer = NULL;
8076   info.count_buffer = NULL;
8077   info.indexed = false;
8078   info.instance_count = 0;
8079
8080   if (!radv_before_draw(cmd_buffer, &info, 1))
8081      return;
8082   radv_emit_indirect_draw_packets(cmd_buffer, &info);
8083   radv_after_draw(cmd_buffer);
8084}
8085
8086VKAPI_ATTR void VKAPI_CALL
8087radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
8088                            uint32_t drawCount, uint32_t stride)
8089{
8090   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8091   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8092   struct radv_draw_info info;
8093
8094   info.indexed = true;
8095   info.count = drawCount;
8096   info.indirect = buffer;
8097   info.indirect_offset = offset;
8098   info.stride = stride;
8099   info.count_buffer = NULL;
8100   info.strmout_buffer = NULL;
8101   info.instance_count = 0;
8102
8103   if (!radv_before_draw(cmd_buffer, &info, 1))
8104      return;
8105   radv_emit_indirect_draw_packets(cmd_buffer, &info);
8106   radv_after_draw(cmd_buffer);
8107}
8108
8109VKAPI_ATTR void VKAPI_CALL
8110radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
8111                          VkBuffer _countBuffer, VkDeviceSize countBufferOffset,
8112                          uint32_t maxDrawCount, uint32_t stride)
8113{
8114   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8115   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8116   RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
8117   struct radv_draw_info info;
8118
8119   info.count = maxDrawCount;
8120   info.indirect = buffer;
8121   info.indirect_offset = offset;
8122   info.count_buffer = count_buffer;
8123   info.count_buffer_offset = countBufferOffset;
8124   info.stride = stride;
8125   info.strmout_buffer = NULL;
8126   info.indexed = false;
8127   info.instance_count = 0;
8128
8129   if (!radv_before_draw(cmd_buffer, &info, 1))
8130      return;
8131   radv_emit_indirect_draw_packets(cmd_buffer, &info);
8132   radv_after_draw(cmd_buffer);
8133}
8134
8135VKAPI_ATTR void VKAPI_CALL
8136radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
8137                                 VkDeviceSize offset, VkBuffer _countBuffer,
8138                                 VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
8139                                 uint32_t stride)
8140{
8141   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8142   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8143   RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
8144   struct radv_draw_info info;
8145
8146   info.indexed = true;
8147   info.count = maxDrawCount;
8148   info.indirect = buffer;
8149   info.indirect_offset = offset;
8150   info.count_buffer = count_buffer;
8151   info.count_buffer_offset = countBufferOffset;
8152   info.stride = stride;
8153   info.strmout_buffer = NULL;
8154   info.instance_count = 0;
8155
8156   if (!radv_before_draw(cmd_buffer, &info, 1))
8157      return;
8158   radv_emit_indirect_draw_packets(cmd_buffer, &info);
8159   radv_after_draw(cmd_buffer);
8160}
8161
8162VKAPI_ATTR void VKAPI_CALL
8163radv_CmdDrawMeshTasksNV(VkCommandBuffer commandBuffer, uint32_t taskCount, uint32_t firstTask)
8164{
8165   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8166   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
8167   struct radv_draw_info info;
8168
8169   info.count = taskCount;
8170   info.instance_count = 1;
8171   info.first_instance = 0;
8172   info.stride = 0;
8173   info.indexed = false;
8174   info.strmout_buffer = NULL;
8175   info.count_buffer = NULL;
8176   info.indirect = NULL;
8177
8178   if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1))
8179      return;
8180
8181   if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
8182      radv_emit_direct_taskmesh_draw_packets(cmd_buffer, taskCount, 1, 1, firstTask);
8183   } else {
8184      radv_emit_direct_mesh_draw_packet(cmd_buffer, taskCount, 1, 1, firstTask);
8185   }
8186
8187   radv_after_draw(cmd_buffer);
8188}
8189
8190VKAPI_ATTR void VKAPI_CALL
8191radv_CmdDrawMeshTasksIndirectNV(VkCommandBuffer commandBuffer, VkBuffer _buffer,
8192                                VkDeviceSize offset, uint32_t drawCount, uint32_t stride)
8193{
8194   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8195   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8196
8197   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
8198   struct radv_draw_info info;
8199
8200   info.indirect = buffer;
8201   info.indirect_offset = offset;
8202   info.stride = stride;
8203   info.count = drawCount;
8204   info.strmout_buffer = NULL;
8205   info.count_buffer = NULL;
8206   info.indexed = false;
8207   info.instance_count = 0;
8208
8209   if (!radv_before_taskmesh_draw(cmd_buffer, &info, drawCount))
8210      return;
8211
8212   /* Indirect draw with mesh shader only:
8213    * Use DRAW_INDIRECT / DRAW_INDIRECT_MULTI like normal indirect draws.
8214    * Needed because DISPATCH_MESH_INDIRECT_MULTI doesn't support firstTask.
8215    *
8216    * Indirect draw with task + mesh shaders:
8217    * Use DISPATCH_TASKMESH_INDIRECT_MULTI_ACE + DISPATCH_TASKMESH_GFX.
8218    * These packets don't support firstTask so we implement that by
8219    * reading the NV command's indirect buffer in the shader.
8220    *
8221    * The indirect BO layout from the NV_mesh_shader API is incompatible
8222    * with AMD HW. To make it work, we allocate some space
8223    * in the upload buffer and copy the data to it.
8224    */
8225
8226   if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
8227      uint64_t nv_ib_va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
8228      uint32_t nv_ib_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
8229      struct radv_buffer buf =
8230         radv_nv_task_indirect_bo(cmd_buffer, buffer, offset, drawCount, stride);
8231      info.indirect = &buf;
8232      info.indirect_offset = 0;
8233      info.stride = sizeof(VkDispatchIndirectCommand);
8234
8235      radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info, nv_ib_va, nv_ib_stride);
8236   } else {
8237      struct radv_buffer buf =
8238         radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset, drawCount, stride);
8239      info.indirect = &buf;
8240      info.indirect_offset = 0;
8241      info.stride = sizeof(VkDrawIndirectCommand);
8242
8243      radv_emit_indirect_draw_packets(cmd_buffer, &info);
8244   }
8245
8246   radv_after_draw(cmd_buffer);
8247}
8248
8249VKAPI_ATTR void VKAPI_CALL
8250radv_CmdDrawMeshTasksIndirectCountNV(VkCommandBuffer commandBuffer, VkBuffer _buffer,
8251                                     VkDeviceSize offset, VkBuffer _countBuffer,
8252                                     VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
8253                                     uint32_t stride)
8254{
8255   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8256   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8257   RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
8258
8259   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
8260   struct radv_draw_info info;
8261
8262   info.indirect = buffer;
8263   info.indirect_offset = offset;
8264   info.stride = stride;
8265   info.count = maxDrawCount;
8266   info.strmout_buffer = NULL;
8267   info.count_buffer = count_buffer;
8268   info.count_buffer_offset = countBufferOffset;
8269   info.indexed = false;
8270   info.instance_count = 0;
8271
8272   if (!radv_before_taskmesh_draw(cmd_buffer, &info, maxDrawCount))
8273      return;
8274
8275   if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
8276      uint64_t nv_ib_va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
8277      uint32_t nv_ib_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
8278      struct radv_buffer buf =
8279         radv_nv_task_indirect_bo(cmd_buffer, buffer, offset, maxDrawCount, stride);
8280      info.indirect = &buf;
8281      info.indirect_offset = 0;
8282      info.stride = sizeof(VkDispatchIndirectCommand);
8283
8284      radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info, nv_ib_va, nv_ib_stride);
8285   } else {
8286      struct radv_buffer buf =
8287         radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset, maxDrawCount, stride);
8288      info.indirect = &buf;
8289      info.indirect_offset = 0;
8290      info.stride = sizeof(VkDrawIndirectCommand);
8291
8292      radv_emit_indirect_draw_packets(cmd_buffer, &info);
8293   }
8294
8295   radv_after_draw(cmd_buffer);
8296}
8297
8298void
8299radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
8300                                   const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
8301{
8302   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8303   VK_FROM_HANDLE(radv_indirect_command_layout, layout,
8304                  pGeneratedCommandsInfo->indirectCommandsLayout);
8305   VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
8306
8307   /* The only actions that can be done are draws, so skip on other queues. */
8308   if (cmd_buffer->qf != RADV_QUEUE_GENERAL)
8309      return;
8310
8311   /* Secondary command buffers are needed for the full extension but can't use
8312    * PKT3_INDIRECT_BUFFER_CIK.
8313    */
8314   assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
8315
8316   radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
8317
8318   struct radv_draw_info info;
8319
8320   info.count = pGeneratedCommandsInfo->sequencesCount;
8321   info.indirect = prep_buffer; /* We're not really going use it this way, but a good signal
8322                                   that this is not direct. */
8323   info.indirect_offset = 0;
8324   info.stride = 0;
8325   info.strmout_buffer = NULL;
8326   info.count_buffer = NULL;
8327   info.indexed = layout->indexed;
8328   info.instance_count = 0;
8329
8330   if (!radv_before_draw(cmd_buffer, &info, 1))
8331      return;
8332
8333   uint32_t cmdbuf_size = radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo);
8334   uint64_t va = radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset +
8335                 pGeneratedCommandsInfo->preprocessOffset;
8336   const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
8337
8338   radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
8339   radeon_emit(cmd_buffer->cs, 0);
8340
8341   if (!view_mask) {
8342      radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
8343      radeon_emit(cmd_buffer->cs, va);
8344      radeon_emit(cmd_buffer->cs, va >> 32);
8345      radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2);
8346   } else {
8347      u_foreach_bit (view, view_mask) {
8348         radv_emit_view_index(cmd_buffer, view);
8349
8350         radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
8351         radeon_emit(cmd_buffer->cs, va);
8352         radeon_emit(cmd_buffer->cs, va >> 32);
8353         radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2);
8354      }
8355   }
8356
8357   if (layout->binds_index_buffer) {
8358      cmd_buffer->state.last_index_type = -1;
8359      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
8360   }
8361
8362   if (layout->bind_vbo_mask)
8363      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
8364
8365   if (layout->binds_state)
8366      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
8367
8368   cmd_buffer->push_constant_stages |= ~0;
8369
8370   cmd_buffer->state.last_index_type = -1;
8371   cmd_buffer->state.last_num_instances = -1;
8372   cmd_buffer->state.last_vertex_offset = -1;
8373   cmd_buffer->state.last_first_instance = -1;
8374   cmd_buffer->state.last_drawid = -1;
8375
8376   radv_after_draw(cmd_buffer);
8377}
8378
8379struct radv_dispatch_info {
8380   /**
8381    * Determine the layout of the grid (in block units) to be used.
8382    */
8383   uint32_t blocks[3];
8384
8385   /**
8386    * A starting offset for the grid. If unaligned is set, the offset
8387    * must still be aligned.
8388    */
8389   uint32_t offsets[3];
8390   /**
8391    * Whether it's an unaligned compute dispatch.
8392    */
8393   bool unaligned;
8394
8395   /**
8396    * Indirect compute parameters resource.
8397    */
8398   struct radeon_winsys_bo *indirect;
8399   uint64_t va;
8400};
8401
8402static void
8403radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
8404                           struct radv_compute_pipeline *pipeline,
8405                           const struct radv_dispatch_info *info)
8406{
8407   struct radv_shader *compute_shader = pipeline->base.shaders[MESA_SHADER_COMPUTE];
8408   unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
8409   struct radeon_winsys *ws = cmd_buffer->device->ws;
8410   bool predicating = cmd_buffer->state.predicating;
8411   struct radeon_cmdbuf *cs = cmd_buffer->cs;
8412   struct radv_userdata_info *loc;
8413
8414   radv_describe_dispatch(cmd_buffer, info->blocks[0], info->blocks[1], info->blocks[2]);
8415
8416   loc = radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
8417
8418   ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 30);
8419
8420   if (compute_shader->info.wave_size == 32) {
8421      assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
8422      dispatch_initiator |= S_00B800_CS_W32_EN(1);
8423   }
8424
8425   if (info->va) {
8426      if (info->indirect)
8427         radv_cs_add_buffer(ws, cs, info->indirect);
8428
8429      if (info->unaligned) {
8430         radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
8431         radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
8432         radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
8433         radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
8434
8435         dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
8436      }
8437
8438      if (loc->sgpr_idx != -1) {
8439         unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4;
8440
8441         if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
8442            assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3);
8443            radeon_emit(cs, PKT3(PKT3_LOAD_SH_REG_INDEX, 3, 0));
8444            radeon_emit(cs, info->va);
8445            radeon_emit(cs, info->va >> 32);
8446            radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
8447            radeon_emit(cs, 3);
8448         } else {
8449            radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, reg, info->va, true);
8450         }
8451      }
8452
8453      if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
8454         radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
8455                                          &cmd_buffer->mec_inv_pred_emitted,
8456                                          4 /* DISPATCH_INDIRECT size */);
8457         radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | PKT3_SHADER_TYPE_S(1));
8458         radeon_emit(cs, info->va);
8459         radeon_emit(cs, info->va >> 32);
8460         radeon_emit(cs, dispatch_initiator);
8461      } else {
8462         radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
8463         radeon_emit(cs, 1);
8464         radeon_emit(cs, info->va);
8465         radeon_emit(cs, info->va >> 32);
8466
8467         radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
8468         radeon_emit(cs, 0);
8469         radeon_emit(cs, dispatch_initiator);
8470      }
8471   } else {
8472      unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
8473      unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
8474
8475      if (info->unaligned) {
8476         unsigned *cs_block_size = compute_shader->info.cs.block_size;
8477         unsigned remainder[3];
8478
8479         /* If aligned, these should be an entire block size,
8480          * not 0.
8481          */
8482         remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]);
8483         remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]);
8484         remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]);
8485
8486         blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
8487         blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
8488         blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
8489
8490         for (unsigned i = 0; i < 3; ++i) {
8491            assert(offsets[i] % cs_block_size[i] == 0);
8492            offsets[i] /= cs_block_size[i];
8493         }
8494
8495         radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
8496         radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
8497                            S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
8498         radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
8499                            S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
8500         radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
8501                            S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
8502
8503         dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
8504      }
8505
8506      if (loc->sgpr_idx != -1) {
8507         if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
8508            assert(loc->num_sgprs == 3);
8509
8510            radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
8511            radeon_emit(cs, blocks[0]);
8512            radeon_emit(cs, blocks[1]);
8513            radeon_emit(cs, blocks[2]);
8514         } else {
8515            uint32_t offset;
8516            if (!radv_cmd_buffer_upload_data(cmd_buffer, 12, blocks, &offset))
8517               return;
8518
8519            uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
8520            radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
8521                                     R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, va, true);
8522         }
8523      }
8524
8525      if (offsets[0] || offsets[1] || offsets[2]) {
8526         radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
8527         radeon_emit(cs, offsets[0]);
8528         radeon_emit(cs, offsets[1]);
8529         radeon_emit(cs, offsets[2]);
8530
8531         /* The blocks in the packet are not counts but end values. */
8532         for (unsigned i = 0; i < 3; ++i)
8533            blocks[i] += offsets[i];
8534      } else {
8535         dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
8536      }
8537
8538      if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
8539         radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
8540                                          &cmd_buffer->mec_inv_pred_emitted,
8541                                          5 /* DISPATCH_DIRECT size */);
8542         predicating = false;
8543      }
8544
8545      radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
8546      radeon_emit(cs, blocks[0]);
8547      radeon_emit(cs, blocks[1]);
8548      radeon_emit(cs, blocks[2]);
8549      radeon_emit(cs, dispatch_initiator);
8550   }
8551
8552   assert(cmd_buffer->cs->cdw <= cdw_max);
8553}
8554
8555static void
8556radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer,
8557                                       struct radv_compute_pipeline *pipeline,
8558                                       VkPipelineBindPoint bind_point)
8559{
8560   radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, &pipeline->base, bind_point);
8561   radv_flush_constants(cmd_buffer,
8562                        bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
8563                           ? RADV_RT_STAGE_BITS
8564                           : VK_SHADER_STAGE_COMPUTE_BIT,
8565                        &pipeline->base, bind_point);
8566}
8567
8568static void
8569radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info,
8570              struct radv_compute_pipeline *pipeline, VkPipelineBindPoint bind_point)
8571{
8572   bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
8573   bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
8574
8575   if (pipeline->cs_regalloc_hang_bug)
8576      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
8577                                      RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
8578
8579   if (cmd_buffer->state.flush_bits &
8580       (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
8581        RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
8582      /* If we have to wait for idle, set all states first, so that
8583       * all SET packets are processed in parallel with previous draw
8584       * calls. Then upload descriptors, set shader pointers, and
8585       * dispatch, and prefetch at the end. This ensures that the
8586       * time the CUs are idle is very short. (there are only SET_SH
8587       * packets between the wait and the draw)
8588       */
8589      radv_emit_compute_pipeline(cmd_buffer, pipeline);
8590      si_emit_cache_flush(cmd_buffer);
8591      /* <-- CUs are idle here --> */
8592
8593      radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
8594
8595      radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
8596      /* <-- CUs are busy here --> */
8597
8598      /* Start prefetches after the dispatch has been started. Both
8599       * will run in parallel, but starting the dispatch first is
8600       * more important.
8601       */
8602      if (has_prefetch && pipeline_is_dirty) {
8603         radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
8604      }
8605   } else {
8606      /* If we don't wait for idle, start prefetches first, then set
8607       * states, and dispatch at the end.
8608       */
8609      si_emit_cache_flush(cmd_buffer);
8610
8611      if (has_prefetch && pipeline_is_dirty) {
8612         radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
8613      }
8614
8615      radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
8616
8617      radv_emit_compute_pipeline(cmd_buffer, pipeline);
8618      radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
8619   }
8620
8621   if (pipeline_is_dirty) {
8622      /* Raytracing uses compute shaders but has separate bind points and pipelines.
8623       * So if we set compute userdata & shader registers we should dirty the raytracing
8624       * ones and the other way around.
8625       *
8626       * We only need to do this when the pipeline is dirty because when we switch between
8627       * the two we always need to switch pipelines.
8628       */
8629      radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
8630                                                     ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
8631                                                     : VK_PIPELINE_BIND_POINT_COMPUTE);
8632   }
8633
8634   if (pipeline->cs_regalloc_hang_bug)
8635      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
8636
8637   radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
8638}
8639
8640static void
8641radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
8642{
8643   radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline,
8644                 VK_PIPELINE_BIND_POINT_COMPUTE);
8645}
8646
8647VKAPI_ATTR void VKAPI_CALL
8648radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y,
8649                     uint32_t base_z, uint32_t x, uint32_t y, uint32_t z)
8650{
8651   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8652   struct radv_dispatch_info info = {0};
8653
8654   info.blocks[0] = x;
8655   info.blocks[1] = y;
8656   info.blocks[2] = z;
8657
8658   info.offsets[0] = base_x;
8659   info.offsets[1] = base_y;
8660   info.offsets[2] = base_z;
8661   radv_compute_dispatch(cmd_buffer, &info);
8662}
8663
8664VKAPI_ATTR void VKAPI_CALL
8665radv_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
8666{
8667   radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
8668}
8669
8670VKAPI_ATTR void VKAPI_CALL
8671radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)
8672{
8673   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8674   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8675   struct radv_dispatch_info info = {0};
8676
8677   info.indirect = buffer->bo;
8678   info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
8679
8680   radv_compute_dispatch(cmd_buffer, &info);
8681}
8682
8683void
8684radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
8685{
8686   struct radv_dispatch_info info = {0};
8687
8688   info.blocks[0] = x;
8689   info.blocks[1] = y;
8690   info.blocks[2] = z;
8691   info.unaligned = 1;
8692
8693   radv_compute_dispatch(cmd_buffer, &info);
8694}
8695
8696void
8697radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va)
8698{
8699   struct radv_dispatch_info info = {0};
8700
8701   info.indirect = bo;
8702   info.va = va;
8703
8704   radv_compute_dispatch(cmd_buffer, &info);
8705}
8706
8707enum radv_rt_mode {
8708   radv_rt_mode_direct,
8709   radv_rt_mode_indirect,
8710   radv_rt_mode_indirect2,
8711};
8712
8713static void
8714radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *tables,
8715                uint64_t indirect_va, enum radv_rt_mode mode)
8716{
8717   struct radv_compute_pipeline *pipeline = cmd_buffer->state.rt_pipeline;
8718   uint32_t base_reg = pipeline->base.user_data_0[MESA_SHADER_COMPUTE];
8719
8720   struct radv_dispatch_info info = {0};
8721   info.unaligned = true;
8722
8723   uint64_t launch_size_va;
8724   uint64_t sbt_va;
8725
8726   if (mode != radv_rt_mode_indirect2) {
8727      uint32_t upload_size = mode == radv_rt_mode_direct
8728                                ? sizeof(VkTraceRaysIndirectCommand2KHR)
8729                                : offsetof(VkTraceRaysIndirectCommand2KHR, width);
8730
8731      uint32_t offset;
8732      if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset))
8733         return;
8734
8735      uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
8736
8737      launch_size_va = (mode == radv_rt_mode_direct)
8738                          ? upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width)
8739                          : indirect_va;
8740      sbt_va = upload_va;
8741   } else {
8742      launch_size_va = indirect_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
8743      sbt_va = indirect_va;
8744   }
8745
8746   if (mode == radv_rt_mode_direct) {
8747      info.blocks[0] = tables->width;
8748      info.blocks[1] = tables->height;
8749      info.blocks[2] = tables->depth;
8750   } else
8751      info.va = launch_size_va;
8752
8753   struct radv_userdata_info *desc_loc =
8754      radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_SBT_DESCRIPTORS);
8755   if (desc_loc->sgpr_idx != -1) {
8756      radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
8757                               base_reg + desc_loc->sgpr_idx * 4, sbt_va, true);
8758   }
8759
8760   struct radv_userdata_info *size_loc =
8761      radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
8762   if (size_loc->sgpr_idx != -1) {
8763      radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
8764                               base_reg + size_loc->sgpr_idx * 4, launch_size_va, true);
8765   }
8766
8767   radv_dispatch(cmd_buffer, &info, pipeline, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
8768}
8769
8770VKAPI_ATTR void VKAPI_CALL
8771radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,
8772                     const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
8773                     const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
8774                     const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
8775                     const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
8776                     uint32_t width, uint32_t height, uint32_t depth)
8777{
8778   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8779
8780   VkTraceRaysIndirectCommand2KHR tables = {
8781      .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
8782      .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
8783      .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
8784      .missShaderBindingTableSize = pMissShaderBindingTable->size,
8785      .missShaderBindingTableStride = pMissShaderBindingTable->stride,
8786      .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
8787      .hitShaderBindingTableSize = pHitShaderBindingTable->size,
8788      .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
8789      .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
8790      .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
8791      .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
8792      .width = width,
8793      .height = height,
8794      .depth = depth,
8795   };
8796
8797   radv_trace_rays(cmd_buffer, &tables, 0, radv_rt_mode_direct);
8798}
8799
8800VKAPI_ATTR void VKAPI_CALL
8801radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
8802                             const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
8803                             const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
8804                             const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
8805                             const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
8806                             VkDeviceAddress indirectDeviceAddress)
8807{
8808   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8809
8810   assert(cmd_buffer->device->use_global_bo_list);
8811
8812   VkTraceRaysIndirectCommand2KHR tables = {
8813      .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
8814      .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
8815      .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
8816      .missShaderBindingTableSize = pMissShaderBindingTable->size,
8817      .missShaderBindingTableStride = pMissShaderBindingTable->stride,
8818      .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
8819      .hitShaderBindingTableSize = pHitShaderBindingTable->size,
8820      .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
8821      .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
8822      .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
8823      .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
8824   };
8825
8826   radv_trace_rays(cmd_buffer, &tables, indirectDeviceAddress, radv_rt_mode_indirect);
8827}
8828
8829VKAPI_ATTR void VKAPI_CALL
8830radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer, VkDeviceAddress indirectDeviceAddress)
8831{
8832   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8833
8834   assert(cmd_buffer->device->use_global_bo_list);
8835
8836   radv_trace_rays(cmd_buffer, NULL, indirectDeviceAddress, radv_rt_mode_indirect2);
8837}
8838
8839static void
8840radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size)
8841{
8842   unsigned wave_size = 0;
8843   unsigned scratch_bytes_per_wave = 0;
8844
8845   if (cmd_buffer->state.rt_pipeline) {
8846      scratch_bytes_per_wave = cmd_buffer->state.rt_pipeline->base.scratch_bytes_per_wave;
8847      wave_size = cmd_buffer->state.rt_pipeline->base.shaders[MESA_SHADER_COMPUTE]->info.wave_size;
8848   }
8849
8850   /* The hardware register is specified as a multiple of 256 DWORDS. */
8851   scratch_bytes_per_wave += align(size * wave_size, 1024);
8852
8853   cmd_buffer->compute_scratch_size_per_wave_needed =
8854      MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
8855}
8856
8857VKAPI_ATTR void VKAPI_CALL
8858radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size)
8859{
8860   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8861
8862   radv_set_rt_stack_size(cmd_buffer, size);
8863   cmd_buffer->state.rt_stack_size = size;
8864}
8865
8866VKAPI_ATTR void VKAPI_CALL
8867radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo)
8868{
8869   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8870
8871   radv_mark_noncoherent_rb(cmd_buffer);
8872
8873   radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
8874
8875   radv_cmd_buffer_end_subpass(cmd_buffer);
8876
8877   vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments);
8878   vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.subpass_sample_locs);
8879
8880   cmd_buffer->state.pass = NULL;
8881   cmd_buffer->state.subpass = NULL;
8882   cmd_buffer->state.attachments = NULL;
8883   cmd_buffer->state.framebuffer = NULL;
8884   cmd_buffer->state.subpass_sample_locs = NULL;
8885}
8886
8887VKAPI_ATTR void VKAPI_CALL
8888radv_CmdBeginRendering(VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo)
8889{
8890   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8891   const VkRenderingFragmentShadingRateAttachmentInfoKHR *vrs_info = vk_find_struct_const(
8892      pRenderingInfo->pNext, RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
8893   VkResult result;
8894   /* (normal + resolve) for color attachments and ds and a VRS attachment */
8895   VkAttachmentDescription2 att_desc[MAX_RTS * 2 + 3];
8896   VkAttachmentDescriptionStencilLayout ds_stencil_att, ds_stencil_resolve_att;
8897   VkImageView iviews[MAX_RTS * 2 + 3];
8898   VkAttachmentReference2 color_refs[MAX_RTS], color_resolve_refs[MAX_RTS];
8899   VkAttachmentReference2 ds_ref, ds_resolve_ref, vrs_ref;
8900   VkAttachmentReferenceStencilLayout ds_stencil_ref, ds_stencil_resolve_ref;
8901   VkSubpassDescriptionDepthStencilResolve ds_resolve_info;
8902   VkFragmentShadingRateAttachmentInfoKHR vrs_subpass_info;
8903   VkClearValue clear_values[MAX_RTS * 2 + 3];
8904   unsigned att_count = 0;
8905
8906   VkSubpassDescription2 subpass = {
8907      .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
8908      .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
8909      .viewMask = pRenderingInfo->viewMask,
8910      .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
8911      .pColorAttachments = color_refs,
8912      .pResolveAttachments = color_resolve_refs,
8913   };
8914
8915   for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; ++i) {
8916      color_refs[i] = (VkAttachmentReference2){
8917         .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8918         .attachment = VK_ATTACHMENT_UNUSED,
8919      };
8920      color_resolve_refs[i] = (VkAttachmentReference2){
8921         .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8922         .attachment = VK_ATTACHMENT_UNUSED,
8923      };
8924
8925      if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
8926         continue;
8927
8928      const VkRenderingAttachmentInfo *info = &pRenderingInfo->pColorAttachments[i];
8929      RADV_FROM_HANDLE(radv_image_view, iview, info->imageView);
8930      color_refs[i] = (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8931                                               .attachment = att_count,
8932                                               .layout = info->imageLayout,
8933                                               .aspectMask = iview->vk.aspects};
8934
8935      iviews[att_count] = info->imageView;
8936      clear_values[att_count] = info->clearValue;
8937      VkAttachmentDescription2 *att = att_desc + att_count++;
8938
8939      memset(att, 0, sizeof(*att));
8940      att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
8941      att->format = iview->vk.format;
8942      att->samples = iview->image->info.samples;
8943      att->loadOp = info->loadOp;
8944      att->storeOp = info->storeOp;
8945      att->initialLayout = info->imageLayout;
8946      att->finalLayout = info->imageLayout;
8947
8948      if (pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT)
8949         att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
8950
8951      if (pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)
8952         att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
8953
8954      if (info->resolveMode != VK_RESOLVE_MODE_NONE &&
8955          !(pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)) {
8956         RADV_FROM_HANDLE(radv_image_view, resolve_iview, info->resolveImageView);
8957         color_resolve_refs[i] =
8958            (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8959                                     .attachment = att_count,
8960                                     .layout = info->resolveImageLayout,
8961                                     .aspectMask = resolve_iview->vk.aspects};
8962
8963         iviews[att_count] = info->resolveImageView;
8964         att = att_desc + att_count++;
8965
8966         memset(att, 0, sizeof(*att));
8967         att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
8968         att->format = resolve_iview->vk.format;
8969         att->samples = resolve_iview->image->info.samples;
8970         att->loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
8971         att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
8972         att->initialLayout = info->resolveImageLayout;
8973         att->finalLayout = info->resolveImageLayout;
8974      }
8975   }
8976
8977   if (pRenderingInfo->pDepthAttachment || pRenderingInfo->pStencilAttachment) {
8978      const VkRenderingAttachmentInfo *common_info = pRenderingInfo->pDepthAttachment
8979                                                           ? pRenderingInfo->pDepthAttachment
8980                                                           : pRenderingInfo->pStencilAttachment;
8981      RADV_FROM_HANDLE(radv_image_view, iview, common_info->imageView);
8982
8983      if (common_info->imageView != VK_NULL_HANDLE) {
8984         ds_ref = (VkAttachmentReference2){
8985            .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8986            .attachment = att_count,
8987            .layout = common_info->imageLayout,
8988            .aspectMask = (pRenderingInfo->pDepthAttachment ? VK_IMAGE_ASPECT_DEPTH_BIT : 0) |
8989                          (pRenderingInfo->pStencilAttachment ? VK_IMAGE_ASPECT_STENCIL_BIT : 0)};
8990         subpass.pDepthStencilAttachment = &ds_ref;
8991
8992         iviews[att_count] = common_info->imageView;
8993         if (pRenderingInfo->pDepthAttachment)
8994            clear_values[att_count].depthStencil.depth =
8995               pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
8996         if (pRenderingInfo->pStencilAttachment)
8997            clear_values[att_count].depthStencil.stencil =
8998               pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
8999         VkAttachmentDescription2 *att = att_desc + att_count++;
9000
9001         memset(att, 0, sizeof(*att));
9002         att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
9003         att->format = iview->vk.format;
9004         att->samples = iview->image->info.samples;
9005
9006         if (pRenderingInfo->pDepthAttachment) {
9007            att->loadOp = pRenderingInfo->pDepthAttachment->loadOp;
9008            att->storeOp = pRenderingInfo->pDepthAttachment->storeOp;
9009         } else {
9010            att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9011            att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
9012         }
9013
9014         if (pRenderingInfo->pStencilAttachment) {
9015            att->stencilLoadOp = pRenderingInfo->pStencilAttachment->loadOp;
9016            att->stencilStoreOp = pRenderingInfo->pStencilAttachment->storeOp;
9017         } else {
9018            att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9019            att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
9020         }
9021
9022         if (pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT) {
9023            att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9024            att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9025         }
9026
9027         if (pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT) {
9028            att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
9029            att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
9030         }
9031
9032         att->initialLayout = common_info->imageLayout;
9033         att->finalLayout = common_info->imageLayout;
9034
9035         if (pRenderingInfo->pDepthAttachment && pRenderingInfo->pStencilAttachment) {
9036            ds_ref.pNext = &ds_stencil_ref;
9037            ds_stencil_ref = (VkAttachmentReferenceStencilLayout){
9038               .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT,
9039               .stencilLayout = pRenderingInfo->pStencilAttachment->imageLayout};
9040
9041            att->pNext = &ds_stencil_att;
9042            ds_stencil_att = (VkAttachmentDescriptionStencilLayout){
9043               .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT,
9044               .stencilInitialLayout = pRenderingInfo->pStencilAttachment->imageLayout,
9045               .stencilFinalLayout = pRenderingInfo->pStencilAttachment->imageLayout,
9046            };
9047         }
9048
9049         if (((pRenderingInfo->pDepthAttachment &&
9050              pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE) ||
9051             (pRenderingInfo->pStencilAttachment &&
9052              pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE)) &&
9053             !(pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)) {
9054            RADV_FROM_HANDLE(radv_image_view, resolve_iview, common_info->resolveImageView);
9055            ds_resolve_ref =
9056               (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
9057                                        .attachment = att_count,
9058                                        .layout = common_info->resolveImageLayout,
9059                                        .aspectMask = resolve_iview->vk.aspects};
9060
9061            iviews[att_count] = common_info->resolveImageView;
9062            att = att_desc + att_count++;
9063
9064            memset(att, 0, sizeof(*att));
9065            att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
9066            att->format = resolve_iview->vk.format;
9067            att->samples = resolve_iview->image->info.samples;
9068            att->loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
9069            att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
9070            att->initialLayout = common_info->resolveImageLayout;
9071            att->finalLayout = common_info->resolveImageLayout;
9072
9073            ds_resolve_info = (VkSubpassDescriptionDepthStencilResolve){
9074               .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE,
9075               .pNext = subpass.pNext,
9076               .depthResolveMode =
9077                  (pRenderingInfo->pDepthAttachment &&
9078                   pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE)
9079                     ? pRenderingInfo->pDepthAttachment->resolveMode
9080                     : VK_RESOLVE_MODE_NONE,
9081               .stencilResolveMode =
9082                  (pRenderingInfo->pStencilAttachment &&
9083                   pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE)
9084                     ? pRenderingInfo->pStencilAttachment->resolveMode
9085                     : VK_RESOLVE_MODE_NONE,
9086               .pDepthStencilResolveAttachment = &ds_resolve_ref};
9087            subpass.pNext = &ds_resolve_info;
9088
9089            if (pRenderingInfo->pDepthAttachment && pRenderingInfo->pStencilAttachment &&
9090                pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE &&
9091                pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE) {
9092               ds_resolve_ref.pNext = &ds_stencil_resolve_ref;
9093               ds_stencil_resolve_ref = (VkAttachmentReferenceStencilLayout){
9094                  .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT,
9095                  .stencilLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout};
9096
9097               att->pNext = &ds_stencil_resolve_att;
9098               ds_stencil_resolve_att = (VkAttachmentDescriptionStencilLayout){
9099                  .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT,
9100                  .stencilInitialLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout,
9101                  .stencilFinalLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout,
9102               };
9103            }
9104         }
9105      }
9106   }
9107
9108   if (vrs_info && vrs_info->imageView) {
9109      RADV_FROM_HANDLE(radv_image_view, iview, vrs_info->imageView);
9110      vrs_ref = (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
9111                                         .attachment = att_count,
9112                                         .layout = vrs_info->imageLayout,
9113                                         .aspectMask = iview->vk.aspects};
9114
9115      iviews[att_count] = vrs_info->imageView;
9116      VkAttachmentDescription2 *att = att_desc + att_count++;
9117
9118      memset(att, 0, sizeof(*att));
9119      att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
9120      att->format = iview->vk.format;
9121      att->samples = iview->image->info.samples;
9122      att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9123      att->storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
9124      att->initialLayout = vrs_info->imageLayout;
9125      att->finalLayout = vrs_info->imageLayout;
9126
9127      vrs_subpass_info = (VkFragmentShadingRateAttachmentInfoKHR){
9128         .sType = VK_STRUCTURE_TYPE_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR,
9129         .pNext = subpass.pNext,
9130         .pFragmentShadingRateAttachment = &vrs_ref,
9131         .shadingRateAttachmentTexelSize = vrs_info->shadingRateAttachmentTexelSize,
9132      };
9133      subpass.pNext = &vrs_subpass_info;
9134   }
9135
9136   VkRenderPassCreateInfo2 rp_create_info = {
9137      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
9138      .attachmentCount = att_count,
9139      .pAttachments = att_desc,
9140      .subpassCount = 1,
9141      .pSubpasses = &subpass,
9142   };
9143
9144   VkRenderPass rp;
9145   result =
9146      radv_CreateRenderPass2(radv_device_to_handle(cmd_buffer->device), &rp_create_info, NULL, &rp);
9147   if (result != VK_SUCCESS) {
9148      cmd_buffer->record_result = result;
9149      return;
9150   }
9151
9152   unsigned w = pRenderingInfo->renderArea.offset.x + pRenderingInfo->renderArea.extent.width;
9153   unsigned h = pRenderingInfo->renderArea.offset.y + pRenderingInfo->renderArea.extent.height;
9154   for (unsigned i = 0; i < att_count; ++i) {
9155      RADV_FROM_HANDLE(radv_image_view, iview, iviews[i]);
9156
9157      if (vrs_info && vrs_info->imageView == iviews[i])
9158         continue;
9159
9160      w = MIN2(w, iview->extent.width);
9161      h = MIN2(h, iview->extent.height);
9162   }
9163   VkFramebufferCreateInfo fb_create_info = {
9164      .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
9165      .renderPass = rp,
9166      .attachmentCount = att_count,
9167      .pAttachments = iviews,
9168      .width = w,
9169      .height = h,
9170      .layers = pRenderingInfo->layerCount,
9171   };
9172
9173   VkFramebuffer fb;
9174   result =
9175      vk_common_CreateFramebuffer(radv_device_to_handle(cmd_buffer->device), &fb_create_info, NULL, &fb);
9176   if (result != VK_SUCCESS) {
9177      radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device), rp, NULL);
9178      cmd_buffer->record_result = result;
9179      return;
9180   }
9181
9182   VkRenderPassBeginInfo begin_info = {.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
9183                                       .renderPass = rp,
9184                                       .framebuffer = fb,
9185                                       .renderArea = pRenderingInfo->renderArea,
9186                                       .clearValueCount = att_count,
9187                                       .pClearValues = clear_values};
9188
9189   const VkSubpassBeginInfo pass_begin_info = {
9190      .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
9191      .contents = (pRenderingInfo->flags & VK_RENDERING_CONTENTS_SECONDARY_COMMAND_BUFFERS_BIT)
9192                     ? VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS
9193                     : VK_SUBPASS_CONTENTS_INLINE,
9194   };
9195
9196   radv_CmdBeginRenderPass2(commandBuffer, &begin_info, &pass_begin_info);
9197}
9198
9199VKAPI_ATTR void VKAPI_CALL
9200radv_CmdEndRendering(VkCommandBuffer commandBuffer)
9201{
9202   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9203   struct radv_render_pass *pass = cmd_buffer->state.pass;
9204   struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
9205
9206   radv_CmdEndRenderPass2(commandBuffer, NULL);
9207
9208   vk_common_DestroyFramebuffer(radv_device_to_handle(cmd_buffer->device),
9209                                vk_framebuffer_to_handle(framebuffer), NULL);
9210   radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device),
9211                          radv_render_pass_to_handle(pass), NULL);
9212}
9213
9214/*
9215 * For HTILE we have the following interesting clear words:
9216 *   0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
9217 *   0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
9218 *   0xfffffff0: Clear depth to 1.0
9219 *   0x00000000: Clear depth to 0.0
9220 */
9221static void
9222radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9223                      const VkImageSubresourceRange *range)
9224{
9225   struct radv_cmd_state *state = &cmd_buffer->state;
9226   uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image);
9227   VkClearDepthStencilValue value = {0};
9228   struct radv_barrier_data barrier = {0};
9229
9230   barrier.layout_transitions.init_mask_ram = 1;
9231   radv_describe_layout_transition(cmd_buffer, &barrier);
9232
9233   /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent
9234    * in considering previous rendering work for WAW hazards. */
9235   state->flush_bits |=
9236      radv_src_access_flush(cmd_buffer, VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image);
9237
9238   if (image->planes[0].surface.has_stencil &&
9239       !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
9240      /* Flush caches before performing a separate aspect initialization because it's a
9241       * read-modify-write operation.
9242       */
9243      state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT, image);
9244   }
9245
9246   state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
9247
9248   radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask);
9249
9250   if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) {
9251      /* Initialize the TC-compat metada value to 0 because by
9252       * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
9253       * need have to conditionally update its value when performing
9254       * a fast depth clear.
9255       */
9256      radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
9257   }
9258}
9259
9260static void
9261radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9262                                   VkImageLayout src_layout, bool src_render_loop,
9263                                   VkImageLayout dst_layout, bool dst_render_loop,
9264                                   unsigned src_queue_mask, unsigned dst_queue_mask,
9265                                   const VkImageSubresourceRange *range,
9266                                   struct radv_sample_locations_state *sample_locs)
9267{
9268   struct radv_device *device = cmd_buffer->device;
9269
9270   if (!radv_htile_enabled(image, range->baseMipLevel))
9271      return;
9272
9273   if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
9274      radv_initialize_htile(cmd_buffer, image, range);
9275   } else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
9276                                               src_queue_mask) &&
9277              radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
9278                                              dst_queue_mask)) {
9279      radv_initialize_htile(cmd_buffer, image, range);
9280   } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
9281                                              src_queue_mask) &&
9282              !radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
9283                                               dst_queue_mask)) {
9284      cmd_buffer->state.flush_bits |=
9285         RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
9286
9287      radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs);
9288
9289      cmd_buffer->state.flush_bits |=
9290         RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
9291   }
9292}
9293
9294static uint32_t
9295radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9296                const VkImageSubresourceRange *range, uint32_t value)
9297{
9298   struct radv_barrier_data barrier = {0};
9299
9300   barrier.layout_transitions.init_mask_ram = 1;
9301   radv_describe_layout_transition(cmd_buffer, &barrier);
9302
9303   return radv_clear_cmask(cmd_buffer, image, range, value);
9304}
9305
9306uint32_t
9307radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9308                const VkImageSubresourceRange *range)
9309{
9310   static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210};
9311   uint32_t log2_samples = util_logbase2(image->info.samples);
9312   uint32_t value = fmask_clear_values[log2_samples];
9313   struct radv_barrier_data barrier = {0};
9314
9315   barrier.layout_transitions.init_mask_ram = 1;
9316   radv_describe_layout_transition(cmd_buffer, &barrier);
9317
9318   return radv_clear_fmask(cmd_buffer, image, range, value);
9319}
9320
9321uint32_t
9322radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9323              const VkImageSubresourceRange *range, uint32_t value)
9324{
9325   struct radv_barrier_data barrier = {0};
9326   uint32_t flush_bits = 0;
9327   unsigned size = 0;
9328
9329   barrier.layout_transitions.init_mask_ram = 1;
9330   radv_describe_layout_transition(cmd_buffer, &barrier);
9331
9332   flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
9333
9334   if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX8) {
9335      /* When DCC is enabled with mipmaps, some levels might not
9336       * support fast clears and we have to initialize them as "fully
9337       * expanded".
9338       */
9339      /* Compute the size of all fast clearable DCC levels. */
9340      for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) {
9341         struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i];
9342         unsigned dcc_fast_clear_size =
9343            dcc_level->dcc_slice_fast_clear_size * image->info.array_size;
9344
9345         if (!dcc_fast_clear_size)
9346            break;
9347
9348         size = dcc_level->dcc_offset + dcc_fast_clear_size;
9349      }
9350
9351      /* Initialize the mipmap levels without DCC. */
9352      if (size != image->planes[0].surface.meta_size) {
9353         flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bindings[0].bo,
9354                                        radv_buffer_get_va(image->bindings[0].bo) +
9355                                           image->bindings[0].offset +
9356                                           image->planes[0].surface.meta_offset + size,
9357                                        image->planes[0].surface.meta_size - size, 0xffffffff);
9358      }
9359   }
9360
9361   return flush_bits;
9362}
9363
9364/**
9365 * Initialize DCC/FMASK/CMASK metadata for a color image.
9366 */
9367static void
9368radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9369                               VkImageLayout src_layout, bool src_render_loop,
9370                               VkImageLayout dst_layout, bool dst_render_loop,
9371                               unsigned src_queue_mask, unsigned dst_queue_mask,
9372                               const VkImageSubresourceRange *range)
9373{
9374   uint32_t flush_bits = 0;
9375
9376   /* Transitioning from LAYOUT_UNDEFINED layout not everyone is
9377    * consistent in considering previous rendering work for WAW hazards.
9378    */
9379   cmd_buffer->state.flush_bits |=
9380      radv_src_access_flush(cmd_buffer, VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, image);
9381
9382   if (radv_image_has_cmask(image)) {
9383      uint32_t value;
9384
9385      if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
9386         /* TODO: Fix clearing CMASK layers on GFX9. */
9387         if (radv_image_is_tc_compat_cmask(image) ||
9388             (radv_image_has_fmask(image) &&
9389              radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout,
9390                                         dst_render_loop, dst_queue_mask))) {
9391            value = 0xccccccccu;
9392         } else {
9393            value = 0xffffffffu;
9394         }
9395      } else {
9396         static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff};
9397         uint32_t log2_samples = util_logbase2(image->info.samples);
9398
9399         value = cmask_clear_values[log2_samples];
9400      }
9401
9402      flush_bits |= radv_init_cmask(cmd_buffer, image, range, value);
9403   }
9404
9405   if (radv_image_has_fmask(image)) {
9406      flush_bits |= radv_init_fmask(cmd_buffer, image, range);
9407   }
9408
9409   if (radv_dcc_enabled(image, range->baseMipLevel)) {
9410      uint32_t value = 0xffffffffu; /* Fully expanded mode. */
9411
9412      if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
9413                                     dst_layout, dst_render_loop, dst_queue_mask)) {
9414         value = 0u;
9415      }
9416
9417      flush_bits |= radv_init_dcc(cmd_buffer, image, range, value);
9418   }
9419
9420   if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) {
9421      radv_update_fce_metadata(cmd_buffer, image, range, false);
9422
9423      uint32_t color_values[2] = {0};
9424      radv_set_color_clear_metadata(cmd_buffer, image, range, color_values);
9425   }
9426
9427   cmd_buffer->state.flush_bits |= flush_bits;
9428}
9429
9430static void
9431radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9432                       VkImageLayout src_layout, VkImageLayout dst_layout, unsigned dst_queue_mask)
9433{
9434   /* If the image is read-only, we don't have to retile DCC because it can't change. */
9435   if (!(image->vk.usage & RADV_IMAGE_USAGE_WRITE_BITS))
9436      return;
9437
9438   if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
9439       (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR ||
9440        (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN))))
9441      radv_retile_dcc(cmd_buffer, image);
9442}
9443
9444static bool
9445radv_image_need_retile(const struct radv_image *image)
9446{
9447   return image->planes[0].surface.display_dcc_offset &&
9448          image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset;
9449}
9450
9451/**
9452 * Handle color image transitions for DCC/FMASK/CMASK.
9453 */
9454static void
9455radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9456                                   VkImageLayout src_layout, bool src_render_loop,
9457                                   VkImageLayout dst_layout, bool dst_render_loop,
9458                                   unsigned src_queue_mask, unsigned dst_queue_mask,
9459                                   const VkImageSubresourceRange *range)
9460{
9461   bool dcc_decompressed = false, fast_clear_flushed = false;
9462
9463   if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) &&
9464       !radv_dcc_enabled(image, range->baseMipLevel))
9465      return;
9466
9467   if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
9468      radv_init_color_image_metadata(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
9469                                     dst_render_loop, src_queue_mask, dst_queue_mask, range);
9470
9471      if (radv_image_need_retile(image))
9472         radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
9473      return;
9474   }
9475
9476   if (radv_dcc_enabled(image, range->baseMipLevel)) {
9477      if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
9478         cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu);
9479      } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
9480                                            src_layout, src_render_loop, src_queue_mask) &&
9481                 !radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
9482                                             dst_layout, dst_render_loop, dst_queue_mask)) {
9483         radv_decompress_dcc(cmd_buffer, image, range);
9484         dcc_decompressed = true;
9485      } else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9486                                            src_layout, src_render_loop, src_queue_mask) &&
9487                 !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9488                                             dst_layout, dst_render_loop, dst_queue_mask)) {
9489         radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
9490         fast_clear_flushed = true;
9491      }
9492
9493      if (radv_image_need_retile(image))
9494         radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
9495   } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
9496      if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9497                                     src_layout, src_render_loop, src_queue_mask) &&
9498          !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9499                                      dst_layout, dst_render_loop, dst_queue_mask)) {
9500         radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
9501         fast_clear_flushed = true;
9502      }
9503   }
9504
9505   /* MSAA color decompress. */
9506   if (radv_image_has_fmask(image) &&
9507       (image->vk.usage & (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT)) &&
9508       radv_layout_fmask_compressed(cmd_buffer->device, image, src_layout, src_queue_mask) &&
9509       !radv_layout_fmask_compressed(cmd_buffer->device, image, dst_layout, dst_queue_mask)) {
9510      if (radv_dcc_enabled(image, range->baseMipLevel) &&
9511          !radv_image_use_dcc_image_stores(cmd_buffer->device, image) && !dcc_decompressed) {
9512         /* A DCC decompress is required before expanding FMASK
9513          * when DCC stores aren't supported to avoid being in
9514          * a state where DCC is compressed and the main
9515          * surface is uncompressed.
9516          */
9517         radv_decompress_dcc(cmd_buffer, image, range);
9518      } else if (!fast_clear_flushed) {
9519         /* A FMASK decompress is required before expanding
9520          * FMASK.
9521          */
9522         radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
9523      }
9524
9525      struct radv_barrier_data barrier = {0};
9526      barrier.layout_transitions.fmask_color_expand = 1;
9527      radv_describe_layout_transition(cmd_buffer, &barrier);
9528
9529      radv_expand_fmask_image_inplace(cmd_buffer, image, range);
9530   }
9531}
9532
9533static void
9534radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9535                             VkImageLayout src_layout, bool src_render_loop,
9536                             VkImageLayout dst_layout, bool dst_render_loop, uint32_t src_family_index,
9537                             uint32_t dst_family_index, const VkImageSubresourceRange *range,
9538                             struct radv_sample_locations_state *sample_locs)
9539{
9540   enum radv_queue_family src_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, src_family_index);
9541   enum radv_queue_family dst_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, dst_family_index);
9542   if (image->exclusive && src_family_index != dst_family_index) {
9543      /* This is an acquire or a release operation and there will be
9544       * a corresponding release/acquire. Do the transition in the
9545       * most flexible queue. */
9546
9547      assert(src_qf == cmd_buffer->qf ||
9548             dst_qf == cmd_buffer->qf);
9549
9550      if (src_family_index == VK_QUEUE_FAMILY_EXTERNAL || src_family_index == VK_QUEUE_FAMILY_FOREIGN_EXT)
9551         return;
9552
9553      if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
9554         return;
9555
9556      if (cmd_buffer->qf == RADV_QUEUE_COMPUTE &&
9557          (src_qf == RADV_QUEUE_GENERAL || dst_qf == RADV_QUEUE_GENERAL))
9558         return;
9559   }
9560
9561   unsigned src_queue_mask =
9562      radv_image_queue_family_mask(image, src_qf, cmd_buffer->qf);
9563   unsigned dst_queue_mask =
9564      radv_image_queue_family_mask(image, dst_qf, cmd_buffer->qf);
9565
9566   if (src_layout == dst_layout && src_render_loop == dst_render_loop && src_queue_mask == dst_queue_mask)
9567      return;
9568
9569   if (vk_format_has_depth(image->vk.format)) {
9570      radv_handle_depth_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
9571                                         dst_render_loop, src_queue_mask, dst_queue_mask, range,
9572                                         sample_locs);
9573   } else {
9574      radv_handle_color_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
9575                                         dst_render_loop, src_queue_mask, dst_queue_mask, range);
9576   }
9577}
9578
9579static void
9580radv_cp_dma_wait_for_stages(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 stage_mask)
9581{
9582   /* Make sure CP DMA is idle because the driver might have performed a DMA operation for copying a
9583    * buffer (or a MSAA image using FMASK). Note that updating a buffer is considered a clear
9584    * operation but it might also use a CP DMA copy in some rare situations. Other operations using
9585    * a CP DMA clear are implicitly synchronized (see CP_DMA_SYNC).
9586    */
9587   if (stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT |
9588                     VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
9589                     VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
9590      si_cp_dma_wait_for_idle(cmd_buffer);
9591}
9592
9593static void
9594radv_barrier(struct radv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_info,
9595             enum rgp_barrier_reason reason)
9596{
9597   enum radv_cmd_flush_bits src_flush_bits = 0;
9598   enum radv_cmd_flush_bits dst_flush_bits = 0;
9599   VkPipelineStageFlags2 src_stage_mask = 0;
9600   VkPipelineStageFlags2 dst_stage_mask = 0;
9601
9602   if (cmd_buffer->state.subpass)
9603      radv_mark_noncoherent_rb(cmd_buffer);
9604
9605   radv_describe_barrier_start(cmd_buffer, reason);
9606
9607   for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
9608      src_stage_mask |= dep_info->pMemoryBarriers[i].srcStageMask;
9609      src_flush_bits |=
9610         radv_src_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].srcAccessMask, NULL);
9611      dst_stage_mask |= dep_info->pMemoryBarriers[i].dstStageMask;
9612      dst_flush_bits |=
9613         radv_dst_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].dstAccessMask, NULL);
9614   }
9615
9616   for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
9617      src_stage_mask |= dep_info->pBufferMemoryBarriers[i].srcStageMask;
9618      src_flush_bits |=
9619         radv_src_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].srcAccessMask, NULL);
9620      dst_stage_mask |= dep_info->pBufferMemoryBarriers[i].dstStageMask;
9621      dst_flush_bits |=
9622         radv_dst_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].dstAccessMask, NULL);
9623   }
9624
9625   for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
9626      RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
9627
9628      src_stage_mask |= dep_info->pImageMemoryBarriers[i].srcStageMask;
9629      src_flush_bits |=
9630         radv_src_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].srcAccessMask, image);
9631      dst_stage_mask |= dep_info->pImageMemoryBarriers[i].dstStageMask;
9632      dst_flush_bits |=
9633         radv_dst_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].dstAccessMask, image);
9634   }
9635
9636   /* The Vulkan spec 1.1.98 says:
9637    *
9638    * "An execution dependency with only
9639    *  VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT in the destination stage mask
9640    *  will only prevent that stage from executing in subsequently
9641    *  submitted commands. As this stage does not perform any actual
9642    *  execution, this is not observable - in effect, it does not delay
9643    *  processing of subsequent commands. Similarly an execution dependency
9644    *  with only VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT in the source stage mask
9645    *  will effectively not wait for any prior commands to complete."
9646    */
9647   if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
9648      radv_stage_flush(cmd_buffer, src_stage_mask);
9649   cmd_buffer->state.flush_bits |= src_flush_bits;
9650
9651   radv_ace_internal_barrier(cmd_buffer, src_stage_mask, 0);
9652
9653   for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
9654      RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
9655
9656      const struct VkSampleLocationsInfoEXT *sample_locs_info =
9657         vk_find_struct_const(dep_info->pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT);
9658      struct radv_sample_locations_state sample_locations;
9659
9660      if (sample_locs_info) {
9661         assert(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
9662         sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
9663         sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
9664         sample_locations.count = sample_locs_info->sampleLocationsCount;
9665         typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations,
9666                      sample_locs_info->sampleLocationsCount);
9667      }
9668
9669      radv_handle_image_transition(
9670         cmd_buffer, image, dep_info->pImageMemoryBarriers[i].oldLayout,
9671         false, /* Outside of a renderpass we are never in a renderloop */
9672         dep_info->pImageMemoryBarriers[i].newLayout,
9673         false, /* Outside of a renderpass we are never in a renderloop */
9674         dep_info->pImageMemoryBarriers[i].srcQueueFamilyIndex,
9675         dep_info->pImageMemoryBarriers[i].dstQueueFamilyIndex,
9676         &dep_info->pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL);
9677   }
9678
9679   radv_ace_internal_barrier(cmd_buffer, 0, dst_stage_mask);
9680   radv_cp_dma_wait_for_stages(cmd_buffer, src_stage_mask);
9681
9682   cmd_buffer->state.flush_bits |= dst_flush_bits;
9683
9684   radv_describe_barrier_end(cmd_buffer);
9685}
9686
9687VKAPI_ATTR void VKAPI_CALL
9688radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
9689                         const VkDependencyInfo *pDependencyInfo)
9690{
9691   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9692
9693   radv_barrier(cmd_buffer, pDependencyInfo, RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER);
9694}
9695
9696static void
9697write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event,
9698            VkPipelineStageFlags2 stageMask, unsigned value)
9699{
9700   struct radeon_cmdbuf *cs = cmd_buffer->cs;
9701   uint64_t va = radv_buffer_get_va(event->bo);
9702
9703   si_emit_cache_flush(cmd_buffer);
9704
9705   radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
9706
9707   ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28);
9708
9709   if (stageMask & (VK_PIPELINE_STAGE_2_COPY_BIT |
9710                    VK_PIPELINE_STAGE_2_RESOLVE_BIT |
9711                    VK_PIPELINE_STAGE_2_BLIT_BIT |
9712                    VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
9713      /* Be conservative for now. */
9714      stageMask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
9715   }
9716
9717   /* Flags that only require a top-of-pipe event. */
9718   VkPipelineStageFlags2 top_of_pipe_flags = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT;
9719
9720   /* Flags that only require a post-index-fetch event. */
9721   VkPipelineStageFlags2 post_index_fetch_flags =
9722      top_of_pipe_flags | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT;
9723
9724   /* Flags that only require signaling post PS. */
9725   VkPipelineStageFlags2 post_ps_flags =
9726      post_index_fetch_flags | VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
9727      VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
9728      VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
9729      VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV |
9730      VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
9731      VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
9732      VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR |
9733      VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT;
9734
9735   /* Flags that only require signaling post CS. */
9736   VkPipelineStageFlags2 post_cs_flags = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
9737
9738   radv_cp_dma_wait_for_stages(cmd_buffer, stageMask);
9739
9740   if (!(stageMask & ~top_of_pipe_flags)) {
9741      /* Just need to sync the PFP engine. */
9742      radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
9743      radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
9744      radeon_emit(cs, va);
9745      radeon_emit(cs, va >> 32);
9746      radeon_emit(cs, value);
9747   } else if (!(stageMask & ~post_index_fetch_flags)) {
9748      /* Sync ME because PFP reads index and indirect buffers. */
9749      radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
9750      radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
9751      radeon_emit(cs, va);
9752      radeon_emit(cs, va >> 32);
9753      radeon_emit(cs, value);
9754   } else {
9755      unsigned event_type;
9756
9757      if (!(stageMask & ~post_ps_flags)) {
9758         /* Sync previous fragment shaders. */
9759         event_type = V_028A90_PS_DONE;
9760      } else if (!(stageMask & ~post_cs_flags)) {
9761         /* Sync previous compute shaders. */
9762         event_type = V_028A90_CS_DONE;
9763      } else {
9764         /* Otherwise, sync all prior GPU work. */
9765         event_type = V_028A90_BOTTOM_OF_PIPE_TS;
9766      }
9767
9768      si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
9769                                 radv_cmd_buffer_uses_mec(cmd_buffer), event_type, 0,
9770                                 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value,
9771                                 cmd_buffer->gfx9_eop_bug_va);
9772   }
9773
9774   assert(cmd_buffer->cs->cdw <= cdw_max);
9775}
9776
9777VKAPI_ATTR void VKAPI_CALL
9778radv_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event,
9779                  const VkDependencyInfo* pDependencyInfo)
9780{
9781   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9782   RADV_FROM_HANDLE(radv_event, event, _event);
9783   VkPipelineStageFlags2 src_stage_mask = 0;
9784
9785   for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
9786      src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
9787   for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
9788      src_stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
9789   for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
9790      src_stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
9791
9792   write_event(cmd_buffer, event, src_stage_mask, 1);
9793}
9794
9795VKAPI_ATTR void VKAPI_CALL
9796radv_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event,
9797                    VkPipelineStageFlags2 stageMask)
9798{
9799   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9800   RADV_FROM_HANDLE(radv_event, event, _event);
9801
9802   write_event(cmd_buffer, event, stageMask, 0);
9803}
9804
9805VKAPI_ATTR void VKAPI_CALL
9806radv_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
9807                    const VkDependencyInfo* pDependencyInfos)
9808{
9809   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9810   struct radeon_cmdbuf *cs = cmd_buffer->cs;
9811
9812   for (unsigned i = 0; i < eventCount; ++i) {
9813      RADV_FROM_HANDLE(radv_event, event, pEvents[i]);
9814      uint64_t va = radv_buffer_get_va(event->bo);
9815
9816      radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
9817
9818      ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
9819
9820      radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
9821      assert(cmd_buffer->cs->cdw <= cdw_max);
9822   }
9823
9824   radv_barrier(cmd_buffer, pDependencyInfos, RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS);
9825}
9826
9827VKAPI_ATTR void VKAPI_CALL
9828radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
9829{
9830   /* No-op */
9831}
9832
9833/* VK_EXT_conditional_rendering */
9834VKAPI_ATTR void VKAPI_CALL
9835radv_CmdBeginConditionalRenderingEXT(
9836   VkCommandBuffer commandBuffer,
9837   const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
9838{
9839   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9840   RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
9841   struct radeon_cmdbuf *cs = cmd_buffer->cs;
9842   unsigned pred_op = PREDICATION_OP_BOOL32;
9843   bool draw_visible = true;
9844   uint64_t va;
9845
9846   va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset;
9847
9848   /* By default, if the 32-bit value at offset in buffer memory is zero,
9849    * then the rendering commands are discarded, otherwise they are
9850    * executed as normal. If the inverted flag is set, all commands are
9851    * discarded if the value is non zero.
9852    */
9853   if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
9854      draw_visible = false;
9855   }
9856
9857   si_emit_cache_flush(cmd_buffer);
9858
9859   if (cmd_buffer->qf == RADV_QUEUE_GENERAL &&
9860       !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) {
9861      uint64_t pred_value = 0, pred_va;
9862      unsigned pred_offset;
9863
9864      /* From the Vulkan spec 1.1.107:
9865       *
9866       * "If the 32-bit value at offset in buffer memory is zero,
9867       *  then the rendering commands are discarded, otherwise they
9868       *  are executed as normal. If the value of the predicate in
9869       *  buffer memory changes while conditional rendering is
9870       *  active, the rendering commands may be discarded in an
9871       *  implementation-dependent way. Some implementations may
9872       *  latch the value of the predicate upon beginning conditional
9873       *  rendering while others may read it before every rendering
9874       *  command."
9875       *
9876       * But, the AMD hardware treats the predicate as a 64-bit
9877       * value which means we need a workaround in the driver.
9878       * Luckily, it's not required to support if the value changes
9879       * when predication is active.
9880       *
9881       * The workaround is as follows:
9882       * 1) allocate a 64-value in the upload BO and initialize it
9883       *    to 0
9884       * 2) copy the 32-bit predicate value to the upload BO
9885       * 3) use the new allocated VA address for predication
9886       *
9887       * Based on the conditionalrender demo, it's faster to do the
9888       * COPY_DATA in ME  (+ sync PFP) instead of PFP.
9889       */
9890      radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
9891
9892      pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
9893
9894      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
9895      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
9896                         COPY_DATA_WR_CONFIRM);
9897      radeon_emit(cs, va);
9898      radeon_emit(cs, va >> 32);
9899      radeon_emit(cs, pred_va);
9900      radeon_emit(cs, pred_va >> 32);
9901
9902      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
9903      radeon_emit(cs, 0);
9904
9905      va = pred_va;
9906      pred_op = PREDICATION_OP_BOOL64;
9907   }
9908
9909   /* MEC doesn't support predication, we emulate it elsewhere. */
9910   if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
9911      si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
9912   }
9913
9914   /* Store conditional rendering user info. */
9915   cmd_buffer->state.predicating = true;
9916   cmd_buffer->state.predication_type = draw_visible;
9917   cmd_buffer->state.predication_op = pred_op;
9918   cmd_buffer->state.predication_va = va;
9919   cmd_buffer->mec_inv_pred_emitted = false;
9920}
9921
9922VKAPI_ATTR void VKAPI_CALL
9923radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
9924{
9925   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9926
9927   /* MEC doesn't support predication, no need to emit anything here. */
9928   if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
9929      si_emit_set_predication_state(cmd_buffer, false, 0, 0);
9930   }
9931
9932   /* Reset conditional rendering user info. */
9933   cmd_buffer->state.predicating = false;
9934   cmd_buffer->state.predication_type = -1;
9935   cmd_buffer->state.predication_op = 0;
9936   cmd_buffer->state.predication_va = 0;
9937   cmd_buffer->mec_inv_pred_emitted = false;
9938}
9939
9940/* VK_EXT_transform_feedback */
9941VKAPI_ATTR void VKAPI_CALL
9942radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding,
9943                                        uint32_t bindingCount, const VkBuffer *pBuffers,
9944                                        const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes)
9945{
9946   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9947   struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
9948   uint8_t enabled_mask = 0;
9949
9950   assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
9951   for (uint32_t i = 0; i < bindingCount; i++) {
9952      uint32_t idx = firstBinding + i;
9953
9954      sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
9955      sb[idx].offset = pOffsets[i];
9956
9957      if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
9958         sb[idx].size = sb[idx].buffer->vk.size - sb[idx].offset;
9959      } else {
9960         sb[idx].size = pSizes[i];
9961      }
9962
9963      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo);
9964
9965      enabled_mask |= 1 << idx;
9966   }
9967
9968   cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
9969
9970   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
9971}
9972
9973bool
9974radv_is_streamout_enabled(struct radv_cmd_buffer *cmd_buffer)
9975{
9976   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9977
9978   /* Streamout must be enabled for the PRIMITIVES_GENERATED query to work. */
9979   return (so->streamout_enabled || cmd_buffer->state.prims_gen_query_enabled) &&
9980          !cmd_buffer->state.suspend_streamout;
9981}
9982
9983void
9984radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
9985{
9986   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9987   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
9988   bool streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
9989   struct radeon_cmdbuf *cs = cmd_buffer->cs;
9990   uint32_t enabled_stream_buffers_mask = 0;
9991
9992   if (pipeline && pipeline->streamout_shader) {
9993      enabled_stream_buffers_mask = pipeline->streamout_shader->info.so.enabled_stream_buffers_mask;
9994   }
9995
9996   radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
9997   radeon_emit(cs, S_028B94_STREAMOUT_0_EN(streamout_enabled) | S_028B94_RAST_STREAM(0) |
9998                      S_028B94_STREAMOUT_1_EN(streamout_enabled) |
9999                      S_028B94_STREAMOUT_2_EN(streamout_enabled) |
10000                      S_028B94_STREAMOUT_3_EN(streamout_enabled));
10001   radeon_emit(cs, so->hw_enabled_mask & enabled_stream_buffers_mask);
10002
10003   cmd_buffer->state.context_roll_without_scissor_emitted = true;
10004}
10005
10006static void
10007radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
10008{
10009   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10010   bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
10011   uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
10012
10013   so->streamout_enabled = enable;
10014
10015   so->hw_enabled_mask = so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) |
10016                         (so->enabled_mask << 12);
10017
10018   if (!cmd_buffer->device->physical_device->use_ngg_streamout &&
10019       ((old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) ||
10020        (old_hw_enabled_mask != so->hw_enabled_mask)))
10021      radv_emit_streamout_enable(cmd_buffer);
10022
10023   if (cmd_buffer->device->physical_device->use_ngg_streamout) {
10024      cmd_buffer->gds_needed = true;
10025      cmd_buffer->gds_oa_needed = true;
10026   }
10027}
10028
10029static void
10030radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
10031{
10032   struct radeon_cmdbuf *cs = cmd_buffer->cs;
10033   unsigned reg_strmout_cntl;
10034
10035   /* The register is at different places on different ASICs. */
10036   if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
10037      reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
10038      radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
10039      radeon_emit(cs, S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME));
10040      radeon_emit(cs, R_0300FC_CP_STRMOUT_CNTL >> 2);
10041      radeon_emit(cs, 0);
10042      radeon_emit(cs, 0);
10043   } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
10044      reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
10045      radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
10046   } else {
10047      reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
10048      radeon_set_config_reg(cs, reg_strmout_cntl, 0);
10049   }
10050
10051   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
10052   radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
10053
10054   radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
10055   radeon_emit(cs,
10056               WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
10057   radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
10058   radeon_emit(cs, 0);
10059   radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
10060   radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
10061   radeon_emit(cs, 4);                              /* poll interval */
10062}
10063
10064static void
10065radv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10066                          uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10067                          const VkDeviceSize *pCounterBufferOffsets)
10068
10069{
10070   struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
10071   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10072   struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
10073   struct radv_shader_info *info = &pipeline->streamout_shader->info;
10074   struct radeon_cmdbuf *cs = cmd_buffer->cs;
10075
10076   radv_flush_vgt_streamout(cmd_buffer);
10077
10078   assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10079   u_foreach_bit(i, so->enabled_mask)
10080   {
10081      int32_t counter_buffer_idx = i - firstCounterBuffer;
10082      if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10083         counter_buffer_idx = -1;
10084
10085      /* AMD GCN binds streamout buffers as shader resources.
10086       * VGT only counts primitives and tells the shader through
10087       * SGPRs what to do.
10088       */
10089      radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
10090      radeon_emit(cs, sb[i].size >> 2);     /* BUFFER_SIZE (in DW) */
10091      radeon_emit(cs, info->so.strides[i]); /* VTX_STRIDE (in DW) */
10092
10093      cmd_buffer->state.context_roll_without_scissor_emitted = true;
10094
10095      if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
10096         /* The array of counter buffers is optional. */
10097         RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10098         uint64_t va = radv_buffer_get_va(buffer->bo);
10099         uint64_t counter_buffer_offset = 0;
10100
10101         if (pCounterBufferOffsets)
10102            counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10103
10104         va += buffer->offset + counter_buffer_offset;
10105
10106         /* Append */
10107         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
10108         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) |   /* offset in bytes */
10109                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
10110         radeon_emit(cs, 0);                                                 /* unused */
10111         radeon_emit(cs, 0);                                                 /* unused */
10112         radeon_emit(cs, va);                                                /* src address lo */
10113         radeon_emit(cs, va >> 32);                                          /* src address hi */
10114
10115         radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10116      } else {
10117         /* Start from the beginning. */
10118         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
10119         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
10120                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
10121         radeon_emit(cs, 0);                                                    /* unused */
10122         radeon_emit(cs, 0);                                                    /* unused */
10123         radeon_emit(cs, 0);                                                    /* unused */
10124         radeon_emit(cs, 0);                                                    /* unused */
10125      }
10126   }
10127
10128   radv_set_streamout_enable(cmd_buffer, true);
10129}
10130
10131static void
10132gfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10133                           uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10134                           const VkDeviceSize *pCounterBufferOffsets)
10135{
10136   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10137   unsigned last_target = util_last_bit(so->enabled_mask) - 1;
10138   struct radeon_cmdbuf *cs = cmd_buffer->cs;
10139
10140   assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
10141   assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10142
10143   /* Sync because the next streamout operation will overwrite GDS and we
10144    * have to make sure it's idle.
10145    * TODO: Improve by tracking if there is a streamout operation in
10146    * flight.
10147    */
10148   cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
10149   si_emit_cache_flush(cmd_buffer);
10150
10151   u_foreach_bit(i, so->enabled_mask)
10152   {
10153      int32_t counter_buffer_idx = i - firstCounterBuffer;
10154      if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10155         counter_buffer_idx = -1;
10156
10157      bool append =
10158         counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
10159      uint64_t va = 0;
10160
10161      if (append) {
10162         RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10163         uint64_t counter_buffer_offset = 0;
10164
10165         if (pCounterBufferOffsets)
10166            counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10167
10168         va += radv_buffer_get_va(buffer->bo);
10169         va += buffer->offset + counter_buffer_offset;
10170
10171         radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10172      }
10173
10174      radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
10175      radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
10176                         S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
10177      radeon_emit(cs, va);
10178      radeon_emit(cs, va >> 32);
10179      radeon_emit(cs, 4 * i); /* destination in GDS */
10180      radeon_emit(cs, 0);
10181      radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
10182   }
10183
10184   radv_set_streamout_enable(cmd_buffer, true);
10185}
10186
10187VKAPI_ATTR void VKAPI_CALL
10188radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
10189                                  uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10190                                  const VkDeviceSize *pCounterBufferOffsets)
10191{
10192   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10193
10194   if (cmd_buffer->device->physical_device->use_ngg_streamout) {
10195      gfx10_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount,
10196                                 pCounterBuffers, pCounterBufferOffsets);
10197   } else {
10198      radv_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
10199                                pCounterBufferOffsets);
10200   }
10201}
10202
10203static void
10204radv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10205                        uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10206                        const VkDeviceSize *pCounterBufferOffsets)
10207{
10208   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10209   struct radeon_cmdbuf *cs = cmd_buffer->cs;
10210
10211   radv_flush_vgt_streamout(cmd_buffer);
10212
10213   assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10214   u_foreach_bit(i, so->enabled_mask)
10215   {
10216      int32_t counter_buffer_idx = i - firstCounterBuffer;
10217      if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10218         counter_buffer_idx = -1;
10219
10220      if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
10221         /* The array of counters buffer is optional. */
10222         RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10223         uint64_t va = radv_buffer_get_va(buffer->bo);
10224         uint64_t counter_buffer_offset = 0;
10225
10226         if (pCounterBufferOffsets)
10227            counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10228
10229         va += buffer->offset + counter_buffer_offset;
10230
10231         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
10232         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
10233                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
10234                            STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
10235         radeon_emit(cs, va);                                  /* dst address lo */
10236         radeon_emit(cs, va >> 32);                            /* dst address hi */
10237         radeon_emit(cs, 0);                                   /* unused */
10238         radeon_emit(cs, 0);                                   /* unused */
10239
10240         radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10241      }
10242
10243      /* Deactivate transform feedback by zeroing the buffer size.
10244       * The counters (primitives generated, primitives emitted) may
10245       * be enabled even if there is not buffer bound. This ensures
10246       * that the primitives-emitted query won't increment.
10247       */
10248      radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
10249
10250      cmd_buffer->state.context_roll_without_scissor_emitted = true;
10251   }
10252
10253   radv_set_streamout_enable(cmd_buffer, false);
10254}
10255
10256static void
10257gfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10258                         uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10259                         const VkDeviceSize *pCounterBufferOffsets)
10260{
10261   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10262   struct radeon_cmdbuf *cs = cmd_buffer->cs;
10263
10264   assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
10265   assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10266
10267   u_foreach_bit(i, so->enabled_mask)
10268   {
10269      int32_t counter_buffer_idx = i - firstCounterBuffer;
10270      if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10271         counter_buffer_idx = -1;
10272
10273      if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
10274         /* The array of counters buffer is optional. */
10275         RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10276         uint64_t va = radv_buffer_get_va(buffer->bo);
10277         uint64_t counter_buffer_offset = 0;
10278
10279         if (pCounterBufferOffsets)
10280            counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10281
10282         va += buffer->offset + counter_buffer_offset;
10283
10284         si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
10285                                    radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0,
10286                                    EOP_DST_SEL_TC_L2, EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0);
10287
10288         radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10289      }
10290   }
10291
10292   radv_set_streamout_enable(cmd_buffer, false);
10293}
10294
10295VKAPI_ATTR void VKAPI_CALL
10296radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
10297                                uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10298                                const VkDeviceSize *pCounterBufferOffsets)
10299{
10300   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10301
10302   if (cmd_buffer->device->physical_device->use_ngg_streamout) {
10303      gfx10_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
10304                               pCounterBufferOffsets);
10305   } else {
10306      radv_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
10307                              pCounterBufferOffsets);
10308   }
10309}
10310
10311VKAPI_ATTR void VKAPI_CALL
10312radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount,
10313                                 uint32_t firstInstance, VkBuffer _counterBuffer,
10314                                 VkDeviceSize counterBufferOffset, uint32_t counterOffset,
10315                                 uint32_t vertexStride)
10316{
10317   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10318   RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
10319   struct radv_draw_info info;
10320
10321   info.count = 0;
10322   info.instance_count = instanceCount;
10323   info.first_instance = firstInstance;
10324   info.strmout_buffer = counterBuffer;
10325   info.strmout_buffer_offset = counterBufferOffset;
10326   info.stride = vertexStride;
10327   info.indexed = false;
10328   info.indirect = NULL;
10329
10330   if (!radv_before_draw(cmd_buffer, &info, 1))
10331      return;
10332   struct VkMultiDrawInfoEXT minfo = { 0, 0 };
10333   radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
10334   radv_after_draw(cmd_buffer);
10335}
10336
10337/* VK_AMD_buffer_marker */
10338VKAPI_ATTR void VKAPI_CALL
10339radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage,
10340                              VkBuffer dstBuffer, VkDeviceSize dstOffset, uint32_t marker)
10341{
10342   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10343   RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
10344   struct radeon_cmdbuf *cs = cmd_buffer->cs;
10345   uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + dstOffset;
10346
10347   si_emit_cache_flush(cmd_buffer);
10348
10349   ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
10350
10351   if (!(stage & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) {
10352      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
10353      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
10354                         COPY_DATA_WR_CONFIRM);
10355      radeon_emit(cs, marker);
10356      radeon_emit(cs, 0);
10357      radeon_emit(cs, va);
10358      radeon_emit(cs, va >> 32);
10359   } else {
10360      si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
10361                                 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS,
10362                                 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker,
10363                                 cmd_buffer->gfx9_eop_bug_va);
10364   }
10365
10366   assert(cmd_buffer->cs->cdw <= cdw_max);
10367}
10368
10369void
10370radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer,
10371                                  VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline,
10372                                  uint32_t groupIndex)
10373{
10374   fprintf(stderr, "radv: unimplemented vkCmdBindPipelineShaderGroupNV\n");
10375   abort();
10376}