1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_format.h"
30 #include "vk_render_pass.h"
31 #include "vk_util.h"
32 #include "util/fast_idiv_by_const.h"
33 
34 #include "common/intel_aux_map.h"
35 #include "common/intel_l3_config.h"
36 #include "genxml/gen_macros.h"
37 #include "genxml/genX_pack.h"
38 #include "genxml/gen_rt_pack.h"
39 #include "common/intel_guardband.h"
40 #include "compiler/brw_prim.h"
41 
42 #include "nir/nir_xfb_info.h"
43 
44 #include "ds/intel_tracepoints.h"
45 
46 /* We reserve :
47  *    - GPR 14 for secondary command buffer returns
48  *    - GPR 15 for conditional rendering
49  */
50 #define MI_BUILDER_NUM_ALLOC_GPRS 14
51 #define __gen_get_batch_dwords anv_batch_emit_dwords
52 #define __gen_address_offset anv_address_add
53 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
54 #include "common/mi_builder.h"
55 
56 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
57                                         uint32_t pipeline);
58 
59 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc)60 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
61    enum anv_pipe_bits bits = 0;
62    bits |= (pc->DepthCacheFlushEnable) ?  ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
63    bits |= (pc->DCFlushEnable) ?  ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
64 #if GFX_VERx10 >= 125
65    bits |= (pc->PSSStallSyncEnable) ?  ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
66 #endif
67 #if GFX_VER >= 12
68    bits |= (pc->TileCacheFlushEnable) ?  ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
69    bits |= (pc->HDCPipelineFlushEnable) ?  ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
70 #endif
71    bits |= (pc->RenderTargetCacheFlushEnable) ?  ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
72    bits |= (pc->VFCacheInvalidationEnable) ?  ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
73    bits |= (pc->StateCacheInvalidationEnable) ?  ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
74    bits |= (pc->ConstantCacheInvalidationEnable) ?  ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
75    bits |= (pc->TextureCacheInvalidationEnable) ?  ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
76    bits |= (pc->InstructionCacheInvalidateEnable) ?  ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
77    bits |= (pc->StallAtPixelScoreboard) ?  ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
78    bits |= (pc->DepthStallEnable) ?  ANV_PIPE_DEPTH_STALL_BIT : 0;
79    bits |= (pc->CommandStreamerStallEnable) ?  ANV_PIPE_CS_STALL_BIT : 0;
80    return bits;
81 }
82 
83 #define anv_debug_dump_pc(pc) \
84    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
85       fputs("pc: emit PC=( ", stderr); \
86       anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \
87       fprintf(stderr, ") reason: %s\n", __FUNCTION__); \
88    }
89 
90 static bool
is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer)91 is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer)
92 {
93    struct anv_queue_family *queue_family = cmd_buffer->queue_family;
94    return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0;
95 }
96 
97 void
cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer)98 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
99 {
100    struct anv_device *device = cmd_buffer->device;
101    uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
102 
103    /* If we are emitting a new state base address we probably need to re-emit
104     * binding tables.
105     */
106    cmd_buffer->state.descriptors_dirty |= ~0;
107 
108 #if GFX_VERx10 >= 125
109    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
110       pc.CommandStreamerStallEnable = true;
111       anv_debug_dump_pc(pc);
112    }
113    anv_batch_emit(
114       &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
115       btpa.BindingTablePoolBaseAddress =
116          anv_cmd_buffer_surface_base_address(cmd_buffer);
117       btpa.BindingTablePoolBufferSize = BINDING_TABLE_POOL_BLOCK_SIZE / 4096;
118       btpa.MOCS = mocs;
119    }
120 #else /* GFX_VERx10 < 125 */
121    /* Emit a render target cache flush.
122     *
123     * This isn't documented anywhere in the PRM.  However, it seems to be
124     * necessary prior to changing the surface state base address.  Without
125     * this, we get GPU hangs when using multi-level command buffers which
126     * clear depth, reset state base address, and then go render stuff.
127     */
128    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
129 #if GFX_VER >= 12
130       pc.HDCPipelineFlushEnable = true;
131 #else
132       pc.DCFlushEnable = true;
133 #endif
134       pc.RenderTargetCacheFlushEnable = true;
135       pc.CommandStreamerStallEnable = true;
136       anv_debug_dump_pc(pc);
137    }
138 
139 #if GFX_VERx10 == 120
140    /* Wa_1607854226:
141     *
142     *  Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
143     *  mode by putting the pipeline temporarily in 3D mode.
144     */
145    uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline;
146    genX(flush_pipeline_select_3d)(cmd_buffer);
147 #endif
148 
149    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
150       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
151       sba.GeneralStateMOCS = mocs;
152       sba.GeneralStateBaseAddressModifyEnable = true;
153 
154       sba.StatelessDataPortAccessMOCS = mocs;
155 
156       sba.SurfaceStateBaseAddress =
157          anv_cmd_buffer_surface_base_address(cmd_buffer);
158       sba.SurfaceStateMOCS = mocs;
159       sba.SurfaceStateBaseAddressModifyEnable = true;
160 
161       sba.DynamicStateBaseAddress =
162          (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
163       sba.DynamicStateMOCS = mocs;
164       sba.DynamicStateBaseAddressModifyEnable = true;
165 
166       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
167       sba.IndirectObjectMOCS = mocs;
168       sba.IndirectObjectBaseAddressModifyEnable = true;
169 
170       sba.InstructionBaseAddress =
171          (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
172       sba.InstructionMOCS = mocs;
173       sba.InstructionBaseAddressModifyEnable = true;
174 
175 #  if (GFX_VER >= 8)
176       /* Broadwell requires that we specify a buffer size for a bunch of
177        * these fields.  However, since we will be growing the BO's live, we
178        * just set them all to the maximum.
179        */
180       sba.GeneralStateBufferSize       = 0xfffff;
181       sba.IndirectObjectBufferSize     = 0xfffff;
182       if (anv_use_relocations(device->physical)) {
183          sba.DynamicStateBufferSize    = 0xfffff;
184          sba.InstructionBufferSize     = 0xfffff;
185       } else {
186          /* With softpin, we use fixed addresses so we actually know how big
187           * our base addresses are.
188           */
189          sba.DynamicStateBufferSize    = DYNAMIC_STATE_POOL_SIZE / 4096;
190          sba.InstructionBufferSize     = INSTRUCTION_STATE_POOL_SIZE / 4096;
191       }
192       sba.GeneralStateBufferSizeModifyEnable    = true;
193       sba.IndirectObjectBufferSizeModifyEnable  = true;
194       sba.DynamicStateBufferSizeModifyEnable    = true;
195       sba.InstructionBuffersizeModifyEnable     = true;
196 #  else
197       /* On gfx7, we have upper bounds instead.  According to the docs,
198        * setting an upper bound of zero means that no bounds checking is
199        * performed so, in theory, we should be able to leave them zero.
200        * However, border color is broken and the GPU bounds-checks anyway.
201        * To avoid this and other potential problems, we may as well set it
202        * for everything.
203        */
204       sba.GeneralStateAccessUpperBound =
205          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
206       sba.GeneralStateAccessUpperBoundModifyEnable = true;
207       sba.DynamicStateAccessUpperBound =
208          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
209       sba.DynamicStateAccessUpperBoundModifyEnable = true;
210       sba.InstructionAccessUpperBound =
211          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
212       sba.InstructionAccessUpperBoundModifyEnable = true;
213 #  endif
214 #  if (GFX_VER >= 9)
215       sba.BindlessSurfaceStateBaseAddress =
216          (struct anv_address) { device->surface_state_pool.block_pool.bo, 0 };
217       sba.BindlessSurfaceStateSize = (1 << 20) - 1;
218       sba.BindlessSurfaceStateMOCS = mocs;
219       sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
220 #  endif
221 #  if (GFX_VER >= 10)
222       sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
223       sba.BindlessSamplerStateMOCS = mocs;
224       sba.BindlessSamplerStateBaseAddressModifyEnable = true;
225       sba.BindlessSamplerStateBufferSize = 0;
226 #  endif
227    }
228 
229 #if GFX_VERx10 == 120
230    /* Wa_1607854226:
231     *
232     *  Put the pipeline back into its current mode.
233     */
234    if (gfx12_wa_pipeline != UINT32_MAX)
235       genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
236 #endif
237 
238 #endif /* GFX_VERx10 < 125 */
239 
240    /* After re-setting the surface state base address, we have to do some
241     * cache flushing so that the sampler engine will pick up the new
242     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
243     * Shared Function > 3D Sampler > State > State Caching (page 96):
244     *
245     *    Coherency with system memory in the state cache, like the texture
246     *    cache is handled partially by software. It is expected that the
247     *    command stream or shader will issue Cache Flush operation or
248     *    Cache_Flush sampler message to ensure that the L1 cache remains
249     *    coherent with system memory.
250     *
251     *    [...]
252     *
253     *    Whenever the value of the Dynamic_State_Base_Addr,
254     *    Surface_State_Base_Addr are altered, the L1 state cache must be
255     *    invalidated to ensure the new surface or sampler state is fetched
256     *    from system memory.
257     *
258     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
259     * which, according the PIPE_CONTROL instruction documentation in the
260     * Broadwell PRM:
261     *
262     *    Setting this bit is independent of any other bit in this packet.
263     *    This bit controls the invalidation of the L1 and L2 state caches
264     *    at the top of the pipe i.e. at the parsing time.
265     *
266     * Unfortunately, experimentation seems to indicate that state cache
267     * invalidation through a PIPE_CONTROL does nothing whatsoever in
268     * regards to surface state and binding tables.  In stead, it seems that
269     * invalidating the texture cache is what is actually needed.
270     *
271     * XXX:  As far as we have been able to determine through
272     * experimentation, shows that flush the texture cache appears to be
273     * sufficient.  The theory here is that all of the sampling/rendering
274     * units cache the binding table in the texture cache.  However, we have
275     * yet to be able to actually confirm this.
276     *
277     * Wa_14013910100:
278     *
279     *  "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
280     *   or program pipe control with Instruction cache invalidate post
281     *   STATE_BASE_ADDRESS command"
282     */
283    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
284       pc.TextureCacheInvalidationEnable = true;
285       pc.ConstantCacheInvalidationEnable = true;
286       pc.StateCacheInvalidationEnable = true;
287 #if GFX_VERx10 == 125
288       pc.InstructionCacheInvalidateEnable = true;
289 #endif
290 #if GFX_VER >= 9 && GFX_VER <= 11
291       /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
292        *
293        *    "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
294        *     always set for GPGPU workloads when “Texture Cache Invalidation
295        *     Enable” bit is set".
296        *
297        * Workaround stopped appearing in TGL PRMs.
298        */
299       pc.CommandStreamerStallEnable =
300          cmd_buffer->state.current_pipeline == GPGPU;
301 #endif
302       anv_debug_dump_pc(pc);
303    }
304 }
305 
306 static void
add_surface_reloc(struct anv_cmd_buffer *cmd_buffer, struct anv_state state, struct anv_address addr)307 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
308                   struct anv_state state, struct anv_address addr)
309 {
310    VkResult result;
311 
312    if (anv_use_relocations(cmd_buffer->device->physical)) {
313       const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
314       result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
315                                   &cmd_buffer->vk.pool->alloc,
316                                   state.offset + isl_dev->ss.addr_offset,
317                                   addr.bo, addr.offset, NULL);
318    } else {
319       result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
320                                      &cmd_buffer->vk.pool->alloc,
321                                      addr.bo);
322    }
323 
324    if (unlikely(result != VK_SUCCESS))
325       anv_batch_set_error(&cmd_buffer->batch, result);
326 }
327 
328 static void
add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer, struct anv_surface_state state)329 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
330                          struct anv_surface_state state)
331 {
332    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
333 
334    assert(!anv_address_is_null(state.address));
335    add_surface_reloc(cmd_buffer, state.state, state.address);
336 
337    if (!anv_address_is_null(state.aux_address)) {
338       VkResult result =
339          anv_reloc_list_add(&cmd_buffer->surface_relocs,
340                             &cmd_buffer->vk.pool->alloc,
341                             state.state.offset + isl_dev->ss.aux_addr_offset,
342                             state.aux_address.bo,
343                             state.aux_address.offset,
344                             NULL);
345       if (result != VK_SUCCESS)
346          anv_batch_set_error(&cmd_buffer->batch, result);
347    }
348 
349    if (!anv_address_is_null(state.clear_address)) {
350       VkResult result =
351          anv_reloc_list_add(&cmd_buffer->surface_relocs,
352                             &cmd_buffer->vk.pool->alloc,
353                             state.state.offset +
354                             isl_dev->ss.clear_color_state_offset,
355                             state.clear_address.bo,
356                             state.clear_address.offset,
357                             NULL);
358       if (result != VK_SUCCESS)
359          anv_batch_set_error(&cmd_buffer->batch, result);
360    }
361 }
362 
363 static bool
isl_color_value_requires_conversion(union isl_color_value color, const struct isl_surf *surf, const struct isl_view *view)364 isl_color_value_requires_conversion(union isl_color_value color,
365                                     const struct isl_surf *surf,
366                                     const struct isl_view *view)
367 {
368    if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
369       return false;
370 
371    uint32_t surf_pack[4] = { 0, 0, 0, 0 };
372    isl_color_value_pack(&color, surf->format, surf_pack);
373 
374    uint32_t view_pack[4] = { 0, 0, 0, 0 };
375    union isl_color_value swiz_color =
376       isl_color_value_swizzle_inv(color, view->swizzle);
377    isl_color_value_pack(&swiz_color, view->format, view_pack);
378 
379    return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
380 }
381 
382 static bool
anv_can_fast_clear_color_view(struct anv_device * device, struct anv_image_view *iview, VkImageLayout layout, union isl_color_value clear_color, uint32_t num_layers, VkRect2D render_area)383 anv_can_fast_clear_color_view(struct anv_device * device,
384                               struct anv_image_view *iview,
385                               VkImageLayout layout,
386                               union isl_color_value clear_color,
387                               uint32_t num_layers,
388                               VkRect2D render_area)
389 {
390    if (iview->planes[0].isl.base_array_layer >=
391        anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
392                             iview->planes[0].isl.base_level))
393       return false;
394 
395    /* Start by getting the fast clear type.  We use the first subpass
396     * layout here because we don't want to fast-clear if the first subpass
397     * to use the attachment can't handle fast-clears.
398     */
399    enum anv_fast_clear_type fast_clear_type =
400       anv_layout_to_fast_clear_type(&device->info, iview->image,
401                                     VK_IMAGE_ASPECT_COLOR_BIT,
402                                     layout);
403    switch (fast_clear_type) {
404    case ANV_FAST_CLEAR_NONE:
405       return false;
406    case ANV_FAST_CLEAR_DEFAULT_VALUE:
407       if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
408          return false;
409       break;
410    case ANV_FAST_CLEAR_ANY:
411       break;
412    }
413 
414    /* Potentially, we could do partial fast-clears but doing so has crazy
415     * alignment restrictions.  It's easier to just restrict to full size
416     * fast clears for now.
417     */
418    if (render_area.offset.x != 0 ||
419        render_area.offset.y != 0 ||
420        render_area.extent.width != iview->vk.extent.width ||
421        render_area.extent.height != iview->vk.extent.height)
422       return false;
423 
424    /* On Broadwell and earlier, we can only handle 0/1 clear colors */
425    if (GFX_VER <= 8 &&
426        !isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))
427       return false;
428 
429    /* If the clear color is one that would require non-trivial format
430     * conversion on resolve, we don't bother with the fast clear.  This
431     * shouldn't be common as most clear colors are 0/1 and the most common
432     * format re-interpretation is for sRGB.
433     */
434    if (isl_color_value_requires_conversion(clear_color,
435                                            &iview->image->planes[0].primary_surface.isl,
436                                            &iview->planes[0].isl)) {
437       anv_perf_warn(VK_LOG_OBJS(&iview->vk.base),
438                     "Cannot fast-clear to colors which would require "
439                     "format conversion on resolve");
440       return false;
441    }
442 
443    /* We only allow fast clears to the first slice of an image (level 0,
444     * layer 0) and only for the entire slice.  This guarantees us that, at
445     * any given time, there is only one clear color on any given image at
446     * any given time.  At the time of our testing (Jan 17, 2018), there
447     * were no known applications which would benefit from fast-clearing
448     * more than just the first slice.
449     */
450    if (iview->planes[0].isl.base_level > 0 ||
451        iview->planes[0].isl.base_array_layer > 0) {
452       anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
453                     "Rendering with multi-lod or multi-layer framebuffer "
454                     "with LOAD_OP_LOAD and baseMipLevel > 0 or "
455                     "baseArrayLayer > 0.  Not fast clearing.");
456       return false;
457    }
458 
459    if (num_layers > 1) {
460       anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
461                     "Rendering to a multi-layer framebuffer with "
462                     "LOAD_OP_CLEAR.  Only fast-clearing the first slice");
463    }
464 
465    return true;
466 }
467 
468 static bool
anv_can_hiz_clear_ds_view(struct anv_device *device, const struct anv_image_view *iview, VkImageLayout layout, VkImageAspectFlags clear_aspects, float depth_clear_value, VkRect2D render_area)469 anv_can_hiz_clear_ds_view(struct anv_device *device,
470                           const struct anv_image_view *iview,
471                           VkImageLayout layout,
472                           VkImageAspectFlags clear_aspects,
473                           float depth_clear_value,
474                           VkRect2D render_area)
475 {
476    /* We don't do any HiZ or depth fast-clears on gfx7 yet */
477    if (GFX_VER == 7)
478       return false;
479 
480    /* If we're just clearing stencil, we can always HiZ clear */
481    if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
482       return true;
483 
484    /* We must have depth in order to have HiZ */
485    if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
486       return false;
487 
488    const enum isl_aux_usage clear_aux_usage =
489       anv_layout_to_aux_usage(&device->info, iview->image,
490                               VK_IMAGE_ASPECT_DEPTH_BIT,
491                               VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
492                               layout);
493    if (!blorp_can_hiz_clear_depth(&device->info,
494                                   &iview->image->planes[0].primary_surface.isl,
495                                   clear_aux_usage,
496                                   iview->planes[0].isl.base_level,
497                                   iview->planes[0].isl.base_array_layer,
498                                   render_area.offset.x,
499                                   render_area.offset.y,
500                                   render_area.offset.x +
501                                   render_area.extent.width,
502                                   render_area.offset.y +
503                                   render_area.extent.height))
504       return false;
505 
506    if (depth_clear_value != ANV_HZ_FC_VAL)
507       return false;
508 
509    /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared
510     * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports
511     * returning 0.0f. Gens prior to gfx8 do not support this feature at all.
512     */
513    if (GFX_VER == 8 && anv_can_sample_with_hiz(&device->info, iview->image))
514       return false;
515 
516    /* If we got here, then we can fast clear */
517    return true;
518 }
519 
520 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
521 
522 #if GFX_VER == 12
523 static void
anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, uint32_t base_level, uint32_t level_count, uint32_t base_layer, uint32_t layer_count)524 anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
525                       const struct anv_image *image,
526                       VkImageAspectFlagBits aspect,
527                       uint32_t base_level, uint32_t level_count,
528                       uint32_t base_layer, uint32_t layer_count)
529 {
530    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
531 
532    const struct anv_surface *surface = &image->planes[plane].primary_surface;
533    uint64_t base_address =
534       anv_address_physical(anv_image_address(image, &surface->memory_range));
535 
536    const struct isl_surf *isl_surf = &image->planes[plane].primary_surface.isl;
537    uint64_t format_bits = intel_aux_map_format_bits_for_isl_surf(isl_surf);
538 
539    /* We're about to live-update the AUX-TT.  We really don't want anyone else
540     * trying to read it while we're doing this.  We could probably get away
541     * with not having this stall in some cases if we were really careful but
542     * it's better to play it safe.  Full stall the GPU.
543     */
544    anv_add_pending_pipe_bits(cmd_buffer,
545                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
546                              "before update AUX-TT");
547    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
548 
549    struct mi_builder b;
550    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
551 
552    for (uint32_t a = 0; a < layer_count; a++) {
553       const uint32_t layer = base_layer + a;
554 
555       uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0;
556       for (uint32_t l = 0; l < level_count; l++) {
557          const uint32_t level = base_level + l;
558 
559          uint32_t logical_array_layer, logical_z_offset_px;
560          if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
561             logical_array_layer = 0;
562 
563             /* If the given miplevel does not have this layer, then any higher
564              * miplevels won't either because miplevels only get smaller the
565              * higher the LOD.
566              */
567             assert(layer < image->vk.extent.depth);
568             if (layer >= anv_minify(image->vk.extent.depth, level))
569                break;
570             logical_z_offset_px = layer;
571          } else {
572             assert(layer < image->vk.array_layers);
573             logical_array_layer = layer;
574             logical_z_offset_px = 0;
575          }
576 
577          uint64_t slice_start_offset_B, slice_end_offset_B;
578          isl_surf_get_image_range_B_tile(isl_surf, level,
579                                          logical_array_layer,
580                                          logical_z_offset_px,
581                                          &slice_start_offset_B,
582                                          &slice_end_offset_B);
583 
584          start_offset_B = MIN2(start_offset_B, slice_start_offset_B);
585          end_offset_B = MAX2(end_offset_B, slice_end_offset_B);
586       }
587 
588       /* Aux operates 64K at a time */
589       start_offset_B = align_down_u64(start_offset_B, 64 * 1024);
590       end_offset_B = align_u64(end_offset_B, 64 * 1024);
591 
592       for (uint64_t offset = start_offset_B;
593            offset < end_offset_B; offset += 64 * 1024) {
594          uint64_t address = base_address + offset;
595 
596          uint64_t aux_entry_addr64, *aux_entry_map;
597          aux_entry_map = intel_aux_map_get_entry(cmd_buffer->device->aux_map_ctx,
598                                                  address, &aux_entry_addr64);
599 
600          assert(!anv_use_relocations(cmd_buffer->device->physical));
601          struct anv_address aux_entry_address = {
602             .bo = NULL,
603             .offset = aux_entry_addr64,
604          };
605 
606          const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map);
607          uint64_t new_aux_entry =
608             (old_aux_entry & INTEL_AUX_MAP_ADDRESS_MASK) | format_bits;
609 
610          if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage))
611             new_aux_entry |= INTEL_AUX_MAP_ENTRY_VALID_BIT;
612 
613          mi_store(&b, mi_mem64(aux_entry_address), mi_imm(new_aux_entry));
614       }
615    }
616 
617    anv_add_pending_pipe_bits(cmd_buffer,
618                              ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
619                              "after update AUX-TT");
620 }
621 #endif /* GFX_VER == 12 */
622 
623 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
624  * the initial layout is undefined, the HiZ buffer and depth buffer will
625  * represent the same data at the end of this operation.
626  */
627 static void
transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, uint32_t base_layer, uint32_t layer_count, VkImageLayout initial_layout, VkImageLayout final_layout, bool will_full_fast_clear)628 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
629                         const struct anv_image *image,
630                         uint32_t base_layer, uint32_t layer_count,
631                         VkImageLayout initial_layout,
632                         VkImageLayout final_layout,
633                         bool will_full_fast_clear)
634 {
635    const uint32_t depth_plane =
636       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
637    if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
638       return;
639 
640 #if GFX_VER == 12
641    if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
642         initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
643        cmd_buffer->device->physical->has_implicit_ccs &&
644        cmd_buffer->device->info.has_aux_map) {
645       anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
646                             0, 1, base_layer, layer_count);
647    }
648 #endif
649 
650    /* If will_full_fast_clear is set, the caller promises to fast-clear the
651     * largest portion of the specified range as it can.  For depth images,
652     * that means the entire image because we don't support multi-LOD HiZ.
653     */
654    assert(image->planes[0].primary_surface.isl.levels == 1);
655    if (will_full_fast_clear)
656       return;
657 
658    const enum isl_aux_state initial_state =
659       anv_layout_to_aux_state(&cmd_buffer->device->info, image,
660                               VK_IMAGE_ASPECT_DEPTH_BIT,
661                               initial_layout);
662    const enum isl_aux_state final_state =
663       anv_layout_to_aux_state(&cmd_buffer->device->info, image,
664                               VK_IMAGE_ASPECT_DEPTH_BIT,
665                               final_layout);
666 
667    const bool initial_depth_valid =
668       isl_aux_state_has_valid_primary(initial_state);
669    const bool initial_hiz_valid =
670       isl_aux_state_has_valid_aux(initial_state);
671    const bool final_needs_depth =
672       isl_aux_state_has_valid_primary(final_state);
673    const bool final_needs_hiz =
674       isl_aux_state_has_valid_aux(final_state);
675 
676    /* Getting into the pass-through state for Depth is tricky and involves
677     * both a resolve and an ambiguate.  We don't handle that state right now
678     * as anv_layout_to_aux_state never returns it.
679     */
680    assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
681 
682    if (final_needs_depth && !initial_depth_valid) {
683       assert(initial_hiz_valid);
684       anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
685                        0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
686    } else if (final_needs_hiz && !initial_hiz_valid) {
687       assert(initial_depth_valid);
688       anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
689                        0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
690    }
691 }
692 
693 #if GFX_VER == 7
694 static inline bool
vk_image_layout_stencil_write_optimal(VkImageLayout layout)695 vk_image_layout_stencil_write_optimal(VkImageLayout layout)
696 {
697    return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
698           layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||
699           layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL;
700 }
701 #endif
702 
703 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
704  * the initial layout is undefined, the HiZ buffer and depth buffer will
705  * represent the same data at the end of this operation.
706  */
707 static void
transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, uint32_t base_level, uint32_t level_count, uint32_t base_layer, uint32_t layer_count, VkImageLayout initial_layout, VkImageLayout final_layout, bool will_full_fast_clear)708 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
709                           const struct anv_image *image,
710                           uint32_t base_level, uint32_t level_count,
711                           uint32_t base_layer, uint32_t layer_count,
712                           VkImageLayout initial_layout,
713                           VkImageLayout final_layout,
714                           bool will_full_fast_clear)
715 {
716 #if GFX_VER == 7
717    const uint32_t plane =
718       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
719 
720    /* On gfx7, we have to store a texturable version of the stencil buffer in
721     * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
722     * forth at strategic points. Stencil writes are only allowed in following
723     * layouts:
724     *
725     *  - VK_IMAGE_LAYOUT_GENERAL
726     *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
727     *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
728     *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
729     *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
730     *
731     * For general, we have no nice opportunity to transition so we do the copy
732     * to the shadow unconditionally at the end of the subpass. For transfer
733     * destinations, we can update it as part of the transfer op. For the other
734     * layouts, we delay the copy until a transition into some other layout.
735     */
736    if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
737        vk_image_layout_stencil_write_optimal(initial_layout) &&
738        !vk_image_layout_stencil_write_optimal(final_layout)) {
739       anv_image_copy_to_shadow(cmd_buffer, image,
740                                VK_IMAGE_ASPECT_STENCIL_BIT,
741                                base_level, level_count,
742                                base_layer, layer_count);
743    }
744 #elif GFX_VER == 12
745    const uint32_t plane =
746       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
747    if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
748       return;
749 
750    if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
751         initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
752        cmd_buffer->device->physical->has_implicit_ccs &&
753        cmd_buffer->device->info.has_aux_map) {
754       anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
755                             base_level, level_count, base_layer, layer_count);
756 
757       /* If will_full_fast_clear is set, the caller promises to fast-clear the
758        * largest portion of the specified range as it can.
759        */
760       if (will_full_fast_clear)
761          return;
762 
763       for (uint32_t l = 0; l < level_count; l++) {
764          const uint32_t level = base_level + l;
765          const VkRect2D clear_rect = {
766             .offset.x = 0,
767             .offset.y = 0,
768             .extent.width = anv_minify(image->vk.extent.width, level),
769             .extent.height = anv_minify(image->vk.extent.height, level),
770          };
771 
772          uint32_t aux_layers =
773             anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level);
774          uint32_t level_layer_count =
775             MIN2(layer_count, aux_layers - base_layer);
776 
777          /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression
778           * Enable:
779           *
780           *    "When enabled, Stencil Buffer needs to be initialized via
781           *    stencil clear (HZ_OP) before any renderpass."
782           */
783          anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
784                              level, base_layer, level_layer_count,
785                              clear_rect, 0 /* Stencil clear value */);
786       }
787    }
788 #endif
789 }
790 
791 #define MI_PREDICATE_SRC0    0x2400
792 #define MI_PREDICATE_SRC1    0x2408
793 #define MI_PREDICATE_RESULT  0x2418
794 
795 static void
set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, uint32_t level, uint32_t base_layer, uint32_t layer_count, bool compressed)796 set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
797                          const struct anv_image *image,
798                          VkImageAspectFlagBits aspect,
799                          uint32_t level,
800                          uint32_t base_layer, uint32_t layer_count,
801                          bool compressed)
802 {
803    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
804 
805    /* We only have compression tracking for CCS_E */
806    if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E)
807       return;
808 
809    for (uint32_t a = 0; a < layer_count; a++) {
810       uint32_t layer = base_layer + a;
811       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
812          sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device,
813                                                             image, aspect,
814                                                             level, layer);
815          sdi.ImmediateData = compressed ? UINT32_MAX : 0;
816       }
817    }
818 }
819 
820 static void
set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, enum anv_fast_clear_type fast_clear)821 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
822                            const struct anv_image *image,
823                            VkImageAspectFlagBits aspect,
824                            enum anv_fast_clear_type fast_clear)
825 {
826    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
827       sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
828                                                        image, aspect);
829       sdi.ImmediateData = fast_clear;
830    }
831 
832    /* Whenever we have fast-clear, we consider that slice to be compressed.
833     * This makes building predicates much easier.
834     */
835    if (fast_clear != ANV_FAST_CLEAR_NONE)
836       set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true);
837 }
838 
839 /* This is only really practical on haswell and above because it requires
840  * MI math in order to get it correct.
841  */
842 #if GFX_VERx10 >= 75
843 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, uint32_t level, uint32_t array_layer, enum isl_aux_op resolve_op, enum anv_fast_clear_type fast_clear_supported)844 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
845                                   const struct anv_image *image,
846                                   VkImageAspectFlagBits aspect,
847                                   uint32_t level, uint32_t array_layer,
848                                   enum isl_aux_op resolve_op,
849                                   enum anv_fast_clear_type fast_clear_supported)
850 {
851    struct mi_builder b;
852    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
853 
854    const struct mi_value fast_clear_type =
855       mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
856                                                   image, aspect));
857 
858    if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
859       /* In this case, we're doing a full resolve which means we want the
860        * resolve to happen if any compression (including fast-clears) is
861        * present.
862        *
863        * In order to simplify the logic a bit, we make the assumption that,
864        * if the first slice has been fast-cleared, it is also marked as
865        * compressed.  See also set_image_fast_clear_state.
866        */
867       const struct mi_value compression_state =
868          mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device,
869                                                        image, aspect,
870                                                        level, array_layer));
871       mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state);
872       mi_store(&b, compression_state, mi_imm(0));
873 
874       if (level == 0 && array_layer == 0) {
875          /* If the predicate is true, we want to write 0 to the fast clear type
876           * and, if it's false, leave it alone.  We can do this by writing
877           *
878           * clear_type = clear_type & ~predicate;
879           */
880          struct mi_value new_fast_clear_type =
881             mi_iand(&b, fast_clear_type,
882                         mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0)));
883          mi_store(&b, fast_clear_type, new_fast_clear_type);
884       }
885    } else if (level == 0 && array_layer == 0) {
886       /* In this case, we are doing a partial resolve to get rid of fast-clear
887        * colors.  We don't care about the compression state but we do care
888        * about how much fast clear is allowed by the final layout.
889        */
890       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
891       assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
892 
893       /* We need to compute (fast_clear_supported < image->fast_clear) */
894       struct mi_value pred =
895          mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
896       mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
897 
898       /* If the predicate is true, we want to write 0 to the fast clear type
899        * and, if it's false, leave it alone.  We can do this by writing
900        *
901        * clear_type = clear_type & ~predicate;
902        */
903       struct mi_value new_fast_clear_type =
904          mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
905       mi_store(&b, fast_clear_type, new_fast_clear_type);
906    } else {
907       /* In this case, we're trying to do a partial resolve on a slice that
908        * doesn't have clear color.  There's nothing to do.
909        */
910       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
911       return;
912    }
913 
914    /* Set src1 to 0 and use a != condition */
915    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
916 
917    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
918       mip.LoadOperation    = LOAD_LOADINV;
919       mip.CombineOperation = COMBINE_SET;
920       mip.CompareOperation = COMPARE_SRCS_EQUAL;
921    }
922 }
923 #endif /* GFX_VERx10 >= 75 */
924 
925 #if GFX_VER <= 8
926 static void
anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, uint32_t level, uint32_t array_layer, enum isl_aux_op resolve_op, enum anv_fast_clear_type fast_clear_supported)927 anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
928                                  const struct anv_image *image,
929                                  VkImageAspectFlagBits aspect,
930                                  uint32_t level, uint32_t array_layer,
931                                  enum isl_aux_op resolve_op,
932                                  enum anv_fast_clear_type fast_clear_supported)
933 {
934    struct mi_builder b;
935    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
936 
937    struct mi_value fast_clear_type_mem =
938       mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
939                                                       image, aspect));
940 
941    /* This only works for partial resolves and only when the clear color is
942     * all or nothing.  On the upside, this emits less command streamer code
943     * and works on Ivybridge and Bay Trail.
944     */
945    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
946    assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);
947 
948    /* We don't support fast clears on anything other than the first slice. */
949    if (level > 0 || array_layer > 0)
950       return;
951 
952    /* On gfx8, we don't have a concept of default clear colors because we
953     * can't sample from CCS surfaces.  It's enough to just load the fast clear
954     * state into the predicate register.
955     */
956    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);
957    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
958    mi_store(&b, fast_clear_type_mem, mi_imm(0));
959 
960    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
961       mip.LoadOperation    = LOAD_LOADINV;
962       mip.CombineOperation = COMBINE_SET;
963       mip.CompareOperation = COMPARE_SRCS_EQUAL;
964    }
965 }
966 #endif /* GFX_VER <= 8 */
967 
968 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, enum isl_format format, struct isl_swizzle swizzle, VkImageAspectFlagBits aspect, uint32_t level, uint32_t array_layer, enum isl_aux_op resolve_op, enum anv_fast_clear_type fast_clear_supported)969 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
970                                const struct anv_image *image,
971                                enum isl_format format,
972                                struct isl_swizzle swizzle,
973                                VkImageAspectFlagBits aspect,
974                                uint32_t level, uint32_t array_layer,
975                                enum isl_aux_op resolve_op,
976                                enum anv_fast_clear_type fast_clear_supported)
977 {
978    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
979 
980 #if GFX_VER >= 9
981    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
982                                      aspect, level, array_layer,
983                                      resolve_op, fast_clear_supported);
984 #else /* GFX_VER <= 8 */
985    anv_cmd_simple_resolve_predicate(cmd_buffer, image,
986                                     aspect, level, array_layer,
987                                     resolve_op, fast_clear_supported);
988 #endif
989 
990    /* CCS_D only supports full resolves and BLORP will assert on us if we try
991     * to do a partial resolve on a CCS_D surface.
992     */
993    if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
994        image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
995       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
996 
997    anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
998                     level, array_layer, 1, resolve_op, NULL, true);
999 }
1000 
1001 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, enum isl_format format, struct isl_swizzle swizzle, VkImageAspectFlagBits aspect, uint32_t array_layer, enum isl_aux_op resolve_op, enum anv_fast_clear_type fast_clear_supported)1002 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
1003                                const struct anv_image *image,
1004                                enum isl_format format,
1005                                struct isl_swizzle swizzle,
1006                                VkImageAspectFlagBits aspect,
1007                                uint32_t array_layer,
1008                                enum isl_aux_op resolve_op,
1009                                enum anv_fast_clear_type fast_clear_supported)
1010 {
1011    assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
1012    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
1013 
1014 #if GFX_VERx10 >= 75
1015    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
1016                                      aspect, 0, array_layer,
1017                                      resolve_op, fast_clear_supported);
1018 
1019    anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
1020                     array_layer, 1, resolve_op, NULL, true);
1021 #else
1022    unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
1023 #endif
1024 }
1025 
1026 void
cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, enum isl_aux_usage aux_usage, uint32_t level, uint32_t base_layer, uint32_t layer_count)1027 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
1028                                     const struct anv_image *image,
1029                                     VkImageAspectFlagBits aspect,
1030                                     enum isl_aux_usage aux_usage,
1031                                     uint32_t level,
1032                                     uint32_t base_layer,
1033                                     uint32_t layer_count)
1034 {
1035    /* The aspect must be exactly one of the image aspects. */
1036    assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
1037 
1038    /* The only compression types with more than just fast-clears are MCS,
1039     * CCS_E, and HiZ.  With HiZ we just trust the layout and don't actually
1040     * track the current fast-clear and compression state.  This leaves us
1041     * with just MCS and CCS_E.
1042     */
1043    if (aux_usage != ISL_AUX_USAGE_CCS_E &&
1044        aux_usage != ISL_AUX_USAGE_MCS)
1045       return;
1046 
1047    set_image_compressed_bit(cmd_buffer, image, aspect,
1048                             level, base_layer, layer_count, true);
1049 }
1050 
1051 static void
init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect)1052 init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,
1053                       const struct anv_image *image,
1054                       VkImageAspectFlagBits aspect)
1055 {
1056    assert(cmd_buffer && image);
1057    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1058 
1059    set_image_fast_clear_state(cmd_buffer, image, aspect,
1060                               ANV_FAST_CLEAR_NONE);
1061 
1062    /* Initialize the struct fields that are accessed for fast-clears so that
1063     * the HW restrictions on the field values are satisfied.
1064     */
1065    struct anv_address addr =
1066       anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
1067 
1068    if (GFX_VER >= 9) {
1069       const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1070       const unsigned num_dwords = GFX_VER >= 10 ?
1071                                   isl_dev->ss.clear_color_state_size / 4 :
1072                                   isl_dev->ss.clear_value_size / 4;
1073       for (unsigned i = 0; i < num_dwords; i++) {
1074          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
1075             sdi.Address = addr;
1076             sdi.Address.offset += i * 4;
1077             sdi.ImmediateData = 0;
1078          }
1079       }
1080    } else {
1081       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
1082          sdi.Address = addr;
1083          if (GFX_VERx10 >= 75) {
1084             /* Pre-SKL, the dword containing the clear values also contains
1085              * other fields, so we need to initialize those fields to match the
1086              * values that would be in a color attachment.
1087              */
1088             sdi.ImmediateData = ISL_CHANNEL_SELECT_RED   << 25 |
1089                                 ISL_CHANNEL_SELECT_GREEN << 22 |
1090                                 ISL_CHANNEL_SELECT_BLUE  << 19 |
1091                                 ISL_CHANNEL_SELECT_ALPHA << 16;
1092          } else if (GFX_VER == 7) {
1093             /* On IVB, the dword containing the clear values also contains
1094              * other fields that must be zero or can be zero.
1095              */
1096             sdi.ImmediateData = 0;
1097          }
1098       }
1099    }
1100 }
1101 
1102 /* Copy the fast-clear value dword(s) between a surface state object and an
1103  * image's fast clear state buffer.
1104  */
1105 static void
copy_fast_clear_dwords(struct anv_cmd_buffer *cmd_buffer, struct anv_state surface_state, const struct anv_image *image, VkImageAspectFlagBits aspect, bool copy_from_surface_state)1106 genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
1107                              struct anv_state surface_state,
1108                              const struct anv_image *image,
1109                              VkImageAspectFlagBits aspect,
1110                              bool copy_from_surface_state)
1111 {
1112    assert(cmd_buffer && image);
1113    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1114 
1115    struct anv_address ss_clear_addr = {
1116       .bo = cmd_buffer->device->surface_state_pool.block_pool.bo,
1117       .offset = surface_state.offset +
1118                 cmd_buffer->device->isl_dev.ss.clear_value_offset,
1119    };
1120    const struct anv_address entry_addr =
1121       anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
1122    unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
1123 
1124 #if GFX_VER == 7
1125    /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM
1126     * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is
1127     * in-flight when they are issued even if the memory touched is not
1128     * currently active for rendering.  The weird bit is that it is not the
1129     * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight
1130     * rendering hangs such that the next stalling command after the
1131     * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.
1132     *
1133     * It is unclear exactly why this hang occurs.  Both MI commands come with
1134     * warnings about the 3D pipeline but that doesn't seem to fully explain
1135     * it.  My (Jason's) best theory is that it has something to do with the
1136     * fact that we're using a GPU state register as our temporary and that
1137     * something with reading/writing it is causing problems.
1138     *
1139     * In order to work around this issue, we emit a PIPE_CONTROL with the
1140     * command streamer stall bit set.
1141     */
1142    anv_add_pending_pipe_bits(cmd_buffer,
1143                              ANV_PIPE_CS_STALL_BIT,
1144                              "after copy_fast_clear_dwords. Avoid potential hang");
1145    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1146 #endif
1147 
1148    struct mi_builder b;
1149    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1150 
1151    if (copy_from_surface_state) {
1152       mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);
1153    } else {
1154       mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
1155 
1156       /* Updating a surface state object may require that the state cache be
1157        * invalidated. From the SKL PRM, Shared Functions -> State -> State
1158        * Caching:
1159        *
1160        *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
1161        *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
1162        *    modified [...], the L1 state cache must be invalidated to ensure
1163        *    the new surface or sampler state is fetched from system memory.
1164        *
1165        * In testing, SKL doesn't actually seem to need this, but HSW does.
1166        */
1167       anv_add_pending_pipe_bits(cmd_buffer,
1168                                 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
1169                                 "after copy_fast_clear_dwords surface state update");
1170    }
1171 }
1172 
1173 /**
1174  * @brief Transitions a color buffer from one layout to another.
1175  *
1176  * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
1177  * more information.
1178  *
1179  * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
1180  * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
1181  *                    this represents the maximum layers to transition at each
1182  *                    specified miplevel.
1183  */
1184 static void
transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, const uint32_t base_level, uint32_t level_count, uint32_t base_layer, uint32_t layer_count, VkImageLayout initial_layout, VkImageLayout final_layout, uint64_t src_queue_family, uint64_t dst_queue_family, bool will_full_fast_clear)1185 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
1186                         const struct anv_image *image,
1187                         VkImageAspectFlagBits aspect,
1188                         const uint32_t base_level, uint32_t level_count,
1189                         uint32_t base_layer, uint32_t layer_count,
1190                         VkImageLayout initial_layout,
1191                         VkImageLayout final_layout,
1192                         uint64_t src_queue_family,
1193                         uint64_t dst_queue_family,
1194                         bool will_full_fast_clear)
1195 {
1196    struct anv_device *device = cmd_buffer->device;
1197    const struct intel_device_info *devinfo = &device->info;
1198    /* Validate the inputs. */
1199    assert(cmd_buffer);
1200    assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1201    /* These values aren't supported for simplicity's sake. */
1202    assert(level_count != VK_REMAINING_MIP_LEVELS &&
1203           layer_count != VK_REMAINING_ARRAY_LAYERS);
1204    /* Ensure the subresource range is valid. */
1205    UNUSED uint64_t last_level_num = base_level + level_count;
1206    const uint32_t max_depth = anv_minify(image->vk.extent.depth, base_level);
1207    UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
1208    assert((uint64_t)base_layer + layer_count  <= image_layers);
1209    assert(last_level_num <= image->vk.mip_levels);
1210    /* If there is a layout transfer, the final layout cannot be undefined or
1211     * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
1212     */
1213    assert(initial_layout == final_layout ||
1214           (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
1215            final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
1216    const struct isl_drm_modifier_info *isl_mod_info =
1217       image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
1218       ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
1219       : NULL;
1220 
1221    const bool src_queue_external =
1222       src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1223       src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1224 
1225    const bool dst_queue_external =
1226       dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1227       dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1228 
1229    /* Simultaneous acquire and release on external queues is illegal. */
1230    assert(!src_queue_external || !dst_queue_external);
1231 
1232    /* Ownership transition on an external queue requires special action if the
1233     * image has a DRM format modifier because we store image data in
1234     * a driver-private bo which is inaccessible to the external queue.
1235     */
1236    const bool private_binding_acquire =
1237       src_queue_external &&
1238       anv_image_is_externally_shared(image) &&
1239       anv_image_has_private_binding(image);
1240 
1241    const bool private_binding_release =
1242       dst_queue_external &&
1243       anv_image_is_externally_shared(image) &&
1244       anv_image_has_private_binding(image);
1245 
1246    if (initial_layout == final_layout &&
1247        !private_binding_acquire && !private_binding_release) {
1248       /* No work is needed. */
1249        return;
1250    }
1251 
1252    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
1253 
1254    if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
1255        final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
1256       /* This surface is a linear compressed image with a tiled shadow surface
1257        * for texturing.  The client is about to use it in READ_ONLY_OPTIMAL so
1258        * we need to ensure the shadow copy is up-to-date.
1259        */
1260       assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
1261       assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
1262       assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);
1263       assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
1264       assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));
1265       assert(plane == 0);
1266       anv_image_copy_to_shadow(cmd_buffer, image,
1267                                VK_IMAGE_ASPECT_COLOR_BIT,
1268                                base_level, level_count,
1269                                base_layer, layer_count);
1270    }
1271 
1272    if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
1273       return;
1274 
1275    assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
1276 
1277    /* The following layouts are equivalent for non-linear images. */
1278    const bool initial_layout_undefined =
1279       initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
1280       initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
1281 
1282    bool must_init_fast_clear_state = false;
1283    bool must_init_aux_surface = false;
1284 
1285    if (initial_layout_undefined) {
1286       /* The subresource may have been aliased and populated with arbitrary
1287        * data.
1288        */
1289       must_init_fast_clear_state = true;
1290       must_init_aux_surface = true;
1291    } else if (private_binding_acquire) {
1292       /* The fast clear state lives in a driver-private bo, and therefore the
1293        * external/foreign queue is unaware of it.
1294        *
1295        * If this is the first time we are accessing the image, then the fast
1296        * clear state is uninitialized.
1297        *
1298        * If this is NOT the first time we are accessing the image, then the fast
1299        * clear state may still be valid and correct due to the resolve during
1300        * our most recent ownership release.  However, we do not track the aux
1301        * state with MI stores, and therefore must assume the worst-case: that
1302        * this is the first time we are accessing the image.
1303        */
1304       assert(image->planes[plane].fast_clear_memory_range.binding ==
1305               ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1306       must_init_fast_clear_state = true;
1307 
1308       if (image->planes[plane].aux_surface.memory_range.binding ==
1309           ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
1310          assert(isl_mod_info->aux_usage == ISL_AUX_USAGE_NONE);
1311 
1312          /* The aux surface, like the fast clear state, lives in
1313           * a driver-private bo.  We must initialize the aux surface for the
1314           * same reasons we must initialize the fast clear state.
1315           */
1316          must_init_aux_surface = true;
1317       } else {
1318          assert(isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE);
1319 
1320          /* The aux surface, unlike the fast clear state, lives in
1321           * application-visible VkDeviceMemory and is shared with the
1322           * external/foreign queue. Therefore, when we acquire ownership of the
1323           * image with a defined VkImageLayout, the aux surface is valid and has
1324           * the aux state required by the modifier.
1325           */
1326          must_init_aux_surface = false;
1327       }
1328    }
1329 
1330 #if GFX_VER == 12
1331    if (initial_layout_undefined) {
1332       if (device->physical->has_implicit_ccs && devinfo->has_aux_map) {
1333          anv_image_init_aux_tt(cmd_buffer, image, aspect,
1334                                base_level, level_count,
1335                                base_layer, layer_count);
1336       }
1337    }
1338 #else
1339    assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map));
1340 #endif
1341 
1342    if (must_init_fast_clear_state) {
1343       if (base_level == 0 && base_layer == 0)
1344          init_fast_clear_color(cmd_buffer, image, aspect);
1345    }
1346 
1347    if (must_init_aux_surface) {
1348       assert(must_init_fast_clear_state);
1349 
1350       /* Initialize the aux buffers to enable correct rendering.  In order to
1351        * ensure that things such as storage images work correctly, aux buffers
1352        * need to be initialized to valid data.
1353        *
1354        * Having an aux buffer with invalid data is a problem for two reasons:
1355        *
1356        *  1) Having an invalid value in the buffer can confuse the hardware.
1357        *     For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1358        *     invalid and leads to the hardware doing strange things.  It
1359        *     doesn't hang as far as we can tell but rendering corruption can
1360        *     occur.
1361        *
1362        *  2) If this transition is into the GENERAL layout and we then use the
1363        *     image as a storage image, then we must have the aux buffer in the
1364        *     pass-through state so that, if we then go to texture from the
1365        *     image, we get the results of our storage image writes and not the
1366        *     fast clear color or other random data.
1367        *
1368        * For CCS both of the problems above are real demonstrable issues.  In
1369        * that case, the only thing we can do is to perform an ambiguate to
1370        * transition the aux surface into the pass-through state.
1371        *
1372        * For MCS, (2) is never an issue because we don't support multisampled
1373        * storage images.  In theory, issue (1) is a problem with MCS but we've
1374        * never seen it in the wild.  For 4x and 16x, all bit patters could, in
1375        * theory, be interpreted as something but we don't know that all bit
1376        * patterns are actually valid.  For 2x and 8x, you could easily end up
1377        * with the MCS referring to an invalid plane because not all bits of
1378        * the MCS value are actually used.  Even though we've never seen issues
1379        * in the wild, it's best to play it safe and initialize the MCS.  We
1380        * can use a fast-clear for MCS because we only ever touch from render
1381        * and texture (no image load store).
1382        */
1383       if (image->vk.samples == 1) {
1384          for (uint32_t l = 0; l < level_count; l++) {
1385             const uint32_t level = base_level + l;
1386 
1387             uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1388             if (base_layer >= aux_layers)
1389                break; /* We will only get fewer layers as level increases */
1390             uint32_t level_layer_count =
1391                MIN2(layer_count, aux_layers - base_layer);
1392 
1393             /* If will_full_fast_clear is set, the caller promises to
1394              * fast-clear the largest portion of the specified range as it can.
1395              * For color images, that means only the first LOD and array slice.
1396              */
1397             if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1398                base_layer++;
1399                level_layer_count--;
1400                if (level_layer_count == 0)
1401                   continue;
1402             }
1403 
1404             anv_image_ccs_op(cmd_buffer, image,
1405                              image->planes[plane].primary_surface.isl.format,
1406                              ISL_SWIZZLE_IDENTITY,
1407                              aspect, level, base_layer, level_layer_count,
1408                              ISL_AUX_OP_AMBIGUATE, NULL, false);
1409 
1410             if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
1411                set_image_compressed_bit(cmd_buffer, image, aspect,
1412                                         level, base_layer, level_layer_count,
1413                                         false);
1414             }
1415          }
1416       } else {
1417          if (image->vk.samples == 4 || image->vk.samples == 16) {
1418             anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
1419                           "Doing a potentially unnecessary fast-clear to "
1420                           "define an MCS buffer.");
1421          }
1422 
1423          /* If will_full_fast_clear is set, the caller promises to fast-clear
1424           * the largest portion of the specified range as it can.
1425           */
1426          if (will_full_fast_clear)
1427             return;
1428 
1429          assert(base_level == 0 && level_count == 1);
1430          anv_image_mcs_op(cmd_buffer, image,
1431                           image->planes[plane].primary_surface.isl.format,
1432                           ISL_SWIZZLE_IDENTITY,
1433                           aspect, base_layer, layer_count,
1434                           ISL_AUX_OP_FAST_CLEAR, NULL, false);
1435       }
1436       return;
1437    }
1438 
1439    enum isl_aux_usage initial_aux_usage =
1440       anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
1441    enum isl_aux_usage final_aux_usage =
1442       anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
1443    enum anv_fast_clear_type initial_fast_clear =
1444       anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);
1445    enum anv_fast_clear_type final_fast_clear =
1446       anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);
1447 
1448    /* We must override the anv_layout_to_* functions because they are unaware of
1449     * acquire/release direction.
1450     */
1451    if (private_binding_acquire) {
1452       initial_aux_usage = isl_mod_info->aux_usage;
1453       initial_fast_clear = isl_mod_info->supports_clear_color ?
1454          initial_fast_clear : ANV_FAST_CLEAR_NONE;
1455    } else if (private_binding_release) {
1456       final_aux_usage = isl_mod_info->aux_usage;
1457       final_fast_clear = isl_mod_info->supports_clear_color ?
1458          final_fast_clear : ANV_FAST_CLEAR_NONE;
1459    }
1460 
1461    /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1462     * We can handle transitions between CCS_D/E to and from NONE.  What we
1463     * don't yet handle is switching between CCS_E and CCS_D within a given
1464     * image.  Doing so in a performant way requires more detailed aux state
1465     * tracking such as what is done in i965.  For now, just assume that we
1466     * only have one type of compression.
1467     */
1468    assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1469           final_aux_usage == ISL_AUX_USAGE_NONE ||
1470           initial_aux_usage == final_aux_usage);
1471 
1472    /* If initial aux usage is NONE, there is nothing to resolve */
1473    if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1474       return;
1475 
1476    enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1477 
1478    /* If the initial layout supports more fast clear than the final layout
1479     * then we need at least a partial resolve.
1480     */
1481    if (final_fast_clear < initial_fast_clear)
1482       resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1483 
1484    if (initial_aux_usage == ISL_AUX_USAGE_CCS_E &&
1485        final_aux_usage != ISL_AUX_USAGE_CCS_E)
1486       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
1487 
1488    if (resolve_op == ISL_AUX_OP_NONE)
1489       return;
1490 
1491    /* Perform a resolve to synchronize data between the main and aux buffer.
1492     * Before we begin, we must satisfy the cache flushing requirement specified
1493     * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
1494     *
1495     *    Any transition from any value in {Clear, Render, Resolve} to a
1496     *    different value in {Clear, Render, Resolve} requires end of pipe
1497     *    synchronization.
1498     *
1499     * We perform a flush of the write cache before and after the clear and
1500     * resolve operations to meet this requirement.
1501     *
1502     * Unlike other drawing, fast clear operations are not properly
1503     * synchronized. The first PIPE_CONTROL here likely ensures that the
1504     * contents of the previous render or clear hit the render target before we
1505     * resolve and the second likely ensures that the resolve is complete before
1506     * we do any more rendering or clearing.
1507     */
1508    anv_add_pending_pipe_bits(cmd_buffer,
1509                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1510                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1511                              "after transition RT");
1512 
1513    for (uint32_t l = 0; l < level_count; l++) {
1514       uint32_t level = base_level + l;
1515 
1516       uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1517       if (base_layer >= aux_layers)
1518          break; /* We will only get fewer layers as level increases */
1519       uint32_t level_layer_count =
1520          MIN2(layer_count, aux_layers - base_layer);
1521 
1522       for (uint32_t a = 0; a < level_layer_count; a++) {
1523          uint32_t array_layer = base_layer + a;
1524 
1525          /* If will_full_fast_clear is set, the caller promises to fast-clear
1526           * the largest portion of the specified range as it can.  For color
1527           * images, that means only the first LOD and array slice.
1528           */
1529          if (level == 0 && array_layer == 0 && will_full_fast_clear)
1530             continue;
1531 
1532          if (image->vk.samples == 1) {
1533             anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1534                                            image->planes[plane].primary_surface.isl.format,
1535                                            ISL_SWIZZLE_IDENTITY,
1536                                            aspect, level, array_layer, resolve_op,
1537                                            final_fast_clear);
1538          } else {
1539             /* We only support fast-clear on the first layer so partial
1540              * resolves should not be used on other layers as they will use
1541              * the clear color stored in memory that is only valid for layer0.
1542              */
1543             if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1544                 array_layer != 0)
1545                continue;
1546 
1547             anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1548                                            image->planes[plane].primary_surface.isl.format,
1549                                            ISL_SWIZZLE_IDENTITY,
1550                                            aspect, array_layer, resolve_op,
1551                                            final_fast_clear);
1552          }
1553       }
1554    }
1555 
1556    anv_add_pending_pipe_bits(cmd_buffer,
1557                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1558                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1559                              "after transition RT");
1560 }
1561 
1562 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer, uint32_t color_att_count)1563 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1564                                 uint32_t color_att_count)
1565 {
1566    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1567 
1568    /* Reserve one for the NULL state. */
1569    unsigned num_states = 1 + color_att_count;
1570    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1571    const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);
1572    gfx->att_states =
1573       anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
1574                              num_states * ss_stride, isl_dev->ss.align);
1575    if (gfx->att_states.map == NULL) {
1576       return anv_batch_set_error(&cmd_buffer->batch,
1577                                  VK_ERROR_OUT_OF_DEVICE_MEMORY);
1578    }
1579 
1580    struct anv_state next_state = gfx->att_states;
1581    next_state.alloc_size = isl_dev->ss.size;
1582 
1583    gfx->null_surface_state = next_state;
1584    next_state.offset += ss_stride;
1585    next_state.map += ss_stride;
1586 
1587    gfx->color_att_count = color_att_count;
1588    for (uint32_t i = 0; i < color_att_count; i++) {
1589       gfx->color_att[i] = (struct anv_attachment) {
1590          .surface_state.state = next_state,
1591       };
1592       next_state.offset += ss_stride;
1593       next_state.map += ss_stride;
1594    }
1595    gfx->depth_att = (struct anv_attachment) { };
1596    gfx->stencil_att = (struct anv_attachment) { };
1597 
1598    return VK_SUCCESS;
1599 }
1600 
1601 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)1602 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1603 {
1604    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1605 
1606    gfx->render_area = (VkRect2D) { };
1607    gfx->layer_count = 0;
1608    gfx->samples = 0;
1609 
1610    gfx->color_att_count = 0;
1611    gfx->depth_att = (struct anv_attachment) { };
1612    gfx->stencil_att = (struct anv_attachment) { };
1613    gfx->null_surface_state = ANV_STATE_NULL;
1614 }
1615 
1616 VkResult
BeginCommandBuffer( VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo* pBeginInfo)1617 genX(BeginCommandBuffer)(
1618     VkCommandBuffer                             commandBuffer,
1619     const VkCommandBufferBeginInfo*             pBeginInfo)
1620 {
1621    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1622    VkResult result;
1623 
1624    /* If this is the first vkBeginCommandBuffer, we must *initialize* the
1625     * command buffer's state. Otherwise, we must *reset* its state. In both
1626     * cases we reset it.
1627     *
1628     * From the Vulkan 1.0 spec:
1629     *
1630     *    If a command buffer is in the executable state and the command buffer
1631     *    was allocated from a command pool with the
1632     *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
1633     *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
1634     *    as if vkResetCommandBuffer had been called with
1635     *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
1636     *    the command buffer in the recording state.
1637     */
1638    anv_cmd_buffer_reset(cmd_buffer);
1639    anv_cmd_buffer_reset_rendering(cmd_buffer);
1640 
1641    cmd_buffer->usage_flags = pBeginInfo->flags;
1642 
1643    /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
1644     * primary level command buffers.
1645     *
1646     * From the Vulkan 1.0 spec:
1647     *
1648     *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
1649     *    secondary command buffer is considered to be entirely inside a render
1650     *    pass. If this is a primary command buffer, then this bit is ignored.
1651     */
1652    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
1653       cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
1654 
1655    trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
1656 
1657    genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1658 
1659    /* We sometimes store vertex data in the dynamic state buffer for blorp
1660     * operations and our dynamic state stream may re-use data from previous
1661     * command buffers.  In order to prevent stale cache data, we flush the VF
1662     * cache.  We could do this on every blorp call but that's not really
1663     * needed as all of the data will get written by the CPU prior to the GPU
1664     * executing anything.  The chances are fairly high that they will use
1665     * blorp at least once per primary command buffer so it shouldn't be
1666     * wasted.
1667     *
1668     * There is also a workaround on gfx8 which requires us to invalidate the
1669     * VF cache occasionally.  It's easier if we can assume we start with a
1670     * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
1671     */
1672    anv_add_pending_pipe_bits(cmd_buffer,
1673                              ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1674                              "new cmd buffer");
1675 
1676    /* Re-emit the aux table register in every command buffer.  This way we're
1677     * ensured that we have the table even if this command buffer doesn't
1678     * initialize any images.
1679     */
1680    if (cmd_buffer->device->info.has_aux_map) {
1681       anv_add_pending_pipe_bits(cmd_buffer,
1682                                 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
1683                                 "new cmd buffer with aux-tt");
1684    }
1685 
1686    /* We send an "Indirect State Pointers Disable" packet at
1687     * EndCommandBuffer, so all push constant packets are ignored during a
1688     * context restore. Documentation says after that command, we need to
1689     * emit push constants again before any rendering operation. So we
1690     * flag them dirty here to make sure they get emitted.
1691     */
1692    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
1693 
1694    if (cmd_buffer->usage_flags &
1695        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1696       struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1697 
1698       char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
1699       const VkRenderingInfo *resume_info =
1700          vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
1701                                                                pBeginInfo,
1702                                                                gcbiar_data);
1703       if (resume_info != NULL) {
1704          genX(CmdBeginRendering)(commandBuffer, resume_info);
1705       } else {
1706          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
1707             vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
1708                                                              pBeginInfo);
1709          assert(inheritance_info);
1710 
1711          gfx->rendering_flags = inheritance_info->flags;
1712          gfx->render_area = (VkRect2D) { };
1713          gfx->layer_count = 0;
1714          gfx->samples = inheritance_info->rasterizationSamples;
1715          gfx->view_mask = inheritance_info->viewMask;
1716 
1717          uint32_t color_att_count = inheritance_info->colorAttachmentCount;
1718          result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
1719          if (result != VK_SUCCESS)
1720             return result;
1721 
1722          for (uint32_t i = 0; i < color_att_count; i++) {
1723             gfx->color_att[i].vk_format =
1724                inheritance_info->pColorAttachmentFormats[i];
1725          }
1726          gfx->depth_att.vk_format =
1727             inheritance_info->depthAttachmentFormat;
1728          gfx->stencil_att.vk_format =
1729             inheritance_info->stencilAttachmentFormat;
1730 
1731          cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
1732       }
1733    }
1734 
1735 #if GFX_VER >= 8
1736    /* Emit the sample pattern at the beginning of the batch because the
1737     * default locations emitted at the device initialization might have been
1738     * changed by a previous command buffer.
1739     *
1740     * Do not change that when we're continuing a previous renderpass.
1741     */
1742    if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
1743        !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
1744       genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
1745 #endif
1746 
1747 #if GFX_VERx10 >= 75
1748    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1749       const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
1750          vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
1751 
1752       /* If secondary buffer supports conditional rendering
1753        * we should emit commands as if conditional rendering is enabled.
1754        */
1755       cmd_buffer->state.conditional_render_enabled =
1756          conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
1757    }
1758 #endif
1759 
1760    return VK_SUCCESS;
1761 }
1762 
1763 /* From the PRM, Volume 2a:
1764  *
1765  *    "Indirect State Pointers Disable
1766  *
1767  *    At the completion of the post-sync operation associated with this pipe
1768  *    control packet, the indirect state pointers in the hardware are
1769  *    considered invalid; the indirect pointers are not saved in the context.
1770  *    If any new indirect state commands are executed in the command stream
1771  *    while the pipe control is pending, the new indirect state commands are
1772  *    preserved.
1773  *
1774  *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
1775  *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
1776  *    commands are only considered as Indirect State Pointers. Once ISP is
1777  *    issued in a context, SW must initialize by programming push constant
1778  *    commands for all the shaders (at least to zero length) before attempting
1779  *    any rendering operation for the same context."
1780  *
1781  * 3DSTATE_CONSTANT_* packets are restored during a context restore,
1782  * even though they point to a BO that has been already unreferenced at
1783  * the end of the previous batch buffer. This has been fine so far since
1784  * we are protected by these scratch page (every address not covered by
1785  * a BO should be pointing to the scratch page). But on CNL, it is
1786  * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
1787  * instruction.
1788  *
1789  * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
1790  * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
1791  * context restore, so the mentioned hang doesn't happen. However,
1792  * software must program push constant commands for all stages prior to
1793  * rendering anything. So we flag them dirty in BeginCommandBuffer.
1794  *
1795  * Finally, we also make sure to stall at pixel scoreboard to make sure the
1796  * constants have been loaded into the EUs prior to disable the push constants
1797  * so that it doesn't hang a previous 3DPRIMITIVE.
1798  */
1799 static void
emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)1800 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
1801 {
1802    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1803          pc.StallAtPixelScoreboard = true;
1804          pc.CommandStreamerStallEnable = true;
1805          anv_debug_dump_pc(pc);
1806    }
1807    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1808          pc.IndirectStatePointersDisable = true;
1809          pc.CommandStreamerStallEnable = true;
1810          anv_debug_dump_pc(pc);
1811    }
1812 }
1813 
1814 VkResult
EndCommandBuffer( VkCommandBuffer commandBuffer)1815 genX(EndCommandBuffer)(
1816     VkCommandBuffer                             commandBuffer)
1817 {
1818    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1819 
1820    if (anv_batch_has_error(&cmd_buffer->batch))
1821       return cmd_buffer->batch.status;
1822 
1823    anv_measure_endcommandbuffer(cmd_buffer);
1824 
1825    /* We want every command buffer to start with the PMA fix in a known state,
1826     * so we disable it at the end of the command buffer.
1827     */
1828    genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
1829 
1830    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1831 
1832    emit_isp_disable(cmd_buffer);
1833 
1834    trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
1835 
1836    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
1837 
1838    return VK_SUCCESS;
1839 }
1840 
1841 void
CmdExecuteCommands( VkCommandBuffer commandBuffer, uint32_t commandBufferCount, const VkCommandBuffer* pCmdBuffers)1842 genX(CmdExecuteCommands)(
1843     VkCommandBuffer                             commandBuffer,
1844     uint32_t                                    commandBufferCount,
1845     const VkCommandBuffer*                      pCmdBuffers)
1846 {
1847    ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
1848 
1849    assert(primary->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1850 
1851    if (anv_batch_has_error(&primary->batch))
1852       return;
1853 
1854    /* The secondary command buffers will assume that the PMA fix is disabled
1855     * when they begin executing.  Make sure this is true.
1856     */
1857    genX(cmd_buffer_enable_pma_fix)(primary, false);
1858 
1859    /* The secondary command buffer doesn't know which textures etc. have been
1860     * flushed prior to their execution.  Apply those flushes now.
1861     */
1862    genX(cmd_buffer_apply_pipe_flushes)(primary);
1863 
1864    for (uint32_t i = 0; i < commandBufferCount; i++) {
1865       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
1866 
1867       assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1868       assert(!anv_batch_has_error(&secondary->batch));
1869 
1870 #if GFX_VERx10 >= 75
1871       if (secondary->state.conditional_render_enabled) {
1872          if (!primary->state.conditional_render_enabled) {
1873             /* Secondary buffer is constructed as if it will be executed
1874              * with conditional rendering, we should satisfy this dependency
1875              * regardless of conditional rendering being enabled in primary.
1876              */
1877             struct mi_builder b;
1878             mi_builder_init(&b, &primary->device->info, &primary->batch);
1879             mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
1880                          mi_imm(UINT64_MAX));
1881          }
1882       }
1883 #endif
1884 
1885       if (secondary->usage_flags &
1886           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1887          /* If we're continuing a render pass from the primary, we need to
1888           * copy the surface states for the current subpass into the storage
1889           * we allocated for them in BeginCommandBuffer.
1890           */
1891          struct anv_bo *ss_bo =
1892             primary->device->surface_state_pool.block_pool.bo;
1893          struct anv_state src_state = primary->state.gfx.att_states;
1894          struct anv_state dst_state = secondary->state.gfx.att_states;
1895          assert(src_state.alloc_size == dst_state.alloc_size);
1896 
1897          genX(cmd_buffer_so_memcpy)(primary,
1898                                     (struct anv_address) {
1899                                        .bo = ss_bo,
1900                                        .offset = dst_state.offset,
1901                                     },
1902                                     (struct anv_address) {
1903                                        .bo = ss_bo,
1904                                        .offset = src_state.offset,
1905                                     },
1906                                     src_state.alloc_size);
1907       }
1908 
1909       anv_cmd_buffer_add_secondary(primary, secondary);
1910 
1911       assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
1912              secondary->perf_query_pool == primary->perf_query_pool);
1913       if (secondary->perf_query_pool)
1914          primary->perf_query_pool = secondary->perf_query_pool;
1915 
1916 #if GFX_VERx10 == 120
1917       if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
1918          primary->state.depth_reg_mode = secondary->state.depth_reg_mode;
1919 #endif
1920    }
1921 
1922    /* The secondary isn't counted in our VF cache tracking so we need to
1923     * invalidate the whole thing.
1924     */
1925    if (GFX_VER >= 8 && GFX_VER <= 9) {
1926       anv_add_pending_pipe_bits(primary,
1927                                 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1928                                 "Secondary cmd buffer not tracked in VF cache");
1929    }
1930 
1931    /* The secondary may have selected a different pipeline (3D or compute) and
1932     * may have changed the current L3$ configuration.  Reset our tracking
1933     * variables to invalid values to ensure that we re-emit these in the case
1934     * where we do any draws or compute dispatches from the primary after the
1935     * secondary has returned.
1936     */
1937    primary->state.current_pipeline = UINT32_MAX;
1938    primary->state.current_l3_config = NULL;
1939    primary->state.current_hash_scale = 0;
1940    primary->state.gfx.push_constant_stages = 0;
1941    vk_dynamic_graphics_state_dirty_all(&primary->vk.dynamic_graphics_state);
1942 
1943    /* Each of the secondary command buffers will use its own state base
1944     * address.  We need to re-emit state base address for the primary after
1945     * all of the secondaries are done.
1946     *
1947     * TODO: Maybe we want to make this a dirty bit to avoid extra state base
1948     * address calls?
1949     */
1950    genX(cmd_buffer_emit_state_base_address)(primary);
1951 }
1952 
1953 /**
1954  * Program the hardware to use the specified L3 configuration.
1955  */
1956 void
cmd_buffer_config_l3(struct anv_cmd_buffer *cmd_buffer, const struct intel_l3_config *cfg)1957 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
1958                            const struct intel_l3_config *cfg)
1959 {
1960    assert(cfg || GFX_VER >= 12);
1961    if (cfg == cmd_buffer->state.current_l3_config)
1962       return;
1963 
1964 #if GFX_VER >= 11
1965    /* On Gfx11+ we use only one config, so verify it remains the same and skip
1966     * the stalling programming entirely.
1967     */
1968    assert(cfg == cmd_buffer->device->l3_config);
1969 #else
1970    if (INTEL_DEBUG(DEBUG_L3)) {
1971       mesa_logd("L3 config transition: ");
1972       intel_dump_l3_config(cfg, stderr);
1973    }
1974 
1975    /* According to the hardware docs, the L3 partitioning can only be changed
1976     * while the pipeline is completely drained and the caches are flushed,
1977     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1978     */
1979    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1980       pc.DCFlushEnable = true;
1981       pc.PostSyncOperation = NoWrite;
1982       pc.CommandStreamerStallEnable = true;
1983       anv_debug_dump_pc(pc);
1984    }
1985 
1986    /* ...followed by a second pipelined PIPE_CONTROL that initiates
1987     * invalidation of the relevant caches.  Note that because RO invalidation
1988     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1989     * command is processed by the CS) we cannot combine it with the previous
1990     * stalling flush as the hardware documentation suggests, because that
1991     * would cause the CS to stall on previous rendering *after* RO
1992     * invalidation and wouldn't prevent the RO caches from being polluted by
1993     * concurrent rendering before the stall completes.  This intentionally
1994     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1995     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1996     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1997     * already guarantee that there is no concurrent GPGPU kernel execution
1998     * (see SKL HSD 2132585).
1999     */
2000    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2001       pc.TextureCacheInvalidationEnable = true;
2002       pc.ConstantCacheInvalidationEnable = true;
2003       pc.InstructionCacheInvalidateEnable = true;
2004       pc.StateCacheInvalidationEnable = true;
2005       pc.PostSyncOperation = NoWrite;
2006       anv_debug_dump_pc(pc);
2007    }
2008 
2009    /* Now send a third stalling flush to make sure that invalidation is
2010     * complete when the L3 configuration registers are modified.
2011     */
2012    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2013       pc.DCFlushEnable = true;
2014       pc.PostSyncOperation = NoWrite;
2015       pc.CommandStreamerStallEnable = true;
2016       anv_debug_dump_pc(pc);
2017    }
2018 
2019    genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
2020 #endif /* GFX_VER >= 11 */
2021    cmd_buffer->state.current_l3_config = cfg;
2022 }
2023 
2024 enum anv_pipe_bits
emit_apply_pipe_flushes(struct anv_batch *batch, struct anv_device *device, uint32_t current_pipeline, enum anv_pipe_bits bits)2025 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
2026                               struct anv_device *device,
2027                               uint32_t current_pipeline,
2028                               enum anv_pipe_bits bits)
2029 {
2030    /*
2031     * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
2032     *
2033     *    Write synchronization is a special case of end-of-pipe
2034     *    synchronization that requires that the render cache and/or depth
2035     *    related caches are flushed to memory, where the data will become
2036     *    globally visible. This type of synchronization is required prior to
2037     *    SW (CPU) actually reading the result data from memory, or initiating
2038     *    an operation that will use as a read surface (such as a texture
2039     *    surface) a previous render target and/or depth/stencil buffer
2040     *
2041     *
2042     * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
2043     *
2044     *    Exercising the write cache flush bits (Render Target Cache Flush
2045     *    Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
2046     *    ensures the write caches are flushed and doesn't guarantee the data
2047     *    is globally visible.
2048     *
2049     *    SW can track the completion of the end-of-pipe-synchronization by
2050     *    using "Notify Enable" and "PostSync Operation - Write Immediate
2051     *    Data" in the PIPE_CONTROL command.
2052     *
2053     * In other words, flushes are pipelined while invalidations are handled
2054     * immediately.  Therefore, if we're flushing anything then we need to
2055     * schedule an end-of-pipe sync before any invalidations can happen.
2056     */
2057    if (bits & ANV_PIPE_FLUSH_BITS)
2058       bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2059 
2060 
2061    /* HSD 1209978178: docs say that before programming the aux table:
2062     *
2063     *    "Driver must ensure that the engine is IDLE but ensure it doesn't
2064     *    add extra flushes in the case it knows that the engine is already
2065     *    IDLE."
2066     */
2067    if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT))
2068       bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2069 
2070    /* If we're going to do an invalidate and we have a pending end-of-pipe
2071     * sync that has yet to be resolved, we do the end-of-pipe sync now.
2072     */
2073    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
2074        (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
2075       bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
2076       bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2077    }
2078 
2079    /* Project: SKL / Argument: LRI Post Sync Operation [23]
2080     *
2081     * "PIPECONTROL command with “Command Streamer Stall Enable” must be
2082     *  programmed prior to programming a PIPECONTROL command with "LRI
2083     *  Post Sync Operation" in GPGPU mode of operation (i.e when
2084     *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
2085     *
2086     * The same text exists a few rows below for Post Sync Op.
2087     */
2088    if (bits & ANV_PIPE_POST_SYNC_BIT) {
2089       if (GFX_VER == 9 && current_pipeline == GPGPU)
2090          bits |= ANV_PIPE_CS_STALL_BIT;
2091       bits &= ~ANV_PIPE_POST_SYNC_BIT;
2092    }
2093 
2094    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
2095                ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
2096       anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2097 #if GFX_VER >= 12
2098          pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
2099          pipe.HDCPipelineFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2100 #else
2101          /* Flushing HDC pipeline requires DC Flush on earlier HW. */
2102          pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2103 #endif
2104          pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
2105          pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2106          pipe.RenderTargetCacheFlushEnable =
2107             bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
2108 
2109          /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
2110           * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
2111           */
2112 #if GFX_VER >= 12
2113          pipe.DepthStallEnable =
2114             pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT);
2115 #else
2116          pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
2117 #endif
2118 
2119 #if GFX_VERx10 >= 125
2120          pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT;
2121 #endif
2122 
2123          pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
2124 #if GFX_VER == 8
2125          /* From Broadwell PRM, volume 2a:
2126           *    PIPE_CONTROL: Command Streamer Stall Enable:
2127           *
2128           *    "This bit must be always set when PIPE_CONTROL command is
2129           *     programmed by GPGPU and MEDIA workloads, except for the cases
2130           *     when only Read Only Cache Invalidation bits are set (State
2131           *     Cache Invalidation Enable, Instruction cache Invalidation
2132           *     Enable, Texture Cache Invalidation Enable, Constant Cache
2133           *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
2134           *     need not implemented when FF_DOP_CG is disabled."
2135           *
2136           *    Since we do all the invalidation in the following PIPE_CONTROL,
2137           *    if we got here, we need a stall.
2138           */
2139          pipe.CommandStreamerStallEnable |= current_pipeline == GPGPU;
2140 #endif
2141 
2142          pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2143 
2144          /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
2145           *
2146           *    "The most common action to perform upon reaching a
2147           *    synchronization point is to write a value out to memory. An
2148           *    immediate value (included with the synchronization command) may
2149           *    be written."
2150           *
2151           *
2152           * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
2153           *
2154           *    "In case the data flushed out by the render engine is to be
2155           *    read back in to the render engine in coherent manner, then the
2156           *    render engine has to wait for the fence completion before
2157           *    accessing the flushed data. This can be achieved by following
2158           *    means on various products: PIPE_CONTROL command with CS Stall
2159           *    and the required write caches flushed with Post-Sync-Operation
2160           *    as Write Immediate Data.
2161           *
2162           *    Example:
2163           *       - Workload-1 (3D/GPGPU/MEDIA)
2164           *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
2165           *         Immediate Data, Required Write Cache Flush bits set)
2166           *       - Workload-2 (Can use the data produce or output by
2167           *         Workload-1)
2168           */
2169          if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
2170             pipe.CommandStreamerStallEnable = true;
2171             pipe.PostSyncOperation = WriteImmediateData;
2172             pipe.Address = device->workaround_address;
2173          }
2174 
2175          /*
2176           * According to the Broadwell documentation, any PIPE_CONTROL with the
2177           * "Command Streamer Stall" bit set must also have another bit set,
2178           * with five different options:
2179           *
2180           *  - Render Target Cache Flush
2181           *  - Depth Cache Flush
2182           *  - Stall at Pixel Scoreboard
2183           *  - Post-Sync Operation
2184           *  - Depth Stall
2185           *  - DC Flush Enable
2186           *
2187           * I chose "Stall at Pixel Scoreboard" since that's what we use in
2188           * mesa and it seems to work fine. The choice is fairly arbitrary.
2189           */
2190          if (pipe.CommandStreamerStallEnable &&
2191              !pipe.RenderTargetCacheFlushEnable &&
2192              !pipe.DepthCacheFlushEnable &&
2193              !pipe.StallAtPixelScoreboard &&
2194              !pipe.PostSyncOperation &&
2195              !pipe.DepthStallEnable &&
2196              !pipe.DCFlushEnable)
2197             pipe.StallAtPixelScoreboard = true;
2198          anv_debug_dump_pc(pipe);
2199       }
2200 
2201       /* If a render target flush was emitted, then we can toggle off the bit
2202        * saying that render target writes are ongoing.
2203        */
2204       if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
2205          bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
2206 
2207       if (GFX_VERx10 == 75) {
2208          /* Haswell needs addition work-arounds:
2209           *
2210           * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
2211           *
2212           *    Option 1:
2213           *    PIPE_CONTROL command with the CS Stall and the required write
2214           *    caches flushed with Post-SyncOperation as Write Immediate Data
2215           *    followed by eight dummy MI_STORE_DATA_IMM (write to scratch
2216           *    spce) commands.
2217           *
2218           *    Example:
2219           *       - Workload-1
2220           *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
2221           *         Immediate Data, Required Write Cache Flush bits set)
2222           *       - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
2223           *       - Workload-2 (Can use the data produce or output by
2224           *         Workload-1)
2225           *
2226           * Unfortunately, both the PRMs and the internal docs are a bit
2227           * out-of-date in this regard.  What the windows driver does (and
2228           * this appears to actually work) is to emit a register read from the
2229           * memory address written by the pipe control above.
2230           *
2231           * What register we load into doesn't matter.  We choose an indirect
2232           * rendering register because we know it always exists and it's one
2233           * of the first registers the command parser allows us to write.  If
2234           * you don't have command parser support in your kernel (pre-4.2),
2235           * this will get turned into MI_NOOP and you won't get the
2236           * workaround.  Unfortunately, there's just not much we can do in
2237           * that case.  This register is perfectly safe to write since we
2238           * always re-load all of the indirect draw registers right before
2239           * 3DPRIMITIVE when needed anyway.
2240           */
2241          anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2242             lrm.RegisterAddress  = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */
2243             lrm.MemoryAddress = device->workaround_address;
2244          }
2245       }
2246 
2247       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
2248                 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
2249    }
2250 
2251    if (bits & ANV_PIPE_INVALIDATE_BITS) {
2252       /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2253        *
2254        *    "If the VF Cache Invalidation Enable is set to a 1 in a
2255        *    PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to
2256        *    0, with the VF Cache Invalidation Enable set to 0 needs to be sent
2257        *    prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to
2258        *    a 1."
2259        *
2260        * This appears to hang Broadwell, so we restrict it to just gfx9.
2261        */
2262       if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
2263          anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe);
2264 
2265       anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2266          pipe.StateCacheInvalidationEnable =
2267             bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
2268          pipe.ConstantCacheInvalidationEnable =
2269             bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
2270 #if GFX_VER >= 12
2271          /* Invalidates the L3 cache part in which index & vertex data is loaded
2272           * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
2273           */
2274          pipe.L3ReadOnlyCacheInvalidationEnable =
2275             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2276 #endif
2277          pipe.VFCacheInvalidationEnable =
2278             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2279          pipe.TextureCacheInvalidationEnable =
2280             bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
2281          pipe.InstructionCacheInvalidateEnable =
2282             bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
2283 
2284 #if GFX_VER >= 9 && GFX_VER <= 11
2285          /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2286           *
2287           *    "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
2288           *     always set for GPGPU workloads when “Texture Cache
2289           *     Invalidation Enable” bit is set".
2290           *
2291           * Workaround stopped appearing in TGL PRMs.
2292           */
2293          if (current_pipeline == GPGPU && pipe.TextureCacheInvalidationEnable)
2294             pipe.CommandStreamerStallEnable = true;
2295 #endif
2296 
2297          /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2298           *
2299           *    "When VF Cache Invalidate is set “Post Sync Operation” must be
2300           *    enabled to “Write Immediate Data” or “Write PS Depth Count” or
2301           *    “Write Timestamp”.
2302           */
2303          if (GFX_VER == 9 && pipe.VFCacheInvalidationEnable) {
2304             pipe.PostSyncOperation = WriteImmediateData;
2305             pipe.Address = device->workaround_address;
2306          }
2307          anv_debug_dump_pc(pipe);
2308       }
2309 
2310 #if GFX_VER == 12
2311       if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info.has_aux_map) {
2312          anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2313             lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num);
2314             lri.DataDWord = 1;
2315          }
2316       }
2317 #endif
2318 
2319       bits &= ~ANV_PIPE_INVALIDATE_BITS;
2320    }
2321 
2322    return bits;
2323 }
2324 
2325 void
cmd_buffer_apply_pipe_flushes(struct anv_cmd_buffer *cmd_buffer)2326 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
2327 {
2328 #if GFX_VERx10 == 120
2329    /* If we're changing the state of the RHWO optimization, we need to have
2330     * sb_stall+cs_stall.
2331     */
2332    const bool rhwo_opt_change =
2333       cmd_buffer->state.rhwo_optimization_enabled !=
2334       cmd_buffer->state.pending_rhwo_optimization_enabled;
2335    if (rhwo_opt_change) {
2336       anv_add_pending_pipe_bits(cmd_buffer,
2337                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
2338                                 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
2339                                 "change RHWO optimization");
2340    }
2341 #endif
2342 
2343    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
2344 
2345    if (unlikely(cmd_buffer->device->physical->always_flush_cache))
2346       bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
2347    else if (bits == 0)
2348       return;
2349 
2350    bool trace_flush =
2351       (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | ANV_PIPE_INVALIDATE_BITS)) != 0;
2352    if (trace_flush)
2353       trace_intel_begin_stall(&cmd_buffer->trace);
2354 
2355    if ((GFX_VER >= 8 && GFX_VER <= 9) &&
2356        (bits & ANV_PIPE_CS_STALL_BIT) &&
2357        (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
2358       /* If we are doing a VF cache invalidate AND a CS stall (it must be
2359        * both) then we can reset our vertex cache tracking.
2360        */
2361       memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
2362              sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
2363       memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
2364              sizeof(cmd_buffer->state.gfx.ib_dirty_range));
2365    }
2366 
2367    cmd_buffer->state.pending_pipe_bits =
2368       genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
2369                                     cmd_buffer->device,
2370                                     cmd_buffer->state.current_pipeline,
2371                                     bits);
2372 
2373 #if GFX_VERx10 == 120
2374    /* Wa_1508744258 handling */
2375    if (rhwo_opt_change) {
2376       anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
2377          c1.RCCRHWOOptimizationDisable =
2378             !cmd_buffer->state.pending_rhwo_optimization_enabled;
2379          c1.RCCRHWOOptimizationDisableMask = true;
2380       }
2381       cmd_buffer->state.rhwo_optimization_enabled =
2382          cmd_buffer->state.pending_rhwo_optimization_enabled;
2383    }
2384 #endif
2385 
2386    if (trace_flush) {
2387       trace_intel_end_stall(&cmd_buffer->trace, bits,
2388                             anv_pipe_flush_bit_to_ds_stall_flag, NULL);
2389    }
2390 }
2391 
2392 static void
cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_info, const char *reason)2393 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
2394                    const VkDependencyInfo *dep_info,
2395                    const char *reason)
2396 {
2397    /* XXX: Right now, we're really dumb and just flush whatever categories
2398     * the app asks for.  One of these days we may make this a bit better
2399     * but right now that's all the hardware allows for in most areas.
2400     */
2401    VkAccessFlags2 src_flags = 0;
2402    VkAccessFlags2 dst_flags = 0;
2403 
2404    for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
2405       src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
2406       dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
2407    }
2408 
2409    for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
2410       src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask;
2411       dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask;
2412    }
2413 
2414    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
2415       const VkImageMemoryBarrier2 *img_barrier =
2416          &dep_info->pImageMemoryBarriers[i];
2417 
2418       src_flags |= img_barrier->srcAccessMask;
2419       dst_flags |= img_barrier->dstAccessMask;
2420 
2421       ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
2422       const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
2423 
2424       uint32_t base_layer, layer_count;
2425       if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
2426          base_layer = 0;
2427          layer_count = anv_minify(image->vk.extent.depth, range->baseMipLevel);
2428       } else {
2429          base_layer = range->baseArrayLayer;
2430          layer_count = vk_image_subresource_layer_count(&image->vk, range);
2431       }
2432       const uint32_t level_count =
2433          vk_image_subresource_level_count(&image->vk, range);
2434 
2435       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2436          transition_depth_buffer(cmd_buffer, image,
2437                                  base_layer, layer_count,
2438                                  img_barrier->oldLayout,
2439                                  img_barrier->newLayout,
2440                                  false /* will_full_fast_clear */);
2441       }
2442 
2443       if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2444          transition_stencil_buffer(cmd_buffer, image,
2445                                    range->baseMipLevel, level_count,
2446                                    base_layer, layer_count,
2447                                    img_barrier->oldLayout,
2448                                    img_barrier->newLayout,
2449                                    false /* will_full_fast_clear */);
2450       }
2451 
2452       if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
2453          VkImageAspectFlags color_aspects =
2454             vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
2455          anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
2456             transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
2457                                     range->baseMipLevel, level_count,
2458                                     base_layer, layer_count,
2459                                     img_barrier->oldLayout,
2460                                     img_barrier->newLayout,
2461                                     img_barrier->srcQueueFamilyIndex,
2462                                     img_barrier->dstQueueFamilyIndex,
2463                                     false /* will_full_fast_clear */);
2464          }
2465       }
2466    }
2467 
2468    enum anv_pipe_bits bits =
2469       anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
2470       anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);
2471 
2472    anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
2473 }
2474 
CmdPipelineBarrier2( VkCommandBuffer commandBuffer, const VkDependencyInfo* pDependencyInfo)2475 void genX(CmdPipelineBarrier2)(
2476     VkCommandBuffer                             commandBuffer,
2477     const VkDependencyInfo*                     pDependencyInfo)
2478 {
2479    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2480 
2481    cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
2482 }
2483 
2484 static void
cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)2485 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
2486 {
2487    VkShaderStageFlags stages =
2488       cmd_buffer->state.gfx.pipeline->active_stages;
2489 
2490    /* In order to avoid thrash, we assume that vertex and fragment stages
2491     * always exist.  In the rare case where one is missing *and* the other
2492     * uses push concstants, this may be suboptimal.  However, avoiding stalls
2493     * seems more important.
2494     */
2495    stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
2496    if (anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline))
2497       stages |= VK_SHADER_STAGE_VERTEX_BIT;
2498 
2499    if (stages == cmd_buffer->state.gfx.push_constant_stages)
2500       return;
2501 
2502    const unsigned push_constant_kb =
2503       cmd_buffer->device->info.max_constant_urb_size_kb;
2504 
2505    const unsigned num_stages =
2506       util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
2507    unsigned size_per_stage = push_constant_kb / num_stages;
2508 
2509    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
2510     * units of 2KB.  Incidentally, these are the same platforms that have
2511     * 32KB worth of push constant space.
2512     */
2513    if (push_constant_kb == 32)
2514       size_per_stage &= ~1u;
2515 
2516    uint32_t kb_used = 0;
2517    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
2518       unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
2519       anv_batch_emit(&cmd_buffer->batch,
2520                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
2521          alloc._3DCommandSubOpcode  = 18 + i;
2522          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
2523          alloc.ConstantBufferSize   = push_size;
2524       }
2525       kb_used += push_size;
2526    }
2527 
2528    anv_batch_emit(&cmd_buffer->batch,
2529                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
2530       alloc.ConstantBufferOffset = kb_used;
2531       alloc.ConstantBufferSize = push_constant_kb - kb_used;
2532    }
2533 
2534 #if GFX_VERx10 == 125
2535    /* Wa_22011440098
2536     *
2537     * In 3D mode, after programming push constant alloc command immediately
2538     * program push constant command(ZERO length) without any commit between
2539     * them.
2540     */
2541    if (intel_device_info_is_dg2(&cmd_buffer->device->info)) {
2542       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
2543          /* Update empty push constants for all stages (bitmask = 11111b) */
2544          c.ShaderUpdateEnable = 0x1f;
2545          c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
2546       }
2547    }
2548 #endif
2549 
2550    cmd_buffer->state.gfx.push_constant_stages = stages;
2551 
2552    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
2553     *
2554     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
2555     *    the next 3DPRIMITIVE command after programming the
2556     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
2557     *
2558     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
2559     * pipeline setup, we need to dirty push constants.
2560     */
2561    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
2562 }
2563 
2564 static VkResult
emit_binding_table(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_pipeline_state *pipe_state, struct anv_shader_bin *shader, struct anv_state *bt_state)2565 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
2566                    struct anv_cmd_pipeline_state *pipe_state,
2567                    struct anv_shader_bin *shader,
2568                    struct anv_state *bt_state)
2569 {
2570    uint32_t state_offset;
2571 
2572    struct anv_pipeline_bind_map *map = &shader->bind_map;
2573    if (map->surface_count == 0) {
2574       *bt_state = (struct anv_state) { 0, };
2575       return VK_SUCCESS;
2576    }
2577 
2578    *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2579                                                   map->surface_count,
2580                                                   &state_offset);
2581    uint32_t *bt_map = bt_state->map;
2582 
2583    if (bt_state->map == NULL)
2584       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2585 
2586    /* We only need to emit relocs if we're not using softpin.  If we are using
2587     * softpin then we always keep all user-allocated memory objects resident.
2588     */
2589    const bool need_client_mem_relocs =
2590       anv_use_relocations(cmd_buffer->device->physical);
2591    struct anv_push_constants *push = &pipe_state->push_constants;
2592 
2593    for (uint32_t s = 0; s < map->surface_count; s++) {
2594       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2595 
2596       struct anv_state surface_state;
2597 
2598       switch (binding->set) {
2599       case ANV_DESCRIPTOR_SET_NULL:
2600          bt_map[s] = 0;
2601          break;
2602 
2603       case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2604          /* Color attachment binding */
2605          assert(shader->stage == MESA_SHADER_FRAGMENT);
2606          if (binding->index < cmd_buffer->state.gfx.color_att_count) {
2607             const struct anv_attachment *att =
2608                &cmd_buffer->state.gfx.color_att[binding->index];
2609             surface_state = att->surface_state.state;
2610          } else {
2611             surface_state = cmd_buffer->state.gfx.null_surface_state;
2612          }
2613          assert(surface_state.map);
2614          bt_map[s] = surface_state.offset + state_offset;
2615          break;
2616 
2617       case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {
2618          struct anv_state surface_state =
2619             anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2620 
2621          struct anv_address constant_data = {
2622             .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2623             .offset = shader->kernel.offset +
2624                       shader->prog_data->const_data_offset,
2625          };
2626          unsigned constant_data_size = shader->prog_data->const_data_size;
2627 
2628          const enum isl_format format =
2629             anv_isl_format_for_descriptor_type(cmd_buffer->device,
2630                                                VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2631          anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2632                                        format, ISL_SWIZZLE_IDENTITY,
2633                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2634                                        constant_data, constant_data_size, 1);
2635 
2636          assert(surface_state.map);
2637          bt_map[s] = surface_state.offset + state_offset;
2638          add_surface_reloc(cmd_buffer, surface_state, constant_data);
2639          break;
2640       }
2641 
2642       case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2643          /* This is always the first binding for compute shaders */
2644          assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2645 
2646          struct anv_state surface_state =
2647             anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2648 
2649          const enum isl_format format =
2650             anv_isl_format_for_descriptor_type(cmd_buffer->device,
2651                                                VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2652          anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2653                                        format, ISL_SWIZZLE_IDENTITY,
2654                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2655                                        cmd_buffer->state.compute.num_workgroups,
2656                                        12, 1);
2657 
2658          assert(surface_state.map);
2659          bt_map[s] = surface_state.offset + state_offset;
2660          if (need_client_mem_relocs) {
2661             add_surface_reloc(cmd_buffer, surface_state,
2662                               cmd_buffer->state.compute.num_workgroups);
2663          }
2664          break;
2665       }
2666 
2667       case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2668          /* This is a descriptor set buffer so the set index is actually
2669           * given by binding->binding.  (Yes, that's confusing.)
2670           */
2671          struct anv_descriptor_set *set =
2672             pipe_state->descriptors[binding->index];
2673          assert(set->desc_mem.alloc_size);
2674          assert(set->desc_surface_state.alloc_size);
2675          bt_map[s] = set->desc_surface_state.offset + state_offset;
2676          add_surface_reloc(cmd_buffer, set->desc_surface_state,
2677                            anv_descriptor_set_address(set));
2678          break;
2679       }
2680 
2681       default: {
2682          assert(binding->set < MAX_SETS);
2683          const struct anv_descriptor_set *set =
2684             pipe_state->descriptors[binding->set];
2685          if (binding->index >= set->descriptor_count) {
2686             /* From the Vulkan spec section entitled "DescriptorSet and
2687              * Binding Assignment":
2688              *
2689              *    "If the array is runtime-sized, then array elements greater
2690              *    than or equal to the size of that binding in the bound
2691              *    descriptor set must not be used."
2692              *
2693              * Unfortunately, the compiler isn't smart enough to figure out
2694              * when a dynamic binding isn't used so it may grab the whole
2695              * array and stick it in the binding table.  In this case, it's
2696              * safe to just skip those bindings that are OOB.
2697              */
2698             assert(binding->index < set->layout->descriptor_count);
2699             continue;
2700          }
2701          const struct anv_descriptor *desc = &set->descriptors[binding->index];
2702 
2703          switch (desc->type) {
2704          case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
2705          case VK_DESCRIPTOR_TYPE_SAMPLER:
2706             /* Nothing for us to do here */
2707             continue;
2708 
2709          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2710          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2711          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
2712             if (desc->image_view) {
2713                struct anv_surface_state sstate =
2714                   (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
2715                   desc->image_view->planes[binding->plane].general_sampler_surface_state :
2716                   desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
2717                surface_state = sstate.state;
2718                assert(surface_state.alloc_size);
2719                if (need_client_mem_relocs)
2720                   add_surface_state_relocs(cmd_buffer, sstate);
2721             } else {
2722                surface_state = cmd_buffer->device->null_surface_state;
2723             }
2724             break;
2725          }
2726 
2727          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
2728             if (desc->image_view) {
2729                struct anv_surface_state sstate =
2730                   binding->lowered_storage_surface
2731                   ? desc->image_view->planes[binding->plane].lowered_storage_surface_state
2732                   : desc->image_view->planes[binding->plane].storage_surface_state;
2733                surface_state = sstate.state;
2734                assert(surface_state.alloc_size);
2735                if (surface_state.offset == 0) {
2736                   mesa_loge("Bound a image to a descriptor where the "
2737                             "descriptor does not have NonReadable "
2738                             "set and the image does not have a "
2739                             "corresponding SPIR-V format enum.");
2740                   vk_debug_report(&cmd_buffer->device->physical->instance->vk,
2741                                   VK_DEBUG_REPORT_ERROR_BIT_EXT,
2742                                   &desc->image_view->vk.base,
2743                                   __LINE__, 0, "anv",
2744                                   "Bound a image to a descriptor where the "
2745                                   "descriptor does not have NonReadable "
2746                                   "set and the image does not have a "
2747                                   "corresponding SPIR-V format enum.");
2748                }
2749                if (surface_state.offset && need_client_mem_relocs)
2750                   add_surface_state_relocs(cmd_buffer, sstate);
2751             } else {
2752                surface_state = cmd_buffer->device->null_surface_state;
2753             }
2754             break;
2755          }
2756 
2757          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2758          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2759             if (desc->set_buffer_view) {
2760                surface_state = desc->set_buffer_view->surface_state;
2761                assert(surface_state.alloc_size);
2762                if (need_client_mem_relocs) {
2763                   add_surface_reloc(cmd_buffer, surface_state,
2764                                     desc->set_buffer_view->address);
2765                }
2766             } else {
2767                surface_state = cmd_buffer->device->null_surface_state;
2768             }
2769             break;
2770 
2771          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2772             if (desc->buffer_view) {
2773                surface_state = desc->buffer_view->surface_state;
2774                assert(surface_state.alloc_size);
2775                if (need_client_mem_relocs) {
2776                   add_surface_reloc(cmd_buffer, surface_state,
2777                                     desc->buffer_view->address);
2778                }
2779             } else {
2780                surface_state = cmd_buffer->device->null_surface_state;
2781             }
2782             break;
2783 
2784          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2785          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
2786             if (desc->buffer) {
2787                /* Compute the offset within the buffer */
2788                uint32_t dynamic_offset =
2789                   push->dynamic_offsets[binding->dynamic_offset_index];
2790                uint64_t offset = desc->offset + dynamic_offset;
2791                /* Clamp to the buffer size */
2792                offset = MIN2(offset, desc->buffer->vk.size);
2793                /* Clamp the range to the buffer size */
2794                uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
2795 
2796                /* Align the range for consistency */
2797                if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
2798                   range = align_u32(range, ANV_UBO_ALIGNMENT);
2799 
2800                struct anv_address address =
2801                   anv_address_add(desc->buffer->address, offset);
2802 
2803                surface_state =
2804                   anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
2805                enum isl_format format =
2806                   anv_isl_format_for_descriptor_type(cmd_buffer->device,
2807                                                      desc->type);
2808 
2809                isl_surf_usage_flags_t usage =
2810                   desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?
2811                   ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
2812                   ISL_SURF_USAGE_STORAGE_BIT;
2813 
2814                anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2815                                              format, ISL_SWIZZLE_IDENTITY,
2816                                              usage, address, range, 1);
2817                if (need_client_mem_relocs)
2818                   add_surface_reloc(cmd_buffer, surface_state, address);
2819             } else {
2820                surface_state = cmd_buffer->device->null_surface_state;
2821             }
2822             break;
2823          }
2824 
2825          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2826             if (desc->buffer_view) {
2827                surface_state = binding->lowered_storage_surface
2828                   ? desc->buffer_view->lowered_storage_surface_state
2829                   : desc->buffer_view->storage_surface_state;
2830                assert(surface_state.alloc_size);
2831                if (need_client_mem_relocs) {
2832                   add_surface_reloc(cmd_buffer, surface_state,
2833                                     desc->buffer_view->address);
2834                }
2835             } else {
2836                surface_state = cmd_buffer->device->null_surface_state;
2837             }
2838             break;
2839 
2840          default:
2841             assert(!"Invalid descriptor type");
2842             continue;
2843          }
2844          assert(surface_state.map);
2845          bt_map[s] = surface_state.offset + state_offset;
2846          break;
2847       }
2848       }
2849    }
2850 
2851    return VK_SUCCESS;
2852 }
2853 
2854 static VkResult
emit_samplers(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_pipeline_state *pipe_state, struct anv_shader_bin *shader, struct anv_state *state)2855 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2856               struct anv_cmd_pipeline_state *pipe_state,
2857               struct anv_shader_bin *shader,
2858               struct anv_state *state)
2859 {
2860    struct anv_pipeline_bind_map *map = &shader->bind_map;
2861    if (map->sampler_count == 0) {
2862       *state = (struct anv_state) { 0, };
2863       return VK_SUCCESS;
2864    }
2865 
2866    uint32_t size = map->sampler_count * 16;
2867    *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2868 
2869    if (state->map == NULL)
2870       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2871 
2872    for (uint32_t s = 0; s < map->sampler_count; s++) {
2873       struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2874       const struct anv_descriptor *desc =
2875          &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2876 
2877       if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2878           desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2879          continue;
2880 
2881       struct anv_sampler *sampler = desc->sampler;
2882 
2883       /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2884        * happens to be zero.
2885        */
2886       if (sampler == NULL)
2887          continue;
2888 
2889       memcpy(state->map + (s * 16),
2890              sampler->state[binding->plane], sizeof(sampler->state[0]));
2891    }
2892 
2893    return VK_SUCCESS;
2894 }
2895 
2896 static uint32_t
flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_pipeline_state *pipe_state, const VkShaderStageFlags dirty, struct anv_shader_bin **shaders, uint32_t num_shaders)2897 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
2898                       struct anv_cmd_pipeline_state *pipe_state,
2899                       const VkShaderStageFlags dirty,
2900                       struct anv_shader_bin **shaders,
2901                       uint32_t num_shaders)
2902 {
2903    VkShaderStageFlags flushed = 0;
2904 
2905    VkResult result = VK_SUCCESS;
2906    for (uint32_t i = 0; i < num_shaders; i++) {
2907       if (!shaders[i])
2908          continue;
2909 
2910       gl_shader_stage stage = shaders[i]->stage;
2911       VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2912       if ((vk_stage & dirty) == 0)
2913          continue;
2914 
2915       assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2916       result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2917                              &cmd_buffer->state.samplers[stage]);
2918       if (result != VK_SUCCESS)
2919          break;
2920 
2921       assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2922       result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2923                                   &cmd_buffer->state.binding_tables[stage]);
2924       if (result != VK_SUCCESS)
2925          break;
2926 
2927       flushed |= vk_stage;
2928    }
2929 
2930    if (result != VK_SUCCESS) {
2931       assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2932 
2933       result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2934       if (result != VK_SUCCESS)
2935          return 0;
2936 
2937       /* Re-emit state base addresses so we get the new surface state base
2938        * address before we start emitting binding tables etc.
2939        */
2940       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2941 
2942       /* Re-emit all active binding tables */
2943       flushed = 0;
2944 
2945       for (uint32_t i = 0; i < num_shaders; i++) {
2946          if (!shaders[i])
2947             continue;
2948 
2949          gl_shader_stage stage = shaders[i]->stage;
2950 
2951          result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2952                                 &cmd_buffer->state.samplers[stage]);
2953          if (result != VK_SUCCESS) {
2954             anv_batch_set_error(&cmd_buffer->batch, result);
2955             return 0;
2956          }
2957          result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2958                                      &cmd_buffer->state.binding_tables[stage]);
2959          if (result != VK_SUCCESS) {
2960             anv_batch_set_error(&cmd_buffer->batch, result);
2961             return 0;
2962          }
2963 
2964          flushed |= mesa_to_vk_shader_stage(stage);
2965       }
2966    }
2967 
2968    return flushed;
2969 }
2970 
2971 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer, uint32_t stages)2972 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
2973                                     uint32_t stages)
2974 {
2975    static const uint32_t sampler_state_opcodes[] = {
2976       [MESA_SHADER_VERTEX]                      = 43,
2977       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
2978       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
2979       [MESA_SHADER_GEOMETRY]                    = 46,
2980       [MESA_SHADER_FRAGMENT]                    = 47,
2981    };
2982 
2983    static const uint32_t binding_table_opcodes[] = {
2984       [MESA_SHADER_VERTEX]                      = 38,
2985       [MESA_SHADER_TESS_CTRL]                   = 39,
2986       [MESA_SHADER_TESS_EVAL]                   = 40,
2987       [MESA_SHADER_GEOMETRY]                    = 41,
2988       [MESA_SHADER_FRAGMENT]                    = 42,
2989    };
2990 
2991    anv_foreach_stage(s, stages) {
2992       assert(s < ARRAY_SIZE(binding_table_opcodes));
2993 
2994       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
2995          anv_batch_emit(&cmd_buffer->batch,
2996                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
2997             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
2998             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
2999          }
3000       }
3001 
3002       /* Always emit binding table pointers if we're asked to, since on SKL
3003        * this is what flushes push constants. */
3004       anv_batch_emit(&cmd_buffer->batch,
3005                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
3006          btp._3DCommandSubOpcode = binding_table_opcodes[s];
3007          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
3008       }
3009    }
3010 }
3011 
3012 static struct anv_address
get_push_range_address(struct anv_cmd_buffer *cmd_buffer, const struct anv_shader_bin *shader, const struct anv_push_range *range)3013 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
3014                        const struct anv_shader_bin *shader,
3015                        const struct anv_push_range *range)
3016 {
3017    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3018    switch (range->set) {
3019    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
3020       /* This is a descriptor set buffer so the set index is
3021        * actually given by binding->binding.  (Yes, that's
3022        * confusing.)
3023        */
3024       struct anv_descriptor_set *set =
3025          gfx_state->base.descriptors[range->index];
3026       return anv_descriptor_set_address(set);
3027    }
3028 
3029    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
3030       if (gfx_state->base.push_constants_state.alloc_size == 0) {
3031          gfx_state->base.push_constants_state =
3032             anv_cmd_buffer_gfx_push_constants(cmd_buffer);
3033       }
3034       return (struct anv_address) {
3035          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3036          .offset = gfx_state->base.push_constants_state.offset,
3037       };
3038    }
3039 
3040    case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
3041       return (struct anv_address) {
3042          .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
3043          .offset = shader->kernel.offset +
3044                    shader->prog_data->const_data_offset,
3045       };
3046 
3047    default: {
3048       assert(range->set < MAX_SETS);
3049       struct anv_descriptor_set *set =
3050          gfx_state->base.descriptors[range->set];
3051       const struct anv_descriptor *desc =
3052          &set->descriptors[range->index];
3053 
3054       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
3055          if (desc->buffer_view)
3056             return desc->buffer_view->address;
3057       } else {
3058          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
3059          if (desc->buffer) {
3060             const struct anv_push_constants *push =
3061                &gfx_state->base.push_constants;
3062             uint32_t dynamic_offset =
3063                push->dynamic_offsets[range->dynamic_offset_index];
3064             return anv_address_add(desc->buffer->address,
3065                                    desc->offset + dynamic_offset);
3066          }
3067       }
3068 
3069       /* For NULL UBOs, we just return an address in the workaround BO.  We do
3070        * writes to it for workarounds but always at the bottom.  The higher
3071        * bytes should be all zeros.
3072        */
3073       assert(range->length * 32 <= 2048);
3074       return (struct anv_address) {
3075          .bo = cmd_buffer->device->workaround_bo,
3076          .offset = 1024,
3077       };
3078    }
3079    }
3080 }
3081 
3082 
3083 /** Returns the size in bytes of the bound buffer
3084  *
3085  * The range is relative to the start of the buffer, not the start of the
3086  * range.  The returned range may be smaller than
3087  *
3088  *    (range->start + range->length) * 32;
3089  */
3090 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer, const struct anv_shader_bin *shader, const struct anv_push_range *range)3091 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
3092                           const struct anv_shader_bin *shader,
3093                           const struct anv_push_range *range)
3094 {
3095    assert(shader->stage != MESA_SHADER_COMPUTE);
3096    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3097    switch (range->set) {
3098    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
3099       struct anv_descriptor_set *set =
3100          gfx_state->base.descriptors[range->index];
3101       assert(range->start * 32 < set->desc_mem.alloc_size);
3102       assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);
3103       return set->desc_mem.alloc_size;
3104    }
3105 
3106    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
3107       return (range->start + range->length) * 32;
3108 
3109    case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
3110       return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);
3111 
3112    default: {
3113       assert(range->set < MAX_SETS);
3114       struct anv_descriptor_set *set =
3115          gfx_state->base.descriptors[range->set];
3116       const struct anv_descriptor *desc =
3117          &set->descriptors[range->index];
3118 
3119       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
3120          /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
3121             * We use the descriptor set's internally allocated surface state to fill the binding table entry.
3122          */
3123          if (!desc->set_buffer_view)
3124             return 0;
3125 
3126          if (range->start * 32 > desc->set_buffer_view->range)
3127             return 0;
3128 
3129          return desc->set_buffer_view->range;
3130       } else {
3131          if (!desc->buffer)
3132             return 0;
3133 
3134          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
3135          /* Compute the offset within the buffer */
3136          const struct anv_push_constants *push =
3137             &gfx_state->base.push_constants;
3138          uint32_t dynamic_offset =
3139             push->dynamic_offsets[range->dynamic_offset_index];
3140          uint64_t offset = desc->offset + dynamic_offset;
3141          /* Clamp to the buffer size */
3142          offset = MIN2(offset, desc->buffer->vk.size);
3143          /* Clamp the range to the buffer size */
3144          uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
3145 
3146          /* Align the range for consistency */
3147          bound_range = align_u32(bound_range, ANV_UBO_ALIGNMENT);
3148 
3149          return bound_range;
3150       }
3151    }
3152    }
3153 }
3154 
3155 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer, gl_shader_stage stage, struct anv_address *buffers, unsigned buffer_count)3156 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
3157                               gl_shader_stage stage,
3158                               struct anv_address *buffers,
3159                               unsigned buffer_count)
3160 {
3161    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3162    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3163 
3164    static const uint32_t push_constant_opcodes[] = {
3165       [MESA_SHADER_VERTEX]                      = 21,
3166       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3167       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3168       [MESA_SHADER_GEOMETRY]                    = 22,
3169       [MESA_SHADER_FRAGMENT]                    = 23,
3170    };
3171 
3172    assert(stage < ARRAY_SIZE(push_constant_opcodes));
3173 
3174    UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
3175 
3176    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
3177       c._3DCommandSubOpcode = push_constant_opcodes[stage];
3178 
3179       /* Set MOCS, except on Gfx8, because the Broadwell PRM says:
3180        *
3181        *    "Constant Buffer Object Control State must be always
3182        *     programmed to zero."
3183        *
3184        * This restriction does not exist on any newer platforms.
3185        *
3186        * We only have one MOCS field for the whole packet, not one per
3187        * buffer.  We could go out of our way here to walk over all of
3188        * the buffers and see if any of them are used externally and use
3189        * the external MOCS.  However, the notion that someone would use
3190        * the same bit of memory for both scanout and a UBO is nuts.
3191        *
3192        * Let's not bother and assume it's all internal.
3193        */
3194 #if GFX_VER >= 9
3195       c.MOCS = mocs;
3196 #elif GFX_VER < 8
3197       c.ConstantBody.MOCS = mocs;
3198 #endif
3199 
3200       if (anv_pipeline_has_stage(pipeline, stage)) {
3201          const struct anv_pipeline_bind_map *bind_map =
3202             &pipeline->shaders[stage]->bind_map;
3203 
3204 #if GFX_VERx10 >= 75
3205          /* The Skylake PRM contains the following restriction:
3206           *
3207           *    "The driver must ensure The following case does not occur
3208           *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3209           *     buffer 3 read length equal to zero committed followed by a
3210           *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3211           *     zero committed."
3212           *
3213           * To avoid this, we program the buffers in the highest slots.
3214           * This way, slot 0 is only used if slot 3 is also used.
3215           */
3216          assert(buffer_count <= 4);
3217          const unsigned shift = 4 - buffer_count;
3218          for (unsigned i = 0; i < buffer_count; i++) {
3219             const struct anv_push_range *range = &bind_map->push_ranges[i];
3220 
3221             /* At this point we only have non-empty ranges */
3222             assert(range->length > 0);
3223 
3224             /* For Ivy Bridge, make sure we only set the first range (actual
3225              * push constants)
3226              */
3227             assert((GFX_VERx10 >= 75) || i == 0);
3228 
3229             c.ConstantBody.ReadLength[i + shift] = range->length;
3230             c.ConstantBody.Buffer[i + shift] =
3231                anv_address_add(buffers[i], range->start * 32);
3232          }
3233 #else
3234          /* For Ivy Bridge, push constants are relative to dynamic state
3235           * base address and we only ever push actual push constants.
3236           */
3237          if (bind_map->push_ranges[0].length > 0) {
3238             assert(buffer_count == 1);
3239             assert(bind_map->push_ranges[0].set ==
3240                    ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
3241             assert(buffers[0].bo ==
3242                    cmd_buffer->device->dynamic_state_pool.block_pool.bo);
3243             c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
3244             c.ConstantBody.Buffer[0].bo = NULL;
3245             c.ConstantBody.Buffer[0].offset = buffers[0].offset;
3246          }
3247          assert(bind_map->push_ranges[1].length == 0);
3248          assert(bind_map->push_ranges[2].length == 0);
3249          assert(bind_map->push_ranges[3].length == 0);
3250 #endif
3251       }
3252    }
3253 }
3254 
3255 #if GFX_VER >= 12
3256 static void
cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer, uint32_t shader_mask, struct anv_address *buffers, uint32_t buffer_count)3257 cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
3258                                   uint32_t shader_mask,
3259                                   struct anv_address *buffers,
3260                                   uint32_t buffer_count)
3261 {
3262    if (buffer_count == 0) {
3263       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
3264          c.ShaderUpdateEnable = shader_mask;
3265          c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
3266       }
3267       return;
3268    }
3269 
3270    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3271    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3272 
3273    static const UNUSED uint32_t push_constant_opcodes[] = {
3274       [MESA_SHADER_VERTEX]                      = 21,
3275       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3276       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3277       [MESA_SHADER_GEOMETRY]                    = 22,
3278       [MESA_SHADER_FRAGMENT]                    = 23,
3279    };
3280 
3281    gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
3282    assert(stage < ARRAY_SIZE(push_constant_opcodes));
3283 
3284    const struct anv_pipeline_bind_map *bind_map =
3285       &pipeline->shaders[stage]->bind_map;
3286 
3287    uint32_t *dw;
3288    const uint32_t buffer_mask = (1 << buffer_count) - 1;
3289    const uint32_t num_dwords = 2 + 2 * buffer_count;
3290 
3291    dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3292                         GENX(3DSTATE_CONSTANT_ALL),
3293                         .ShaderUpdateEnable = shader_mask,
3294                         .PointerBufferMask = buffer_mask,
3295                         .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
3296 
3297    for (int i = 0; i < buffer_count; i++) {
3298       const struct anv_push_range *range = &bind_map->push_ranges[i];
3299       GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
3300          &cmd_buffer->batch, dw + 2 + i * 2,
3301          &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
3302             .PointerToConstantBuffer =
3303                anv_address_add(buffers[i], range->start * 32),
3304             .ConstantBufferReadLength = range->length,
3305          });
3306    }
3307 }
3308 #endif
3309 
3310 static void
cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, VkShaderStageFlags dirty_stages)3311 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
3312                                 VkShaderStageFlags dirty_stages)
3313 {
3314    VkShaderStageFlags flushed = 0;
3315    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3316    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3317 
3318 #if GFX_VER >= 12
3319    uint32_t nobuffer_stages = 0;
3320 #endif
3321 
3322    /* Compute robust pushed register access mask for each stage. */
3323    if (cmd_buffer->device->robust_buffer_access) {
3324       anv_foreach_stage(stage, dirty_stages) {
3325          if (!anv_pipeline_has_stage(pipeline, stage))
3326             continue;
3327 
3328          const struct anv_shader_bin *shader = pipeline->shaders[stage];
3329          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3330          struct anv_push_constants *push = &gfx_state->base.push_constants;
3331 
3332          push->push_reg_mask[stage] = 0;
3333          /* Start of the current range in the shader, relative to the start of
3334           * push constants in the shader.
3335           */
3336          unsigned range_start_reg = 0;
3337          for (unsigned i = 0; i < 4; i++) {
3338             const struct anv_push_range *range = &bind_map->push_ranges[i];
3339             if (range->length == 0)
3340                continue;
3341 
3342             unsigned bound_size =
3343                get_push_range_bound_size(cmd_buffer, shader, range);
3344             if (bound_size >= range->start * 32) {
3345                unsigned bound_regs =
3346                   MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
3347                        range->length);
3348                assert(range_start_reg + bound_regs <= 64);
3349                push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
3350                                                               bound_regs);
3351             }
3352 
3353             cmd_buffer->state.push_constants_dirty |=
3354                mesa_to_vk_shader_stage(stage);
3355 
3356             range_start_reg += range->length;
3357          }
3358       }
3359    }
3360 
3361    /* Resets the push constant state so that we allocate a new one if
3362     * needed.
3363     */
3364    gfx_state->base.push_constants_state = ANV_STATE_NULL;
3365 
3366    anv_foreach_stage(stage, dirty_stages) {
3367       unsigned buffer_count = 0;
3368       flushed |= mesa_to_vk_shader_stage(stage);
3369       UNUSED uint32_t max_push_range = 0;
3370 
3371       struct anv_address buffers[4] = {};
3372       if (anv_pipeline_has_stage(pipeline, stage)) {
3373          const struct anv_shader_bin *shader = pipeline->shaders[stage];
3374          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3375 
3376          /* We have to gather buffer addresses as a second step because the
3377           * loop above puts data into the push constant area and the call to
3378           * get_push_range_address is what locks our push constants and copies
3379           * them into the actual GPU buffer.  If we did the two loops at the
3380           * same time, we'd risk only having some of the sizes in the push
3381           * constant buffer when we did the copy.
3382           */
3383          for (unsigned i = 0; i < 4; i++) {
3384             const struct anv_push_range *range = &bind_map->push_ranges[i];
3385             if (range->length == 0)
3386                break;
3387 
3388             buffers[i] = get_push_range_address(cmd_buffer, shader, range);
3389             max_push_range = MAX2(max_push_range, range->length);
3390             buffer_count++;
3391          }
3392 
3393          /* We have at most 4 buffers but they should be tightly packed */
3394          for (unsigned i = buffer_count; i < 4; i++)
3395             assert(bind_map->push_ranges[i].length == 0);
3396       }
3397 
3398 #if GFX_VER >= 12
3399       /* If this stage doesn't have any push constants, emit it later in a
3400        * single CONSTANT_ALL packet.
3401        */
3402       if (buffer_count == 0) {
3403          nobuffer_stages |= 1 << stage;
3404          continue;
3405       }
3406 
3407       /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
3408        * contains only 5 bits, so we can only use it for buffers smaller than
3409        * 32.
3410        */
3411       if (max_push_range < 32) {
3412          cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
3413                                            buffers, buffer_count);
3414          continue;
3415       }
3416 #endif
3417 
3418       cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
3419    }
3420 
3421 #if GFX_VER >= 12
3422    if (nobuffer_stages)
3423       cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
3424 #endif
3425 
3426    cmd_buffer->state.push_constants_dirty &= ~flushed;
3427 }
3428 
3429 #if GFX_VERx10 >= 125
3430 static void
cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer, VkShaderStageFlags dirty_stages)3431 cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
3432                                   VkShaderStageFlags dirty_stages)
3433 {
3434    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3435    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3436 
3437    if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_NV &&
3438        anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
3439 
3440       const struct anv_shader_bin *shader = pipeline->shaders[MESA_SHADER_TASK];
3441       const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3442 
3443       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
3444          const struct anv_push_range *range = &bind_map->push_ranges[0];
3445          if (range->length > 0) {
3446             struct anv_address buffer =
3447                get_push_range_address(cmd_buffer, shader, range);
3448 
3449             uint64_t addr = anv_address_physical(buffer);
3450             data.InlineData[0] = addr & 0xffffffff;
3451             data.InlineData[1] = addr >> 32;
3452 
3453             memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
3454                    cmd_buffer->state.gfx.base.push_constants.client_data,
3455                    BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
3456          }
3457       }
3458    }
3459 
3460    if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_NV &&
3461        anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
3462 
3463       const struct anv_shader_bin *shader = pipeline->shaders[MESA_SHADER_MESH];
3464       const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3465 
3466       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
3467          const struct anv_push_range *range = &bind_map->push_ranges[0];
3468          if (range->length > 0) {
3469             struct anv_address buffer =
3470                get_push_range_address(cmd_buffer, shader, range);
3471 
3472             uint64_t addr = anv_address_physical(buffer);
3473             data.InlineData[0] = addr & 0xffffffff;
3474             data.InlineData[1] = addr >> 32;
3475 
3476             memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
3477                    cmd_buffer->state.gfx.base.push_constants.client_data,
3478                    BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
3479          }
3480       }
3481    }
3482 
3483    cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
3484 }
3485 #endif
3486 
3487 static void
cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)3488 cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
3489 {
3490    const struct vk_dynamic_graphics_state *dyn =
3491       &cmd_buffer->vk.dynamic_graphics_state;
3492 
3493    if (!(cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) &&
3494        !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) &&
3495 #if GFX_VER <= 7
3496        !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) &&
3497        !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) &&
3498 #endif
3499        !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
3500       return;
3501 
3502    /* Take dynamic primitive topology in to account with
3503     *    3DSTATE_CLIP::ViewportXYClipTestEnable
3504     */
3505    VkPolygonMode dynamic_raster_mode =
3506       genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
3507                                 dyn->ia.primitive_topology);
3508    bool xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
3509 
3510    struct GENX(3DSTATE_CLIP) clip = {
3511       GENX(3DSTATE_CLIP_header),
3512 #if GFX_VER <= 7
3513       .FrontWinding = genX(vk_to_intel_front_face)[dyn->rs.front_face],
3514       .CullMode     = genX(vk_to_intel_cullmode)[dyn->rs.cull_mode],
3515 #endif
3516       .ViewportXYClipTestEnable = xy_clip_test_enable,
3517    };
3518    uint32_t dwords[GENX(3DSTATE_CLIP_length)];
3519 
3520    /* TODO(mesh): Multiview. */
3521    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3522    if (anv_pipeline_is_primitive(pipeline)) {
3523       const struct brw_vue_prog_data *last =
3524          anv_pipeline_get_last_vue_prog_data(pipeline);
3525       if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3526          clip.MaximumVPIndex = dyn->vp.viewport_count > 0 ?
3527                                dyn->vp.viewport_count - 1 : 0;
3528       }
3529    } else if (anv_pipeline_is_mesh(pipeline)) {
3530       const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
3531       if (mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
3532          clip.MaximumVPIndex = dyn->vp.viewport_count > 0 ?
3533                                dyn->vp.viewport_count - 1 : 0;
3534       }
3535    }
3536 
3537    GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);
3538    anv_batch_emit_merge(&cmd_buffer->batch, dwords,
3539                         pipeline->gfx7.clip);
3540 }
3541 
3542 static void
cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)3543 cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
3544 {
3545    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3546    const struct vk_dynamic_graphics_state *dyn =
3547       &cmd_buffer->vk.dynamic_graphics_state;
3548    uint32_t count = dyn->vp.viewport_count;
3549    const VkViewport *viewports = dyn->vp.viewports;
3550    struct anv_state sf_clip_state =
3551       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
3552 
3553    bool negative_one_to_one =
3554       cmd_buffer->state.gfx.pipeline->negative_one_to_one;
3555 
3556    float scale = negative_one_to_one ? 0.5f : 1.0f;
3557 
3558    for (uint32_t i = 0; i < count; i++) {
3559       const VkViewport *vp = &viewports[i];
3560 
3561       /* The gfx7 state struct has just the matrix and guardband fields, the
3562        * gfx8 struct adds the min/max viewport fields. */
3563       struct GENX(SF_CLIP_VIEWPORT) sfv = {
3564          .ViewportMatrixElementm00 = vp->width / 2,
3565          .ViewportMatrixElementm11 = vp->height / 2,
3566          .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
3567          .ViewportMatrixElementm30 = vp->x + vp->width / 2,
3568          .ViewportMatrixElementm31 = vp->y + vp->height / 2,
3569          .ViewportMatrixElementm32 = negative_one_to_one ?
3570             (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
3571          .XMinClipGuardband = -1.0f,
3572          .XMaxClipGuardband = 1.0f,
3573          .YMinClipGuardband = -1.0f,
3574          .YMaxClipGuardband = 1.0f,
3575 #if GFX_VER >= 8
3576          .XMinViewPort = vp->x,
3577          .XMaxViewPort = vp->x + vp->width - 1,
3578          .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
3579          .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
3580 #endif
3581       };
3582 
3583       const uint32_t fb_size_max = 1 << 14;
3584       uint32_t x_min = 0, x_max = fb_size_max;
3585       uint32_t y_min = 0, y_max = fb_size_max;
3586 
3587       /* If we have a valid renderArea, include that */
3588       if (gfx->render_area.extent.width > 0 &&
3589           gfx->render_area.extent.height > 0) {
3590          x_min = MAX2(x_min, gfx->render_area.offset.x);
3591          x_max = MIN2(x_min, gfx->render_area.offset.x +
3592                              gfx->render_area.extent.width);
3593          y_min = MAX2(y_min, gfx->render_area.offset.y);
3594          y_max = MIN2(y_min, gfx->render_area.offset.y +
3595                              gfx->render_area.extent.height);
3596       }
3597 
3598       /* The client is required to have enough scissors for whatever it sets
3599        * as ViewportIndex but it's possible that they've got more viewports
3600        * set from a previous command.  Also, from the Vulkan 1.3.207:
3601        *
3602        *    "The application must ensure (using scissor if necessary) that
3603        *    all rendering is contained within the render area."
3604        *
3605        * If the client doesn't set a scissor, that basically means it
3606        * guarantees everything is in-bounds already.  If we end up using a
3607        * guardband of [-1, 1] in that case, there shouldn't be much loss.
3608        * It's theoretically possible that they could do all their clipping
3609        * with clip planes but that'd be a bit odd.
3610        */
3611       if (i < dyn->vp.scissor_count) {
3612          const VkRect2D *scissor = &dyn->vp.scissors[i];
3613          x_min = MAX2(x_min, scissor->offset.x);
3614          x_max = MIN2(x_min, scissor->offset.x + scissor->extent.width);
3615          y_min = MAX2(y_min, scissor->offset.y);
3616          y_max = MIN2(y_min, scissor->offset.y + scissor->extent.height);
3617       }
3618 
3619       /* Only bother calculating the guardband if our known render area is
3620        * less than the maximum size.  Otherwise, it will calculate [-1, 1]
3621        * anyway but possibly with precision loss.
3622        */
3623       if (x_min > 0 || x_max < fb_size_max ||
3624           y_min > 0 || y_max < fb_size_max) {
3625          intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
3626                                         sfv.ViewportMatrixElementm00,
3627                                         sfv.ViewportMatrixElementm11,
3628                                         sfv.ViewportMatrixElementm30,
3629                                         sfv.ViewportMatrixElementm31,
3630                                         &sfv.XMinClipGuardband,
3631                                         &sfv.XMaxClipGuardband,
3632                                         &sfv.YMinClipGuardband,
3633                                         &sfv.YMaxClipGuardband);
3634       }
3635 
3636       GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
3637    }
3638 
3639    anv_batch_emit(&cmd_buffer->batch,
3640                   GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
3641       clip.SFClipViewportPointer = sf_clip_state.offset;
3642    }
3643 }
3644 
3645 static void
cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer, bool depth_clamp_enable)3646 cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
3647                                bool depth_clamp_enable)
3648 {
3649    const struct vk_dynamic_graphics_state *dyn =
3650       &cmd_buffer->vk.dynamic_graphics_state;
3651    uint32_t count = dyn->vp.viewport_count;
3652    const VkViewport *viewports = dyn->vp.viewports;
3653    struct anv_state cc_state =
3654       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
3655 
3656    for (uint32_t i = 0; i < count; i++) {
3657       const VkViewport *vp = &viewports[i];
3658 
3659       /* From the Vulkan spec:
3660        *
3661        *    "It is valid for minDepth to be greater than or equal to
3662        *    maxDepth."
3663        */
3664       float min_depth = MIN2(vp->minDepth, vp->maxDepth);
3665       float max_depth = MAX2(vp->minDepth, vp->maxDepth);
3666 
3667       struct GENX(CC_VIEWPORT) cc_viewport = {
3668          .MinimumDepth = depth_clamp_enable ? min_depth : 0.0f,
3669          .MaximumDepth = depth_clamp_enable ? max_depth : 1.0f,
3670       };
3671 
3672       GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
3673    }
3674 
3675    anv_batch_emit(&cmd_buffer->batch,
3676                   GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
3677       cc.CCViewportPointer = cc_state.offset;
3678    }
3679 }
3680 
3681 static int64_t
clamp_int64(int64_t x, int64_t min, int64_t max)3682 clamp_int64(int64_t x, int64_t min, int64_t max)
3683 {
3684    if (x < min)
3685       return min;
3686    else if (x < max)
3687       return x;
3688    else
3689       return max;
3690 }
3691 
3692 static void
cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)3693 cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
3694 {
3695    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3696    const struct vk_dynamic_graphics_state *dyn =
3697       &cmd_buffer->vk.dynamic_graphics_state;
3698    uint32_t count = dyn->vp.scissor_count;
3699    const VkRect2D *scissors = dyn->vp.scissors;
3700    const VkViewport *viewports = dyn->vp.viewports;
3701 
3702    /* Wa_1409725701:
3703     *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
3704     *    stored as an array of up to 16 elements. The location of first
3705     *    element of the array, as specified by Pointer to SCISSOR_RECT, should
3706     *    be aligned to a 64-byte boundary.
3707     */
3708    uint32_t alignment = 64;
3709    struct anv_state scissor_state =
3710       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, alignment);
3711 
3712    for (uint32_t i = 0; i < count; i++) {
3713       const VkRect2D *s = &scissors[i];
3714       const VkViewport *vp = &viewports[i];
3715 
3716       /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
3717        * ymax < ymin for empty clips.  In case clip x, y, width height are all
3718        * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't
3719        * what we want. Just special case empty clips and produce a canonical
3720        * empty clip. */
3721       static const struct GENX(SCISSOR_RECT) empty_scissor = {
3722          .ScissorRectangleYMin = 1,
3723          .ScissorRectangleXMin = 1,
3724          .ScissorRectangleYMax = 0,
3725          .ScissorRectangleXMax = 0
3726       };
3727 
3728       const int max = 0xffff;
3729 
3730       uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
3731       uint32_t x_min = MAX2(s->offset.x, vp->x);
3732       uint32_t y_max = MIN2(s->offset.y + s->extent.height - 1,
3733                        MAX2(vp->y, vp->y + vp->height) - 1);
3734       uint32_t x_max = MIN2(s->offset.x + s->extent.width - 1,
3735                        vp->x + vp->width - 1);
3736 
3737       /* Do this math using int64_t so overflow gets clamped correctly. */
3738       if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3739          y_min = clamp_int64((uint64_t) y_min, gfx->render_area.offset.y, max);
3740          x_min = clamp_int64((uint64_t) x_min, gfx->render_area.offset.x, max);
3741          y_max = clamp_int64((uint64_t) y_max, 0,
3742                              gfx->render_area.offset.y +
3743                              gfx->render_area.extent.height - 1);
3744          x_max = clamp_int64((uint64_t) x_max, 0,
3745                              gfx->render_area.offset.x +
3746                              gfx->render_area.extent.width - 1);
3747       }
3748 
3749       struct GENX(SCISSOR_RECT) scissor = {
3750          .ScissorRectangleYMin = y_min,
3751          .ScissorRectangleXMin = x_min,
3752          .ScissorRectangleYMax = y_max,
3753          .ScissorRectangleXMax = x_max
3754       };
3755 
3756       if (s->extent.width <= 0 || s->extent.height <= 0) {
3757          GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8,
3758                                  &empty_scissor);
3759       } else {
3760          GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
3761       }
3762    }
3763 
3764    anv_batch_emit(&cmd_buffer->batch,
3765                   GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
3766       ssp.ScissorRectPointer = scissor_state.offset;
3767    }
3768 }
3769 
3770 static void
cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)3771 cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
3772 {
3773    const struct vk_dynamic_graphics_state *dyn =
3774       &cmd_buffer->vk.dynamic_graphics_state;
3775    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3776 
3777 #if GFX_VER == 7
3778 #  define streamout_state_dw pipeline->gfx7.streamout_state
3779 #else
3780 #  define streamout_state_dw pipeline->gfx8.streamout_state
3781 #endif
3782 
3783    uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];
3784 
3785    struct GENX(3DSTATE_STREAMOUT) so = {
3786       GENX(3DSTATE_STREAMOUT_header),
3787       .RenderingDisable = dyn->rs.rasterizer_discard_enable,
3788    };
3789    GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);
3790    anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);
3791 }
3792 
3793 void
cmd_buffer_flush_state(struct anv_cmd_buffer *cmd_buffer)3794 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
3795 {
3796    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3797    const struct vk_dynamic_graphics_state *dyn =
3798       &cmd_buffer->vk.dynamic_graphics_state;
3799    uint32_t *p;
3800 
3801    assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
3802 
3803    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
3804 
3805    genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
3806 
3807    genX(flush_pipeline_select_3d)(cmd_buffer);
3808 
3809    /* Apply any pending pipeline flushes we may have.  We want to apply them
3810     * now because, if any of those flushes are for things like push constants,
3811     * the GPU will read the state at weird times.
3812     */
3813    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3814 
3815    uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;
3816    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
3817       vb_emit |= pipeline->vb_used;
3818 
3819    if (vb_emit) {
3820       const uint32_t num_buffers = __builtin_popcount(vb_emit);
3821       const uint32_t num_dwords = 1 + num_buffers * 4;
3822 
3823       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3824                           GENX(3DSTATE_VERTEX_BUFFERS));
3825       uint32_t i = 0;
3826       u_foreach_bit(vb, vb_emit) {
3827          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
3828          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
3829 
3830          struct GENX(VERTEX_BUFFER_STATE) state;
3831          if (buffer) {
3832             uint32_t stride = dyn->vi_binding_strides[vb];
3833             UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
3834 
3835 #if GFX_VER <= 7
3836             bool per_instance = pipeline->vb[vb].instanced;
3837             uint32_t divisor = pipeline->vb[vb].instance_divisor *
3838                                pipeline->instance_multiplier;
3839 #endif
3840 
3841             state = (struct GENX(VERTEX_BUFFER_STATE)) {
3842                .VertexBufferIndex = vb,
3843 
3844                .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
3845                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3846 #if GFX_VER <= 7
3847                .BufferAccessType = per_instance ? INSTANCEDATA : VERTEXDATA,
3848                .InstanceDataStepRate = per_instance ? divisor : 1,
3849 #endif
3850                .AddressModifyEnable = true,
3851                .BufferPitch = stride,
3852                .BufferStartingAddress = anv_address_add(buffer->address, offset),
3853                .NullVertexBuffer = offset >= buffer->vk.size,
3854 #if GFX_VER >= 12
3855                .L3BypassDisable = true,
3856 #endif
3857 
3858 #if GFX_VER >= 8
3859                .BufferSize = size,
3860 #else
3861                /* XXX: to handle dynamic offset for older gens we might want
3862                 * to modify Endaddress, but there are issues when doing so:
3863                 *
3864                 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439
3865                 */
3866                .EndAddress = anv_address_add(buffer->address, buffer->vk.size - 1),
3867 #endif
3868             };
3869          } else {
3870             state = (struct GENX(VERTEX_BUFFER_STATE)) {
3871                .VertexBufferIndex = vb,
3872                .NullVertexBuffer = true,
3873                .MOCS = anv_mocs(cmd_buffer->device, NULL,
3874                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3875             };
3876          }
3877 
3878 #if GFX_VER >= 8 && GFX_VER <= 9
3879          genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
3880                                                         state.BufferStartingAddress,
3881                                                         state.BufferSize);
3882 #endif
3883 
3884          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
3885          i++;
3886       }
3887    }
3888 
3889    cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
3890 
3891    uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
3892                                 pipeline->active_stages;
3893    if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
3894        !vk_dynamic_graphics_state_any_dirty(dyn) &&
3895        !cmd_buffer->state.push_constants_dirty)
3896       return;
3897 
3898    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||
3899        (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &
3900                          ANV_CMD_DIRTY_PIPELINE))) {
3901       /* Wa_16011411144:
3902        *
3903        * SW must insert a PIPE_CONTROL cmd before and after the
3904        * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
3905        * state is not combined with other state changes.
3906        */
3907       if (intel_device_info_is_dg2(&cmd_buffer->device->info)) {
3908          anv_add_pending_pipe_bits(cmd_buffer,
3909                                    ANV_PIPE_CS_STALL_BIT,
3910                                    "before SO_BUFFER change WA");
3911          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3912       }
3913 
3914       /* We don't need any per-buffer dirty tracking because you're not
3915        * allowed to bind different XFB buffers while XFB is enabled.
3916        */
3917       for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
3918          struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
3919          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
3920 #if GFX_VER < 12
3921             sob.SOBufferIndex = idx;
3922 #else
3923             sob._3DCommandOpcode = 0;
3924             sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
3925 #endif
3926 
3927             if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
3928                sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo, 0);
3929                sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
3930                                                         xfb->offset);
3931 #if GFX_VER >= 8
3932                sob.SOBufferEnable = true;
3933                sob.StreamOffsetWriteEnable = false;
3934                /* Size is in DWords - 1 */
3935                sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
3936 #else
3937                /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so
3938                 * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the
3939                 * default for an empty SO_BUFFER packet) to disable them.
3940                 */
3941                sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];
3942                sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,
3943                                                        xfb->offset + xfb->size);
3944 #endif
3945             } else {
3946                sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
3947             }
3948          }
3949       }
3950 
3951       if (intel_device_info_is_dg2(&cmd_buffer->device->info)) {
3952          /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
3953          anv_add_pending_pipe_bits(cmd_buffer,
3954                                    ANV_PIPE_CS_STALL_BIT,
3955                                    "after SO_BUFFER change WA");
3956          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3957       } else if (GFX_VER >= 10) {
3958          /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
3959          anv_add_pending_pipe_bits(cmd_buffer,
3960                                    ANV_PIPE_CS_STALL_BIT,
3961                                    "after 3DSTATE_SO_BUFFER call");
3962       }
3963    }
3964 
3965    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
3966       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
3967 
3968       /* If the pipeline changed, we may need to re-allocate push constant
3969        * space in the URB.
3970        */
3971       cmd_buffer_alloc_push_constants(cmd_buffer);
3972    }
3973 
3974 #if GFX_VER <= 7
3975    if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
3976        cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
3977       /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
3978        *
3979        *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
3980        *    stall needs to be sent just prior to any 3DSTATE_VS,
3981        *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
3982        *    3DSTATE_BINDING_TABLE_POINTER_VS,
3983        *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
3984        *    PIPE_CONTROL needs to be sent before any combination of VS
3985        *    associated 3DSTATE."
3986        */
3987       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3988          pc.DepthStallEnable  = true;
3989          pc.PostSyncOperation = WriteImmediateData;
3990          pc.Address           = cmd_buffer->device->workaround_address;
3991          anv_debug_dump_pc(pc);
3992       }
3993    }
3994 #endif
3995 
3996    /* Render targets live in the same binding table as fragment descriptors */
3997    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
3998       descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
3999 
4000    /* We emit the binding tables and sampler tables first, then emit push
4001     * constants and then finally emit binding table and sampler table
4002     * pointers.  It has to happen in this order, since emitting the binding
4003     * tables may change the push constants (in case of storage images). After
4004     * emitting push constants, on SKL+ we have to emit the corresponding
4005     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
4006     */
4007    uint32_t dirty = 0;
4008    if (descriptors_dirty) {
4009       dirty = flush_descriptor_sets(cmd_buffer,
4010                                     &cmd_buffer->state.gfx.base,
4011                                     descriptors_dirty,
4012                                     pipeline->shaders,
4013                                     ARRAY_SIZE(pipeline->shaders));
4014       cmd_buffer->state.descriptors_dirty &= ~dirty;
4015    }
4016 
4017    if (dirty || cmd_buffer->state.push_constants_dirty) {
4018       /* Because we're pushing UBOs, we have to push whenever either
4019        * descriptors or push constants is dirty.
4020        */
4021       dirty |= cmd_buffer->state.push_constants_dirty;
4022       cmd_buffer_flush_push_constants(cmd_buffer,
4023                                       dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
4024 #if GFX_VERx10 >= 125
4025       cmd_buffer_flush_mesh_inline_data(
4026          cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_NV |
4027                               VK_SHADER_STAGE_MESH_BIT_NV));
4028 #endif
4029    }
4030 
4031    if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
4032       cmd_buffer_emit_descriptor_pointers(cmd_buffer,
4033                                           dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
4034    }
4035 
4036    cmd_buffer_emit_clip(cmd_buffer);
4037 
4038    if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
4039                                        ANV_CMD_DIRTY_XFB_ENABLE)) ||
4040        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
4041       cmd_buffer_emit_streamout(cmd_buffer);
4042 
4043    if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
4044                                        ANV_CMD_DIRTY_RENDER_TARGETS)) ||
4045        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
4046        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
4047       cmd_buffer_emit_viewport(cmd_buffer);
4048       cmd_buffer_emit_depth_viewport(cmd_buffer,
4049                                      pipeline->depth_clamp_enable);
4050       cmd_buffer_emit_scissor(cmd_buffer);
4051    }
4052 
4053    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
4054        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
4055       uint32_t topology;
4056       if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
4057          topology = _3DPRIM_PATCHLIST(pipeline->patch_control_points);
4058       else
4059          topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology];
4060 
4061       cmd_buffer->state.gfx.primitive_topology = topology;
4062 
4063 #if (GFX_VER >= 8)
4064       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
4065          vft.PrimitiveTopologyType = topology;
4066       }
4067 #endif
4068    }
4069 
4070    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
4071 }
4072 
4073 static void
emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr, uint32_t size, uint32_t index)4074 emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
4075                struct anv_address addr,
4076                uint32_t size, uint32_t index)
4077 {
4078    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
4079                                  GENX(3DSTATE_VERTEX_BUFFERS));
4080 
4081    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
4082       &(struct GENX(VERTEX_BUFFER_STATE)) {
4083          .VertexBufferIndex = index,
4084          .AddressModifyEnable = true,
4085          .BufferPitch = 0,
4086          .MOCS = anv_mocs(cmd_buffer->device, addr.bo,
4087                           ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
4088          .NullVertexBuffer = size == 0,
4089 #if GFX_VER >= 12
4090          .L3BypassDisable = true,
4091 #endif
4092 #if (GFX_VER >= 8)
4093          .BufferStartingAddress = addr,
4094          .BufferSize = size
4095 #else
4096          .BufferStartingAddress = addr,
4097          .EndAddress = anv_address_add(addr, size),
4098 #endif
4099       });
4100 
4101    genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
4102                                                   index, addr, size);
4103 }
4104 
4105 static void
emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr)4106 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
4107                              struct anv_address addr)
4108 {
4109    emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
4110 }
4111 
4112 static void
emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, uint32_t base_vertex, uint32_t base_instance)4113 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
4114                           uint32_t base_vertex, uint32_t base_instance)
4115 {
4116    if (base_vertex == 0 && base_instance == 0) {
4117       emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
4118    } else {
4119       struct anv_state id_state =
4120          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
4121 
4122       ((uint32_t *)id_state.map)[0] = base_vertex;
4123       ((uint32_t *)id_state.map)[1] = base_instance;
4124 
4125       struct anv_address addr = {
4126          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4127          .offset = id_state.offset,
4128       };
4129 
4130       emit_base_vertex_instance_bo(cmd_buffer, addr);
4131    }
4132 }
4133 
4134 static void
emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)4135 emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
4136 {
4137    struct anv_state state =
4138       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
4139 
4140    ((uint32_t *)state.map)[0] = draw_index;
4141 
4142    struct anv_address addr = {
4143       .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4144       .offset = state.offset,
4145    };
4146 
4147    emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
4148 }
4149 
4150 static void
update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer, uint32_t access_type)4151 update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
4152                                    uint32_t access_type)
4153 {
4154    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4155    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4156 
4157    uint64_t vb_used = pipeline->vb_used;
4158    if (vs_prog_data->uses_firstvertex ||
4159        vs_prog_data->uses_baseinstance)
4160       vb_used |= 1ull << ANV_SVGS_VB_INDEX;
4161    if (vs_prog_data->uses_drawid)
4162       vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
4163 
4164    genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
4165                                                        access_type == RANDOM,
4166                                                        vb_used);
4167 }
4168 
4169 ALWAYS_INLINE static void
cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer, const struct brw_vs_prog_data *vs_prog_data, uint32_t base_vertex, uint32_t base_instance, uint32_t draw_id, bool force_flush)4170 cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
4171                                            const struct brw_vs_prog_data *vs_prog_data,
4172                                            uint32_t base_vertex,
4173                                            uint32_t base_instance,
4174                                            uint32_t draw_id,
4175                                            bool force_flush)
4176 {
4177    bool emitted = false;
4178    if (vs_prog_data->uses_firstvertex ||
4179        vs_prog_data->uses_baseinstance) {
4180       emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
4181       emitted = true;
4182    }
4183    if (vs_prog_data->uses_drawid) {
4184       emit_draw_index(cmd_buffer, draw_id);
4185       emitted = true;
4186    }
4187    /* Emitting draw index or vertex index BOs may result in needing
4188     * additional VF cache flushes.
4189     */
4190    if (emitted || force_flush)
4191       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4192 }
4193 
CmdDraw( VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance)4194 void genX(CmdDraw)(
4195     VkCommandBuffer                             commandBuffer,
4196     uint32_t                                    vertexCount,
4197     uint32_t                                    instanceCount,
4198     uint32_t                                    firstVertex,
4199     uint32_t                                    firstInstance)
4200 {
4201    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4202    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4203    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4204 
4205    if (anv_batch_has_error(&cmd_buffer->batch))
4206       return;
4207 
4208    const uint32_t count =
4209       vertexCount * instanceCount * pipeline->instance_multiplier;
4210    anv_measure_snapshot(cmd_buffer,
4211                         INTEL_SNAPSHOT_DRAW,
4212                         "draw", count);
4213    trace_intel_begin_draw(&cmd_buffer->trace);
4214 
4215    genX(cmd_buffer_flush_state)(cmd_buffer);
4216 
4217    if (cmd_buffer->state.conditional_render_enabled)
4218       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4219 
4220    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4221                                               firstVertex, firstInstance, 0,
4222                                               true);
4223 
4224    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4225       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4226       prim.VertexAccessType         = SEQUENTIAL;
4227       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4228       prim.VertexCountPerInstance   = vertexCount;
4229       prim.StartVertexLocation      = firstVertex;
4230       prim.InstanceCount            = instanceCount *
4231                                       pipeline->instance_multiplier;
4232       prim.StartInstanceLocation    = firstInstance;
4233       prim.BaseVertexLocation       = 0;
4234    }
4235 
4236    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4237 
4238    trace_intel_end_draw(&cmd_buffer->trace, count);
4239 }
4240 
CmdDrawMultiEXT( VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo, uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)4241 void genX(CmdDrawMultiEXT)(
4242     VkCommandBuffer                             commandBuffer,
4243     uint32_t                                    drawCount,
4244     const VkMultiDrawInfoEXT                   *pVertexInfo,
4245     uint32_t                                    instanceCount,
4246     uint32_t                                    firstInstance,
4247     uint32_t                                    stride)
4248 {
4249    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4250    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4251    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4252 
4253    if (anv_batch_has_error(&cmd_buffer->batch))
4254       return;
4255 
4256    const uint32_t count =
4257       drawCount * instanceCount * pipeline->instance_multiplier;
4258    anv_measure_snapshot(cmd_buffer,
4259                         INTEL_SNAPSHOT_DRAW,
4260                         "draw_multi", count);
4261    trace_intel_begin_draw_multi(&cmd_buffer->trace);
4262 
4263    genX(cmd_buffer_flush_state)(cmd_buffer);
4264 
4265    if (cmd_buffer->state.conditional_render_enabled)
4266       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4267 
4268    uint32_t i = 0;
4269    vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
4270       cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4271                                                  draw->firstVertex,
4272                                                  firstInstance, i, !i);
4273 
4274       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4275          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4276          prim.VertexAccessType         = SEQUENTIAL;
4277          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4278          prim.VertexCountPerInstance   = draw->vertexCount;
4279          prim.StartVertexLocation      = draw->firstVertex;
4280          prim.InstanceCount            = instanceCount *
4281                                          pipeline->instance_multiplier;
4282          prim.StartInstanceLocation    = firstInstance;
4283          prim.BaseVertexLocation       = 0;
4284       }
4285    }
4286 
4287    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4288 
4289    trace_intel_end_draw_multi(&cmd_buffer->trace, count);
4290 }
4291 
CmdDrawIndexed( VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance)4292 void genX(CmdDrawIndexed)(
4293     VkCommandBuffer                             commandBuffer,
4294     uint32_t                                    indexCount,
4295     uint32_t                                    instanceCount,
4296     uint32_t                                    firstIndex,
4297     int32_t                                     vertexOffset,
4298     uint32_t                                    firstInstance)
4299 {
4300    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4301    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4302    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4303 
4304    if (anv_batch_has_error(&cmd_buffer->batch))
4305       return;
4306 
4307    const uint32_t count =
4308       indexCount * instanceCount * pipeline->instance_multiplier;
4309    anv_measure_snapshot(cmd_buffer,
4310                         INTEL_SNAPSHOT_DRAW,
4311                         "draw indexed",
4312                         count);
4313    trace_intel_begin_draw_indexed(&cmd_buffer->trace);
4314 
4315    genX(cmd_buffer_flush_state)(cmd_buffer);
4316 
4317    if (cmd_buffer->state.conditional_render_enabled)
4318       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4319 
4320    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, vertexOffset, firstInstance, 0, true);
4321 
4322    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4323       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4324       prim.VertexAccessType         = RANDOM;
4325       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4326       prim.VertexCountPerInstance   = indexCount;
4327       prim.StartVertexLocation      = firstIndex;
4328       prim.InstanceCount            = instanceCount *
4329                                       pipeline->instance_multiplier;
4330       prim.StartInstanceLocation    = firstInstance;
4331       prim.BaseVertexLocation       = vertexOffset;
4332    }
4333 
4334    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4335 
4336    trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
4337 }
4338 
CmdDrawMultiIndexedEXT( VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo, uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset)4339 void genX(CmdDrawMultiIndexedEXT)(
4340     VkCommandBuffer                             commandBuffer,
4341     uint32_t                                    drawCount,
4342     const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
4343     uint32_t                                    instanceCount,
4344     uint32_t                                    firstInstance,
4345     uint32_t                                    stride,
4346     const int32_t                              *pVertexOffset)
4347 {
4348    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4349    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4350    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4351 
4352    if (anv_batch_has_error(&cmd_buffer->batch))
4353       return;
4354 
4355    const uint32_t count =
4356       drawCount * instanceCount * pipeline->instance_multiplier;
4357    anv_measure_snapshot(cmd_buffer,
4358                         INTEL_SNAPSHOT_DRAW,
4359                         "draw indexed_multi",
4360                         count);
4361    trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
4362 
4363    genX(cmd_buffer_flush_state)(cmd_buffer);
4364 
4365    if (cmd_buffer->state.conditional_render_enabled)
4366       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4367 
4368    uint32_t i = 0;
4369    if (pVertexOffset) {
4370       if (vs_prog_data->uses_drawid) {
4371          bool emitted = true;
4372          if (vs_prog_data->uses_firstvertex ||
4373              vs_prog_data->uses_baseinstance) {
4374             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
4375             emitted = true;
4376          }
4377          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4378             if (vs_prog_data->uses_drawid) {
4379                emit_draw_index(cmd_buffer, i);
4380                emitted = true;
4381             }
4382             /* Emitting draw index or vertex index BOs may result in needing
4383              * additional VF cache flushes.
4384              */
4385             if (emitted)
4386                genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4387 
4388             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4389                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4390                prim.VertexAccessType         = RANDOM;
4391                prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4392                prim.VertexCountPerInstance   = draw->indexCount;
4393                prim.StartVertexLocation      = draw->firstIndex;
4394                prim.InstanceCount            = instanceCount *
4395                                                pipeline->instance_multiplier;
4396                prim.StartInstanceLocation    = firstInstance;
4397                prim.BaseVertexLocation       = *pVertexOffset;
4398             }
4399             emitted = false;
4400          }
4401       } else {
4402          if (vs_prog_data->uses_firstvertex ||
4403              vs_prog_data->uses_baseinstance) {
4404             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
4405             /* Emitting draw index or vertex index BOs may result in needing
4406              * additional VF cache flushes.
4407              */
4408             genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4409          }
4410          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4411             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4412                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4413                prim.VertexAccessType         = RANDOM;
4414                prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4415                prim.VertexCountPerInstance   = draw->indexCount;
4416                prim.StartVertexLocation      = draw->firstIndex;
4417                prim.InstanceCount            = instanceCount *
4418                                                pipeline->instance_multiplier;
4419                prim.StartInstanceLocation    = firstInstance;
4420                prim.BaseVertexLocation       = *pVertexOffset;
4421             }
4422          }
4423       }
4424    } else {
4425       vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4426          cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4427                                                     draw->vertexOffset,
4428                                                     firstInstance, i, i != 0);
4429 
4430          anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4431             prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4432             prim.VertexAccessType         = RANDOM;
4433             prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4434             prim.VertexCountPerInstance   = draw->indexCount;
4435             prim.StartVertexLocation      = draw->firstIndex;
4436             prim.InstanceCount            = instanceCount *
4437                                             pipeline->instance_multiplier;
4438             prim.StartInstanceLocation    = firstInstance;
4439             prim.BaseVertexLocation       = draw->vertexOffset;
4440          }
4441       }
4442    }
4443 
4444    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4445 
4446    trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
4447 }
4448 
4449 /* Auto-Draw / Indirect Registers */
4450 #define GFX7_3DPRIM_END_OFFSET          0x2420
4451 #define GFX7_3DPRIM_START_VERTEX        0x2430
4452 #define GFX7_3DPRIM_VERTEX_COUNT        0x2434
4453 #define GFX7_3DPRIM_INSTANCE_COUNT      0x2438
4454 #define GFX7_3DPRIM_START_INSTANCE      0x243C
4455 #define GFX7_3DPRIM_BASE_VERTEX         0x2440
4456 
CmdDrawIndirectByteCountEXT( VkCommandBuffer commandBuffer, uint32_t instanceCount, uint32_t firstInstance, VkBuffer counterBuffer, VkDeviceSize counterBufferOffset, uint32_t counterOffset, uint32_t vertexStride)4457 void genX(CmdDrawIndirectByteCountEXT)(
4458     VkCommandBuffer                             commandBuffer,
4459     uint32_t                                    instanceCount,
4460     uint32_t                                    firstInstance,
4461     VkBuffer                                    counterBuffer,
4462     VkDeviceSize                                counterBufferOffset,
4463     uint32_t                                    counterOffset,
4464     uint32_t                                    vertexStride)
4465 {
4466 #if GFX_VERx10 >= 75
4467    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4468    ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
4469    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4470    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4471 
4472    /* firstVertex is always zero for this draw function */
4473    const uint32_t firstVertex = 0;
4474 
4475    if (anv_batch_has_error(&cmd_buffer->batch))
4476       return;
4477 
4478    anv_measure_snapshot(cmd_buffer,
4479                         INTEL_SNAPSHOT_DRAW,
4480                         "draw indirect byte count",
4481                         instanceCount * pipeline->instance_multiplier);
4482    trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
4483 
4484    genX(cmd_buffer_flush_state)(cmd_buffer);
4485 
4486    if (cmd_buffer->state.conditional_render_enabled)
4487       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4488 
4489    if (vs_prog_data->uses_firstvertex ||
4490        vs_prog_data->uses_baseinstance)
4491       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
4492    if (vs_prog_data->uses_drawid)
4493       emit_draw_index(cmd_buffer, 0);
4494 
4495    /* Emitting draw index or vertex index BOs may result in needing
4496     * additional VF cache flushes.
4497     */
4498    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4499 
4500    struct mi_builder b;
4501    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4502    struct mi_value count =
4503       mi_mem32(anv_address_add(counter_buffer->address,
4504                                    counterBufferOffset));
4505    if (counterOffset)
4506       count = mi_isub(&b, count, mi_imm(counterOffset));
4507    count = mi_udiv32_imm(&b, count, vertexStride);
4508    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
4509 
4510    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
4511    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
4512             mi_imm(instanceCount * pipeline->instance_multiplier));
4513    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
4514    mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
4515 
4516    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4517       prim.IndirectParameterEnable  = true;
4518       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4519       prim.VertexAccessType         = SEQUENTIAL;
4520       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4521    }
4522 
4523    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4524 
4525    trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
4526       instanceCount * pipeline->instance_multiplier);
4527 #endif /* GFX_VERx10 >= 75 */
4528 }
4529 
4530 static void
load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr, bool indexed)4531 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
4532                          struct anv_address addr,
4533                          bool indexed)
4534 {
4535    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4536 
4537    struct mi_builder b;
4538    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4539 
4540    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
4541                 mi_mem32(anv_address_add(addr, 0)));
4542 
4543    struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
4544    if (pipeline->instance_multiplier > 1) {
4545 #if GFX_VERx10 >= 75
4546       instance_count = mi_imul_imm(&b, instance_count,
4547                                    pipeline->instance_multiplier);
4548 #else
4549       anv_finishme("Multiview + indirect draw requires MI_MATH; "
4550                    "MI_MATH is not supported on Ivy Bridge");
4551 #endif
4552    }
4553    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
4554 
4555    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
4556                 mi_mem32(anv_address_add(addr, 8)));
4557 
4558    if (indexed) {
4559       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
4560                    mi_mem32(anv_address_add(addr, 12)));
4561       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
4562                    mi_mem32(anv_address_add(addr, 16)));
4563    } else {
4564       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
4565                    mi_mem32(anv_address_add(addr, 12)));
4566       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
4567    }
4568 }
4569 
CmdDrawIndirect( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride)4570 void genX(CmdDrawIndirect)(
4571     VkCommandBuffer                             commandBuffer,
4572     VkBuffer                                    _buffer,
4573     VkDeviceSize                                offset,
4574     uint32_t                                    drawCount,
4575     uint32_t                                    stride)
4576 {
4577    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4578    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4579    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4580    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4581 
4582    if (anv_batch_has_error(&cmd_buffer->batch))
4583       return;
4584 
4585    anv_measure_snapshot(cmd_buffer,
4586                         INTEL_SNAPSHOT_DRAW,
4587                         "draw indirect",
4588                         drawCount);
4589    trace_intel_begin_draw_indirect(&cmd_buffer->trace);
4590 
4591    genX(cmd_buffer_flush_state)(cmd_buffer);
4592 
4593    if (cmd_buffer->state.conditional_render_enabled)
4594       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4595 
4596    for (uint32_t i = 0; i < drawCount; i++) {
4597       struct anv_address draw = anv_address_add(buffer->address, offset);
4598 
4599       if (vs_prog_data->uses_firstvertex ||
4600           vs_prog_data->uses_baseinstance)
4601          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4602       if (vs_prog_data->uses_drawid)
4603          emit_draw_index(cmd_buffer, i);
4604 
4605       /* Emitting draw index or vertex index BOs may result in needing
4606        * additional VF cache flushes.
4607        */
4608       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4609 
4610       load_indirect_parameters(cmd_buffer, draw, false);
4611 
4612       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4613          prim.IndirectParameterEnable  = true;
4614          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4615          prim.VertexAccessType         = SEQUENTIAL;
4616          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4617       }
4618 
4619       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4620 
4621       offset += stride;
4622    }
4623 
4624    trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
4625 }
4626 
CmdDrawIndexedIndirect( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride)4627 void genX(CmdDrawIndexedIndirect)(
4628     VkCommandBuffer                             commandBuffer,
4629     VkBuffer                                    _buffer,
4630     VkDeviceSize                                offset,
4631     uint32_t                                    drawCount,
4632     uint32_t                                    stride)
4633 {
4634    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4635    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4636    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4637    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4638 
4639    if (anv_batch_has_error(&cmd_buffer->batch))
4640       return;
4641 
4642    anv_measure_snapshot(cmd_buffer,
4643                         INTEL_SNAPSHOT_DRAW,
4644                         "draw indexed indirect",
4645                         drawCount);
4646    trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
4647 
4648    genX(cmd_buffer_flush_state)(cmd_buffer);
4649 
4650    if (cmd_buffer->state.conditional_render_enabled)
4651       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4652 
4653    for (uint32_t i = 0; i < drawCount; i++) {
4654       struct anv_address draw = anv_address_add(buffer->address, offset);
4655 
4656       /* TODO: We need to stomp base vertex to 0 somehow */
4657       if (vs_prog_data->uses_firstvertex ||
4658           vs_prog_data->uses_baseinstance)
4659          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4660       if (vs_prog_data->uses_drawid)
4661          emit_draw_index(cmd_buffer, i);
4662 
4663       /* Emitting draw index or vertex index BOs may result in needing
4664        * additional VF cache flushes.
4665        */
4666       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4667 
4668       load_indirect_parameters(cmd_buffer, draw, true);
4669 
4670       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4671          prim.IndirectParameterEnable  = true;
4672          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4673          prim.VertexAccessType         = RANDOM;
4674          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4675       }
4676 
4677       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4678 
4679       offset += stride;
4680    }
4681 
4682    trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
4683 }
4684 
4685 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, struct mi_builder *b, struct anv_buffer *count_buffer, uint64_t countBufferOffset)4686 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4687                                  struct mi_builder *b,
4688                                  struct anv_buffer *count_buffer,
4689                                  uint64_t countBufferOffset)
4690 {
4691    struct anv_address count_address =
4692          anv_address_add(count_buffer->address, countBufferOffset);
4693 
4694    struct mi_value ret = mi_imm(0);
4695 
4696    if (cmd_buffer->state.conditional_render_enabled) {
4697 #if GFX_VERx10 >= 75
4698       ret = mi_new_gpr(b);
4699       mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
4700 #endif
4701    } else {
4702       /* Upload the current draw count from the draw parameters buffer to
4703        * MI_PREDICATE_SRC0.
4704        */
4705       mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
4706       mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
4707    }
4708 
4709    return ret;
4710 }
4711 
4712 static void
emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, struct mi_builder *b, uint32_t draw_index)4713 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4714                           struct mi_builder *b,
4715                           uint32_t draw_index)
4716 {
4717    /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
4718    mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
4719 
4720    if (draw_index == 0) {
4721       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4722          mip.LoadOperation    = LOAD_LOADINV;
4723          mip.CombineOperation = COMBINE_SET;
4724          mip.CompareOperation = COMPARE_SRCS_EQUAL;
4725       }
4726    } else {
4727       /* While draw_index < draw_count the predicate's result will be
4728        *  (draw_index == draw_count) ^ TRUE = TRUE
4729        * When draw_index == draw_count the result is
4730        *  (TRUE) ^ TRUE = FALSE
4731        * After this all results will be:
4732        *  (FALSE) ^ FALSE = FALSE
4733        */
4734       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4735          mip.LoadOperation    = LOAD_LOAD;
4736          mip.CombineOperation = COMBINE_XOR;
4737          mip.CompareOperation = COMPARE_SRCS_EQUAL;
4738       }
4739    }
4740 }
4741 
4742 #if GFX_VERx10 >= 75
4743 static void
emit_draw_count_predicate_with_conditional_render( struct anv_cmd_buffer *cmd_buffer, struct mi_builder *b, uint32_t draw_index, struct mi_value max)4744 emit_draw_count_predicate_with_conditional_render(
4745                           struct anv_cmd_buffer *cmd_buffer,
4746                           struct mi_builder *b,
4747                           uint32_t draw_index,
4748                           struct mi_value max)
4749 {
4750    struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
4751    pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
4752 
4753 #if GFX_VER >= 8
4754    mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
4755 #else
4756    /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser
4757     * so we emit MI_PREDICATE to set it.
4758     */
4759 
4760    mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);
4761    mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4762 
4763    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4764       mip.LoadOperation    = LOAD_LOADINV;
4765       mip.CombineOperation = COMBINE_SET;
4766       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4767    }
4768 #endif
4769 }
4770 #endif
4771 
4772 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer, struct mi_builder *b, uint32_t draw_index, struct mi_value max)4773 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
4774                                struct mi_builder *b,
4775                                uint32_t draw_index,
4776                                struct mi_value max)
4777 {
4778 #if GFX_VERx10 >= 75
4779    if (cmd_buffer->state.conditional_render_enabled) {
4780       emit_draw_count_predicate_with_conditional_render(
4781             cmd_buffer, b, draw_index, mi_value_ref(b, max));
4782    } else {
4783       emit_draw_count_predicate(cmd_buffer, b, draw_index);
4784    }
4785 #else
4786    emit_draw_count_predicate(cmd_buffer, b, draw_index);
4787 #endif
4788 }
4789 
CmdDrawIndirectCount( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride)4790 void genX(CmdDrawIndirectCount)(
4791     VkCommandBuffer                             commandBuffer,
4792     VkBuffer                                    _buffer,
4793     VkDeviceSize                                offset,
4794     VkBuffer                                    _countBuffer,
4795     VkDeviceSize                                countBufferOffset,
4796     uint32_t                                    maxDrawCount,
4797     uint32_t                                    stride)
4798 {
4799    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4800    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4801    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4802    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4803    struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4804    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4805 
4806    if (anv_batch_has_error(&cmd_buffer->batch))
4807       return;
4808 
4809    anv_measure_snapshot(cmd_buffer,
4810                         INTEL_SNAPSHOT_DRAW,
4811                         "draw indirect count",
4812                         0);
4813    trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
4814 
4815    genX(cmd_buffer_flush_state)(cmd_buffer);
4816 
4817    struct mi_builder b;
4818    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4819    struct mi_value max =
4820       prepare_for_draw_count_predicate(cmd_buffer, &b,
4821                                        count_buffer, countBufferOffset);
4822 
4823    for (uint32_t i = 0; i < maxDrawCount; i++) {
4824       struct anv_address draw = anv_address_add(buffer->address, offset);
4825 
4826       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4827 
4828       if (vs_prog_data->uses_firstvertex ||
4829           vs_prog_data->uses_baseinstance)
4830          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4831       if (vs_prog_data->uses_drawid)
4832          emit_draw_index(cmd_buffer, i);
4833 
4834       /* Emitting draw index or vertex index BOs may result in needing
4835        * additional VF cache flushes.
4836        */
4837       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4838 
4839       load_indirect_parameters(cmd_buffer, draw, false);
4840 
4841       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4842          prim.IndirectParameterEnable  = true;
4843          prim.PredicateEnable          = true;
4844          prim.VertexAccessType         = SEQUENTIAL;
4845          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4846       }
4847 
4848       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4849 
4850       offset += stride;
4851    }
4852 
4853    mi_value_unref(&b, max);
4854 
4855    trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount);
4856 }
4857 
CmdDrawIndexedIndirectCount( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride)4858 void genX(CmdDrawIndexedIndirectCount)(
4859     VkCommandBuffer                             commandBuffer,
4860     VkBuffer                                    _buffer,
4861     VkDeviceSize                                offset,
4862     VkBuffer                                    _countBuffer,
4863     VkDeviceSize                                countBufferOffset,
4864     uint32_t                                    maxDrawCount,
4865     uint32_t                                    stride)
4866 {
4867    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4868    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4869    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4870    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4871    struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4872    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4873 
4874    if (anv_batch_has_error(&cmd_buffer->batch))
4875       return;
4876 
4877    anv_measure_snapshot(cmd_buffer,
4878                         INTEL_SNAPSHOT_DRAW,
4879                         "draw indexed indirect count",
4880                         0);
4881    trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
4882 
4883    genX(cmd_buffer_flush_state)(cmd_buffer);
4884 
4885    struct mi_builder b;
4886    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4887    struct mi_value max =
4888       prepare_for_draw_count_predicate(cmd_buffer, &b,
4889                                        count_buffer, countBufferOffset);
4890 
4891    for (uint32_t i = 0; i < maxDrawCount; i++) {
4892       struct anv_address draw = anv_address_add(buffer->address, offset);
4893 
4894       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4895 
4896       /* TODO: We need to stomp base vertex to 0 somehow */
4897       if (vs_prog_data->uses_firstvertex ||
4898           vs_prog_data->uses_baseinstance)
4899          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4900       if (vs_prog_data->uses_drawid)
4901          emit_draw_index(cmd_buffer, i);
4902 
4903       /* Emitting draw index or vertex index BOs may result in needing
4904        * additional VF cache flushes.
4905        */
4906       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4907 
4908       load_indirect_parameters(cmd_buffer, draw, true);
4909 
4910       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4911          prim.IndirectParameterEnable  = true;
4912          prim.PredicateEnable          = true;
4913          prim.VertexAccessType         = RANDOM;
4914          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4915       }
4916 
4917       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4918 
4919       offset += stride;
4920    }
4921 
4922    mi_value_unref(&b, max);
4923 
4924    trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount);
4925 
4926 }
4927 
CmdBeginTransformFeedbackEXT( VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, uint32_t counterBufferCount, const VkBuffer* pCounterBuffers, const VkDeviceSize* pCounterBufferOffsets)4928 void genX(CmdBeginTransformFeedbackEXT)(
4929     VkCommandBuffer                             commandBuffer,
4930     uint32_t                                    firstCounterBuffer,
4931     uint32_t                                    counterBufferCount,
4932     const VkBuffer*                             pCounterBuffers,
4933     const VkDeviceSize*                         pCounterBufferOffsets)
4934 {
4935    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4936 
4937    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4938    assert(counterBufferCount <= MAX_XFB_BUFFERS);
4939    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4940 
4941    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4942     *
4943     *    "Ssoftware must ensure that no HW stream output operations can be in
4944     *    process or otherwise pending at the point that the MI_LOAD/STORE
4945     *    commands are processed. This will likely require a pipeline flush."
4946     */
4947    anv_add_pending_pipe_bits(cmd_buffer,
4948                              ANV_PIPE_CS_STALL_BIT,
4949                              "begin transform feedback");
4950    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4951 
4952    for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
4953       /* If we have a counter buffer, this is a resume so we need to load the
4954        * value into the streamout offset register.  Otherwise, this is a begin
4955        * and we need to reset it to zero.
4956        */
4957       if (pCounterBuffers &&
4958           idx >= firstCounterBuffer &&
4959           idx - firstCounterBuffer < counterBufferCount &&
4960           pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
4961          uint32_t cb_idx = idx - firstCounterBuffer;
4962          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4963          uint64_t offset = pCounterBufferOffsets ?
4964                            pCounterBufferOffsets[cb_idx] : 0;
4965 
4966          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
4967             lrm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4968             lrm.MemoryAddress    = anv_address_add(counter_buffer->address,
4969                                                    offset);
4970          }
4971       } else {
4972          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
4973             lri.RegisterOffset   = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4974             lri.DataDWord        = 0;
4975          }
4976       }
4977    }
4978 
4979    cmd_buffer->state.xfb_enabled = true;
4980    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
4981 }
4982 
CmdEndTransformFeedbackEXT( VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, uint32_t counterBufferCount, const VkBuffer* pCounterBuffers, const VkDeviceSize* pCounterBufferOffsets)4983 void genX(CmdEndTransformFeedbackEXT)(
4984     VkCommandBuffer                             commandBuffer,
4985     uint32_t                                    firstCounterBuffer,
4986     uint32_t                                    counterBufferCount,
4987     const VkBuffer*                             pCounterBuffers,
4988     const VkDeviceSize*                         pCounterBufferOffsets)
4989 {
4990    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4991 
4992    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4993    assert(counterBufferCount <= MAX_XFB_BUFFERS);
4994    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4995 
4996    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4997     *
4998     *    "Ssoftware must ensure that no HW stream output operations can be in
4999     *    process or otherwise pending at the point that the MI_LOAD/STORE
5000     *    commands are processed. This will likely require a pipeline flush."
5001     */
5002    anv_add_pending_pipe_bits(cmd_buffer,
5003                              ANV_PIPE_CS_STALL_BIT,
5004                              "end transform feedback");
5005    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5006 
5007    for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
5008       unsigned idx = firstCounterBuffer + cb_idx;
5009 
5010       /* If we have a counter buffer, this is a resume so we need to load the
5011        * value into the streamout offset register.  Otherwise, this is a begin
5012        * and we need to reset it to zero.
5013        */
5014       if (pCounterBuffers &&
5015           cb_idx < counterBufferCount &&
5016           pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
5017          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
5018          uint64_t offset = pCounterBufferOffsets ?
5019                            pCounterBufferOffsets[cb_idx] : 0;
5020 
5021          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
5022             srm.MemoryAddress    = anv_address_add(counter_buffer->address,
5023                                                    offset);
5024             srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
5025          }
5026       }
5027    }
5028 
5029    cmd_buffer->state.xfb_enabled = false;
5030    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
5031 }
5032 
5033 #if GFX_VERx10 >= 125
5034 void
CmdDrawMeshTasksNV( VkCommandBuffer commandBuffer, uint32_t taskCount, uint32_t firstTask)5035 genX(CmdDrawMeshTasksNV)(
5036     VkCommandBuffer                             commandBuffer,
5037     uint32_t                                    taskCount,
5038     uint32_t                                    firstTask)
5039 {
5040    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5041 
5042    if (anv_batch_has_error(&cmd_buffer->batch))
5043       return;
5044 
5045    /* TODO(mesh): Check if this is not emitting more packets than we need. */
5046    genX(cmd_buffer_flush_state)(cmd_buffer);
5047 
5048    if (cmd_buffer->state.conditional_render_enabled)
5049       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5050 
5051    /* BSpec 54016 says: "The values passed for Starting ThreadGroup ID X
5052     * and ThreadGroup Count X shall not cause TGIDs to exceed (2^32)-1."
5053     */
5054    assert((int64_t)firstTask + taskCount - 1 <= UINT32_MAX);
5055 
5056    anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_1D), m) {
5057       m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
5058       m.ThreadGroupCountX = taskCount;
5059       m.StartingThreadGroupIDX = firstTask;
5060    }
5061 }
5062 
5063 #define GFX125_3DMESH_TG_COUNT 0x26F0
5064 #define GFX125_3DMESH_STARTING_TGID 0x26F4
5065 #define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
5066 
5067 static void
mesh_load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, struct mi_builder *b, struct anv_address addr, bool emit_xp0, uint32_t xp0)5068 mesh_load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
5069                               struct mi_builder *b,
5070                               struct anv_address addr,
5071                               bool emit_xp0,
5072                               uint32_t xp0)
5073 {
5074    const size_t taskCountOff = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
5075    const size_t firstTaskOff = offsetof(VkDrawMeshTasksIndirectCommandNV, firstTask);
5076 
5077    mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
5078                mi_mem32(anv_address_add(addr, taskCountOff)));
5079 
5080    mi_store(b, mi_reg32(GFX125_3DMESH_STARTING_TGID),
5081                mi_mem32(anv_address_add(addr, firstTaskOff)));
5082 
5083    if (emit_xp0)
5084       mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
5085 }
5086 
5087 static void
emit_indirect_3dmesh_1d(struct anv_batch *batch, bool predicate_enable, bool uses_drawid)5088 emit_indirect_3dmesh_1d(struct anv_batch *batch,
5089                         bool predicate_enable,
5090                         bool uses_drawid)
5091 {
5092    uint32_t len = GENX(3DMESH_1D_length) + uses_drawid;
5093    uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_1D),
5094                    .PredicateEnable           = predicate_enable,
5095                    .IndirectParameterEnable   = true,
5096                    .ExtendedParameter0Present = uses_drawid);
5097    if (uses_drawid)
5098       dw[len - 1] = 0;
5099 }
5100 
5101 void
CmdDrawMeshTasksIndirectNV( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride)5102 genX(CmdDrawMeshTasksIndirectNV)(
5103     VkCommandBuffer                             commandBuffer,
5104     VkBuffer                                    _buffer,
5105     VkDeviceSize                                offset,
5106     uint32_t                                    drawCount,
5107     uint32_t                                    stride)
5108 {
5109    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5110    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5111    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
5112    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
5113    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
5114    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5115 
5116    if (anv_batch_has_error(&cmd_buffer->batch))
5117       return;
5118 
5119    genX(cmd_buffer_flush_state)(cmd_buffer);
5120 
5121    if (cmd_state->conditional_render_enabled)
5122       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5123 
5124    bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
5125                        mesh_prog_data->uses_drawid;
5126    struct mi_builder b;
5127    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5128 
5129    for (uint32_t i = 0; i < drawCount; i++) {
5130       struct anv_address draw = anv_address_add(buffer->address, offset);
5131 
5132       mesh_load_indirect_parameters(cmd_buffer, &b, draw, uses_drawid, i);
5133 
5134       emit_indirect_3dmesh_1d(&cmd_buffer->batch,
5135             cmd_state->conditional_render_enabled, uses_drawid);
5136 
5137       offset += stride;
5138    }
5139 }
5140 
5141 void
CmdDrawMeshTasksIndirectCountNV( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride)5142 genX(CmdDrawMeshTasksIndirectCountNV)(
5143     VkCommandBuffer                             commandBuffer,
5144     VkBuffer                                    _buffer,
5145     VkDeviceSize                                offset,
5146     VkBuffer                                    _countBuffer,
5147     VkDeviceSize                                countBufferOffset,
5148     uint32_t                                    maxDrawCount,
5149     uint32_t                                    stride)
5150 {
5151    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5152    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5153    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
5154    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
5155    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
5156    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
5157 
5158    if (anv_batch_has_error(&cmd_buffer->batch))
5159       return;
5160 
5161    genX(cmd_buffer_flush_state)(cmd_buffer);
5162 
5163    bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
5164                        mesh_prog_data->uses_drawid;
5165 
5166    struct mi_builder b;
5167    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5168 
5169    struct mi_value max =
5170          prepare_for_draw_count_predicate(cmd_buffer, &b,
5171                                           count_buffer, countBufferOffset);
5172 
5173    for (uint32_t i = 0; i < maxDrawCount; i++) {
5174       struct anv_address draw = anv_address_add(buffer->address, offset);
5175 
5176       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
5177 
5178       mesh_load_indirect_parameters(cmd_buffer, &b, draw, uses_drawid, i);
5179 
5180       emit_indirect_3dmesh_1d(&cmd_buffer->batch, true, uses_drawid);
5181 
5182       offset += stride;
5183    }
5184 }
5185 #endif /* GFX_VERx10 >= 125 */
5186 
5187 void
cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)5188 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
5189 {
5190    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
5191    struct anv_compute_pipeline *pipeline = comp_state->pipeline;
5192 
5193    assert(pipeline->cs);
5194 
5195    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
5196 
5197    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
5198 
5199    /* Apply any pending pipeline flushes we may have.  We want to apply them
5200     * now because, if any of those flushes are for things like push constants,
5201     * the GPU will read the state at weird times.
5202     */
5203    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5204 
5205    if (cmd_buffer->state.compute.pipeline_dirty) {
5206       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
5207        *
5208        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
5209        *    the only bits that are changed are scoreboard related: Scoreboard
5210        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
5211        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
5212        *    sufficient."
5213        */
5214       anv_add_pending_pipe_bits(cmd_buffer,
5215                               ANV_PIPE_CS_STALL_BIT,
5216                               "flush compute state");
5217       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5218 
5219       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
5220 
5221       /* The workgroup size of the pipeline affects our push constant layout
5222        * so flag push constants as dirty if we change the pipeline.
5223        */
5224       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5225    }
5226 
5227    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
5228        cmd_buffer->state.compute.pipeline_dirty) {
5229       flush_descriptor_sets(cmd_buffer,
5230                             &cmd_buffer->state.compute.base,
5231                             VK_SHADER_STAGE_COMPUTE_BIT,
5232                             &pipeline->cs, 1);
5233       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
5234 
5235 #if GFX_VERx10 < 125
5236       uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
5237       struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
5238          .BindingTablePointer =
5239             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
5240          .SamplerStatePointer =
5241             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
5242       };
5243       GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
5244 
5245       struct anv_state state =
5246          anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
5247                                       pipeline->interface_descriptor_data,
5248                                       GENX(INTERFACE_DESCRIPTOR_DATA_length),
5249                                       64);
5250 
5251       uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
5252       anv_batch_emit(&cmd_buffer->batch,
5253                      GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
5254          mid.InterfaceDescriptorTotalLength        = size;
5255          mid.InterfaceDescriptorDataStartAddress   = state.offset;
5256       }
5257 #endif
5258    }
5259 
5260    if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
5261       comp_state->push_data =
5262          anv_cmd_buffer_cs_push_constants(cmd_buffer);
5263 
5264 #if GFX_VERx10 < 125
5265       if (comp_state->push_data.alloc_size) {
5266          anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
5267             curbe.CURBETotalDataLength    = comp_state->push_data.alloc_size;
5268             curbe.CURBEDataStartAddress   = comp_state->push_data.offset;
5269          }
5270       }
5271 #endif
5272 
5273       cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
5274    }
5275 
5276    cmd_buffer->state.compute.pipeline_dirty = false;
5277 
5278    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5279 }
5280 
5281 #if GFX_VER == 7
5282 
5283 static VkResult
verify_cmd_parser(const struct anv_device *device, int required_version, const char *function)5284 verify_cmd_parser(const struct anv_device *device,
5285                   int required_version,
5286                   const char *function)
5287 {
5288    if (device->physical->cmd_parser_version < required_version) {
5289       return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT,
5290                        "cmd parser version %d is required for %s",
5291                        required_version, function);
5292    } else {
5293       return VK_SUCCESS;
5294    }
5295 }
5296 
5297 #endif
5298 
5299 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer, uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ)5300 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
5301                                   uint32_t baseGroupX,
5302                                   uint32_t baseGroupY,
5303                                   uint32_t baseGroupZ)
5304 {
5305    if (anv_batch_has_error(&cmd_buffer->batch))
5306       return;
5307 
5308    struct anv_push_constants *push =
5309       &cmd_buffer->state.compute.base.push_constants;
5310    if (push->cs.base_work_group_id[0] != baseGroupX ||
5311        push->cs.base_work_group_id[1] != baseGroupY ||
5312        push->cs.base_work_group_id[2] != baseGroupZ) {
5313       push->cs.base_work_group_id[0] = baseGroupX;
5314       push->cs.base_work_group_id[1] = baseGroupY;
5315       push->cs.base_work_group_id[2] = baseGroupZ;
5316 
5317       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5318    }
5319 }
5320 
CmdDispatch( VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)5321 void genX(CmdDispatch)(
5322     VkCommandBuffer                             commandBuffer,
5323     uint32_t                                    x,
5324     uint32_t                                    y,
5325     uint32_t                                    z)
5326 {
5327    genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z);
5328 }
5329 
5330 #if GFX_VERx10 >= 125
5331 
5332 static inline void
emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, const struct anv_compute_pipeline *pipeline, bool indirect, const struct brw_cs_prog_data *prog_data, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)5333 emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
5334                     const struct anv_compute_pipeline *pipeline, bool indirect,
5335                     const struct brw_cs_prog_data *prog_data,
5336                     uint32_t groupCountX, uint32_t groupCountY,
5337                     uint32_t groupCountZ)
5338 {
5339    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
5340    const struct anv_shader_bin *cs_bin = pipeline->cs;
5341    bool predicate = cmd_buffer->state.conditional_render_enabled;
5342 
5343    const struct intel_device_info *devinfo = &pipeline->base.device->info;
5344    const struct brw_cs_dispatch_info dispatch =
5345       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
5346 
5347    anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
5348       cw.IndirectParameterEnable        = indirect;
5349       cw.PredicateEnable                = predicate;
5350       cw.SIMDSize                       = dispatch.simd_size / 16;
5351       cw.IndirectDataStartAddress       = comp_state->push_data.offset;
5352       cw.IndirectDataLength             = comp_state->push_data.alloc_size;
5353       cw.LocalXMaximum                  = prog_data->local_size[0] - 1;
5354       cw.LocalYMaximum                  = prog_data->local_size[1] - 1;
5355       cw.LocalZMaximum                  = prog_data->local_size[2] - 1;
5356       cw.ThreadGroupIDXDimension        = groupCountX;
5357       cw.ThreadGroupIDYDimension        = groupCountY;
5358       cw.ThreadGroupIDZDimension        = groupCountZ;
5359       cw.ExecutionMask                  = dispatch.right_mask;
5360       cw.PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0);
5361 
5362       cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
5363          .KernelStartPointer = cs_bin->kernel.offset,
5364          .SamplerStatePointer =
5365             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
5366          .BindingTablePointer =
5367             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
5368          .BindingTableEntryCount =
5369             1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
5370          .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
5371          .SharedLocalMemorySize = encode_slm_size(GFX_VER,
5372                                                   prog_data->base.total_shared),
5373          .NumberOfBarriers = prog_data->uses_barrier,
5374       };
5375    }
5376 }
5377 
5378 #else /* #if GFX_VERx10 >= 125 */
5379 
5380 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer, const struct anv_compute_pipeline *pipeline, bool indirect, const struct brw_cs_prog_data *prog_data, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)5381 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
5382                   const struct anv_compute_pipeline *pipeline, bool indirect,
5383                   const struct brw_cs_prog_data *prog_data,
5384                   uint32_t groupCountX, uint32_t groupCountY,
5385                   uint32_t groupCountZ)
5386 {
5387    bool predicate = (GFX_VER <= 7 && indirect) ||
5388       cmd_buffer->state.conditional_render_enabled;
5389 
5390    const struct intel_device_info *devinfo = &pipeline->base.device->info;
5391    const struct brw_cs_dispatch_info dispatch =
5392       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
5393 
5394    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
5395       ggw.IndirectParameterEnable      = indirect;
5396       ggw.PredicateEnable              = predicate;
5397       ggw.SIMDSize                     = dispatch.simd_size / 16;
5398       ggw.ThreadDepthCounterMaximum    = 0;
5399       ggw.ThreadHeightCounterMaximum   = 0;
5400       ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
5401       ggw.ThreadGroupIDXDimension      = groupCountX;
5402       ggw.ThreadGroupIDYDimension      = groupCountY;
5403       ggw.ThreadGroupIDZDimension      = groupCountZ;
5404       ggw.RightExecutionMask           = dispatch.right_mask;
5405       ggw.BottomExecutionMask          = 0xffffffff;
5406    }
5407 
5408    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
5409 }
5410 
5411 #endif /* #if GFX_VERx10 >= 125 */
5412 
5413 static inline void
emit_cs_walker(struct anv_cmd_buffer *cmd_buffer, const struct anv_compute_pipeline *pipeline, bool indirect, const struct brw_cs_prog_data *prog_data, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)5414 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
5415                const struct anv_compute_pipeline *pipeline, bool indirect,
5416                const struct brw_cs_prog_data *prog_data,
5417                uint32_t groupCountX, uint32_t groupCountY,
5418                uint32_t groupCountZ)
5419 {
5420 #if GFX_VERx10 >= 125
5421    emit_compute_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
5422                        groupCountY, groupCountZ);
5423 #else
5424    emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
5425                      groupCountY, groupCountZ);
5426 #endif
5427 }
5428 
CmdDispatchBase( VkCommandBuffer commandBuffer, uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)5429 void genX(CmdDispatchBase)(
5430     VkCommandBuffer                             commandBuffer,
5431     uint32_t                                    baseGroupX,
5432     uint32_t                                    baseGroupY,
5433     uint32_t                                    baseGroupZ,
5434     uint32_t                                    groupCountX,
5435     uint32_t                                    groupCountY,
5436     uint32_t                                    groupCountZ)
5437 {
5438    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5439    struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
5440    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
5441 
5442    anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
5443                                      baseGroupY, baseGroupZ);
5444 
5445    if (anv_batch_has_error(&cmd_buffer->batch))
5446       return;
5447 
5448    anv_measure_snapshot(cmd_buffer,
5449                         INTEL_SNAPSHOT_COMPUTE,
5450                         "compute",
5451                         groupCountX * groupCountY * groupCountZ *
5452                         prog_data->local_size[0] * prog_data->local_size[1] *
5453                         prog_data->local_size[2]);
5454 
5455    trace_intel_begin_compute(&cmd_buffer->trace);
5456 
5457    if (prog_data->uses_num_work_groups) {
5458       struct anv_state state =
5459          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
5460       uint32_t *sizes = state.map;
5461       sizes[0] = groupCountX;
5462       sizes[1] = groupCountY;
5463       sizes[2] = groupCountZ;
5464       cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
5465          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5466          .offset = state.offset,
5467       };
5468 
5469       /* The num_workgroups buffer goes in the binding table */
5470       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5471    }
5472 
5473    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
5474 
5475    if (cmd_buffer->state.conditional_render_enabled)
5476       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5477 
5478    emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
5479                   groupCountY, groupCountZ);
5480 
5481    trace_intel_end_compute(&cmd_buffer->trace,
5482                            groupCountX, groupCountY, groupCountZ);
5483 }
5484 
5485 #define GPGPU_DISPATCHDIMX 0x2500
5486 #define GPGPU_DISPATCHDIMY 0x2504
5487 #define GPGPU_DISPATCHDIMZ 0x2508
5488 
CmdDispatchIndirect( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)5489 void genX(CmdDispatchIndirect)(
5490     VkCommandBuffer                             commandBuffer,
5491     VkBuffer                                    _buffer,
5492     VkDeviceSize                                offset)
5493 {
5494    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5495    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5496    struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
5497    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
5498    struct anv_address addr = anv_address_add(buffer->address, offset);
5499    UNUSED struct anv_batch *batch = &cmd_buffer->batch;
5500 
5501    anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
5502 
5503 #if GFX_VER == 7
5504    /* Linux 4.4 added command parser version 5 which allows the GPGPU
5505     * indirect dispatch registers to be written.
5506     */
5507    if (verify_cmd_parser(cmd_buffer->device, 5,
5508                          "vkCmdDispatchIndirect") != VK_SUCCESS)
5509       return;
5510 #endif
5511 
5512    anv_measure_snapshot(cmd_buffer,
5513                         INTEL_SNAPSHOT_COMPUTE,
5514                         "compute indirect",
5515                         0);
5516    trace_intel_begin_compute(&cmd_buffer->trace);
5517 
5518    if (prog_data->uses_num_work_groups) {
5519       cmd_buffer->state.compute.num_workgroups = addr;
5520 
5521       /* The num_workgroups buffer goes in the binding table */
5522       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5523    }
5524 
5525    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
5526 
5527    struct mi_builder b;
5528    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5529 
5530    struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));
5531    struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));
5532    struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));
5533 
5534    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
5535    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
5536    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
5537 
5538 #if GFX_VER <= 7
5539    /* predicate = (compute_dispatch_indirect_x_size == 0); */
5540    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);
5541    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5542    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5543       mip.LoadOperation    = LOAD_LOAD;
5544       mip.CombineOperation = COMBINE_SET;
5545       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5546    }
5547 
5548    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
5549    mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);
5550    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5551       mip.LoadOperation    = LOAD_LOAD;
5552       mip.CombineOperation = COMBINE_OR;
5553       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5554    }
5555 
5556    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
5557    mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);
5558    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5559       mip.LoadOperation    = LOAD_LOAD;
5560       mip.CombineOperation = COMBINE_OR;
5561       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5562    }
5563 
5564    /* predicate = !predicate; */
5565    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5566       mip.LoadOperation    = LOAD_LOADINV;
5567       mip.CombineOperation = COMBINE_OR;
5568       mip.CompareOperation = COMPARE_FALSE;
5569    }
5570 
5571 #if GFX_VERx10 == 75
5572    if (cmd_buffer->state.conditional_render_enabled) {
5573       /* predicate &= !(conditional_rendering_predicate == 0); */
5574       mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),
5575                    mi_reg32(ANV_PREDICATE_RESULT_REG));
5576       anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5577          mip.LoadOperation    = LOAD_LOADINV;
5578          mip.CombineOperation = COMBINE_AND;
5579          mip.CompareOperation = COMPARE_SRCS_EQUAL;
5580       }
5581    }
5582 #endif
5583 
5584 #else /* GFX_VER > 7 */
5585    if (cmd_buffer->state.conditional_render_enabled)
5586       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5587 #endif
5588 
5589    emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
5590 
5591    trace_intel_end_compute(&cmd_buffer->trace, 0, 0, 0);
5592 }
5593 
5594 struct anv_state
cmd_buffer_ray_query_globals(struct anv_cmd_buffer *cmd_buffer)5595 genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
5596 {
5597 #if GFX_VERx10 >= 125
5598    struct anv_device *device = cmd_buffer->device;
5599 
5600    struct anv_state state =
5601       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
5602                                          BRW_RT_DISPATCH_GLOBALS_SIZE,
5603                                          64);
5604    struct brw_rt_scratch_layout layout;
5605    uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
5606                                        * some cases?
5607                                        */
5608    brw_rt_compute_scratch_layout(&layout, &device->info,
5609                                  stack_ids_per_dss, 1 << 10);
5610 
5611    struct GFX_RT_DISPATCH_GLOBALS rtdg = {
5612       .MemBaseAddress = (struct anv_address) {
5613          /* The ray query HW computes offsets from the top of the buffer, so
5614           * let the address at the end of the buffer.
5615           */
5616          .bo = device->ray_query_bo,
5617          .offset = device->ray_query_bo->size
5618       },
5619       .AsyncRTStackSize = layout.ray_stack_stride / 64,
5620       .NumDSSRTStacks = layout.stack_ids_per_dss,
5621       .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
5622       .Flags = RT_DEPTH_TEST_LESS_EQUAL,
5623       .ResumeShaderTable = (struct anv_address) {
5624          .bo = cmd_buffer->state.ray_query_shadow_bo,
5625       },
5626    };
5627    GFX_RT_DISPATCH_GLOBALS_pack(NULL, state.map, &rtdg);
5628 
5629    return state;
5630 #else
5631    unreachable("Not supported");
5632 #endif
5633 }
5634 
5635 #if GFX_VERx10 >= 125
5636 static void
calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])5637 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
5638 {
5639    unsigned total_shift = 0;
5640    memset(local_shift, 0, 3);
5641 
5642    bool progress;
5643    do {
5644       progress = false;
5645       for (unsigned i = 0; i < 3; i++) {
5646          assert(global[i] > 0);
5647          if ((1 << local_shift[i]) < global[i]) {
5648             progress = true;
5649             local_shift[i]++;
5650             total_shift++;
5651          }
5652 
5653          if (total_shift == 3)
5654             return;
5655       }
5656    } while(progress);
5657 
5658    /* Assign whatever's left to x */
5659    local_shift[0] += 3 - total_shift;
5660 }
5661 
5662 static struct GFX_RT_SHADER_TABLE
vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)5663 vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
5664 {
5665    return (struct GFX_RT_SHADER_TABLE) {
5666       .BaseAddress = anv_address_from_u64(region->deviceAddress),
5667       .Stride = region->stride,
5668    };
5669 }
5670 
5671 static void
cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, const VkStridedDeviceAddressRegionKHR *raygen_sbt, const VkStridedDeviceAddressRegionKHR *miss_sbt, const VkStridedDeviceAddressRegionKHR *hit_sbt, const VkStridedDeviceAddressRegionKHR *callable_sbt, bool is_indirect, uint32_t launch_width, uint32_t launch_height, uint32_t launch_depth, uint64_t launch_size_addr)5672 cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
5673                       const VkStridedDeviceAddressRegionKHR *raygen_sbt,
5674                       const VkStridedDeviceAddressRegionKHR *miss_sbt,
5675                       const VkStridedDeviceAddressRegionKHR *hit_sbt,
5676                       const VkStridedDeviceAddressRegionKHR *callable_sbt,
5677                       bool is_indirect,
5678                       uint32_t launch_width,
5679                       uint32_t launch_height,
5680                       uint32_t launch_depth,
5681                       uint64_t launch_size_addr)
5682 {
5683    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
5684    struct anv_ray_tracing_pipeline *pipeline = rt->pipeline;
5685 
5686    if (anv_batch_has_error(&cmd_buffer->batch))
5687       return;
5688 
5689    /* If we have a known degenerate launch size, just bail */
5690    if (!is_indirect &&
5691        (launch_width == 0 || launch_height == 0 || launch_depth == 0))
5692       return;
5693 
5694    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
5695    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
5696 
5697    cmd_buffer->state.rt.pipeline_dirty = false;
5698 
5699    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5700 
5701    /* Add these to the reloc list as they're internal buffers that don't
5702     * actually have relocs to pick them up manually.
5703     *
5704     * TODO(RT): This is a bit of a hack
5705     */
5706    anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
5707                          cmd_buffer->batch.alloc,
5708                          rt->scratch.bo);
5709 
5710    /* Allocate and set up our RT_DISPATCH_GLOBALS */
5711    struct anv_state rtdg_state =
5712       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
5713                                          BRW_RT_PUSH_CONST_OFFSET +
5714                                          sizeof(struct anv_push_constants),
5715                                          64);
5716 
5717    struct GFX_RT_DISPATCH_GLOBALS rtdg = {
5718       .MemBaseAddress = (struct anv_address) {
5719          .bo = rt->scratch.bo,
5720          .offset = rt->scratch.layout.ray_stack_start,
5721       },
5722       .CallStackHandler =
5723          anv_shader_bin_get_bsr(cmd_buffer->device->rt_trivial_return, 0),
5724       .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
5725       .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
5726       .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
5727       .Flags = RT_DEPTH_TEST_LESS_EQUAL,
5728       .HitGroupTable = vk_sdar_to_shader_table(hit_sbt),
5729       .MissGroupTable = vk_sdar_to_shader_table(miss_sbt),
5730       .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
5731       .LaunchWidth = launch_width,
5732       .LaunchHeight = launch_height,
5733       .LaunchDepth = launch_depth,
5734       .CallableGroupTable = vk_sdar_to_shader_table(callable_sbt),
5735    };
5736    GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg);
5737 
5738    /* Push constants go after the RT_DISPATCH_GLOBALS */
5739    assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET);
5740    memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
5741           &cmd_buffer->state.rt.base.push_constants,
5742           sizeof(struct anv_push_constants));
5743 
5744    struct anv_address rtdg_addr = {
5745       .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5746       .offset = rtdg_state.offset,
5747    };
5748 
5749    uint8_t local_size_log2[3];
5750    uint32_t global_size[3] = {};
5751    if (is_indirect) {
5752       /* Pick a local size that's probably ok.  We assume most TraceRays calls
5753        * will use a two-dimensional dispatch size.  Worst case, our initial
5754        * dispatch will be a little slower than it has to be.
5755        */
5756       local_size_log2[0] = 2;
5757       local_size_log2[1] = 1;
5758       local_size_log2[2] = 0;
5759 
5760       struct mi_builder b;
5761       mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5762 
5763       struct mi_value launch_size[3] = {
5764          mi_mem32(anv_address_from_u64(launch_size_addr + 0)),
5765          mi_mem32(anv_address_from_u64(launch_size_addr + 4)),
5766          mi_mem32(anv_address_from_u64(launch_size_addr + 8)),
5767       };
5768 
5769       /* Store the original launch size into RT_DISPATCH_GLOBALS
5770        *
5771        * TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets
5772        * moved into a genX version.
5773        */
5774       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)),
5775                mi_value_ref(&b, launch_size[0]));
5776       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)),
5777                mi_value_ref(&b, launch_size[1]));
5778       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)),
5779                mi_value_ref(&b, launch_size[2]));
5780 
5781       /* Compute the global dispatch size */
5782       for (unsigned i = 0; i < 3; i++) {
5783          if (local_size_log2[i] == 0)
5784             continue;
5785 
5786          /* global_size = DIV_ROUND_UP(launch_size, local_size)
5787           *
5788           * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
5789           * has the semantics of shifting the enture 64-bit value and taking
5790           * the bottom 32 so we don't have to worry about roll-over.
5791           */
5792          uint32_t local_size = 1 << local_size_log2[i];
5793          launch_size[i] = mi_iadd(&b, launch_size[i],
5794                                       mi_imm(local_size - 1));
5795          launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
5796                                             local_size_log2[i]);
5797       }
5798 
5799       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
5800       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
5801       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
5802    } else {
5803       uint32_t launch_size[3] = { launch_width, launch_height, launch_depth };
5804       calc_local_trace_size(local_size_log2, launch_size);
5805 
5806       for (unsigned i = 0; i < 3; i++) {
5807          /* We have to be a bit careful here because DIV_ROUND_UP adds to the
5808           * numerator value may overflow.  Cast to uint64_t to avoid this.
5809           */
5810          uint32_t local_size = 1 << local_size_log2[i];
5811          global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size);
5812       }
5813    }
5814 
5815    anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
5816       cw.IndirectParameterEnable        = is_indirect;
5817       cw.PredicateEnable                = false;
5818       cw.SIMDSize                       = SIMD8;
5819       cw.LocalXMaximum                  = (1 << local_size_log2[0]) - 1;
5820       cw.LocalYMaximum                  = (1 << local_size_log2[1]) - 1;
5821       cw.LocalZMaximum                  = (1 << local_size_log2[2]) - 1;
5822       cw.ThreadGroupIDXDimension        = global_size[0];
5823       cw.ThreadGroupIDYDimension        = global_size[1];
5824       cw.ThreadGroupIDZDimension        = global_size[2];
5825       cw.ExecutionMask                  = 0xff;
5826       cw.EmitInlineParameter            = true;
5827       cw.PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0);
5828 
5829       const gl_shader_stage s = MESA_SHADER_RAYGEN;
5830       struct anv_device *device = cmd_buffer->device;
5831       struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
5832       struct anv_state *samplers = &cmd_buffer->state.samplers[s];
5833       cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
5834          .KernelStartPointer = device->rt_trampoline->kernel.offset,
5835          .SamplerStatePointer = samplers->offset,
5836          /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
5837          .SamplerCount = 0,
5838          .BindingTablePointer = surfaces->offset,
5839          .NumberofThreadsinGPGPUThreadGroup = 1,
5840          .BTDMode = true,
5841       };
5842 
5843       struct brw_rt_raygen_trampoline_params trampoline_params = {
5844          .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
5845          .raygen_bsr_addr = raygen_sbt->deviceAddress,
5846          .is_indirect = is_indirect,
5847          .local_group_size_log2 = {
5848             local_size_log2[0],
5849             local_size_log2[1],
5850             local_size_log2[2],
5851          },
5852       };
5853       STATIC_ASSERT(sizeof(trampoline_params) == 32);
5854       memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
5855    }
5856 }
5857 
5858 void
CmdTraceRaysKHR( VkCommandBuffer commandBuffer, const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, uint32_t width, uint32_t height, uint32_t depth)5859 genX(CmdTraceRaysKHR)(
5860     VkCommandBuffer                             commandBuffer,
5861     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
5862     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
5863     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
5864     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
5865     uint32_t                                    width,
5866     uint32_t                                    height,
5867     uint32_t                                    depth)
5868 {
5869    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5870 
5871    cmd_buffer_trace_rays(cmd_buffer,
5872                          pRaygenShaderBindingTable,
5873                          pMissShaderBindingTable,
5874                          pHitShaderBindingTable,
5875                          pCallableShaderBindingTable,
5876                          false /* is_indirect */,
5877                          width, height, depth,
5878                          0 /* launch_size_addr */);
5879 }
5880 
5881 void
CmdTraceRaysIndirectKHR( VkCommandBuffer commandBuffer, const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, VkDeviceAddress indirectDeviceAddress)5882 genX(CmdTraceRaysIndirectKHR)(
5883     VkCommandBuffer                             commandBuffer,
5884     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
5885     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
5886     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
5887     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
5888     VkDeviceAddress                             indirectDeviceAddress)
5889 {
5890    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5891 
5892    cmd_buffer_trace_rays(cmd_buffer,
5893                          pRaygenShaderBindingTable,
5894                          pMissShaderBindingTable,
5895                          pHitShaderBindingTable,
5896                          pCallableShaderBindingTable,
5897                          true /* is_indirect */,
5898                          0, 0, 0, /* width, height, depth, */
5899                          indirectDeviceAddress);
5900 }
5901 #endif /* GFX_VERx10 >= 125 */
5902 
5903 static void
flush_pipeline_select(struct anv_cmd_buffer *cmd_buffer, uint32_t pipeline)5904 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
5905                             uint32_t pipeline)
5906 {
5907    UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info;
5908 
5909    if (cmd_buffer->state.current_pipeline == pipeline)
5910       return;
5911 
5912 #if GFX_VER >= 8 && GFX_VER < 10
5913    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
5914     *
5915     *   Software must clear the COLOR_CALC_STATE Valid field in
5916     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
5917     *   with Pipeline Select set to GPGPU.
5918     *
5919     * The internal hardware docs recommend the same workaround for Gfx9
5920     * hardware too.
5921     */
5922    if (pipeline == GPGPU)
5923       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
5924 #endif
5925 
5926 #if GFX_VER == 9
5927    if (pipeline == _3D) {
5928       /* There is a mid-object preemption workaround which requires you to
5929        * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D.  However,
5930        * even without preemption, we have issues with geometry flickering when
5931        * GPGPU and 3D are back-to-back and this seems to fix it.  We don't
5932        * really know why.
5933        */
5934       anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
5935          vfe.MaximumNumberofThreads =
5936             devinfo->max_cs_threads * devinfo->subslice_total - 1;
5937          vfe.NumberofURBEntries     = 2;
5938          vfe.URBEntryAllocationSize = 2;
5939       }
5940 
5941       /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
5942        * invalid. Set the compute pipeline to dirty to force a re-emit of the
5943        * pipeline in case we get back-to-back dispatch calls with the same
5944        * pipeline and a PIPELINE_SELECT in between.
5945        */
5946       cmd_buffer->state.compute.pipeline_dirty = true;
5947    }
5948 #endif
5949 
5950    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
5951     * PIPELINE_SELECT [DevBWR+]":
5952     *
5953     *   Project: DEVSNB+
5954     *
5955     *   Software must ensure all the write caches are flushed through a
5956     *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
5957     *   command to invalidate read only caches prior to programming
5958     *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
5959     *
5960     * Note the cmd_buffer_apply_pipe_flushes will split this into two
5961     * PIPE_CONTROLs.
5962     */
5963    anv_add_pending_pipe_bits(cmd_buffer,
5964                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
5965                              ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
5966                              ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
5967                              ANV_PIPE_CS_STALL_BIT |
5968                              ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5969                              ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
5970                              ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
5971                              ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT,
5972                              "flush and invalidate for PIPELINE_SELECT");
5973    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5974 
5975    anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
5976 #if GFX_VER >= 9
5977       ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
5978       ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
5979 #endif
5980       ps.PipelineSelection = pipeline;
5981    }
5982 
5983 #if GFX_VER == 9
5984    if (devinfo->platform == INTEL_PLATFORM_GLK) {
5985       /* Project: DevGLK
5986        *
5987        * "This chicken bit works around a hardware issue with barrier logic
5988        *  encountered when switching between GPGPU and 3D pipelines.  To
5989        *  workaround the issue, this mode bit should be set after a pipeline
5990        *  is selected."
5991        */
5992       anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) {
5993          scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU
5994                                                   : GLK_BARRIER_MODE_3D_HULL;
5995          scec1.GLKBarrierModeMask = 1;
5996       }
5997    }
5998 #endif
5999 
6000    cmd_buffer->state.current_pipeline = pipeline;
6001 }
6002 
6003 void
flush_pipeline_select_3d(struct anv_cmd_buffer *cmd_buffer)6004 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
6005 {
6006    genX(flush_pipeline_select)(cmd_buffer, _3D);
6007 }
6008 
6009 void
flush_pipeline_select_gpgpu(struct anv_cmd_buffer *cmd_buffer)6010 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
6011 {
6012    genX(flush_pipeline_select)(cmd_buffer, GPGPU);
6013 }
6014 
6015 void
cmd_buffer_emit_gfx7_depth_flush(struct anv_cmd_buffer *cmd_buffer)6016 genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
6017 {
6018    if (GFX_VER >= 8)
6019       return;
6020 
6021    /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
6022     *
6023     *    "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
6024     *    combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
6025     *    3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
6026     *    issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
6027     *    set), followed by a pipelined depth cache flush (PIPE_CONTROL with
6028     *    Depth Flush Bit set, followed by another pipelined depth stall
6029     *    (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
6030     *    guarantee that the pipeline from WM onwards is already flushed (e.g.,
6031     *    via a preceding MI_FLUSH)."
6032     */
6033    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
6034       pipe.DepthStallEnable = true;
6035       anv_debug_dump_pc(pipe);
6036    }
6037    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
6038       pipe.DepthCacheFlushEnable = true;
6039 #if GFX_VER >= 12
6040       pipe.TileCacheFlushEnable = true;
6041 #endif
6042       anv_debug_dump_pc(pipe);
6043    }
6044    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
6045       pipe.DepthStallEnable = true;
6046       anv_debug_dump_pc(pipe);
6047    }
6048 }
6049 
6050 void
cmd_buffer_emit_gfx12_depth_wa(struct anv_cmd_buffer *cmd_buffer, const struct isl_surf *surf)6051 genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
6052                                      const struct isl_surf *surf)
6053 {
6054 #if GFX_VERx10 == 120
6055    const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
6056                                surf->samples == 1;
6057 
6058    switch (cmd_buffer->state.depth_reg_mode) {
6059    case ANV_DEPTH_REG_MODE_HW_DEFAULT:
6060       if (!is_d16_1x_msaa)
6061          return;
6062       break;
6063    case ANV_DEPTH_REG_MODE_D16_1X_MSAA:
6064       if (is_d16_1x_msaa)
6065          return;
6066       break;
6067    case ANV_DEPTH_REG_MODE_UNKNOWN:
6068       break;
6069    }
6070 
6071    /* We'll change some CHICKEN registers depending on the depth surface
6072     * format. Do a depth flush and stall so the pipeline is not using these
6073     * settings while we change the registers.
6074     */
6075    anv_add_pending_pipe_bits(cmd_buffer,
6076                              ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
6077                              ANV_PIPE_DEPTH_STALL_BIT |
6078                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
6079                              "Workaround: Stop pipeline for 14010455700");
6080    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6081 
6082    /* Wa_14010455700
6083     *
6084     * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
6085     * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
6086     */
6087    anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
6088       reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
6089       reg.HIZPlaneOptimizationdisablebitMask = true;
6090    }
6091 
6092    cmd_buffer->state.depth_reg_mode =
6093       is_d16_1x_msaa ? ANV_DEPTH_REG_MODE_D16_1X_MSAA :
6094                        ANV_DEPTH_REG_MODE_HW_DEFAULT;
6095 #endif
6096 }
6097 
6098 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
6099  *
6100  *    "The VF cache needs to be invalidated before binding and then using
6101  *    Vertex Buffers that overlap with any previously bound Vertex Buffer
6102  *    (at a 64B granularity) since the last invalidation.  A VF cache
6103  *    invalidate is performed by setting the "VF Cache Invalidation Enable"
6104  *    bit in PIPE_CONTROL."
6105  *
6106  * This is implemented by carefully tracking all vertex and index buffer
6107  * bindings and flushing if the cache ever ends up with a range in the cache
6108  * that would exceed 4 GiB.  This is implemented in three parts:
6109  *
6110  *    1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
6111  *       every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
6112  *       tracking code of the new binding.  If this new binding would cause
6113  *       the cache to have a too-large range on the next draw call, a pipeline
6114  *       stall and VF cache invalidate are added to pending_pipeline_bits.
6115  *
6116  *    2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
6117  *       empty whenever we emit a VF invalidate.
6118  *
6119  *    3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
6120  *       after every 3DPRIMITIVE and copies the bound range into the dirty
6121  *       range for each used buffer.  This has to be a separate step because
6122  *       we don't always re-bind all buffers and so 1. can't know which
6123  *       buffers are actually bound.
6124  */
6125 void
cmd_buffer_set_binding_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer, int vb_index, struct anv_address vb_address, uint32_t vb_size)6126 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
6127                                                int vb_index,
6128                                                struct anv_address vb_address,
6129                                                uint32_t vb_size)
6130 {
6131    if (GFX_VER < 8 || GFX_VER > 9 ||
6132        anv_use_relocations(cmd_buffer->device->physical))
6133       return;
6134 
6135    struct anv_vb_cache_range *bound, *dirty;
6136    if (vb_index == -1) {
6137       bound = &cmd_buffer->state.gfx.ib_bound_range;
6138       dirty = &cmd_buffer->state.gfx.ib_dirty_range;
6139    } else {
6140       assert(vb_index >= 0);
6141       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
6142       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
6143       bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
6144       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
6145    }
6146 
6147    if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
6148                                                   vb_address,
6149                                                   vb_size)) {
6150       anv_add_pending_pipe_bits(cmd_buffer,
6151                                 ANV_PIPE_CS_STALL_BIT |
6152                                 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
6153                                 "vb > 32b range");
6154    }
6155 }
6156 
6157 void
cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer, uint32_t access_type, uint64_t vb_used)6158 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
6159                                                     uint32_t access_type,
6160                                                     uint64_t vb_used)
6161 {
6162    if (GFX_VER < 8 || GFX_VER > 9 ||
6163        anv_use_relocations(cmd_buffer->device->physical))
6164       return;
6165 
6166    if (access_type == RANDOM) {
6167       /* We have an index buffer */
6168       struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
6169       struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
6170 
6171       if (bound->end > bound->start) {
6172          dirty->start = MIN2(dirty->start, bound->start);
6173          dirty->end = MAX2(dirty->end, bound->end);
6174       }
6175    }
6176 
6177    uint64_t mask = vb_used;
6178    while (mask) {
6179       int i = u_bit_scan64(&mask);
6180       assert(i >= 0);
6181       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
6182       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
6183 
6184       struct anv_vb_cache_range *bound, *dirty;
6185       bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
6186       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
6187 
6188       if (bound->end > bound->start) {
6189          dirty->start = MIN2(dirty->start, bound->start);
6190          dirty->end = MAX2(dirty->end, bound->end);
6191       }
6192    }
6193 }
6194 
6195 /**
6196  * Update the pixel hashing modes that determine the balancing of PS threads
6197  * across subslices and slices.
6198  *
6199  * \param width Width bound of the rendering area (already scaled down if \p
6200  *              scale is greater than 1).
6201  * \param height Height bound of the rendering area (already scaled down if \p
6202  *               scale is greater than 1).
6203  * \param scale The number of framebuffer samples that could potentially be
6204  *              affected by an individual channel of the PS thread.  This is
6205  *              typically one for single-sampled rendering, but for operations
6206  *              like CCS resolves and fast clears a single PS invocation may
6207  *              update a huge number of pixels, in which case a finer
6208  *              balancing is desirable in order to maximally utilize the
6209  *              bandwidth available.  UINT_MAX can be used as shorthand for
6210  *              "finest hashing mode available".
6211  */
6212 void
cmd_buffer_emit_hashing_mode(struct anv_cmd_buffer *cmd_buffer, unsigned width, unsigned height, unsigned scale)6213 genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
6214                                    unsigned width, unsigned height,
6215                                    unsigned scale)
6216 {
6217 #if GFX_VER == 9
6218    const struct intel_device_info *devinfo = &cmd_buffer->device->info;
6219    const unsigned slice_hashing[] = {
6220       /* Because all Gfx9 platforms with more than one slice require
6221        * three-way subslice hashing, a single "normal" 16x16 slice hashing
6222        * block is guaranteed to suffer from substantial imbalance, with one
6223        * subslice receiving twice as much work as the other two in the
6224        * slice.
6225        *
6226        * The performance impact of that would be particularly severe when
6227        * three-way hashing is also in use for slice balancing (which is the
6228        * case for all Gfx9 GT4 platforms), because one of the slices
6229        * receives one every three 16x16 blocks in either direction, which
6230        * is roughly the periodicity of the underlying subslice imbalance
6231        * pattern ("roughly" because in reality the hardware's
6232        * implementation of three-way hashing doesn't do exact modulo 3
6233        * arithmetic, which somewhat decreases the magnitude of this effect
6234        * in practice).  This leads to a systematic subslice imbalance
6235        * within that slice regardless of the size of the primitive.  The
6236        * 32x32 hashing mode guarantees that the subslice imbalance within a
6237        * single slice hashing block is minimal, largely eliminating this
6238        * effect.
6239        */
6240       _32x32,
6241       /* Finest slice hashing mode available. */
6242       NORMAL
6243    };
6244    const unsigned subslice_hashing[] = {
6245       /* 16x16 would provide a slight cache locality benefit especially
6246        * visible in the sampler L1 cache efficiency of low-bandwidth
6247        * non-LLC platforms, but it comes at the cost of greater subslice
6248        * imbalance for primitives of dimensions approximately intermediate
6249        * between 16x4 and 16x16.
6250        */
6251       _16x4,
6252       /* Finest subslice hashing mode available. */
6253       _8x4
6254    };
6255    /* Dimensions of the smallest hashing block of a given hashing mode.  If
6256     * the rendering area is smaller than this there can't possibly be any
6257     * benefit from switching to this mode, so we optimize out the
6258     * transition.
6259     */
6260    const unsigned min_size[][2] = {
6261          { 16, 4 },
6262          { 8, 4 }
6263    };
6264    const unsigned idx = scale > 1;
6265 
6266    if (cmd_buffer->state.current_hash_scale != scale &&
6267        (width > min_size[idx][0] || height > min_size[idx][1])) {
6268       anv_add_pending_pipe_bits(cmd_buffer,
6269                                 ANV_PIPE_CS_STALL_BIT |
6270                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
6271                                 "change pixel hash mode");
6272       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6273 
6274       anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) {
6275          gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
6276          gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
6277          gt.SubsliceHashing = subslice_hashing[idx];
6278          gt.SubsliceHashingMask = -1;
6279       }
6280 
6281       cmd_buffer->state.current_hash_scale = scale;
6282    }
6283 #endif
6284 }
6285 
6286 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)6287 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
6288 {
6289    struct anv_device *device = cmd_buffer->device;
6290    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
6291 
6292    /* FIXME: Width and Height are wrong */
6293 
6294    genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
6295 
6296    uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
6297                                         device->isl_dev.ds.size / 4);
6298    if (dw == NULL)
6299       return;
6300 
6301    struct isl_view isl_view = {};
6302    struct isl_depth_stencil_hiz_emit_info info = {
6303       .view = &isl_view,
6304       .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
6305    };
6306 
6307    if (gfx->depth_att.iview != NULL) {
6308       isl_view = gfx->depth_att.iview->planes[0].isl;
6309    } else if (gfx->stencil_att.iview != NULL) {
6310       isl_view = gfx->stencil_att.iview->planes[0].isl;
6311    }
6312 
6313    if (gfx->view_mask) {
6314       assert(isl_view.array_len == 0 ||
6315              isl_view.array_len >= util_last_bit(gfx->view_mask));
6316       isl_view.array_len = util_last_bit(gfx->view_mask);
6317    } else {
6318       assert(isl_view.array_len == 0 ||
6319              isl_view.array_len >= util_last_bit(gfx->layer_count));
6320       isl_view.array_len = gfx->layer_count;
6321    }
6322 
6323    if (gfx->depth_att.iview != NULL) {
6324       const struct anv_image_view *iview = gfx->depth_att.iview;
6325       const struct anv_image *image = iview->image;
6326 
6327       const uint32_t depth_plane =
6328          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
6329       const struct anv_surface *depth_surface =
6330          &image->planes[depth_plane].primary_surface;
6331       const struct anv_address depth_address =
6332          anv_image_address(image, &depth_surface->memory_range);
6333 
6334       info.depth_surf = &depth_surface->isl;
6335 
6336       info.depth_address =
6337          anv_batch_emit_reloc(&cmd_buffer->batch,
6338                               dw + device->isl_dev.ds.depth_offset / 4,
6339                               depth_address.bo, depth_address.offset);
6340       info.mocs =
6341          anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
6342 
6343       info.hiz_usage = gfx->depth_att.aux_usage;
6344       if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
6345          assert(isl_aux_usage_has_hiz(info.hiz_usage));
6346 
6347          const struct anv_surface *hiz_surface =
6348             &image->planes[depth_plane].aux_surface;
6349          const struct anv_address hiz_address =
6350             anv_image_address(image, &hiz_surface->memory_range);
6351 
6352          info.hiz_surf = &hiz_surface->isl;
6353 
6354          info.hiz_address =
6355             anv_batch_emit_reloc(&cmd_buffer->batch,
6356                                  dw + device->isl_dev.ds.hiz_offset / 4,
6357                                  hiz_address.bo, hiz_address.offset);
6358 
6359          info.depth_clear_value = ANV_HZ_FC_VAL;
6360       }
6361    }
6362 
6363    if (gfx->stencil_att.iview != NULL) {
6364       const struct anv_image_view *iview = gfx->stencil_att.iview;
6365       const struct anv_image *image = iview->image;
6366 
6367       const uint32_t stencil_plane =
6368          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
6369       const struct anv_surface *stencil_surface =
6370          &image->planes[stencil_plane].primary_surface;
6371       const struct anv_address stencil_address =
6372          anv_image_address(image, &stencil_surface->memory_range);
6373 
6374       info.stencil_surf = &stencil_surface->isl;
6375 
6376       info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
6377       info.stencil_address =
6378          anv_batch_emit_reloc(&cmd_buffer->batch,
6379                               dw + device->isl_dev.ds.stencil_offset / 4,
6380                               stencil_address.bo, stencil_address.offset);
6381       info.mocs =
6382          anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
6383    }
6384 
6385    isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
6386 
6387    if (info.depth_surf)
6388       genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
6389 
6390    if (GFX_VER >= 12) {
6391       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
6392       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6393 
6394       /* Wa_1408224581
6395        *
6396        * Workaround: Gfx12LP Astep only An additional pipe control with
6397        * post-sync = store dword operation would be required.( w/a is to
6398        * have an additional pipe control after the stencil state whenever
6399        * the surface state bits of this state is changing).
6400        *
6401        * This also seems sufficient to handle Wa_14014148106.
6402        */
6403       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
6404          pc.PostSyncOperation = WriteImmediateData;
6405          pc.Address = cmd_buffer->device->workaround_address;
6406       }
6407    }
6408    cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
6409 }
6410 
6411 static void
cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer, const struct anv_image_view *fsr_iview)6412 cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer,
6413                                    const struct anv_image_view *fsr_iview)
6414 {
6415 #if GFX_VERx10 >= 125
6416    struct anv_device *device = cmd_buffer->device;
6417 
6418    if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
6419       return;
6420 
6421    uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
6422                                         device->isl_dev.cpb.size / 4);
6423    if (dw == NULL)
6424       return;
6425 
6426    struct isl_cpb_emit_info info = { };
6427 
6428    if (fsr_iview) {
6429       info.view = &fsr_iview->planes[0].isl;
6430       info.surf = &fsr_iview->image->planes[0].primary_surface.isl;
6431       info.address =
6432          anv_batch_emit_reloc(&cmd_buffer->batch,
6433                               dw + device->isl_dev.cpb.offset / 4,
6434                               fsr_iview->image->bindings[0].address.bo,
6435                               fsr_iview->image->bindings[0].address.offset +
6436                               fsr_iview->image->bindings[0].memory_range.offset);
6437       info.mocs =
6438          anv_mocs(device, fsr_iview->image->bindings[0].address.bo,
6439                   ISL_SURF_USAGE_CPB_BIT);
6440    }
6441 
6442    isl_emit_cpb_control_s(&device->isl_dev, dw, &info);
6443 #endif /* GFX_VERx10 >= 125 */
6444 }
6445 
6446 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo *att)6447 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
6448 {
6449    const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
6450       vk_find_struct_const(att->pNext,
6451                            RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
6452    if (layout_info != NULL)
6453       return layout_info->initialLayout;
6454 
6455    return att->imageLayout;
6456 }
6457 
CmdBeginRendering( VkCommandBuffer commandBuffer, const VkRenderingInfo* pRenderingInfo)6458 void genX(CmdBeginRendering)(
6459     VkCommandBuffer                             commandBuffer,
6460     const VkRenderingInfo*                      pRenderingInfo)
6461 {
6462    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6463    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
6464    VkResult result;
6465 
6466    if (!is_render_queue_cmd_buffer(cmd_buffer)) {
6467       assert(!"Trying to start a render pass on non-render queue!");
6468       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
6469       return;
6470    }
6471 
6472    anv_measure_beginrenderpass(cmd_buffer);
6473    trace_intel_begin_render_pass(&cmd_buffer->trace);
6474 
6475    gfx->rendering_flags = pRenderingInfo->flags;
6476    gfx->render_area = pRenderingInfo->renderArea;
6477    gfx->view_mask = pRenderingInfo->viewMask;
6478    gfx->layer_count = pRenderingInfo->layerCount;
6479    gfx->samples = 0;
6480 
6481    const bool is_multiview = gfx->view_mask != 0;
6482    const VkRect2D render_area = gfx->render_area;
6483    const uint32_t layers =
6484       is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
6485 
6486    /* The framebuffer size is at least large enough to contain the render
6487     * area.  Because a zero renderArea is possible, we MAX with 1.
6488     */
6489    struct isl_extent3d fb_size = {
6490       .w = MAX2(1, render_area.offset.x + render_area.extent.width),
6491       .h = MAX2(1, render_area.offset.y + render_area.extent.height),
6492       .d = layers,
6493    };
6494 
6495    const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
6496    result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
6497    if (result != VK_SUCCESS)
6498       return;
6499 
6500    genX(flush_pipeline_select_3d)(cmd_buffer);
6501 
6502    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
6503       if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
6504          continue;
6505 
6506       const VkRenderingAttachmentInfo *att =
6507          &pRenderingInfo->pColorAttachments[i];
6508       ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
6509       const VkImageLayout initial_layout = attachment_initial_layout(att);
6510 
6511       assert(render_area.offset.x + render_area.extent.width <=
6512              iview->vk.extent.width);
6513       assert(render_area.offset.y + render_area.extent.height <=
6514              iview->vk.extent.height);
6515       assert(layers <= iview->vk.layer_count);
6516 
6517       fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
6518       fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
6519 
6520       assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
6521       gfx->samples |= iview->vk.image->samples;
6522 
6523       enum isl_aux_usage aux_usage =
6524          anv_layout_to_aux_usage(&cmd_buffer->device->info,
6525                                  iview->image,
6526                                  VK_IMAGE_ASPECT_COLOR_BIT,
6527                                  VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
6528                                  att->imageLayout);
6529 
6530       union isl_color_value fast_clear_color = { .u32 = { 0, } };
6531 
6532       if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6533           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
6534          const union isl_color_value clear_color =
6535             vk_to_isl_color_with_format(att->clearValue.color,
6536                                         iview->planes[0].isl.format);
6537 
6538          /* We only support fast-clears on the first layer */
6539          const bool fast_clear =
6540             (!is_multiview || (gfx->view_mask & 1)) &&
6541             anv_can_fast_clear_color_view(cmd_buffer->device, iview,
6542                                           att->imageLayout, clear_color,
6543                                           layers, render_area);
6544 
6545          if (att->imageLayout != initial_layout) {
6546             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
6547                    render_area.extent.width == iview->vk.extent.width &&
6548                    render_area.extent.height == iview->vk.extent.height);
6549             if (is_multiview) {
6550                u_foreach_bit(view, gfx->view_mask) {
6551                   transition_color_buffer(cmd_buffer, iview->image,
6552                                           VK_IMAGE_ASPECT_COLOR_BIT,
6553                                           iview->vk.base_mip_level, 1,
6554                                           iview->vk.base_array_layer + view,
6555                                           1, /* layer_count */
6556                                           initial_layout, att->imageLayout,
6557                                           VK_QUEUE_FAMILY_IGNORED,
6558                                           VK_QUEUE_FAMILY_IGNORED,
6559                                           fast_clear);
6560                }
6561             } else {
6562                transition_color_buffer(cmd_buffer, iview->image,
6563                                        VK_IMAGE_ASPECT_COLOR_BIT,
6564                                        iview->vk.base_mip_level, 1,
6565                                        iview->vk.base_array_layer,
6566                                        gfx->layer_count,
6567                                        initial_layout, att->imageLayout,
6568                                        VK_QUEUE_FAMILY_IGNORED,
6569                                        VK_QUEUE_FAMILY_IGNORED,
6570                                        fast_clear);
6571             }
6572          }
6573 
6574          uint32_t clear_view_mask = pRenderingInfo->viewMask;
6575          uint32_t base_clear_layer = iview->vk.base_array_layer;
6576          uint32_t clear_layer_count = gfx->layer_count;
6577          if (fast_clear) {
6578             /* We only support fast-clears on the first layer */
6579             assert(iview->vk.base_mip_level == 0 &&
6580                    iview->vk.base_array_layer == 0);
6581 
6582             fast_clear_color = clear_color;
6583 
6584             if (iview->image->vk.samples == 1) {
6585                anv_image_ccs_op(cmd_buffer, iview->image,
6586                                 iview->planes[0].isl.format,
6587                                 iview->planes[0].isl.swizzle,
6588                                 VK_IMAGE_ASPECT_COLOR_BIT,
6589                                 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
6590                                 &fast_clear_color,
6591                                 false);
6592             } else {
6593                anv_image_mcs_op(cmd_buffer, iview->image,
6594                                 iview->planes[0].isl.format,
6595                                 iview->planes[0].isl.swizzle,
6596                                 VK_IMAGE_ASPECT_COLOR_BIT,
6597                                 0, 1, ISL_AUX_OP_FAST_CLEAR,
6598                                 &fast_clear_color,
6599                                 false);
6600             }
6601             clear_view_mask &= ~1u;
6602             base_clear_layer++;
6603             clear_layer_count--;
6604 
6605             if (isl_color_value_is_zero(clear_color,
6606                                         iview->planes[0].isl.format)) {
6607                /* This image has the auxiliary buffer enabled. We can mark the
6608                 * subresource as not needing a resolve because the clear color
6609                 * will match what's in every RENDER_SURFACE_STATE object when
6610                 * it's being used for sampling.
6611                 */
6612                set_image_fast_clear_state(cmd_buffer, iview->image,
6613                                           VK_IMAGE_ASPECT_COLOR_BIT,
6614                                           ANV_FAST_CLEAR_DEFAULT_VALUE);
6615             } else {
6616                set_image_fast_clear_state(cmd_buffer, iview->image,
6617                                           VK_IMAGE_ASPECT_COLOR_BIT,
6618                                           ANV_FAST_CLEAR_ANY);
6619             }
6620          }
6621 
6622          if (is_multiview) {
6623             u_foreach_bit(view, clear_view_mask) {
6624                anv_image_clear_color(cmd_buffer, iview->image,
6625                                      VK_IMAGE_ASPECT_COLOR_BIT,
6626                                      aux_usage,
6627                                      iview->planes[0].isl.format,
6628                                      iview->planes[0].isl.swizzle,
6629                                      iview->vk.base_mip_level,
6630                                      iview->vk.base_array_layer + view, 1,
6631                                      render_area, clear_color);
6632             }
6633          } else {
6634             anv_image_clear_color(cmd_buffer, iview->image,
6635                                   VK_IMAGE_ASPECT_COLOR_BIT,
6636                                   aux_usage,
6637                                   iview->planes[0].isl.format,
6638                                   iview->planes[0].isl.swizzle,
6639                                   iview->vk.base_mip_level,
6640                                   base_clear_layer, clear_layer_count,
6641                                   render_area, clear_color);
6642          }
6643       } else {
6644          /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
6645          assert(att->imageLayout == initial_layout);
6646       }
6647 
6648       gfx->color_att[i].vk_format = iview->vk.format;
6649       gfx->color_att[i].iview = iview;
6650       gfx->color_att[i].layout = att->imageLayout;
6651       gfx->color_att[i].aux_usage = aux_usage;
6652 
6653       struct isl_view isl_view = iview->planes[0].isl;
6654       if (pRenderingInfo->viewMask) {
6655          assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
6656          isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
6657       } else {
6658          assert(isl_view.array_len >= pRenderingInfo->layerCount);
6659          isl_view.array_len = pRenderingInfo->layerCount;
6660       }
6661 
6662       anv_image_fill_surface_state(cmd_buffer->device,
6663                                    iview->image,
6664                                    VK_IMAGE_ASPECT_COLOR_BIT,
6665                                    &isl_view,
6666                                    ISL_SURF_USAGE_RENDER_TARGET_BIT,
6667                                    aux_usage, &fast_clear_color,
6668                                    0, /* anv_image_view_state_flags */
6669                                    &gfx->color_att[i].surface_state,
6670                                    NULL);
6671 
6672       add_surface_state_relocs(cmd_buffer, gfx->color_att[i].surface_state);
6673 
6674       if (GFX_VER < 10 &&
6675           (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
6676            (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
6677           iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
6678           iview->planes[0].isl.base_level == 0 &&
6679           iview->planes[0].isl.base_array_layer == 0) {
6680          genX(copy_fast_clear_dwords)(cmd_buffer,
6681                                       gfx->color_att[i].surface_state.state,
6682                                       iview->image,
6683                                       VK_IMAGE_ASPECT_COLOR_BIT,
6684                                       false /* copy to ss */);
6685       }
6686 
6687       if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
6688          gfx->color_att[i].resolve_mode = att->resolveMode;
6689          gfx->color_att[i].resolve_iview =
6690             anv_image_view_from_handle(att->resolveImageView);
6691          gfx->color_att[i].resolve_layout = att->resolveImageLayout;
6692       }
6693    }
6694 
6695    const struct anv_image_view *fsr_iview = NULL;
6696    const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att =
6697       vk_find_struct_const(pRenderingInfo->pNext,
6698                            RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
6699    if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) {
6700       fsr_iview = anv_image_view_from_handle(fsr_att->imageView);
6701       /* imageLayout and shadingRateAttachmentTexelSize are ignored */
6702    }
6703 
6704    const struct anv_image_view *ds_iview = NULL;
6705    const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
6706    const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
6707    if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
6708        (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
6709       const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
6710       VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6711       VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6712       VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6713       VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6714       enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
6715       enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
6716       float depth_clear_value = 0;
6717       uint32_t stencil_clear_value = 0;
6718 
6719       if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
6720          d_iview = anv_image_view_from_handle(d_att->imageView);
6721          initial_depth_layout = attachment_initial_layout(d_att);
6722          depth_layout = d_att->imageLayout;
6723          depth_aux_usage =
6724             anv_layout_to_aux_usage(&cmd_buffer->device->info,
6725                                     d_iview->image,
6726                                     VK_IMAGE_ASPECT_DEPTH_BIT,
6727                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
6728                                     depth_layout);
6729          depth_clear_value = d_att->clearValue.depthStencil.depth;
6730       }
6731 
6732       if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
6733          s_iview = anv_image_view_from_handle(s_att->imageView);
6734          initial_stencil_layout = attachment_initial_layout(s_att);
6735          stencil_layout = s_att->imageLayout;
6736          stencil_aux_usage =
6737             anv_layout_to_aux_usage(&cmd_buffer->device->info,
6738                                     s_iview->image,
6739                                     VK_IMAGE_ASPECT_STENCIL_BIT,
6740                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
6741                                     stencil_layout);
6742          stencil_clear_value = s_att->clearValue.depthStencil.stencil;
6743       }
6744 
6745       assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
6746       ds_iview = d_iview != NULL ? d_iview : s_iview;
6747       assert(ds_iview != NULL);
6748 
6749       assert(render_area.offset.x + render_area.extent.width <=
6750              ds_iview->vk.extent.width);
6751       assert(render_area.offset.y + render_area.extent.height <=
6752              ds_iview->vk.extent.height);
6753       assert(layers <= ds_iview->vk.layer_count);
6754 
6755       fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
6756       fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
6757 
6758       assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
6759       gfx->samples |= ds_iview->vk.image->samples;
6760 
6761       VkImageAspectFlags clear_aspects = 0;
6762       if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6763           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
6764          clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
6765       if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6766           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
6767          clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
6768 
6769       if (clear_aspects != 0) {
6770          const bool hiz_clear =
6771             anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
6772                                       depth_layout, clear_aspects,
6773                                       depth_clear_value,
6774                                       render_area);
6775 
6776          if (depth_layout != initial_depth_layout) {
6777             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
6778                    render_area.extent.width == d_iview->vk.extent.width &&
6779                    render_area.extent.height == d_iview->vk.extent.height);
6780 
6781             if (is_multiview) {
6782                u_foreach_bit(view, gfx->view_mask) {
6783                   transition_depth_buffer(cmd_buffer, d_iview->image,
6784                                           d_iview->vk.base_array_layer + view,
6785                                           1 /* layer_count */,
6786                                           initial_depth_layout, depth_layout,
6787                                           hiz_clear);
6788                }
6789             } else {
6790                transition_depth_buffer(cmd_buffer, d_iview->image,
6791                                        d_iview->vk.base_array_layer,
6792                                        gfx->layer_count,
6793                                        initial_depth_layout, depth_layout,
6794                                        hiz_clear);
6795             }
6796          }
6797 
6798          if (stencil_layout != initial_stencil_layout) {
6799             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
6800                    render_area.extent.width == s_iview->vk.extent.width &&
6801                    render_area.extent.height == s_iview->vk.extent.height);
6802 
6803             if (is_multiview) {
6804                u_foreach_bit(view, gfx->view_mask) {
6805                   transition_stencil_buffer(cmd_buffer, s_iview->image,
6806                                             s_iview->vk.base_mip_level, 1,
6807                                             s_iview->vk.base_array_layer + view,
6808                                             1 /* layer_count */,
6809                                             initial_stencil_layout,
6810                                             stencil_layout,
6811                                             hiz_clear);
6812                }
6813             } else {
6814                transition_stencil_buffer(cmd_buffer, s_iview->image,
6815                                          s_iview->vk.base_mip_level, 1,
6816                                          s_iview->vk.base_array_layer,
6817                                          gfx->layer_count,
6818                                          initial_stencil_layout,
6819                                          stencil_layout,
6820                                          hiz_clear);
6821             }
6822          }
6823 
6824          if (is_multiview) {
6825             uint32_t clear_view_mask = pRenderingInfo->viewMask;
6826             while (clear_view_mask) {
6827                int view = u_bit_scan(&clear_view_mask);
6828 
6829                uint32_t level = ds_iview->vk.base_mip_level;
6830                uint32_t layer = ds_iview->vk.base_array_layer + view;
6831 
6832                if (hiz_clear) {
6833                   anv_image_hiz_clear(cmd_buffer, ds_iview->image,
6834                                       clear_aspects,
6835                                       level, layer, 1,
6836                                       render_area,
6837                                       stencil_clear_value);
6838                } else {
6839                   anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
6840                                                 clear_aspects,
6841                                                 depth_aux_usage,
6842                                                 level, layer, 1,
6843                                                 render_area,
6844                                                 depth_clear_value,
6845                                                 stencil_clear_value);
6846                }
6847             }
6848          } else {
6849             uint32_t level = ds_iview->vk.base_mip_level;
6850             uint32_t base_layer = ds_iview->vk.base_array_layer;
6851             uint32_t layer_count = gfx->layer_count;
6852 
6853             if (hiz_clear) {
6854                anv_image_hiz_clear(cmd_buffer, ds_iview->image,
6855                                    clear_aspects,
6856                                    level, base_layer, layer_count,
6857                                    render_area,
6858                                    stencil_clear_value);
6859             } else {
6860                anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
6861                                              clear_aspects,
6862                                              depth_aux_usage,
6863                                              level, base_layer, layer_count,
6864                                              render_area,
6865                                              depth_clear_value,
6866                                              stencil_clear_value);
6867             }
6868          }
6869       } else {
6870          /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
6871          assert(depth_layout == initial_depth_layout);
6872          assert(stencil_layout == initial_stencil_layout);
6873       }
6874 
6875       if (d_iview != NULL) {
6876          gfx->depth_att.vk_format = d_iview->vk.format;
6877          gfx->depth_att.iview = d_iview;
6878          gfx->depth_att.layout = depth_layout;
6879          gfx->depth_att.aux_usage = depth_aux_usage;
6880          if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
6881             assert(d_att->resolveImageView != VK_NULL_HANDLE);
6882             gfx->depth_att.resolve_mode = d_att->resolveMode;
6883             gfx->depth_att.resolve_iview =
6884                anv_image_view_from_handle(d_att->resolveImageView);
6885             gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
6886          }
6887       }
6888 
6889       if (s_iview != NULL) {
6890          gfx->stencil_att.vk_format = s_iview->vk.format;
6891          gfx->stencil_att.iview = s_iview;
6892          gfx->stencil_att.layout = stencil_layout;
6893          gfx->stencil_att.aux_usage = stencil_aux_usage;
6894          if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
6895             assert(s_att->resolveImageView != VK_NULL_HANDLE);
6896             gfx->stencil_att.resolve_mode = s_att->resolveMode;
6897             gfx->stencil_att.resolve_iview =
6898                anv_image_view_from_handle(s_att->resolveImageView);
6899             gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
6900          }
6901       }
6902    }
6903 
6904    /* Finally, now that we know the right size, set up the null surface */
6905    assert(util_bitcount(gfx->samples) <= 1);
6906    isl_null_fill_state(&cmd_buffer->device->isl_dev,
6907                        gfx->null_surface_state.map,
6908                        .size = fb_size);
6909 
6910    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
6911       if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
6912          continue;
6913 
6914       isl_null_fill_state(&cmd_buffer->device->isl_dev,
6915                           gfx->color_att[i].surface_state.state.map,
6916                           .size = fb_size);
6917    }
6918 
6919    /****** We can now start emitting code to begin the render pass ******/
6920 
6921    gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
6922 
6923    /* Our implementation of VK_KHR_multiview uses instancing to draw the
6924     * different views.  If the client asks for instancing, we need to use the
6925     * Instance Data Step Rate to ensure that we repeat the client's
6926     * per-instance data once for each view.  Since this bit is in
6927     * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top
6928     * of each subpass.
6929     */
6930    if (GFX_VER == 7)
6931       gfx->vb_dirty |= ~0;
6932 
6933    /* It is possible to start a render pass with an old pipeline.  Because the
6934     * render pass and subpass index are both baked into the pipeline, this is
6935     * highly unlikely.  In order to do so, it requires that you have a render
6936     * pass with a single subpass and that you use that render pass twice
6937     * back-to-back and use the same pipeline at the start of the second render
6938     * pass as at the end of the first.  In order to avoid unpredictable issues
6939     * with this edge case, we just dirty the pipeline at the start of every
6940     * subpass.
6941     */
6942    gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
6943 
6944 #if GFX_VER >= 11
6945    /* The PIPE_CONTROL command description says:
6946     *
6947     *    "Whenever a Binding Table Index (BTI) used by a Render Target Message
6948     *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
6949     *     Target Cache Flush by enabling this bit. When render target flush
6950     *     is set due to new association of BTI, PS Scoreboard Stall bit must
6951     *     be set in this packet."
6952     */
6953    anv_add_pending_pipe_bits(cmd_buffer,
6954                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
6955                              ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
6956                              "change RT");
6957 #endif
6958 
6959    cmd_buffer_emit_depth_stencil(cmd_buffer);
6960 
6961    cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview);
6962 }
6963 
6964 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer, struct anv_attachment *att, VkImageAspectFlagBits aspect)6965 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
6966                                    struct anv_attachment *att,
6967                                    VkImageAspectFlagBits aspect)
6968 {
6969    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
6970    const struct anv_image_view *iview = att->iview;
6971 
6972    if (gfx->view_mask == 0) {
6973       genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
6974                                           aspect, att->aux_usage,
6975                                           iview->planes[0].isl.base_level,
6976                                           iview->planes[0].isl.base_array_layer,
6977                                           gfx->layer_count);
6978    } else {
6979       uint32_t res_view_mask = gfx->view_mask;
6980       while (res_view_mask) {
6981          int i = u_bit_scan(&res_view_mask);
6982 
6983          const uint32_t level = iview->planes[0].isl.base_level;
6984          const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
6985 
6986          genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
6987                                              aspect, att->aux_usage,
6988                                              level, layer, 1);
6989       }
6990    }
6991 }
6992 
6993 static enum blorp_filter
vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)6994 vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
6995 {
6996    switch (vk_mode) {
6997    case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
6998       return BLORP_FILTER_SAMPLE_0;
6999    case VK_RESOLVE_MODE_AVERAGE_BIT:
7000       return BLORP_FILTER_AVERAGE;
7001    case VK_RESOLVE_MODE_MIN_BIT:
7002       return BLORP_FILTER_MIN_SAMPLE;
7003    case VK_RESOLVE_MODE_MAX_BIT:
7004       return BLORP_FILTER_MAX_SAMPLE;
7005    default:
7006       return BLORP_FILTER_NONE;
7007    }
7008 }
7009 
7010 static void
cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer, const struct anv_attachment *att, VkImageLayout layout, VkImageAspectFlagBits aspect)7011 cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer,
7012                                    const struct anv_attachment *att,
7013                                    VkImageLayout layout,
7014                                    VkImageAspectFlagBits aspect)
7015 {
7016    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
7017    const struct anv_image_view *src_iview = att->iview;
7018    const struct anv_image_view *dst_iview = att->resolve_iview;
7019 
7020    enum isl_aux_usage src_aux_usage =
7021       anv_layout_to_aux_usage(&cmd_buffer->device->info,
7022                               src_iview->image, aspect,
7023                               VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
7024                               layout);
7025 
7026    enum isl_aux_usage dst_aux_usage =
7027       anv_layout_to_aux_usage(&cmd_buffer->device->info,
7028                               dst_iview->image, aspect,
7029                               VK_IMAGE_USAGE_TRANSFER_DST_BIT,
7030                               att->resolve_layout);
7031 
7032    enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
7033 
7034    const VkRect2D render_area = gfx->render_area;
7035    if (gfx->view_mask == 0) {
7036       anv_image_msaa_resolve(cmd_buffer,
7037                              src_iview->image, src_aux_usage,
7038                              src_iview->planes[0].isl.base_level,
7039                              src_iview->planes[0].isl.base_array_layer,
7040                              dst_iview->image, dst_aux_usage,
7041                              dst_iview->planes[0].isl.base_level,
7042                              dst_iview->planes[0].isl.base_array_layer,
7043                              aspect,
7044                              render_area.offset.x, render_area.offset.y,
7045                              render_area.offset.x, render_area.offset.y,
7046                              render_area.extent.width,
7047                              render_area.extent.height,
7048                              gfx->layer_count, filter);
7049    } else {
7050       uint32_t res_view_mask = gfx->view_mask;
7051       while (res_view_mask) {
7052          int i = u_bit_scan(&res_view_mask);
7053 
7054          anv_image_msaa_resolve(cmd_buffer,
7055                                 src_iview->image, src_aux_usage,
7056                                 src_iview->planes[0].isl.base_level,
7057                                 src_iview->planes[0].isl.base_array_layer + i,
7058                                 dst_iview->image, dst_aux_usage,
7059                                 dst_iview->planes[0].isl.base_level,
7060                                 dst_iview->planes[0].isl.base_array_layer + i,
7061                                 aspect,
7062                                 render_area.offset.x, render_area.offset.y,
7063                                 render_area.offset.x, render_area.offset.y,
7064                                 render_area.extent.width,
7065                                 render_area.extent.height,
7066                                 1, filter);
7067       }
7068    }
7069 }
7070 
CmdEndRendering( VkCommandBuffer commandBuffer)7071 void genX(CmdEndRendering)(
7072     VkCommandBuffer                             commandBuffer)
7073 {
7074    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7075    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
7076 
7077    if (anv_batch_has_error(&cmd_buffer->batch))
7078       return;
7079 
7080    const bool is_multiview = gfx->view_mask != 0;
7081    const uint32_t layers =
7082       is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
7083 
7084    bool has_color_resolve = false;
7085    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
7086       if (gfx->color_att[i].iview == NULL)
7087          continue;
7088 
7089       cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
7090                                          VK_IMAGE_ASPECT_COLOR_BIT);
7091 
7092       /* Stash this off for later */
7093       if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
7094           !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
7095          has_color_resolve = true;
7096    }
7097 
7098    if (gfx->depth_att.iview != NULL) {
7099       cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
7100                                          VK_IMAGE_ASPECT_DEPTH_BIT);
7101    }
7102 
7103    if (gfx->stencil_att.iview != NULL) {
7104       cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
7105                                          VK_IMAGE_ASPECT_STENCIL_BIT);
7106    }
7107 
7108    if (has_color_resolve) {
7109       /* We are about to do some MSAA resolves.  We need to flush so that the
7110        * result of writes to the MSAA color attachments show up in the sampler
7111        * when we blit to the single-sampled resolve target.
7112        */
7113       anv_add_pending_pipe_bits(cmd_buffer,
7114                                 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
7115                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
7116                                 "MSAA resolve");
7117    }
7118 
7119    if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
7120        gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE) {
7121       /* We are about to do some MSAA resolves.  We need to flush so that the
7122        * result of writes to the MSAA depth attachments show up in the sampler
7123        * when we blit to the single-sampled resolve target.
7124        */
7125       anv_add_pending_pipe_bits(cmd_buffer,
7126                               ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
7127                               ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
7128                               "MSAA resolve");
7129    }
7130 
7131    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
7132       const struct anv_attachment *att = &gfx->color_att[i];
7133       if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
7134           (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
7135          continue;
7136 
7137       cmd_buffer_resolve_msaa_attachment(cmd_buffer, att, att->layout,
7138                                          VK_IMAGE_ASPECT_COLOR_BIT);
7139    }
7140 
7141    if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
7142        !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
7143       const struct anv_image_view *src_iview = gfx->depth_att.iview;
7144 
7145       /* MSAA resolves sample from the source attachment.  Transition the
7146        * depth attachment first to get rid of any HiZ that we may not be
7147        * able to handle.
7148        */
7149       transition_depth_buffer(cmd_buffer, src_iview->image,
7150                               src_iview->planes[0].isl.base_array_layer,
7151                               layers,
7152                               gfx->depth_att.layout,
7153                               VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
7154                               false /* will_full_fast_clear */);
7155 
7156       cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->depth_att,
7157                                          VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
7158                                          VK_IMAGE_ASPECT_DEPTH_BIT);
7159 
7160       /* Transition the source back to the original layout.  This seems a bit
7161        * inefficient but, since HiZ resolves aren't destructive, going from
7162        * less HiZ to more is generally a no-op.
7163        */
7164       transition_depth_buffer(cmd_buffer, src_iview->image,
7165                               src_iview->planes[0].isl.base_array_layer,
7166                               layers,
7167                               VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
7168                               gfx->depth_att.layout,
7169                               false /* will_full_fast_clear */);
7170    }
7171 
7172    if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
7173        !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
7174       cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->stencil_att,
7175                                          gfx->stencil_att.layout,
7176                                          VK_IMAGE_ASPECT_STENCIL_BIT);
7177    }
7178 
7179 #if GFX_VER == 7
7180    /* On gfx7, we have to store a texturable version of the stencil buffer in
7181     * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
7182     * forth at strategic points. Stencil writes are only allowed in following
7183     * layouts:
7184     *
7185     *  - VK_IMAGE_LAYOUT_GENERAL
7186     *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
7187     *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
7188     *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
7189     *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
7190     *  - VK_IMAGE_LAYOUT_SUBPASS_SELF_DEPENDENCY_MESA
7191     *
7192     * For general, we have no nice opportunity to transition so we do the copy
7193     * to the shadow unconditionally at the end of the subpass. For transfer
7194     * destinations, we can update it as part of the transfer op. For the other
7195     * layouts, we delay the copy until a transition into some other layout.
7196     */
7197    if (gfx->stencil_att.iview != NULL) {
7198       const struct anv_image_view *iview = gfx->stencil_att.iview;
7199       const struct anv_image *image = iview->image;
7200       const uint32_t plane =
7201          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
7202 
7203       if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
7204           (gfx->stencil_att.layout == VK_IMAGE_LAYOUT_GENERAL ||
7205            gfx->stencil_att.layout == VK_IMAGE_LAYOUT_SUBPASS_SELF_DEPENDENCY_MESA)) {
7206          anv_image_copy_to_shadow(cmd_buffer, image,
7207                                   VK_IMAGE_ASPECT_STENCIL_BIT,
7208                                   iview->planes[plane].isl.base_level, 1,
7209                                   iview->planes[plane].isl.base_array_layer,
7210                                   layers);
7211       }
7212    }
7213 #endif
7214 
7215    trace_intel_end_render_pass(&cmd_buffer->trace,
7216                                gfx->render_area.extent.width,
7217                                gfx->render_area.extent.height,
7218                                gfx->color_att_count,
7219                                gfx->samples);
7220 
7221    anv_cmd_buffer_reset_rendering(cmd_buffer);
7222 }
7223 
7224 void
cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer)7225 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
7226 {
7227 #if GFX_VERx10 >= 75
7228    struct mi_builder b;
7229    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
7230 
7231    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
7232                 mi_reg32(ANV_PREDICATE_RESULT_REG));
7233    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7234 
7235    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
7236       mip.LoadOperation    = LOAD_LOADINV;
7237       mip.CombineOperation = COMBINE_SET;
7238       mip.CompareOperation = COMPARE_SRCS_EQUAL;
7239    }
7240 #endif
7241 }
7242 
7243 #if GFX_VERx10 >= 75
CmdBeginConditionalRenderingEXT( VkCommandBuffer commandBuffer, const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)7244 void genX(CmdBeginConditionalRenderingEXT)(
7245    VkCommandBuffer                             commandBuffer,
7246    const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
7247 {
7248    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7249    ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
7250    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
7251    struct anv_address value_address =
7252       anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
7253 
7254    const bool isInverted = pConditionalRenderingBegin->flags &
7255                            VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
7256 
7257    cmd_state->conditional_render_enabled = true;
7258 
7259    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7260 
7261    struct mi_builder b;
7262    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
7263 
7264    /* Section 19.4 of the Vulkan 1.1.85 spec says:
7265     *
7266     *    If the value of the predicate in buffer memory changes
7267     *    while conditional rendering is active, the rendering commands
7268     *    may be discarded in an implementation-dependent way.
7269     *    Some implementations may latch the value of the predicate
7270     *    upon beginning conditional rendering while others
7271     *    may read it before every rendering command.
7272     *
7273     * So it's perfectly fine to read a value from the buffer once.
7274     */
7275    struct mi_value value =  mi_mem32(value_address);
7276 
7277    /* Precompute predicate result, it is necessary to support secondary
7278     * command buffers since it is unknown if conditional rendering is
7279     * inverted when populating them.
7280     */
7281    mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
7282                 isInverted ? mi_uge(&b, mi_imm(0), value) :
7283                              mi_ult(&b, mi_imm(0), value));
7284 }
7285 
CmdEndConditionalRenderingEXT( VkCommandBuffer commandBuffer)7286 void genX(CmdEndConditionalRenderingEXT)(
7287 	VkCommandBuffer                             commandBuffer)
7288 {
7289    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7290    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
7291 
7292    cmd_state->conditional_render_enabled = false;
7293 }
7294 #endif
7295 
7296 /* Set of stage bits for which are pipelined, i.e. they get queued
7297  * by the command streamer for later execution.
7298  */
7299 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
7300    ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
7301      VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
7302      VK_PIPELINE_STAGE_2_HOST_BIT | \
7303      VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
7304 
CmdSetEvent2( VkCommandBuffer commandBuffer, VkEvent _event, const VkDependencyInfo* pDependencyInfo)7305 void genX(CmdSetEvent2)(
7306     VkCommandBuffer                             commandBuffer,
7307     VkEvent                                     _event,
7308     const VkDependencyInfo*                     pDependencyInfo)
7309 {
7310    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7311    ANV_FROM_HANDLE(anv_event, event, _event);
7312 
7313    VkPipelineStageFlags2 src_stages = 0;
7314 
7315    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
7316       src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7317    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
7318       src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7319    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
7320       src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7321 
7322    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
7323    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7324 
7325    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
7326       if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
7327          pc.StallAtPixelScoreboard = true;
7328          pc.CommandStreamerStallEnable = true;
7329       }
7330 
7331       pc.DestinationAddressType  = DAT_PPGTT,
7332       pc.PostSyncOperation       = WriteImmediateData,
7333       pc.Address = (struct anv_address) {
7334          cmd_buffer->device->dynamic_state_pool.block_pool.bo,
7335          event->state.offset
7336       };
7337       pc.ImmediateData           = VK_EVENT_SET;
7338       anv_debug_dump_pc(pc);
7339    }
7340 }
7341 
CmdResetEvent2( VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags2 stageMask)7342 void genX(CmdResetEvent2)(
7343     VkCommandBuffer                             commandBuffer,
7344     VkEvent                                     _event,
7345     VkPipelineStageFlags2                       stageMask)
7346 {
7347    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7348    ANV_FROM_HANDLE(anv_event, event, _event);
7349 
7350    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
7351    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7352 
7353    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
7354       if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
7355          pc.StallAtPixelScoreboard = true;
7356          pc.CommandStreamerStallEnable = true;
7357       }
7358 
7359       pc.DestinationAddressType  = DAT_PPGTT;
7360       pc.PostSyncOperation       = WriteImmediateData;
7361       pc.Address = (struct anv_address) {
7362          cmd_buffer->device->dynamic_state_pool.block_pool.bo,
7363          event->state.offset
7364       };
7365       pc.ImmediateData           = VK_EVENT_RESET;
7366       anv_debug_dump_pc(pc);
7367    }
7368 }
7369 
CmdWaitEvents2( VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent* pEvents, const VkDependencyInfo* pDependencyInfos)7370 void genX(CmdWaitEvents2)(
7371     VkCommandBuffer                             commandBuffer,
7372     uint32_t                                    eventCount,
7373     const VkEvent*                              pEvents,
7374     const VkDependencyInfo*                     pDependencyInfos)
7375 {
7376    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7377 
7378 #if GFX_VER >= 8
7379    for (uint32_t i = 0; i < eventCount; i++) {
7380       ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
7381 
7382       anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
7383          sem.WaitMode            = PollingMode,
7384          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD,
7385          sem.SemaphoreDataDword  = VK_EVENT_SET,
7386          sem.SemaphoreAddress = (struct anv_address) {
7387             cmd_buffer->device->dynamic_state_pool.block_pool.bo,
7388             event->state.offset
7389          };
7390       }
7391    }
7392 #else
7393    anv_finishme("Implement events on gfx7");
7394 #endif
7395 
7396    cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
7397 }
7398 
vk_to_intel_index_type(VkIndexType type)7399 static uint32_t vk_to_intel_index_type(VkIndexType type)
7400 {
7401    switch (type) {
7402    case VK_INDEX_TYPE_UINT8_EXT:
7403       return INDEX_BYTE;
7404    case VK_INDEX_TYPE_UINT16:
7405       return INDEX_WORD;
7406    case VK_INDEX_TYPE_UINT32:
7407       return INDEX_DWORD;
7408    default:
7409       unreachable("invalid index type");
7410    }
7411 }
7412 
restart_index_for_type(VkIndexType type)7413 static uint32_t restart_index_for_type(VkIndexType type)
7414 {
7415    switch (type) {
7416    case VK_INDEX_TYPE_UINT8_EXT:
7417       return UINT8_MAX;
7418    case VK_INDEX_TYPE_UINT16:
7419       return UINT16_MAX;
7420    case VK_INDEX_TYPE_UINT32:
7421       return UINT32_MAX;
7422    default:
7423       unreachable("invalid index type");
7424    }
7425 }
7426 
CmdBindIndexBuffer( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, VkIndexType indexType)7427 void genX(CmdBindIndexBuffer)(
7428     VkCommandBuffer                             commandBuffer,
7429     VkBuffer                                    _buffer,
7430     VkDeviceSize                                offset,
7431     VkIndexType                                 indexType)
7432 {
7433    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7434    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
7435 
7436    cmd_buffer->state.gfx.restart_index = restart_index_for_type(indexType);
7437    cmd_buffer->state.gfx.index_buffer = buffer;
7438    cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
7439    cmd_buffer->state.gfx.index_offset = offset;
7440 
7441    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
7442 }
7443 
CmdSetPerformanceOverrideINTEL( VkCommandBuffer commandBuffer, const VkPerformanceOverrideInfoINTEL* pOverrideInfo)7444 VkResult genX(CmdSetPerformanceOverrideINTEL)(
7445     VkCommandBuffer                             commandBuffer,
7446     const VkPerformanceOverrideInfoINTEL*       pOverrideInfo)
7447 {
7448    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7449 
7450    switch (pOverrideInfo->type) {
7451    case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
7452 #if GFX_VER >= 9
7453       anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
7454          csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
7455          csdm2.MediaInstructionDisable = pOverrideInfo->enable;
7456          csdm2._3DRenderingInstructionDisableMask = true;
7457          csdm2.MediaInstructionDisableMask = true;
7458       }
7459 #else
7460       anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {
7461          instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;
7462          instpm.MediaInstructionDisable = pOverrideInfo->enable;
7463          instpm._3DRenderingInstructionDisableMask = true;
7464          instpm.MediaInstructionDisableMask = true;
7465       }
7466 #endif
7467       break;
7468    }
7469 
7470    case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
7471       if (pOverrideInfo->enable) {
7472          /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
7473          anv_add_pending_pipe_bits(cmd_buffer,
7474                                    ANV_PIPE_FLUSH_BITS |
7475                                    ANV_PIPE_INVALIDATE_BITS,
7476                                    "perf counter isolation");
7477          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7478       }
7479       break;
7480 
7481    default:
7482       unreachable("Invalid override");
7483    }
7484 
7485    return VK_SUCCESS;
7486 }
7487 
CmdSetPerformanceStreamMarkerINTEL( VkCommandBuffer commandBuffer, const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo)7488 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
7489     VkCommandBuffer                             commandBuffer,
7490     const VkPerformanceStreamMarkerInfoINTEL*   pMarkerInfo)
7491 {
7492    /* TODO: Waiting on the register to write, might depend on generation. */
7493 
7494    return VK_SUCCESS;
7495 }
7496 
7497 #define TIMESTAMP 0x2358
7498 
cmd_emit_timestamp(struct anv_batch *batch, struct anv_device *device, struct anv_address addr, bool end_of_pipe)7499 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
7500                               struct anv_device *device,
7501                               struct anv_address addr,
7502                               bool end_of_pipe) {
7503    if (end_of_pipe) {
7504       anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
7505          pc.PostSyncOperation   = WriteTimestamp;
7506          pc.Address             = addr;
7507          anv_debug_dump_pc(pc);
7508       }
7509    } else {
7510       struct mi_builder b;
7511       mi_builder_init(&b, &device->info, batch);
7512       mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
7513    }
7514 }
7515