1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_format.h"
30 #include "vk_render_pass.h"
31 #include "vk_util.h"
32 #include "util/fast_idiv_by_const.h"
33
34 #include "common/intel_aux_map.h"
35 #include "common/intel_l3_config.h"
36 #include "genxml/gen_macros.h"
37 #include "genxml/genX_pack.h"
38 #include "genxml/gen_rt_pack.h"
39 #include "common/intel_guardband.h"
40 #include "compiler/brw_prim.h"
41
42 #include "nir/nir_xfb_info.h"
43
44 #include "ds/intel_tracepoints.h"
45
46 /* We reserve :
47 * - GPR 14 for secondary command buffer returns
48 * - GPR 15 for conditional rendering
49 */
50 #define MI_BUILDER_NUM_ALLOC_GPRS 14
51 #define __gen_get_batch_dwords anv_batch_emit_dwords
52 #define __gen_address_offset anv_address_add
53 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
54 #include "common/mi_builder.h"
55
56 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
57 uint32_t pipeline);
58
59 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc)60 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
61 enum anv_pipe_bits bits = 0;
62 bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
63 bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
64 #if GFX_VERx10 >= 125
65 bits |= (pc->PSSStallSyncEnable) ? ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
66 #endif
67 #if GFX_VER >= 12
68 bits |= (pc->TileCacheFlushEnable) ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
69 bits |= (pc->HDCPipelineFlushEnable) ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
70 #endif
71 bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
72 bits |= (pc->VFCacheInvalidationEnable) ? ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
73 bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
74 bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
75 bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
76 bits |= (pc->InstructionCacheInvalidateEnable) ? ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
77 bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
78 bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0;
79 bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0;
80 return bits;
81 }
82
83 #define anv_debug_dump_pc(pc) \
84 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
85 fputs("pc: emit PC=( ", stderr); \
86 anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \
87 fprintf(stderr, ") reason: %s\n", __FUNCTION__); \
88 }
89
90 static bool
is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer)91 is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer)
92 {
93 struct anv_queue_family *queue_family = cmd_buffer->queue_family;
94 return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0;
95 }
96
97 void
cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer)98 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
99 {
100 struct anv_device *device = cmd_buffer->device;
101 uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
102
103 /* If we are emitting a new state base address we probably need to re-emit
104 * binding tables.
105 */
106 cmd_buffer->state.descriptors_dirty |= ~0;
107
108 #if GFX_VERx10 >= 125
109 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
110 pc.CommandStreamerStallEnable = true;
111 anv_debug_dump_pc(pc);
112 }
113 anv_batch_emit(
114 &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
115 btpa.BindingTablePoolBaseAddress =
116 anv_cmd_buffer_surface_base_address(cmd_buffer);
117 btpa.BindingTablePoolBufferSize = BINDING_TABLE_POOL_BLOCK_SIZE / 4096;
118 btpa.MOCS = mocs;
119 }
120 #else /* GFX_VERx10 < 125 */
121 /* Emit a render target cache flush.
122 *
123 * This isn't documented anywhere in the PRM. However, it seems to be
124 * necessary prior to changing the surface state base address. Without
125 * this, we get GPU hangs when using multi-level command buffers which
126 * clear depth, reset state base address, and then go render stuff.
127 */
128 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
129 #if GFX_VER >= 12
130 pc.HDCPipelineFlushEnable = true;
131 #else
132 pc.DCFlushEnable = true;
133 #endif
134 pc.RenderTargetCacheFlushEnable = true;
135 pc.CommandStreamerStallEnable = true;
136 anv_debug_dump_pc(pc);
137 }
138
139 #if GFX_VERx10 == 120
140 /* Wa_1607854226:
141 *
142 * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
143 * mode by putting the pipeline temporarily in 3D mode.
144 */
145 uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline;
146 genX(flush_pipeline_select_3d)(cmd_buffer);
147 #endif
148
149 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
150 sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
151 sba.GeneralStateMOCS = mocs;
152 sba.GeneralStateBaseAddressModifyEnable = true;
153
154 sba.StatelessDataPortAccessMOCS = mocs;
155
156 sba.SurfaceStateBaseAddress =
157 anv_cmd_buffer_surface_base_address(cmd_buffer);
158 sba.SurfaceStateMOCS = mocs;
159 sba.SurfaceStateBaseAddressModifyEnable = true;
160
161 sba.DynamicStateBaseAddress =
162 (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
163 sba.DynamicStateMOCS = mocs;
164 sba.DynamicStateBaseAddressModifyEnable = true;
165
166 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
167 sba.IndirectObjectMOCS = mocs;
168 sba.IndirectObjectBaseAddressModifyEnable = true;
169
170 sba.InstructionBaseAddress =
171 (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
172 sba.InstructionMOCS = mocs;
173 sba.InstructionBaseAddressModifyEnable = true;
174
175 # if (GFX_VER >= 8)
176 /* Broadwell requires that we specify a buffer size for a bunch of
177 * these fields. However, since we will be growing the BO's live, we
178 * just set them all to the maximum.
179 */
180 sba.GeneralStateBufferSize = 0xfffff;
181 sba.IndirectObjectBufferSize = 0xfffff;
182 if (anv_use_relocations(device->physical)) {
183 sba.DynamicStateBufferSize = 0xfffff;
184 sba.InstructionBufferSize = 0xfffff;
185 } else {
186 /* With softpin, we use fixed addresses so we actually know how big
187 * our base addresses are.
188 */
189 sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096;
190 sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096;
191 }
192 sba.GeneralStateBufferSizeModifyEnable = true;
193 sba.IndirectObjectBufferSizeModifyEnable = true;
194 sba.DynamicStateBufferSizeModifyEnable = true;
195 sba.InstructionBuffersizeModifyEnable = true;
196 # else
197 /* On gfx7, we have upper bounds instead. According to the docs,
198 * setting an upper bound of zero means that no bounds checking is
199 * performed so, in theory, we should be able to leave them zero.
200 * However, border color is broken and the GPU bounds-checks anyway.
201 * To avoid this and other potential problems, we may as well set it
202 * for everything.
203 */
204 sba.GeneralStateAccessUpperBound =
205 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
206 sba.GeneralStateAccessUpperBoundModifyEnable = true;
207 sba.DynamicStateAccessUpperBound =
208 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
209 sba.DynamicStateAccessUpperBoundModifyEnable = true;
210 sba.InstructionAccessUpperBound =
211 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
212 sba.InstructionAccessUpperBoundModifyEnable = true;
213 # endif
214 # if (GFX_VER >= 9)
215 sba.BindlessSurfaceStateBaseAddress =
216 (struct anv_address) { device->surface_state_pool.block_pool.bo, 0 };
217 sba.BindlessSurfaceStateSize = (1 << 20) - 1;
218 sba.BindlessSurfaceStateMOCS = mocs;
219 sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
220 # endif
221 # if (GFX_VER >= 10)
222 sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
223 sba.BindlessSamplerStateMOCS = mocs;
224 sba.BindlessSamplerStateBaseAddressModifyEnable = true;
225 sba.BindlessSamplerStateBufferSize = 0;
226 # endif
227 }
228
229 #if GFX_VERx10 == 120
230 /* Wa_1607854226:
231 *
232 * Put the pipeline back into its current mode.
233 */
234 if (gfx12_wa_pipeline != UINT32_MAX)
235 genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
236 #endif
237
238 #endif /* GFX_VERx10 < 125 */
239
240 /* After re-setting the surface state base address, we have to do some
241 * cache flushing so that the sampler engine will pick up the new
242 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
243 * Shared Function > 3D Sampler > State > State Caching (page 96):
244 *
245 * Coherency with system memory in the state cache, like the texture
246 * cache is handled partially by software. It is expected that the
247 * command stream or shader will issue Cache Flush operation or
248 * Cache_Flush sampler message to ensure that the L1 cache remains
249 * coherent with system memory.
250 *
251 * [...]
252 *
253 * Whenever the value of the Dynamic_State_Base_Addr,
254 * Surface_State_Base_Addr are altered, the L1 state cache must be
255 * invalidated to ensure the new surface or sampler state is fetched
256 * from system memory.
257 *
258 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
259 * which, according the PIPE_CONTROL instruction documentation in the
260 * Broadwell PRM:
261 *
262 * Setting this bit is independent of any other bit in this packet.
263 * This bit controls the invalidation of the L1 and L2 state caches
264 * at the top of the pipe i.e. at the parsing time.
265 *
266 * Unfortunately, experimentation seems to indicate that state cache
267 * invalidation through a PIPE_CONTROL does nothing whatsoever in
268 * regards to surface state and binding tables. In stead, it seems that
269 * invalidating the texture cache is what is actually needed.
270 *
271 * XXX: As far as we have been able to determine through
272 * experimentation, shows that flush the texture cache appears to be
273 * sufficient. The theory here is that all of the sampling/rendering
274 * units cache the binding table in the texture cache. However, we have
275 * yet to be able to actually confirm this.
276 *
277 * Wa_14013910100:
278 *
279 * "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
280 * or program pipe control with Instruction cache invalidate post
281 * STATE_BASE_ADDRESS command"
282 */
283 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
284 pc.TextureCacheInvalidationEnable = true;
285 pc.ConstantCacheInvalidationEnable = true;
286 pc.StateCacheInvalidationEnable = true;
287 #if GFX_VERx10 == 125
288 pc.InstructionCacheInvalidateEnable = true;
289 #endif
290 #if GFX_VER >= 9 && GFX_VER <= 11
291 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
292 *
293 * "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
294 * always set for GPGPU workloads when “Texture Cache Invalidation
295 * Enable” bit is set".
296 *
297 * Workaround stopped appearing in TGL PRMs.
298 */
299 pc.CommandStreamerStallEnable =
300 cmd_buffer->state.current_pipeline == GPGPU;
301 #endif
302 anv_debug_dump_pc(pc);
303 }
304 }
305
306 static void
add_surface_reloc(struct anv_cmd_buffer *cmd_buffer, struct anv_state state, struct anv_address addr)307 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
308 struct anv_state state, struct anv_address addr)
309 {
310 VkResult result;
311
312 if (anv_use_relocations(cmd_buffer->device->physical)) {
313 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
314 result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
315 &cmd_buffer->vk.pool->alloc,
316 state.offset + isl_dev->ss.addr_offset,
317 addr.bo, addr.offset, NULL);
318 } else {
319 result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
320 &cmd_buffer->vk.pool->alloc,
321 addr.bo);
322 }
323
324 if (unlikely(result != VK_SUCCESS))
325 anv_batch_set_error(&cmd_buffer->batch, result);
326 }
327
328 static void
add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer, struct anv_surface_state state)329 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
330 struct anv_surface_state state)
331 {
332 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
333
334 assert(!anv_address_is_null(state.address));
335 add_surface_reloc(cmd_buffer, state.state, state.address);
336
337 if (!anv_address_is_null(state.aux_address)) {
338 VkResult result =
339 anv_reloc_list_add(&cmd_buffer->surface_relocs,
340 &cmd_buffer->vk.pool->alloc,
341 state.state.offset + isl_dev->ss.aux_addr_offset,
342 state.aux_address.bo,
343 state.aux_address.offset,
344 NULL);
345 if (result != VK_SUCCESS)
346 anv_batch_set_error(&cmd_buffer->batch, result);
347 }
348
349 if (!anv_address_is_null(state.clear_address)) {
350 VkResult result =
351 anv_reloc_list_add(&cmd_buffer->surface_relocs,
352 &cmd_buffer->vk.pool->alloc,
353 state.state.offset +
354 isl_dev->ss.clear_color_state_offset,
355 state.clear_address.bo,
356 state.clear_address.offset,
357 NULL);
358 if (result != VK_SUCCESS)
359 anv_batch_set_error(&cmd_buffer->batch, result);
360 }
361 }
362
363 static bool
isl_color_value_requires_conversion(union isl_color_value color, const struct isl_surf *surf, const struct isl_view *view)364 isl_color_value_requires_conversion(union isl_color_value color,
365 const struct isl_surf *surf,
366 const struct isl_view *view)
367 {
368 if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
369 return false;
370
371 uint32_t surf_pack[4] = { 0, 0, 0, 0 };
372 isl_color_value_pack(&color, surf->format, surf_pack);
373
374 uint32_t view_pack[4] = { 0, 0, 0, 0 };
375 union isl_color_value swiz_color =
376 isl_color_value_swizzle_inv(color, view->swizzle);
377 isl_color_value_pack(&swiz_color, view->format, view_pack);
378
379 return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
380 }
381
382 static bool
anv_can_fast_clear_color_view(struct anv_device * device, struct anv_image_view *iview, VkImageLayout layout, union isl_color_value clear_color, uint32_t num_layers, VkRect2D render_area)383 anv_can_fast_clear_color_view(struct anv_device * device,
384 struct anv_image_view *iview,
385 VkImageLayout layout,
386 union isl_color_value clear_color,
387 uint32_t num_layers,
388 VkRect2D render_area)
389 {
390 if (iview->planes[0].isl.base_array_layer >=
391 anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
392 iview->planes[0].isl.base_level))
393 return false;
394
395 /* Start by getting the fast clear type. We use the first subpass
396 * layout here because we don't want to fast-clear if the first subpass
397 * to use the attachment can't handle fast-clears.
398 */
399 enum anv_fast_clear_type fast_clear_type =
400 anv_layout_to_fast_clear_type(&device->info, iview->image,
401 VK_IMAGE_ASPECT_COLOR_BIT,
402 layout);
403 switch (fast_clear_type) {
404 case ANV_FAST_CLEAR_NONE:
405 return false;
406 case ANV_FAST_CLEAR_DEFAULT_VALUE:
407 if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
408 return false;
409 break;
410 case ANV_FAST_CLEAR_ANY:
411 break;
412 }
413
414 /* Potentially, we could do partial fast-clears but doing so has crazy
415 * alignment restrictions. It's easier to just restrict to full size
416 * fast clears for now.
417 */
418 if (render_area.offset.x != 0 ||
419 render_area.offset.y != 0 ||
420 render_area.extent.width != iview->vk.extent.width ||
421 render_area.extent.height != iview->vk.extent.height)
422 return false;
423
424 /* On Broadwell and earlier, we can only handle 0/1 clear colors */
425 if (GFX_VER <= 8 &&
426 !isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))
427 return false;
428
429 /* If the clear color is one that would require non-trivial format
430 * conversion on resolve, we don't bother with the fast clear. This
431 * shouldn't be common as most clear colors are 0/1 and the most common
432 * format re-interpretation is for sRGB.
433 */
434 if (isl_color_value_requires_conversion(clear_color,
435 &iview->image->planes[0].primary_surface.isl,
436 &iview->planes[0].isl)) {
437 anv_perf_warn(VK_LOG_OBJS(&iview->vk.base),
438 "Cannot fast-clear to colors which would require "
439 "format conversion on resolve");
440 return false;
441 }
442
443 /* We only allow fast clears to the first slice of an image (level 0,
444 * layer 0) and only for the entire slice. This guarantees us that, at
445 * any given time, there is only one clear color on any given image at
446 * any given time. At the time of our testing (Jan 17, 2018), there
447 * were no known applications which would benefit from fast-clearing
448 * more than just the first slice.
449 */
450 if (iview->planes[0].isl.base_level > 0 ||
451 iview->planes[0].isl.base_array_layer > 0) {
452 anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
453 "Rendering with multi-lod or multi-layer framebuffer "
454 "with LOAD_OP_LOAD and baseMipLevel > 0 or "
455 "baseArrayLayer > 0. Not fast clearing.");
456 return false;
457 }
458
459 if (num_layers > 1) {
460 anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
461 "Rendering to a multi-layer framebuffer with "
462 "LOAD_OP_CLEAR. Only fast-clearing the first slice");
463 }
464
465 return true;
466 }
467
468 static bool
anv_can_hiz_clear_ds_view(struct anv_device *device, const struct anv_image_view *iview, VkImageLayout layout, VkImageAspectFlags clear_aspects, float depth_clear_value, VkRect2D render_area)469 anv_can_hiz_clear_ds_view(struct anv_device *device,
470 const struct anv_image_view *iview,
471 VkImageLayout layout,
472 VkImageAspectFlags clear_aspects,
473 float depth_clear_value,
474 VkRect2D render_area)
475 {
476 /* We don't do any HiZ or depth fast-clears on gfx7 yet */
477 if (GFX_VER == 7)
478 return false;
479
480 /* If we're just clearing stencil, we can always HiZ clear */
481 if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
482 return true;
483
484 /* We must have depth in order to have HiZ */
485 if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
486 return false;
487
488 const enum isl_aux_usage clear_aux_usage =
489 anv_layout_to_aux_usage(&device->info, iview->image,
490 VK_IMAGE_ASPECT_DEPTH_BIT,
491 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
492 layout);
493 if (!blorp_can_hiz_clear_depth(&device->info,
494 &iview->image->planes[0].primary_surface.isl,
495 clear_aux_usage,
496 iview->planes[0].isl.base_level,
497 iview->planes[0].isl.base_array_layer,
498 render_area.offset.x,
499 render_area.offset.y,
500 render_area.offset.x +
501 render_area.extent.width,
502 render_area.offset.y +
503 render_area.extent.height))
504 return false;
505
506 if (depth_clear_value != ANV_HZ_FC_VAL)
507 return false;
508
509 /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared
510 * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports
511 * returning 0.0f. Gens prior to gfx8 do not support this feature at all.
512 */
513 if (GFX_VER == 8 && anv_can_sample_with_hiz(&device->info, iview->image))
514 return false;
515
516 /* If we got here, then we can fast clear */
517 return true;
518 }
519
520 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
521
522 #if GFX_VER == 12
523 static void
anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, uint32_t base_level, uint32_t level_count, uint32_t base_layer, uint32_t layer_count)524 anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
525 const struct anv_image *image,
526 VkImageAspectFlagBits aspect,
527 uint32_t base_level, uint32_t level_count,
528 uint32_t base_layer, uint32_t layer_count)
529 {
530 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
531
532 const struct anv_surface *surface = &image->planes[plane].primary_surface;
533 uint64_t base_address =
534 anv_address_physical(anv_image_address(image, &surface->memory_range));
535
536 const struct isl_surf *isl_surf = &image->planes[plane].primary_surface.isl;
537 uint64_t format_bits = intel_aux_map_format_bits_for_isl_surf(isl_surf);
538
539 /* We're about to live-update the AUX-TT. We really don't want anyone else
540 * trying to read it while we're doing this. We could probably get away
541 * with not having this stall in some cases if we were really careful but
542 * it's better to play it safe. Full stall the GPU.
543 */
544 anv_add_pending_pipe_bits(cmd_buffer,
545 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
546 "before update AUX-TT");
547 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
548
549 struct mi_builder b;
550 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
551
552 for (uint32_t a = 0; a < layer_count; a++) {
553 const uint32_t layer = base_layer + a;
554
555 uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0;
556 for (uint32_t l = 0; l < level_count; l++) {
557 const uint32_t level = base_level + l;
558
559 uint32_t logical_array_layer, logical_z_offset_px;
560 if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
561 logical_array_layer = 0;
562
563 /* If the given miplevel does not have this layer, then any higher
564 * miplevels won't either because miplevels only get smaller the
565 * higher the LOD.
566 */
567 assert(layer < image->vk.extent.depth);
568 if (layer >= anv_minify(image->vk.extent.depth, level))
569 break;
570 logical_z_offset_px = layer;
571 } else {
572 assert(layer < image->vk.array_layers);
573 logical_array_layer = layer;
574 logical_z_offset_px = 0;
575 }
576
577 uint64_t slice_start_offset_B, slice_end_offset_B;
578 isl_surf_get_image_range_B_tile(isl_surf, level,
579 logical_array_layer,
580 logical_z_offset_px,
581 &slice_start_offset_B,
582 &slice_end_offset_B);
583
584 start_offset_B = MIN2(start_offset_B, slice_start_offset_B);
585 end_offset_B = MAX2(end_offset_B, slice_end_offset_B);
586 }
587
588 /* Aux operates 64K at a time */
589 start_offset_B = align_down_u64(start_offset_B, 64 * 1024);
590 end_offset_B = align_u64(end_offset_B, 64 * 1024);
591
592 for (uint64_t offset = start_offset_B;
593 offset < end_offset_B; offset += 64 * 1024) {
594 uint64_t address = base_address + offset;
595
596 uint64_t aux_entry_addr64, *aux_entry_map;
597 aux_entry_map = intel_aux_map_get_entry(cmd_buffer->device->aux_map_ctx,
598 address, &aux_entry_addr64);
599
600 assert(!anv_use_relocations(cmd_buffer->device->physical));
601 struct anv_address aux_entry_address = {
602 .bo = NULL,
603 .offset = aux_entry_addr64,
604 };
605
606 const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map);
607 uint64_t new_aux_entry =
608 (old_aux_entry & INTEL_AUX_MAP_ADDRESS_MASK) | format_bits;
609
610 if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage))
611 new_aux_entry |= INTEL_AUX_MAP_ENTRY_VALID_BIT;
612
613 mi_store(&b, mi_mem64(aux_entry_address), mi_imm(new_aux_entry));
614 }
615 }
616
617 anv_add_pending_pipe_bits(cmd_buffer,
618 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
619 "after update AUX-TT");
620 }
621 #endif /* GFX_VER == 12 */
622
623 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
624 * the initial layout is undefined, the HiZ buffer and depth buffer will
625 * represent the same data at the end of this operation.
626 */
627 static void
transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, uint32_t base_layer, uint32_t layer_count, VkImageLayout initial_layout, VkImageLayout final_layout, bool will_full_fast_clear)628 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
629 const struct anv_image *image,
630 uint32_t base_layer, uint32_t layer_count,
631 VkImageLayout initial_layout,
632 VkImageLayout final_layout,
633 bool will_full_fast_clear)
634 {
635 const uint32_t depth_plane =
636 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
637 if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
638 return;
639
640 #if GFX_VER == 12
641 if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
642 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
643 cmd_buffer->device->physical->has_implicit_ccs &&
644 cmd_buffer->device->info.has_aux_map) {
645 anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
646 0, 1, base_layer, layer_count);
647 }
648 #endif
649
650 /* If will_full_fast_clear is set, the caller promises to fast-clear the
651 * largest portion of the specified range as it can. For depth images,
652 * that means the entire image because we don't support multi-LOD HiZ.
653 */
654 assert(image->planes[0].primary_surface.isl.levels == 1);
655 if (will_full_fast_clear)
656 return;
657
658 const enum isl_aux_state initial_state =
659 anv_layout_to_aux_state(&cmd_buffer->device->info, image,
660 VK_IMAGE_ASPECT_DEPTH_BIT,
661 initial_layout);
662 const enum isl_aux_state final_state =
663 anv_layout_to_aux_state(&cmd_buffer->device->info, image,
664 VK_IMAGE_ASPECT_DEPTH_BIT,
665 final_layout);
666
667 const bool initial_depth_valid =
668 isl_aux_state_has_valid_primary(initial_state);
669 const bool initial_hiz_valid =
670 isl_aux_state_has_valid_aux(initial_state);
671 const bool final_needs_depth =
672 isl_aux_state_has_valid_primary(final_state);
673 const bool final_needs_hiz =
674 isl_aux_state_has_valid_aux(final_state);
675
676 /* Getting into the pass-through state for Depth is tricky and involves
677 * both a resolve and an ambiguate. We don't handle that state right now
678 * as anv_layout_to_aux_state never returns it.
679 */
680 assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
681
682 if (final_needs_depth && !initial_depth_valid) {
683 assert(initial_hiz_valid);
684 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
685 0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
686 } else if (final_needs_hiz && !initial_hiz_valid) {
687 assert(initial_depth_valid);
688 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
689 0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
690 }
691 }
692
693 #if GFX_VER == 7
694 static inline bool
vk_image_layout_stencil_write_optimal(VkImageLayout layout)695 vk_image_layout_stencil_write_optimal(VkImageLayout layout)
696 {
697 return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
698 layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||
699 layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL;
700 }
701 #endif
702
703 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
704 * the initial layout is undefined, the HiZ buffer and depth buffer will
705 * represent the same data at the end of this operation.
706 */
707 static void
transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, uint32_t base_level, uint32_t level_count, uint32_t base_layer, uint32_t layer_count, VkImageLayout initial_layout, VkImageLayout final_layout, bool will_full_fast_clear)708 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
709 const struct anv_image *image,
710 uint32_t base_level, uint32_t level_count,
711 uint32_t base_layer, uint32_t layer_count,
712 VkImageLayout initial_layout,
713 VkImageLayout final_layout,
714 bool will_full_fast_clear)
715 {
716 #if GFX_VER == 7
717 const uint32_t plane =
718 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
719
720 /* On gfx7, we have to store a texturable version of the stencil buffer in
721 * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
722 * forth at strategic points. Stencil writes are only allowed in following
723 * layouts:
724 *
725 * - VK_IMAGE_LAYOUT_GENERAL
726 * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
727 * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
728 * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
729 * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
730 *
731 * For general, we have no nice opportunity to transition so we do the copy
732 * to the shadow unconditionally at the end of the subpass. For transfer
733 * destinations, we can update it as part of the transfer op. For the other
734 * layouts, we delay the copy until a transition into some other layout.
735 */
736 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
737 vk_image_layout_stencil_write_optimal(initial_layout) &&
738 !vk_image_layout_stencil_write_optimal(final_layout)) {
739 anv_image_copy_to_shadow(cmd_buffer, image,
740 VK_IMAGE_ASPECT_STENCIL_BIT,
741 base_level, level_count,
742 base_layer, layer_count);
743 }
744 #elif GFX_VER == 12
745 const uint32_t plane =
746 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
747 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
748 return;
749
750 if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
751 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
752 cmd_buffer->device->physical->has_implicit_ccs &&
753 cmd_buffer->device->info.has_aux_map) {
754 anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
755 base_level, level_count, base_layer, layer_count);
756
757 /* If will_full_fast_clear is set, the caller promises to fast-clear the
758 * largest portion of the specified range as it can.
759 */
760 if (will_full_fast_clear)
761 return;
762
763 for (uint32_t l = 0; l < level_count; l++) {
764 const uint32_t level = base_level + l;
765 const VkRect2D clear_rect = {
766 .offset.x = 0,
767 .offset.y = 0,
768 .extent.width = anv_minify(image->vk.extent.width, level),
769 .extent.height = anv_minify(image->vk.extent.height, level),
770 };
771
772 uint32_t aux_layers =
773 anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level);
774 uint32_t level_layer_count =
775 MIN2(layer_count, aux_layers - base_layer);
776
777 /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression
778 * Enable:
779 *
780 * "When enabled, Stencil Buffer needs to be initialized via
781 * stencil clear (HZ_OP) before any renderpass."
782 */
783 anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
784 level, base_layer, level_layer_count,
785 clear_rect, 0 /* Stencil clear value */);
786 }
787 }
788 #endif
789 }
790
791 #define MI_PREDICATE_SRC0 0x2400
792 #define MI_PREDICATE_SRC1 0x2408
793 #define MI_PREDICATE_RESULT 0x2418
794
795 static void
set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, uint32_t level, uint32_t base_layer, uint32_t layer_count, bool compressed)796 set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
797 const struct anv_image *image,
798 VkImageAspectFlagBits aspect,
799 uint32_t level,
800 uint32_t base_layer, uint32_t layer_count,
801 bool compressed)
802 {
803 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
804
805 /* We only have compression tracking for CCS_E */
806 if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E)
807 return;
808
809 for (uint32_t a = 0; a < layer_count; a++) {
810 uint32_t layer = base_layer + a;
811 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
812 sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device,
813 image, aspect,
814 level, layer);
815 sdi.ImmediateData = compressed ? UINT32_MAX : 0;
816 }
817 }
818 }
819
820 static void
set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, enum anv_fast_clear_type fast_clear)821 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
822 const struct anv_image *image,
823 VkImageAspectFlagBits aspect,
824 enum anv_fast_clear_type fast_clear)
825 {
826 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
827 sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
828 image, aspect);
829 sdi.ImmediateData = fast_clear;
830 }
831
832 /* Whenever we have fast-clear, we consider that slice to be compressed.
833 * This makes building predicates much easier.
834 */
835 if (fast_clear != ANV_FAST_CLEAR_NONE)
836 set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true);
837 }
838
839 /* This is only really practical on haswell and above because it requires
840 * MI math in order to get it correct.
841 */
842 #if GFX_VERx10 >= 75
843 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, uint32_t level, uint32_t array_layer, enum isl_aux_op resolve_op, enum anv_fast_clear_type fast_clear_supported)844 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
845 const struct anv_image *image,
846 VkImageAspectFlagBits aspect,
847 uint32_t level, uint32_t array_layer,
848 enum isl_aux_op resolve_op,
849 enum anv_fast_clear_type fast_clear_supported)
850 {
851 struct mi_builder b;
852 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
853
854 const struct mi_value fast_clear_type =
855 mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
856 image, aspect));
857
858 if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
859 /* In this case, we're doing a full resolve which means we want the
860 * resolve to happen if any compression (including fast-clears) is
861 * present.
862 *
863 * In order to simplify the logic a bit, we make the assumption that,
864 * if the first slice has been fast-cleared, it is also marked as
865 * compressed. See also set_image_fast_clear_state.
866 */
867 const struct mi_value compression_state =
868 mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device,
869 image, aspect,
870 level, array_layer));
871 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state);
872 mi_store(&b, compression_state, mi_imm(0));
873
874 if (level == 0 && array_layer == 0) {
875 /* If the predicate is true, we want to write 0 to the fast clear type
876 * and, if it's false, leave it alone. We can do this by writing
877 *
878 * clear_type = clear_type & ~predicate;
879 */
880 struct mi_value new_fast_clear_type =
881 mi_iand(&b, fast_clear_type,
882 mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0)));
883 mi_store(&b, fast_clear_type, new_fast_clear_type);
884 }
885 } else if (level == 0 && array_layer == 0) {
886 /* In this case, we are doing a partial resolve to get rid of fast-clear
887 * colors. We don't care about the compression state but we do care
888 * about how much fast clear is allowed by the final layout.
889 */
890 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
891 assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
892
893 /* We need to compute (fast_clear_supported < image->fast_clear) */
894 struct mi_value pred =
895 mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
896 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
897
898 /* If the predicate is true, we want to write 0 to the fast clear type
899 * and, if it's false, leave it alone. We can do this by writing
900 *
901 * clear_type = clear_type & ~predicate;
902 */
903 struct mi_value new_fast_clear_type =
904 mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
905 mi_store(&b, fast_clear_type, new_fast_clear_type);
906 } else {
907 /* In this case, we're trying to do a partial resolve on a slice that
908 * doesn't have clear color. There's nothing to do.
909 */
910 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
911 return;
912 }
913
914 /* Set src1 to 0 and use a != condition */
915 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
916
917 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
918 mip.LoadOperation = LOAD_LOADINV;
919 mip.CombineOperation = COMBINE_SET;
920 mip.CompareOperation = COMPARE_SRCS_EQUAL;
921 }
922 }
923 #endif /* GFX_VERx10 >= 75 */
924
925 #if GFX_VER <= 8
926 static void
anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, uint32_t level, uint32_t array_layer, enum isl_aux_op resolve_op, enum anv_fast_clear_type fast_clear_supported)927 anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
928 const struct anv_image *image,
929 VkImageAspectFlagBits aspect,
930 uint32_t level, uint32_t array_layer,
931 enum isl_aux_op resolve_op,
932 enum anv_fast_clear_type fast_clear_supported)
933 {
934 struct mi_builder b;
935 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
936
937 struct mi_value fast_clear_type_mem =
938 mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
939 image, aspect));
940
941 /* This only works for partial resolves and only when the clear color is
942 * all or nothing. On the upside, this emits less command streamer code
943 * and works on Ivybridge and Bay Trail.
944 */
945 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
946 assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);
947
948 /* We don't support fast clears on anything other than the first slice. */
949 if (level > 0 || array_layer > 0)
950 return;
951
952 /* On gfx8, we don't have a concept of default clear colors because we
953 * can't sample from CCS surfaces. It's enough to just load the fast clear
954 * state into the predicate register.
955 */
956 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);
957 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
958 mi_store(&b, fast_clear_type_mem, mi_imm(0));
959
960 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
961 mip.LoadOperation = LOAD_LOADINV;
962 mip.CombineOperation = COMBINE_SET;
963 mip.CompareOperation = COMPARE_SRCS_EQUAL;
964 }
965 }
966 #endif /* GFX_VER <= 8 */
967
968 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, enum isl_format format, struct isl_swizzle swizzle, VkImageAspectFlagBits aspect, uint32_t level, uint32_t array_layer, enum isl_aux_op resolve_op, enum anv_fast_clear_type fast_clear_supported)969 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
970 const struct anv_image *image,
971 enum isl_format format,
972 struct isl_swizzle swizzle,
973 VkImageAspectFlagBits aspect,
974 uint32_t level, uint32_t array_layer,
975 enum isl_aux_op resolve_op,
976 enum anv_fast_clear_type fast_clear_supported)
977 {
978 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
979
980 #if GFX_VER >= 9
981 anv_cmd_compute_resolve_predicate(cmd_buffer, image,
982 aspect, level, array_layer,
983 resolve_op, fast_clear_supported);
984 #else /* GFX_VER <= 8 */
985 anv_cmd_simple_resolve_predicate(cmd_buffer, image,
986 aspect, level, array_layer,
987 resolve_op, fast_clear_supported);
988 #endif
989
990 /* CCS_D only supports full resolves and BLORP will assert on us if we try
991 * to do a partial resolve on a CCS_D surface.
992 */
993 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
994 image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
995 resolve_op = ISL_AUX_OP_FULL_RESOLVE;
996
997 anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
998 level, array_layer, 1, resolve_op, NULL, true);
999 }
1000
1001 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, enum isl_format format, struct isl_swizzle swizzle, VkImageAspectFlagBits aspect, uint32_t array_layer, enum isl_aux_op resolve_op, enum anv_fast_clear_type fast_clear_supported)1002 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
1003 const struct anv_image *image,
1004 enum isl_format format,
1005 struct isl_swizzle swizzle,
1006 VkImageAspectFlagBits aspect,
1007 uint32_t array_layer,
1008 enum isl_aux_op resolve_op,
1009 enum anv_fast_clear_type fast_clear_supported)
1010 {
1011 assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
1012 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
1013
1014 #if GFX_VERx10 >= 75
1015 anv_cmd_compute_resolve_predicate(cmd_buffer, image,
1016 aspect, 0, array_layer,
1017 resolve_op, fast_clear_supported);
1018
1019 anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
1020 array_layer, 1, resolve_op, NULL, true);
1021 #else
1022 unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
1023 #endif
1024 }
1025
1026 void
cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, enum isl_aux_usage aux_usage, uint32_t level, uint32_t base_layer, uint32_t layer_count)1027 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
1028 const struct anv_image *image,
1029 VkImageAspectFlagBits aspect,
1030 enum isl_aux_usage aux_usage,
1031 uint32_t level,
1032 uint32_t base_layer,
1033 uint32_t layer_count)
1034 {
1035 /* The aspect must be exactly one of the image aspects. */
1036 assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
1037
1038 /* The only compression types with more than just fast-clears are MCS,
1039 * CCS_E, and HiZ. With HiZ we just trust the layout and don't actually
1040 * track the current fast-clear and compression state. This leaves us
1041 * with just MCS and CCS_E.
1042 */
1043 if (aux_usage != ISL_AUX_USAGE_CCS_E &&
1044 aux_usage != ISL_AUX_USAGE_MCS)
1045 return;
1046
1047 set_image_compressed_bit(cmd_buffer, image, aspect,
1048 level, base_layer, layer_count, true);
1049 }
1050
1051 static void
init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect)1052 init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,
1053 const struct anv_image *image,
1054 VkImageAspectFlagBits aspect)
1055 {
1056 assert(cmd_buffer && image);
1057 assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1058
1059 set_image_fast_clear_state(cmd_buffer, image, aspect,
1060 ANV_FAST_CLEAR_NONE);
1061
1062 /* Initialize the struct fields that are accessed for fast-clears so that
1063 * the HW restrictions on the field values are satisfied.
1064 */
1065 struct anv_address addr =
1066 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
1067
1068 if (GFX_VER >= 9) {
1069 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1070 const unsigned num_dwords = GFX_VER >= 10 ?
1071 isl_dev->ss.clear_color_state_size / 4 :
1072 isl_dev->ss.clear_value_size / 4;
1073 for (unsigned i = 0; i < num_dwords; i++) {
1074 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
1075 sdi.Address = addr;
1076 sdi.Address.offset += i * 4;
1077 sdi.ImmediateData = 0;
1078 }
1079 }
1080 } else {
1081 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
1082 sdi.Address = addr;
1083 if (GFX_VERx10 >= 75) {
1084 /* Pre-SKL, the dword containing the clear values also contains
1085 * other fields, so we need to initialize those fields to match the
1086 * values that would be in a color attachment.
1087 */
1088 sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 |
1089 ISL_CHANNEL_SELECT_GREEN << 22 |
1090 ISL_CHANNEL_SELECT_BLUE << 19 |
1091 ISL_CHANNEL_SELECT_ALPHA << 16;
1092 } else if (GFX_VER == 7) {
1093 /* On IVB, the dword containing the clear values also contains
1094 * other fields that must be zero or can be zero.
1095 */
1096 sdi.ImmediateData = 0;
1097 }
1098 }
1099 }
1100 }
1101
1102 /* Copy the fast-clear value dword(s) between a surface state object and an
1103 * image's fast clear state buffer.
1104 */
1105 static void
copy_fast_clear_dwords(struct anv_cmd_buffer *cmd_buffer, struct anv_state surface_state, const struct anv_image *image, VkImageAspectFlagBits aspect, bool copy_from_surface_state)1106 genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
1107 struct anv_state surface_state,
1108 const struct anv_image *image,
1109 VkImageAspectFlagBits aspect,
1110 bool copy_from_surface_state)
1111 {
1112 assert(cmd_buffer && image);
1113 assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1114
1115 struct anv_address ss_clear_addr = {
1116 .bo = cmd_buffer->device->surface_state_pool.block_pool.bo,
1117 .offset = surface_state.offset +
1118 cmd_buffer->device->isl_dev.ss.clear_value_offset,
1119 };
1120 const struct anv_address entry_addr =
1121 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
1122 unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
1123
1124 #if GFX_VER == 7
1125 /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM
1126 * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is
1127 * in-flight when they are issued even if the memory touched is not
1128 * currently active for rendering. The weird bit is that it is not the
1129 * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight
1130 * rendering hangs such that the next stalling command after the
1131 * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.
1132 *
1133 * It is unclear exactly why this hang occurs. Both MI commands come with
1134 * warnings about the 3D pipeline but that doesn't seem to fully explain
1135 * it. My (Jason's) best theory is that it has something to do with the
1136 * fact that we're using a GPU state register as our temporary and that
1137 * something with reading/writing it is causing problems.
1138 *
1139 * In order to work around this issue, we emit a PIPE_CONTROL with the
1140 * command streamer stall bit set.
1141 */
1142 anv_add_pending_pipe_bits(cmd_buffer,
1143 ANV_PIPE_CS_STALL_BIT,
1144 "after copy_fast_clear_dwords. Avoid potential hang");
1145 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1146 #endif
1147
1148 struct mi_builder b;
1149 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1150
1151 if (copy_from_surface_state) {
1152 mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);
1153 } else {
1154 mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
1155
1156 /* Updating a surface state object may require that the state cache be
1157 * invalidated. From the SKL PRM, Shared Functions -> State -> State
1158 * Caching:
1159 *
1160 * Whenever the RENDER_SURFACE_STATE object in memory pointed to by
1161 * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
1162 * modified [...], the L1 state cache must be invalidated to ensure
1163 * the new surface or sampler state is fetched from system memory.
1164 *
1165 * In testing, SKL doesn't actually seem to need this, but HSW does.
1166 */
1167 anv_add_pending_pipe_bits(cmd_buffer,
1168 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
1169 "after copy_fast_clear_dwords surface state update");
1170 }
1171 }
1172
1173 /**
1174 * @brief Transitions a color buffer from one layout to another.
1175 *
1176 * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
1177 * more information.
1178 *
1179 * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
1180 * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
1181 * this represents the maximum layers to transition at each
1182 * specified miplevel.
1183 */
1184 static void
transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, VkImageAspectFlagBits aspect, const uint32_t base_level, uint32_t level_count, uint32_t base_layer, uint32_t layer_count, VkImageLayout initial_layout, VkImageLayout final_layout, uint64_t src_queue_family, uint64_t dst_queue_family, bool will_full_fast_clear)1185 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
1186 const struct anv_image *image,
1187 VkImageAspectFlagBits aspect,
1188 const uint32_t base_level, uint32_t level_count,
1189 uint32_t base_layer, uint32_t layer_count,
1190 VkImageLayout initial_layout,
1191 VkImageLayout final_layout,
1192 uint64_t src_queue_family,
1193 uint64_t dst_queue_family,
1194 bool will_full_fast_clear)
1195 {
1196 struct anv_device *device = cmd_buffer->device;
1197 const struct intel_device_info *devinfo = &device->info;
1198 /* Validate the inputs. */
1199 assert(cmd_buffer);
1200 assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1201 /* These values aren't supported for simplicity's sake. */
1202 assert(level_count != VK_REMAINING_MIP_LEVELS &&
1203 layer_count != VK_REMAINING_ARRAY_LAYERS);
1204 /* Ensure the subresource range is valid. */
1205 UNUSED uint64_t last_level_num = base_level + level_count;
1206 const uint32_t max_depth = anv_minify(image->vk.extent.depth, base_level);
1207 UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
1208 assert((uint64_t)base_layer + layer_count <= image_layers);
1209 assert(last_level_num <= image->vk.mip_levels);
1210 /* If there is a layout transfer, the final layout cannot be undefined or
1211 * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
1212 */
1213 assert(initial_layout == final_layout ||
1214 (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
1215 final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
1216 const struct isl_drm_modifier_info *isl_mod_info =
1217 image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
1218 ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
1219 : NULL;
1220
1221 const bool src_queue_external =
1222 src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1223 src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1224
1225 const bool dst_queue_external =
1226 dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1227 dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1228
1229 /* Simultaneous acquire and release on external queues is illegal. */
1230 assert(!src_queue_external || !dst_queue_external);
1231
1232 /* Ownership transition on an external queue requires special action if the
1233 * image has a DRM format modifier because we store image data in
1234 * a driver-private bo which is inaccessible to the external queue.
1235 */
1236 const bool private_binding_acquire =
1237 src_queue_external &&
1238 anv_image_is_externally_shared(image) &&
1239 anv_image_has_private_binding(image);
1240
1241 const bool private_binding_release =
1242 dst_queue_external &&
1243 anv_image_is_externally_shared(image) &&
1244 anv_image_has_private_binding(image);
1245
1246 if (initial_layout == final_layout &&
1247 !private_binding_acquire && !private_binding_release) {
1248 /* No work is needed. */
1249 return;
1250 }
1251
1252 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
1253
1254 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
1255 final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
1256 /* This surface is a linear compressed image with a tiled shadow surface
1257 * for texturing. The client is about to use it in READ_ONLY_OPTIMAL so
1258 * we need to ensure the shadow copy is up-to-date.
1259 */
1260 assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
1261 assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
1262 assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);
1263 assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
1264 assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));
1265 assert(plane == 0);
1266 anv_image_copy_to_shadow(cmd_buffer, image,
1267 VK_IMAGE_ASPECT_COLOR_BIT,
1268 base_level, level_count,
1269 base_layer, layer_count);
1270 }
1271
1272 if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
1273 return;
1274
1275 assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
1276
1277 /* The following layouts are equivalent for non-linear images. */
1278 const bool initial_layout_undefined =
1279 initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
1280 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
1281
1282 bool must_init_fast_clear_state = false;
1283 bool must_init_aux_surface = false;
1284
1285 if (initial_layout_undefined) {
1286 /* The subresource may have been aliased and populated with arbitrary
1287 * data.
1288 */
1289 must_init_fast_clear_state = true;
1290 must_init_aux_surface = true;
1291 } else if (private_binding_acquire) {
1292 /* The fast clear state lives in a driver-private bo, and therefore the
1293 * external/foreign queue is unaware of it.
1294 *
1295 * If this is the first time we are accessing the image, then the fast
1296 * clear state is uninitialized.
1297 *
1298 * If this is NOT the first time we are accessing the image, then the fast
1299 * clear state may still be valid and correct due to the resolve during
1300 * our most recent ownership release. However, we do not track the aux
1301 * state with MI stores, and therefore must assume the worst-case: that
1302 * this is the first time we are accessing the image.
1303 */
1304 assert(image->planes[plane].fast_clear_memory_range.binding ==
1305 ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1306 must_init_fast_clear_state = true;
1307
1308 if (image->planes[plane].aux_surface.memory_range.binding ==
1309 ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
1310 assert(isl_mod_info->aux_usage == ISL_AUX_USAGE_NONE);
1311
1312 /* The aux surface, like the fast clear state, lives in
1313 * a driver-private bo. We must initialize the aux surface for the
1314 * same reasons we must initialize the fast clear state.
1315 */
1316 must_init_aux_surface = true;
1317 } else {
1318 assert(isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE);
1319
1320 /* The aux surface, unlike the fast clear state, lives in
1321 * application-visible VkDeviceMemory and is shared with the
1322 * external/foreign queue. Therefore, when we acquire ownership of the
1323 * image with a defined VkImageLayout, the aux surface is valid and has
1324 * the aux state required by the modifier.
1325 */
1326 must_init_aux_surface = false;
1327 }
1328 }
1329
1330 #if GFX_VER == 12
1331 if (initial_layout_undefined) {
1332 if (device->physical->has_implicit_ccs && devinfo->has_aux_map) {
1333 anv_image_init_aux_tt(cmd_buffer, image, aspect,
1334 base_level, level_count,
1335 base_layer, layer_count);
1336 }
1337 }
1338 #else
1339 assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map));
1340 #endif
1341
1342 if (must_init_fast_clear_state) {
1343 if (base_level == 0 && base_layer == 0)
1344 init_fast_clear_color(cmd_buffer, image, aspect);
1345 }
1346
1347 if (must_init_aux_surface) {
1348 assert(must_init_fast_clear_state);
1349
1350 /* Initialize the aux buffers to enable correct rendering. In order to
1351 * ensure that things such as storage images work correctly, aux buffers
1352 * need to be initialized to valid data.
1353 *
1354 * Having an aux buffer with invalid data is a problem for two reasons:
1355 *
1356 * 1) Having an invalid value in the buffer can confuse the hardware.
1357 * For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1358 * invalid and leads to the hardware doing strange things. It
1359 * doesn't hang as far as we can tell but rendering corruption can
1360 * occur.
1361 *
1362 * 2) If this transition is into the GENERAL layout and we then use the
1363 * image as a storage image, then we must have the aux buffer in the
1364 * pass-through state so that, if we then go to texture from the
1365 * image, we get the results of our storage image writes and not the
1366 * fast clear color or other random data.
1367 *
1368 * For CCS both of the problems above are real demonstrable issues. In
1369 * that case, the only thing we can do is to perform an ambiguate to
1370 * transition the aux surface into the pass-through state.
1371 *
1372 * For MCS, (2) is never an issue because we don't support multisampled
1373 * storage images. In theory, issue (1) is a problem with MCS but we've
1374 * never seen it in the wild. For 4x and 16x, all bit patters could, in
1375 * theory, be interpreted as something but we don't know that all bit
1376 * patterns are actually valid. For 2x and 8x, you could easily end up
1377 * with the MCS referring to an invalid plane because not all bits of
1378 * the MCS value are actually used. Even though we've never seen issues
1379 * in the wild, it's best to play it safe and initialize the MCS. We
1380 * can use a fast-clear for MCS because we only ever touch from render
1381 * and texture (no image load store).
1382 */
1383 if (image->vk.samples == 1) {
1384 for (uint32_t l = 0; l < level_count; l++) {
1385 const uint32_t level = base_level + l;
1386
1387 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1388 if (base_layer >= aux_layers)
1389 break; /* We will only get fewer layers as level increases */
1390 uint32_t level_layer_count =
1391 MIN2(layer_count, aux_layers - base_layer);
1392
1393 /* If will_full_fast_clear is set, the caller promises to
1394 * fast-clear the largest portion of the specified range as it can.
1395 * For color images, that means only the first LOD and array slice.
1396 */
1397 if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1398 base_layer++;
1399 level_layer_count--;
1400 if (level_layer_count == 0)
1401 continue;
1402 }
1403
1404 anv_image_ccs_op(cmd_buffer, image,
1405 image->planes[plane].primary_surface.isl.format,
1406 ISL_SWIZZLE_IDENTITY,
1407 aspect, level, base_layer, level_layer_count,
1408 ISL_AUX_OP_AMBIGUATE, NULL, false);
1409
1410 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
1411 set_image_compressed_bit(cmd_buffer, image, aspect,
1412 level, base_layer, level_layer_count,
1413 false);
1414 }
1415 }
1416 } else {
1417 if (image->vk.samples == 4 || image->vk.samples == 16) {
1418 anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
1419 "Doing a potentially unnecessary fast-clear to "
1420 "define an MCS buffer.");
1421 }
1422
1423 /* If will_full_fast_clear is set, the caller promises to fast-clear
1424 * the largest portion of the specified range as it can.
1425 */
1426 if (will_full_fast_clear)
1427 return;
1428
1429 assert(base_level == 0 && level_count == 1);
1430 anv_image_mcs_op(cmd_buffer, image,
1431 image->planes[plane].primary_surface.isl.format,
1432 ISL_SWIZZLE_IDENTITY,
1433 aspect, base_layer, layer_count,
1434 ISL_AUX_OP_FAST_CLEAR, NULL, false);
1435 }
1436 return;
1437 }
1438
1439 enum isl_aux_usage initial_aux_usage =
1440 anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
1441 enum isl_aux_usage final_aux_usage =
1442 anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
1443 enum anv_fast_clear_type initial_fast_clear =
1444 anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);
1445 enum anv_fast_clear_type final_fast_clear =
1446 anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);
1447
1448 /* We must override the anv_layout_to_* functions because they are unaware of
1449 * acquire/release direction.
1450 */
1451 if (private_binding_acquire) {
1452 initial_aux_usage = isl_mod_info->aux_usage;
1453 initial_fast_clear = isl_mod_info->supports_clear_color ?
1454 initial_fast_clear : ANV_FAST_CLEAR_NONE;
1455 } else if (private_binding_release) {
1456 final_aux_usage = isl_mod_info->aux_usage;
1457 final_fast_clear = isl_mod_info->supports_clear_color ?
1458 final_fast_clear : ANV_FAST_CLEAR_NONE;
1459 }
1460
1461 /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1462 * We can handle transitions between CCS_D/E to and from NONE. What we
1463 * don't yet handle is switching between CCS_E and CCS_D within a given
1464 * image. Doing so in a performant way requires more detailed aux state
1465 * tracking such as what is done in i965. For now, just assume that we
1466 * only have one type of compression.
1467 */
1468 assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1469 final_aux_usage == ISL_AUX_USAGE_NONE ||
1470 initial_aux_usage == final_aux_usage);
1471
1472 /* If initial aux usage is NONE, there is nothing to resolve */
1473 if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1474 return;
1475
1476 enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1477
1478 /* If the initial layout supports more fast clear than the final layout
1479 * then we need at least a partial resolve.
1480 */
1481 if (final_fast_clear < initial_fast_clear)
1482 resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1483
1484 if (initial_aux_usage == ISL_AUX_USAGE_CCS_E &&
1485 final_aux_usage != ISL_AUX_USAGE_CCS_E)
1486 resolve_op = ISL_AUX_OP_FULL_RESOLVE;
1487
1488 if (resolve_op == ISL_AUX_OP_NONE)
1489 return;
1490
1491 /* Perform a resolve to synchronize data between the main and aux buffer.
1492 * Before we begin, we must satisfy the cache flushing requirement specified
1493 * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
1494 *
1495 * Any transition from any value in {Clear, Render, Resolve} to a
1496 * different value in {Clear, Render, Resolve} requires end of pipe
1497 * synchronization.
1498 *
1499 * We perform a flush of the write cache before and after the clear and
1500 * resolve operations to meet this requirement.
1501 *
1502 * Unlike other drawing, fast clear operations are not properly
1503 * synchronized. The first PIPE_CONTROL here likely ensures that the
1504 * contents of the previous render or clear hit the render target before we
1505 * resolve and the second likely ensures that the resolve is complete before
1506 * we do any more rendering or clearing.
1507 */
1508 anv_add_pending_pipe_bits(cmd_buffer,
1509 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1510 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1511 "after transition RT");
1512
1513 for (uint32_t l = 0; l < level_count; l++) {
1514 uint32_t level = base_level + l;
1515
1516 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1517 if (base_layer >= aux_layers)
1518 break; /* We will only get fewer layers as level increases */
1519 uint32_t level_layer_count =
1520 MIN2(layer_count, aux_layers - base_layer);
1521
1522 for (uint32_t a = 0; a < level_layer_count; a++) {
1523 uint32_t array_layer = base_layer + a;
1524
1525 /* If will_full_fast_clear is set, the caller promises to fast-clear
1526 * the largest portion of the specified range as it can. For color
1527 * images, that means only the first LOD and array slice.
1528 */
1529 if (level == 0 && array_layer == 0 && will_full_fast_clear)
1530 continue;
1531
1532 if (image->vk.samples == 1) {
1533 anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1534 image->planes[plane].primary_surface.isl.format,
1535 ISL_SWIZZLE_IDENTITY,
1536 aspect, level, array_layer, resolve_op,
1537 final_fast_clear);
1538 } else {
1539 /* We only support fast-clear on the first layer so partial
1540 * resolves should not be used on other layers as they will use
1541 * the clear color stored in memory that is only valid for layer0.
1542 */
1543 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1544 array_layer != 0)
1545 continue;
1546
1547 anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1548 image->planes[plane].primary_surface.isl.format,
1549 ISL_SWIZZLE_IDENTITY,
1550 aspect, array_layer, resolve_op,
1551 final_fast_clear);
1552 }
1553 }
1554 }
1555
1556 anv_add_pending_pipe_bits(cmd_buffer,
1557 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1558 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1559 "after transition RT");
1560 }
1561
1562 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer, uint32_t color_att_count)1563 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1564 uint32_t color_att_count)
1565 {
1566 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1567
1568 /* Reserve one for the NULL state. */
1569 unsigned num_states = 1 + color_att_count;
1570 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1571 const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);
1572 gfx->att_states =
1573 anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
1574 num_states * ss_stride, isl_dev->ss.align);
1575 if (gfx->att_states.map == NULL) {
1576 return anv_batch_set_error(&cmd_buffer->batch,
1577 VK_ERROR_OUT_OF_DEVICE_MEMORY);
1578 }
1579
1580 struct anv_state next_state = gfx->att_states;
1581 next_state.alloc_size = isl_dev->ss.size;
1582
1583 gfx->null_surface_state = next_state;
1584 next_state.offset += ss_stride;
1585 next_state.map += ss_stride;
1586
1587 gfx->color_att_count = color_att_count;
1588 for (uint32_t i = 0; i < color_att_count; i++) {
1589 gfx->color_att[i] = (struct anv_attachment) {
1590 .surface_state.state = next_state,
1591 };
1592 next_state.offset += ss_stride;
1593 next_state.map += ss_stride;
1594 }
1595 gfx->depth_att = (struct anv_attachment) { };
1596 gfx->stencil_att = (struct anv_attachment) { };
1597
1598 return VK_SUCCESS;
1599 }
1600
1601 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)1602 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1603 {
1604 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1605
1606 gfx->render_area = (VkRect2D) { };
1607 gfx->layer_count = 0;
1608 gfx->samples = 0;
1609
1610 gfx->color_att_count = 0;
1611 gfx->depth_att = (struct anv_attachment) { };
1612 gfx->stencil_att = (struct anv_attachment) { };
1613 gfx->null_surface_state = ANV_STATE_NULL;
1614 }
1615
1616 VkResult
BeginCommandBuffer( VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo* pBeginInfo)1617 genX(BeginCommandBuffer)(
1618 VkCommandBuffer commandBuffer,
1619 const VkCommandBufferBeginInfo* pBeginInfo)
1620 {
1621 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1622 VkResult result;
1623
1624 /* If this is the first vkBeginCommandBuffer, we must *initialize* the
1625 * command buffer's state. Otherwise, we must *reset* its state. In both
1626 * cases we reset it.
1627 *
1628 * From the Vulkan 1.0 spec:
1629 *
1630 * If a command buffer is in the executable state and the command buffer
1631 * was allocated from a command pool with the
1632 * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
1633 * vkBeginCommandBuffer implicitly resets the command buffer, behaving
1634 * as if vkResetCommandBuffer had been called with
1635 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
1636 * the command buffer in the recording state.
1637 */
1638 anv_cmd_buffer_reset(cmd_buffer);
1639 anv_cmd_buffer_reset_rendering(cmd_buffer);
1640
1641 cmd_buffer->usage_flags = pBeginInfo->flags;
1642
1643 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
1644 * primary level command buffers.
1645 *
1646 * From the Vulkan 1.0 spec:
1647 *
1648 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
1649 * secondary command buffer is considered to be entirely inside a render
1650 * pass. If this is a primary command buffer, then this bit is ignored.
1651 */
1652 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
1653 cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
1654
1655 trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
1656
1657 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1658
1659 /* We sometimes store vertex data in the dynamic state buffer for blorp
1660 * operations and our dynamic state stream may re-use data from previous
1661 * command buffers. In order to prevent stale cache data, we flush the VF
1662 * cache. We could do this on every blorp call but that's not really
1663 * needed as all of the data will get written by the CPU prior to the GPU
1664 * executing anything. The chances are fairly high that they will use
1665 * blorp at least once per primary command buffer so it shouldn't be
1666 * wasted.
1667 *
1668 * There is also a workaround on gfx8 which requires us to invalidate the
1669 * VF cache occasionally. It's easier if we can assume we start with a
1670 * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
1671 */
1672 anv_add_pending_pipe_bits(cmd_buffer,
1673 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1674 "new cmd buffer");
1675
1676 /* Re-emit the aux table register in every command buffer. This way we're
1677 * ensured that we have the table even if this command buffer doesn't
1678 * initialize any images.
1679 */
1680 if (cmd_buffer->device->info.has_aux_map) {
1681 anv_add_pending_pipe_bits(cmd_buffer,
1682 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
1683 "new cmd buffer with aux-tt");
1684 }
1685
1686 /* We send an "Indirect State Pointers Disable" packet at
1687 * EndCommandBuffer, so all push constant packets are ignored during a
1688 * context restore. Documentation says after that command, we need to
1689 * emit push constants again before any rendering operation. So we
1690 * flag them dirty here to make sure they get emitted.
1691 */
1692 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
1693
1694 if (cmd_buffer->usage_flags &
1695 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1696 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1697
1698 char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
1699 const VkRenderingInfo *resume_info =
1700 vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
1701 pBeginInfo,
1702 gcbiar_data);
1703 if (resume_info != NULL) {
1704 genX(CmdBeginRendering)(commandBuffer, resume_info);
1705 } else {
1706 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
1707 vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
1708 pBeginInfo);
1709 assert(inheritance_info);
1710
1711 gfx->rendering_flags = inheritance_info->flags;
1712 gfx->render_area = (VkRect2D) { };
1713 gfx->layer_count = 0;
1714 gfx->samples = inheritance_info->rasterizationSamples;
1715 gfx->view_mask = inheritance_info->viewMask;
1716
1717 uint32_t color_att_count = inheritance_info->colorAttachmentCount;
1718 result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
1719 if (result != VK_SUCCESS)
1720 return result;
1721
1722 for (uint32_t i = 0; i < color_att_count; i++) {
1723 gfx->color_att[i].vk_format =
1724 inheritance_info->pColorAttachmentFormats[i];
1725 }
1726 gfx->depth_att.vk_format =
1727 inheritance_info->depthAttachmentFormat;
1728 gfx->stencil_att.vk_format =
1729 inheritance_info->stencilAttachmentFormat;
1730
1731 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
1732 }
1733 }
1734
1735 #if GFX_VER >= 8
1736 /* Emit the sample pattern at the beginning of the batch because the
1737 * default locations emitted at the device initialization might have been
1738 * changed by a previous command buffer.
1739 *
1740 * Do not change that when we're continuing a previous renderpass.
1741 */
1742 if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
1743 !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
1744 genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
1745 #endif
1746
1747 #if GFX_VERx10 >= 75
1748 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1749 const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
1750 vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
1751
1752 /* If secondary buffer supports conditional rendering
1753 * we should emit commands as if conditional rendering is enabled.
1754 */
1755 cmd_buffer->state.conditional_render_enabled =
1756 conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
1757 }
1758 #endif
1759
1760 return VK_SUCCESS;
1761 }
1762
1763 /* From the PRM, Volume 2a:
1764 *
1765 * "Indirect State Pointers Disable
1766 *
1767 * At the completion of the post-sync operation associated with this pipe
1768 * control packet, the indirect state pointers in the hardware are
1769 * considered invalid; the indirect pointers are not saved in the context.
1770 * If any new indirect state commands are executed in the command stream
1771 * while the pipe control is pending, the new indirect state commands are
1772 * preserved.
1773 *
1774 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
1775 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
1776 * commands are only considered as Indirect State Pointers. Once ISP is
1777 * issued in a context, SW must initialize by programming push constant
1778 * commands for all the shaders (at least to zero length) before attempting
1779 * any rendering operation for the same context."
1780 *
1781 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
1782 * even though they point to a BO that has been already unreferenced at
1783 * the end of the previous batch buffer. This has been fine so far since
1784 * we are protected by these scratch page (every address not covered by
1785 * a BO should be pointing to the scratch page). But on CNL, it is
1786 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
1787 * instruction.
1788 *
1789 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
1790 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
1791 * context restore, so the mentioned hang doesn't happen. However,
1792 * software must program push constant commands for all stages prior to
1793 * rendering anything. So we flag them dirty in BeginCommandBuffer.
1794 *
1795 * Finally, we also make sure to stall at pixel scoreboard to make sure the
1796 * constants have been loaded into the EUs prior to disable the push constants
1797 * so that it doesn't hang a previous 3DPRIMITIVE.
1798 */
1799 static void
emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)1800 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
1801 {
1802 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1803 pc.StallAtPixelScoreboard = true;
1804 pc.CommandStreamerStallEnable = true;
1805 anv_debug_dump_pc(pc);
1806 }
1807 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1808 pc.IndirectStatePointersDisable = true;
1809 pc.CommandStreamerStallEnable = true;
1810 anv_debug_dump_pc(pc);
1811 }
1812 }
1813
1814 VkResult
EndCommandBuffer( VkCommandBuffer commandBuffer)1815 genX(EndCommandBuffer)(
1816 VkCommandBuffer commandBuffer)
1817 {
1818 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1819
1820 if (anv_batch_has_error(&cmd_buffer->batch))
1821 return cmd_buffer->batch.status;
1822
1823 anv_measure_endcommandbuffer(cmd_buffer);
1824
1825 /* We want every command buffer to start with the PMA fix in a known state,
1826 * so we disable it at the end of the command buffer.
1827 */
1828 genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
1829
1830 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1831
1832 emit_isp_disable(cmd_buffer);
1833
1834 trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
1835
1836 anv_cmd_buffer_end_batch_buffer(cmd_buffer);
1837
1838 return VK_SUCCESS;
1839 }
1840
1841 void
CmdExecuteCommands( VkCommandBuffer commandBuffer, uint32_t commandBufferCount, const VkCommandBuffer* pCmdBuffers)1842 genX(CmdExecuteCommands)(
1843 VkCommandBuffer commandBuffer,
1844 uint32_t commandBufferCount,
1845 const VkCommandBuffer* pCmdBuffers)
1846 {
1847 ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
1848
1849 assert(primary->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1850
1851 if (anv_batch_has_error(&primary->batch))
1852 return;
1853
1854 /* The secondary command buffers will assume that the PMA fix is disabled
1855 * when they begin executing. Make sure this is true.
1856 */
1857 genX(cmd_buffer_enable_pma_fix)(primary, false);
1858
1859 /* The secondary command buffer doesn't know which textures etc. have been
1860 * flushed prior to their execution. Apply those flushes now.
1861 */
1862 genX(cmd_buffer_apply_pipe_flushes)(primary);
1863
1864 for (uint32_t i = 0; i < commandBufferCount; i++) {
1865 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
1866
1867 assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1868 assert(!anv_batch_has_error(&secondary->batch));
1869
1870 #if GFX_VERx10 >= 75
1871 if (secondary->state.conditional_render_enabled) {
1872 if (!primary->state.conditional_render_enabled) {
1873 /* Secondary buffer is constructed as if it will be executed
1874 * with conditional rendering, we should satisfy this dependency
1875 * regardless of conditional rendering being enabled in primary.
1876 */
1877 struct mi_builder b;
1878 mi_builder_init(&b, &primary->device->info, &primary->batch);
1879 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
1880 mi_imm(UINT64_MAX));
1881 }
1882 }
1883 #endif
1884
1885 if (secondary->usage_flags &
1886 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1887 /* If we're continuing a render pass from the primary, we need to
1888 * copy the surface states for the current subpass into the storage
1889 * we allocated for them in BeginCommandBuffer.
1890 */
1891 struct anv_bo *ss_bo =
1892 primary->device->surface_state_pool.block_pool.bo;
1893 struct anv_state src_state = primary->state.gfx.att_states;
1894 struct anv_state dst_state = secondary->state.gfx.att_states;
1895 assert(src_state.alloc_size == dst_state.alloc_size);
1896
1897 genX(cmd_buffer_so_memcpy)(primary,
1898 (struct anv_address) {
1899 .bo = ss_bo,
1900 .offset = dst_state.offset,
1901 },
1902 (struct anv_address) {
1903 .bo = ss_bo,
1904 .offset = src_state.offset,
1905 },
1906 src_state.alloc_size);
1907 }
1908
1909 anv_cmd_buffer_add_secondary(primary, secondary);
1910
1911 assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
1912 secondary->perf_query_pool == primary->perf_query_pool);
1913 if (secondary->perf_query_pool)
1914 primary->perf_query_pool = secondary->perf_query_pool;
1915
1916 #if GFX_VERx10 == 120
1917 if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
1918 primary->state.depth_reg_mode = secondary->state.depth_reg_mode;
1919 #endif
1920 }
1921
1922 /* The secondary isn't counted in our VF cache tracking so we need to
1923 * invalidate the whole thing.
1924 */
1925 if (GFX_VER >= 8 && GFX_VER <= 9) {
1926 anv_add_pending_pipe_bits(primary,
1927 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1928 "Secondary cmd buffer not tracked in VF cache");
1929 }
1930
1931 /* The secondary may have selected a different pipeline (3D or compute) and
1932 * may have changed the current L3$ configuration. Reset our tracking
1933 * variables to invalid values to ensure that we re-emit these in the case
1934 * where we do any draws or compute dispatches from the primary after the
1935 * secondary has returned.
1936 */
1937 primary->state.current_pipeline = UINT32_MAX;
1938 primary->state.current_l3_config = NULL;
1939 primary->state.current_hash_scale = 0;
1940 primary->state.gfx.push_constant_stages = 0;
1941 vk_dynamic_graphics_state_dirty_all(&primary->vk.dynamic_graphics_state);
1942
1943 /* Each of the secondary command buffers will use its own state base
1944 * address. We need to re-emit state base address for the primary after
1945 * all of the secondaries are done.
1946 *
1947 * TODO: Maybe we want to make this a dirty bit to avoid extra state base
1948 * address calls?
1949 */
1950 genX(cmd_buffer_emit_state_base_address)(primary);
1951 }
1952
1953 /**
1954 * Program the hardware to use the specified L3 configuration.
1955 */
1956 void
cmd_buffer_config_l3(struct anv_cmd_buffer *cmd_buffer, const struct intel_l3_config *cfg)1957 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
1958 const struct intel_l3_config *cfg)
1959 {
1960 assert(cfg || GFX_VER >= 12);
1961 if (cfg == cmd_buffer->state.current_l3_config)
1962 return;
1963
1964 #if GFX_VER >= 11
1965 /* On Gfx11+ we use only one config, so verify it remains the same and skip
1966 * the stalling programming entirely.
1967 */
1968 assert(cfg == cmd_buffer->device->l3_config);
1969 #else
1970 if (INTEL_DEBUG(DEBUG_L3)) {
1971 mesa_logd("L3 config transition: ");
1972 intel_dump_l3_config(cfg, stderr);
1973 }
1974
1975 /* According to the hardware docs, the L3 partitioning can only be changed
1976 * while the pipeline is completely drained and the caches are flushed,
1977 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1978 */
1979 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1980 pc.DCFlushEnable = true;
1981 pc.PostSyncOperation = NoWrite;
1982 pc.CommandStreamerStallEnable = true;
1983 anv_debug_dump_pc(pc);
1984 }
1985
1986 /* ...followed by a second pipelined PIPE_CONTROL that initiates
1987 * invalidation of the relevant caches. Note that because RO invalidation
1988 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1989 * command is processed by the CS) we cannot combine it with the previous
1990 * stalling flush as the hardware documentation suggests, because that
1991 * would cause the CS to stall on previous rendering *after* RO
1992 * invalidation and wouldn't prevent the RO caches from being polluted by
1993 * concurrent rendering before the stall completes. This intentionally
1994 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1995 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1996 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1997 * already guarantee that there is no concurrent GPGPU kernel execution
1998 * (see SKL HSD 2132585).
1999 */
2000 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2001 pc.TextureCacheInvalidationEnable = true;
2002 pc.ConstantCacheInvalidationEnable = true;
2003 pc.InstructionCacheInvalidateEnable = true;
2004 pc.StateCacheInvalidationEnable = true;
2005 pc.PostSyncOperation = NoWrite;
2006 anv_debug_dump_pc(pc);
2007 }
2008
2009 /* Now send a third stalling flush to make sure that invalidation is
2010 * complete when the L3 configuration registers are modified.
2011 */
2012 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2013 pc.DCFlushEnable = true;
2014 pc.PostSyncOperation = NoWrite;
2015 pc.CommandStreamerStallEnable = true;
2016 anv_debug_dump_pc(pc);
2017 }
2018
2019 genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
2020 #endif /* GFX_VER >= 11 */
2021 cmd_buffer->state.current_l3_config = cfg;
2022 }
2023
2024 enum anv_pipe_bits
emit_apply_pipe_flushes(struct anv_batch *batch, struct anv_device *device, uint32_t current_pipeline, enum anv_pipe_bits bits)2025 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
2026 struct anv_device *device,
2027 uint32_t current_pipeline,
2028 enum anv_pipe_bits bits)
2029 {
2030 /*
2031 * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
2032 *
2033 * Write synchronization is a special case of end-of-pipe
2034 * synchronization that requires that the render cache and/or depth
2035 * related caches are flushed to memory, where the data will become
2036 * globally visible. This type of synchronization is required prior to
2037 * SW (CPU) actually reading the result data from memory, or initiating
2038 * an operation that will use as a read surface (such as a texture
2039 * surface) a previous render target and/or depth/stencil buffer
2040 *
2041 *
2042 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
2043 *
2044 * Exercising the write cache flush bits (Render Target Cache Flush
2045 * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
2046 * ensures the write caches are flushed and doesn't guarantee the data
2047 * is globally visible.
2048 *
2049 * SW can track the completion of the end-of-pipe-synchronization by
2050 * using "Notify Enable" and "PostSync Operation - Write Immediate
2051 * Data" in the PIPE_CONTROL command.
2052 *
2053 * In other words, flushes are pipelined while invalidations are handled
2054 * immediately. Therefore, if we're flushing anything then we need to
2055 * schedule an end-of-pipe sync before any invalidations can happen.
2056 */
2057 if (bits & ANV_PIPE_FLUSH_BITS)
2058 bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2059
2060
2061 /* HSD 1209978178: docs say that before programming the aux table:
2062 *
2063 * "Driver must ensure that the engine is IDLE but ensure it doesn't
2064 * add extra flushes in the case it knows that the engine is already
2065 * IDLE."
2066 */
2067 if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT))
2068 bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2069
2070 /* If we're going to do an invalidate and we have a pending end-of-pipe
2071 * sync that has yet to be resolved, we do the end-of-pipe sync now.
2072 */
2073 if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
2074 (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
2075 bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
2076 bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
2077 }
2078
2079 /* Project: SKL / Argument: LRI Post Sync Operation [23]
2080 *
2081 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
2082 * programmed prior to programming a PIPECONTROL command with "LRI
2083 * Post Sync Operation" in GPGPU mode of operation (i.e when
2084 * PIPELINE_SELECT command is set to GPGPU mode of operation)."
2085 *
2086 * The same text exists a few rows below for Post Sync Op.
2087 */
2088 if (bits & ANV_PIPE_POST_SYNC_BIT) {
2089 if (GFX_VER == 9 && current_pipeline == GPGPU)
2090 bits |= ANV_PIPE_CS_STALL_BIT;
2091 bits &= ~ANV_PIPE_POST_SYNC_BIT;
2092 }
2093
2094 if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
2095 ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
2096 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2097 #if GFX_VER >= 12
2098 pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
2099 pipe.HDCPipelineFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2100 #else
2101 /* Flushing HDC pipeline requires DC Flush on earlier HW. */
2102 pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2103 #endif
2104 pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
2105 pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2106 pipe.RenderTargetCacheFlushEnable =
2107 bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
2108
2109 /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
2110 * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
2111 */
2112 #if GFX_VER >= 12
2113 pipe.DepthStallEnable =
2114 pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT);
2115 #else
2116 pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
2117 #endif
2118
2119 #if GFX_VERx10 >= 125
2120 pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT;
2121 #endif
2122
2123 pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
2124 #if GFX_VER == 8
2125 /* From Broadwell PRM, volume 2a:
2126 * PIPE_CONTROL: Command Streamer Stall Enable:
2127 *
2128 * "This bit must be always set when PIPE_CONTROL command is
2129 * programmed by GPGPU and MEDIA workloads, except for the cases
2130 * when only Read Only Cache Invalidation bits are set (State
2131 * Cache Invalidation Enable, Instruction cache Invalidation
2132 * Enable, Texture Cache Invalidation Enable, Constant Cache
2133 * Invalidation Enable). This is to WA FFDOP CG issue, this WA
2134 * need not implemented when FF_DOP_CG is disabled."
2135 *
2136 * Since we do all the invalidation in the following PIPE_CONTROL,
2137 * if we got here, we need a stall.
2138 */
2139 pipe.CommandStreamerStallEnable |= current_pipeline == GPGPU;
2140 #endif
2141
2142 pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2143
2144 /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
2145 *
2146 * "The most common action to perform upon reaching a
2147 * synchronization point is to write a value out to memory. An
2148 * immediate value (included with the synchronization command) may
2149 * be written."
2150 *
2151 *
2152 * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
2153 *
2154 * "In case the data flushed out by the render engine is to be
2155 * read back in to the render engine in coherent manner, then the
2156 * render engine has to wait for the fence completion before
2157 * accessing the flushed data. This can be achieved by following
2158 * means on various products: PIPE_CONTROL command with CS Stall
2159 * and the required write caches flushed with Post-Sync-Operation
2160 * as Write Immediate Data.
2161 *
2162 * Example:
2163 * - Workload-1 (3D/GPGPU/MEDIA)
2164 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
2165 * Immediate Data, Required Write Cache Flush bits set)
2166 * - Workload-2 (Can use the data produce or output by
2167 * Workload-1)
2168 */
2169 if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
2170 pipe.CommandStreamerStallEnable = true;
2171 pipe.PostSyncOperation = WriteImmediateData;
2172 pipe.Address = device->workaround_address;
2173 }
2174
2175 /*
2176 * According to the Broadwell documentation, any PIPE_CONTROL with the
2177 * "Command Streamer Stall" bit set must also have another bit set,
2178 * with five different options:
2179 *
2180 * - Render Target Cache Flush
2181 * - Depth Cache Flush
2182 * - Stall at Pixel Scoreboard
2183 * - Post-Sync Operation
2184 * - Depth Stall
2185 * - DC Flush Enable
2186 *
2187 * I chose "Stall at Pixel Scoreboard" since that's what we use in
2188 * mesa and it seems to work fine. The choice is fairly arbitrary.
2189 */
2190 if (pipe.CommandStreamerStallEnable &&
2191 !pipe.RenderTargetCacheFlushEnable &&
2192 !pipe.DepthCacheFlushEnable &&
2193 !pipe.StallAtPixelScoreboard &&
2194 !pipe.PostSyncOperation &&
2195 !pipe.DepthStallEnable &&
2196 !pipe.DCFlushEnable)
2197 pipe.StallAtPixelScoreboard = true;
2198 anv_debug_dump_pc(pipe);
2199 }
2200
2201 /* If a render target flush was emitted, then we can toggle off the bit
2202 * saying that render target writes are ongoing.
2203 */
2204 if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
2205 bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
2206
2207 if (GFX_VERx10 == 75) {
2208 /* Haswell needs addition work-arounds:
2209 *
2210 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
2211 *
2212 * Option 1:
2213 * PIPE_CONTROL command with the CS Stall and the required write
2214 * caches flushed with Post-SyncOperation as Write Immediate Data
2215 * followed by eight dummy MI_STORE_DATA_IMM (write to scratch
2216 * spce) commands.
2217 *
2218 * Example:
2219 * - Workload-1
2220 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
2221 * Immediate Data, Required Write Cache Flush bits set)
2222 * - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
2223 * - Workload-2 (Can use the data produce or output by
2224 * Workload-1)
2225 *
2226 * Unfortunately, both the PRMs and the internal docs are a bit
2227 * out-of-date in this regard. What the windows driver does (and
2228 * this appears to actually work) is to emit a register read from the
2229 * memory address written by the pipe control above.
2230 *
2231 * What register we load into doesn't matter. We choose an indirect
2232 * rendering register because we know it always exists and it's one
2233 * of the first registers the command parser allows us to write. If
2234 * you don't have command parser support in your kernel (pre-4.2),
2235 * this will get turned into MI_NOOP and you won't get the
2236 * workaround. Unfortunately, there's just not much we can do in
2237 * that case. This register is perfectly safe to write since we
2238 * always re-load all of the indirect draw registers right before
2239 * 3DPRIMITIVE when needed anyway.
2240 */
2241 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2242 lrm.RegisterAddress = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */
2243 lrm.MemoryAddress = device->workaround_address;
2244 }
2245 }
2246
2247 bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
2248 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
2249 }
2250
2251 if (bits & ANV_PIPE_INVALIDATE_BITS) {
2252 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2253 *
2254 * "If the VF Cache Invalidation Enable is set to a 1 in a
2255 * PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to
2256 * 0, with the VF Cache Invalidation Enable set to 0 needs to be sent
2257 * prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to
2258 * a 1."
2259 *
2260 * This appears to hang Broadwell, so we restrict it to just gfx9.
2261 */
2262 if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
2263 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe);
2264
2265 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2266 pipe.StateCacheInvalidationEnable =
2267 bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
2268 pipe.ConstantCacheInvalidationEnable =
2269 bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
2270 #if GFX_VER >= 12
2271 /* Invalidates the L3 cache part in which index & vertex data is loaded
2272 * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
2273 */
2274 pipe.L3ReadOnlyCacheInvalidationEnable =
2275 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2276 #endif
2277 pipe.VFCacheInvalidationEnable =
2278 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2279 pipe.TextureCacheInvalidationEnable =
2280 bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
2281 pipe.InstructionCacheInvalidateEnable =
2282 bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
2283
2284 #if GFX_VER >= 9 && GFX_VER <= 11
2285 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2286 *
2287 * "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
2288 * always set for GPGPU workloads when “Texture Cache
2289 * Invalidation Enable” bit is set".
2290 *
2291 * Workaround stopped appearing in TGL PRMs.
2292 */
2293 if (current_pipeline == GPGPU && pipe.TextureCacheInvalidationEnable)
2294 pipe.CommandStreamerStallEnable = true;
2295 #endif
2296
2297 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2298 *
2299 * "When VF Cache Invalidate is set “Post Sync Operation” must be
2300 * enabled to “Write Immediate Data” or “Write PS Depth Count” or
2301 * “Write Timestamp”.
2302 */
2303 if (GFX_VER == 9 && pipe.VFCacheInvalidationEnable) {
2304 pipe.PostSyncOperation = WriteImmediateData;
2305 pipe.Address = device->workaround_address;
2306 }
2307 anv_debug_dump_pc(pipe);
2308 }
2309
2310 #if GFX_VER == 12
2311 if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info.has_aux_map) {
2312 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2313 lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num);
2314 lri.DataDWord = 1;
2315 }
2316 }
2317 #endif
2318
2319 bits &= ~ANV_PIPE_INVALIDATE_BITS;
2320 }
2321
2322 return bits;
2323 }
2324
2325 void
cmd_buffer_apply_pipe_flushes(struct anv_cmd_buffer *cmd_buffer)2326 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
2327 {
2328 #if GFX_VERx10 == 120
2329 /* If we're changing the state of the RHWO optimization, we need to have
2330 * sb_stall+cs_stall.
2331 */
2332 const bool rhwo_opt_change =
2333 cmd_buffer->state.rhwo_optimization_enabled !=
2334 cmd_buffer->state.pending_rhwo_optimization_enabled;
2335 if (rhwo_opt_change) {
2336 anv_add_pending_pipe_bits(cmd_buffer,
2337 ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
2338 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
2339 "change RHWO optimization");
2340 }
2341 #endif
2342
2343 enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
2344
2345 if (unlikely(cmd_buffer->device->physical->always_flush_cache))
2346 bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
2347 else if (bits == 0)
2348 return;
2349
2350 bool trace_flush =
2351 (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | ANV_PIPE_INVALIDATE_BITS)) != 0;
2352 if (trace_flush)
2353 trace_intel_begin_stall(&cmd_buffer->trace);
2354
2355 if ((GFX_VER >= 8 && GFX_VER <= 9) &&
2356 (bits & ANV_PIPE_CS_STALL_BIT) &&
2357 (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
2358 /* If we are doing a VF cache invalidate AND a CS stall (it must be
2359 * both) then we can reset our vertex cache tracking.
2360 */
2361 memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
2362 sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
2363 memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
2364 sizeof(cmd_buffer->state.gfx.ib_dirty_range));
2365 }
2366
2367 cmd_buffer->state.pending_pipe_bits =
2368 genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
2369 cmd_buffer->device,
2370 cmd_buffer->state.current_pipeline,
2371 bits);
2372
2373 #if GFX_VERx10 == 120
2374 /* Wa_1508744258 handling */
2375 if (rhwo_opt_change) {
2376 anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
2377 c1.RCCRHWOOptimizationDisable =
2378 !cmd_buffer->state.pending_rhwo_optimization_enabled;
2379 c1.RCCRHWOOptimizationDisableMask = true;
2380 }
2381 cmd_buffer->state.rhwo_optimization_enabled =
2382 cmd_buffer->state.pending_rhwo_optimization_enabled;
2383 }
2384 #endif
2385
2386 if (trace_flush) {
2387 trace_intel_end_stall(&cmd_buffer->trace, bits,
2388 anv_pipe_flush_bit_to_ds_stall_flag, NULL);
2389 }
2390 }
2391
2392 static void
cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_info, const char *reason)2393 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
2394 const VkDependencyInfo *dep_info,
2395 const char *reason)
2396 {
2397 /* XXX: Right now, we're really dumb and just flush whatever categories
2398 * the app asks for. One of these days we may make this a bit better
2399 * but right now that's all the hardware allows for in most areas.
2400 */
2401 VkAccessFlags2 src_flags = 0;
2402 VkAccessFlags2 dst_flags = 0;
2403
2404 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
2405 src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
2406 dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
2407 }
2408
2409 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
2410 src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask;
2411 dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask;
2412 }
2413
2414 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
2415 const VkImageMemoryBarrier2 *img_barrier =
2416 &dep_info->pImageMemoryBarriers[i];
2417
2418 src_flags |= img_barrier->srcAccessMask;
2419 dst_flags |= img_barrier->dstAccessMask;
2420
2421 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
2422 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
2423
2424 uint32_t base_layer, layer_count;
2425 if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
2426 base_layer = 0;
2427 layer_count = anv_minify(image->vk.extent.depth, range->baseMipLevel);
2428 } else {
2429 base_layer = range->baseArrayLayer;
2430 layer_count = vk_image_subresource_layer_count(&image->vk, range);
2431 }
2432 const uint32_t level_count =
2433 vk_image_subresource_level_count(&image->vk, range);
2434
2435 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2436 transition_depth_buffer(cmd_buffer, image,
2437 base_layer, layer_count,
2438 img_barrier->oldLayout,
2439 img_barrier->newLayout,
2440 false /* will_full_fast_clear */);
2441 }
2442
2443 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2444 transition_stencil_buffer(cmd_buffer, image,
2445 range->baseMipLevel, level_count,
2446 base_layer, layer_count,
2447 img_barrier->oldLayout,
2448 img_barrier->newLayout,
2449 false /* will_full_fast_clear */);
2450 }
2451
2452 if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
2453 VkImageAspectFlags color_aspects =
2454 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
2455 anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
2456 transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
2457 range->baseMipLevel, level_count,
2458 base_layer, layer_count,
2459 img_barrier->oldLayout,
2460 img_barrier->newLayout,
2461 img_barrier->srcQueueFamilyIndex,
2462 img_barrier->dstQueueFamilyIndex,
2463 false /* will_full_fast_clear */);
2464 }
2465 }
2466 }
2467
2468 enum anv_pipe_bits bits =
2469 anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
2470 anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);
2471
2472 anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
2473 }
2474
CmdPipelineBarrier2( VkCommandBuffer commandBuffer, const VkDependencyInfo* pDependencyInfo)2475 void genX(CmdPipelineBarrier2)(
2476 VkCommandBuffer commandBuffer,
2477 const VkDependencyInfo* pDependencyInfo)
2478 {
2479 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2480
2481 cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
2482 }
2483
2484 static void
cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)2485 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
2486 {
2487 VkShaderStageFlags stages =
2488 cmd_buffer->state.gfx.pipeline->active_stages;
2489
2490 /* In order to avoid thrash, we assume that vertex and fragment stages
2491 * always exist. In the rare case where one is missing *and* the other
2492 * uses push concstants, this may be suboptimal. However, avoiding stalls
2493 * seems more important.
2494 */
2495 stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
2496 if (anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline))
2497 stages |= VK_SHADER_STAGE_VERTEX_BIT;
2498
2499 if (stages == cmd_buffer->state.gfx.push_constant_stages)
2500 return;
2501
2502 const unsigned push_constant_kb =
2503 cmd_buffer->device->info.max_constant_urb_size_kb;
2504
2505 const unsigned num_stages =
2506 util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
2507 unsigned size_per_stage = push_constant_kb / num_stages;
2508
2509 /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
2510 * units of 2KB. Incidentally, these are the same platforms that have
2511 * 32KB worth of push constant space.
2512 */
2513 if (push_constant_kb == 32)
2514 size_per_stage &= ~1u;
2515
2516 uint32_t kb_used = 0;
2517 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
2518 unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
2519 anv_batch_emit(&cmd_buffer->batch,
2520 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
2521 alloc._3DCommandSubOpcode = 18 + i;
2522 alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
2523 alloc.ConstantBufferSize = push_size;
2524 }
2525 kb_used += push_size;
2526 }
2527
2528 anv_batch_emit(&cmd_buffer->batch,
2529 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
2530 alloc.ConstantBufferOffset = kb_used;
2531 alloc.ConstantBufferSize = push_constant_kb - kb_used;
2532 }
2533
2534 #if GFX_VERx10 == 125
2535 /* Wa_22011440098
2536 *
2537 * In 3D mode, after programming push constant alloc command immediately
2538 * program push constant command(ZERO length) without any commit between
2539 * them.
2540 */
2541 if (intel_device_info_is_dg2(&cmd_buffer->device->info)) {
2542 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
2543 /* Update empty push constants for all stages (bitmask = 11111b) */
2544 c.ShaderUpdateEnable = 0x1f;
2545 c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
2546 }
2547 }
2548 #endif
2549
2550 cmd_buffer->state.gfx.push_constant_stages = stages;
2551
2552 /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
2553 *
2554 * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
2555 * the next 3DPRIMITIVE command after programming the
2556 * 3DSTATE_PUSH_CONSTANT_ALLOC_VS"
2557 *
2558 * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
2559 * pipeline setup, we need to dirty push constants.
2560 */
2561 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
2562 }
2563
2564 static VkResult
emit_binding_table(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_pipeline_state *pipe_state, struct anv_shader_bin *shader, struct anv_state *bt_state)2565 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
2566 struct anv_cmd_pipeline_state *pipe_state,
2567 struct anv_shader_bin *shader,
2568 struct anv_state *bt_state)
2569 {
2570 uint32_t state_offset;
2571
2572 struct anv_pipeline_bind_map *map = &shader->bind_map;
2573 if (map->surface_count == 0) {
2574 *bt_state = (struct anv_state) { 0, };
2575 return VK_SUCCESS;
2576 }
2577
2578 *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2579 map->surface_count,
2580 &state_offset);
2581 uint32_t *bt_map = bt_state->map;
2582
2583 if (bt_state->map == NULL)
2584 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2585
2586 /* We only need to emit relocs if we're not using softpin. If we are using
2587 * softpin then we always keep all user-allocated memory objects resident.
2588 */
2589 const bool need_client_mem_relocs =
2590 anv_use_relocations(cmd_buffer->device->physical);
2591 struct anv_push_constants *push = &pipe_state->push_constants;
2592
2593 for (uint32_t s = 0; s < map->surface_count; s++) {
2594 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2595
2596 struct anv_state surface_state;
2597
2598 switch (binding->set) {
2599 case ANV_DESCRIPTOR_SET_NULL:
2600 bt_map[s] = 0;
2601 break;
2602
2603 case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2604 /* Color attachment binding */
2605 assert(shader->stage == MESA_SHADER_FRAGMENT);
2606 if (binding->index < cmd_buffer->state.gfx.color_att_count) {
2607 const struct anv_attachment *att =
2608 &cmd_buffer->state.gfx.color_att[binding->index];
2609 surface_state = att->surface_state.state;
2610 } else {
2611 surface_state = cmd_buffer->state.gfx.null_surface_state;
2612 }
2613 assert(surface_state.map);
2614 bt_map[s] = surface_state.offset + state_offset;
2615 break;
2616
2617 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {
2618 struct anv_state surface_state =
2619 anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2620
2621 struct anv_address constant_data = {
2622 .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2623 .offset = shader->kernel.offset +
2624 shader->prog_data->const_data_offset,
2625 };
2626 unsigned constant_data_size = shader->prog_data->const_data_size;
2627
2628 const enum isl_format format =
2629 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2630 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2631 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2632 format, ISL_SWIZZLE_IDENTITY,
2633 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2634 constant_data, constant_data_size, 1);
2635
2636 assert(surface_state.map);
2637 bt_map[s] = surface_state.offset + state_offset;
2638 add_surface_reloc(cmd_buffer, surface_state, constant_data);
2639 break;
2640 }
2641
2642 case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2643 /* This is always the first binding for compute shaders */
2644 assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2645
2646 struct anv_state surface_state =
2647 anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2648
2649 const enum isl_format format =
2650 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2651 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2652 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2653 format, ISL_SWIZZLE_IDENTITY,
2654 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2655 cmd_buffer->state.compute.num_workgroups,
2656 12, 1);
2657
2658 assert(surface_state.map);
2659 bt_map[s] = surface_state.offset + state_offset;
2660 if (need_client_mem_relocs) {
2661 add_surface_reloc(cmd_buffer, surface_state,
2662 cmd_buffer->state.compute.num_workgroups);
2663 }
2664 break;
2665 }
2666
2667 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2668 /* This is a descriptor set buffer so the set index is actually
2669 * given by binding->binding. (Yes, that's confusing.)
2670 */
2671 struct anv_descriptor_set *set =
2672 pipe_state->descriptors[binding->index];
2673 assert(set->desc_mem.alloc_size);
2674 assert(set->desc_surface_state.alloc_size);
2675 bt_map[s] = set->desc_surface_state.offset + state_offset;
2676 add_surface_reloc(cmd_buffer, set->desc_surface_state,
2677 anv_descriptor_set_address(set));
2678 break;
2679 }
2680
2681 default: {
2682 assert(binding->set < MAX_SETS);
2683 const struct anv_descriptor_set *set =
2684 pipe_state->descriptors[binding->set];
2685 if (binding->index >= set->descriptor_count) {
2686 /* From the Vulkan spec section entitled "DescriptorSet and
2687 * Binding Assignment":
2688 *
2689 * "If the array is runtime-sized, then array elements greater
2690 * than or equal to the size of that binding in the bound
2691 * descriptor set must not be used."
2692 *
2693 * Unfortunately, the compiler isn't smart enough to figure out
2694 * when a dynamic binding isn't used so it may grab the whole
2695 * array and stick it in the binding table. In this case, it's
2696 * safe to just skip those bindings that are OOB.
2697 */
2698 assert(binding->index < set->layout->descriptor_count);
2699 continue;
2700 }
2701 const struct anv_descriptor *desc = &set->descriptors[binding->index];
2702
2703 switch (desc->type) {
2704 case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
2705 case VK_DESCRIPTOR_TYPE_SAMPLER:
2706 /* Nothing for us to do here */
2707 continue;
2708
2709 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2710 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2711 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
2712 if (desc->image_view) {
2713 struct anv_surface_state sstate =
2714 (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
2715 desc->image_view->planes[binding->plane].general_sampler_surface_state :
2716 desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
2717 surface_state = sstate.state;
2718 assert(surface_state.alloc_size);
2719 if (need_client_mem_relocs)
2720 add_surface_state_relocs(cmd_buffer, sstate);
2721 } else {
2722 surface_state = cmd_buffer->device->null_surface_state;
2723 }
2724 break;
2725 }
2726
2727 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
2728 if (desc->image_view) {
2729 struct anv_surface_state sstate =
2730 binding->lowered_storage_surface
2731 ? desc->image_view->planes[binding->plane].lowered_storage_surface_state
2732 : desc->image_view->planes[binding->plane].storage_surface_state;
2733 surface_state = sstate.state;
2734 assert(surface_state.alloc_size);
2735 if (surface_state.offset == 0) {
2736 mesa_loge("Bound a image to a descriptor where the "
2737 "descriptor does not have NonReadable "
2738 "set and the image does not have a "
2739 "corresponding SPIR-V format enum.");
2740 vk_debug_report(&cmd_buffer->device->physical->instance->vk,
2741 VK_DEBUG_REPORT_ERROR_BIT_EXT,
2742 &desc->image_view->vk.base,
2743 __LINE__, 0, "anv",
2744 "Bound a image to a descriptor where the "
2745 "descriptor does not have NonReadable "
2746 "set and the image does not have a "
2747 "corresponding SPIR-V format enum.");
2748 }
2749 if (surface_state.offset && need_client_mem_relocs)
2750 add_surface_state_relocs(cmd_buffer, sstate);
2751 } else {
2752 surface_state = cmd_buffer->device->null_surface_state;
2753 }
2754 break;
2755 }
2756
2757 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2758 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2759 if (desc->set_buffer_view) {
2760 surface_state = desc->set_buffer_view->surface_state;
2761 assert(surface_state.alloc_size);
2762 if (need_client_mem_relocs) {
2763 add_surface_reloc(cmd_buffer, surface_state,
2764 desc->set_buffer_view->address);
2765 }
2766 } else {
2767 surface_state = cmd_buffer->device->null_surface_state;
2768 }
2769 break;
2770
2771 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2772 if (desc->buffer_view) {
2773 surface_state = desc->buffer_view->surface_state;
2774 assert(surface_state.alloc_size);
2775 if (need_client_mem_relocs) {
2776 add_surface_reloc(cmd_buffer, surface_state,
2777 desc->buffer_view->address);
2778 }
2779 } else {
2780 surface_state = cmd_buffer->device->null_surface_state;
2781 }
2782 break;
2783
2784 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2785 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
2786 if (desc->buffer) {
2787 /* Compute the offset within the buffer */
2788 uint32_t dynamic_offset =
2789 push->dynamic_offsets[binding->dynamic_offset_index];
2790 uint64_t offset = desc->offset + dynamic_offset;
2791 /* Clamp to the buffer size */
2792 offset = MIN2(offset, desc->buffer->vk.size);
2793 /* Clamp the range to the buffer size */
2794 uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
2795
2796 /* Align the range for consistency */
2797 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
2798 range = align_u32(range, ANV_UBO_ALIGNMENT);
2799
2800 struct anv_address address =
2801 anv_address_add(desc->buffer->address, offset);
2802
2803 surface_state =
2804 anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
2805 enum isl_format format =
2806 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2807 desc->type);
2808
2809 isl_surf_usage_flags_t usage =
2810 desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?
2811 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
2812 ISL_SURF_USAGE_STORAGE_BIT;
2813
2814 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2815 format, ISL_SWIZZLE_IDENTITY,
2816 usage, address, range, 1);
2817 if (need_client_mem_relocs)
2818 add_surface_reloc(cmd_buffer, surface_state, address);
2819 } else {
2820 surface_state = cmd_buffer->device->null_surface_state;
2821 }
2822 break;
2823 }
2824
2825 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2826 if (desc->buffer_view) {
2827 surface_state = binding->lowered_storage_surface
2828 ? desc->buffer_view->lowered_storage_surface_state
2829 : desc->buffer_view->storage_surface_state;
2830 assert(surface_state.alloc_size);
2831 if (need_client_mem_relocs) {
2832 add_surface_reloc(cmd_buffer, surface_state,
2833 desc->buffer_view->address);
2834 }
2835 } else {
2836 surface_state = cmd_buffer->device->null_surface_state;
2837 }
2838 break;
2839
2840 default:
2841 assert(!"Invalid descriptor type");
2842 continue;
2843 }
2844 assert(surface_state.map);
2845 bt_map[s] = surface_state.offset + state_offset;
2846 break;
2847 }
2848 }
2849 }
2850
2851 return VK_SUCCESS;
2852 }
2853
2854 static VkResult
emit_samplers(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_pipeline_state *pipe_state, struct anv_shader_bin *shader, struct anv_state *state)2855 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2856 struct anv_cmd_pipeline_state *pipe_state,
2857 struct anv_shader_bin *shader,
2858 struct anv_state *state)
2859 {
2860 struct anv_pipeline_bind_map *map = &shader->bind_map;
2861 if (map->sampler_count == 0) {
2862 *state = (struct anv_state) { 0, };
2863 return VK_SUCCESS;
2864 }
2865
2866 uint32_t size = map->sampler_count * 16;
2867 *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2868
2869 if (state->map == NULL)
2870 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2871
2872 for (uint32_t s = 0; s < map->sampler_count; s++) {
2873 struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2874 const struct anv_descriptor *desc =
2875 &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2876
2877 if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2878 desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2879 continue;
2880
2881 struct anv_sampler *sampler = desc->sampler;
2882
2883 /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2884 * happens to be zero.
2885 */
2886 if (sampler == NULL)
2887 continue;
2888
2889 memcpy(state->map + (s * 16),
2890 sampler->state[binding->plane], sizeof(sampler->state[0]));
2891 }
2892
2893 return VK_SUCCESS;
2894 }
2895
2896 static uint32_t
flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_pipeline_state *pipe_state, const VkShaderStageFlags dirty, struct anv_shader_bin **shaders, uint32_t num_shaders)2897 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
2898 struct anv_cmd_pipeline_state *pipe_state,
2899 const VkShaderStageFlags dirty,
2900 struct anv_shader_bin **shaders,
2901 uint32_t num_shaders)
2902 {
2903 VkShaderStageFlags flushed = 0;
2904
2905 VkResult result = VK_SUCCESS;
2906 for (uint32_t i = 0; i < num_shaders; i++) {
2907 if (!shaders[i])
2908 continue;
2909
2910 gl_shader_stage stage = shaders[i]->stage;
2911 VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2912 if ((vk_stage & dirty) == 0)
2913 continue;
2914
2915 assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2916 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2917 &cmd_buffer->state.samplers[stage]);
2918 if (result != VK_SUCCESS)
2919 break;
2920
2921 assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2922 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2923 &cmd_buffer->state.binding_tables[stage]);
2924 if (result != VK_SUCCESS)
2925 break;
2926
2927 flushed |= vk_stage;
2928 }
2929
2930 if (result != VK_SUCCESS) {
2931 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2932
2933 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2934 if (result != VK_SUCCESS)
2935 return 0;
2936
2937 /* Re-emit state base addresses so we get the new surface state base
2938 * address before we start emitting binding tables etc.
2939 */
2940 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2941
2942 /* Re-emit all active binding tables */
2943 flushed = 0;
2944
2945 for (uint32_t i = 0; i < num_shaders; i++) {
2946 if (!shaders[i])
2947 continue;
2948
2949 gl_shader_stage stage = shaders[i]->stage;
2950
2951 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2952 &cmd_buffer->state.samplers[stage]);
2953 if (result != VK_SUCCESS) {
2954 anv_batch_set_error(&cmd_buffer->batch, result);
2955 return 0;
2956 }
2957 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2958 &cmd_buffer->state.binding_tables[stage]);
2959 if (result != VK_SUCCESS) {
2960 anv_batch_set_error(&cmd_buffer->batch, result);
2961 return 0;
2962 }
2963
2964 flushed |= mesa_to_vk_shader_stage(stage);
2965 }
2966 }
2967
2968 return flushed;
2969 }
2970
2971 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer, uint32_t stages)2972 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
2973 uint32_t stages)
2974 {
2975 static const uint32_t sampler_state_opcodes[] = {
2976 [MESA_SHADER_VERTEX] = 43,
2977 [MESA_SHADER_TESS_CTRL] = 44, /* HS */
2978 [MESA_SHADER_TESS_EVAL] = 45, /* DS */
2979 [MESA_SHADER_GEOMETRY] = 46,
2980 [MESA_SHADER_FRAGMENT] = 47,
2981 };
2982
2983 static const uint32_t binding_table_opcodes[] = {
2984 [MESA_SHADER_VERTEX] = 38,
2985 [MESA_SHADER_TESS_CTRL] = 39,
2986 [MESA_SHADER_TESS_EVAL] = 40,
2987 [MESA_SHADER_GEOMETRY] = 41,
2988 [MESA_SHADER_FRAGMENT] = 42,
2989 };
2990
2991 anv_foreach_stage(s, stages) {
2992 assert(s < ARRAY_SIZE(binding_table_opcodes));
2993
2994 if (cmd_buffer->state.samplers[s].alloc_size > 0) {
2995 anv_batch_emit(&cmd_buffer->batch,
2996 GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
2997 ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
2998 ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
2999 }
3000 }
3001
3002 /* Always emit binding table pointers if we're asked to, since on SKL
3003 * this is what flushes push constants. */
3004 anv_batch_emit(&cmd_buffer->batch,
3005 GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
3006 btp._3DCommandSubOpcode = binding_table_opcodes[s];
3007 btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
3008 }
3009 }
3010 }
3011
3012 static struct anv_address
get_push_range_address(struct anv_cmd_buffer *cmd_buffer, const struct anv_shader_bin *shader, const struct anv_push_range *range)3013 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
3014 const struct anv_shader_bin *shader,
3015 const struct anv_push_range *range)
3016 {
3017 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3018 switch (range->set) {
3019 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
3020 /* This is a descriptor set buffer so the set index is
3021 * actually given by binding->binding. (Yes, that's
3022 * confusing.)
3023 */
3024 struct anv_descriptor_set *set =
3025 gfx_state->base.descriptors[range->index];
3026 return anv_descriptor_set_address(set);
3027 }
3028
3029 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
3030 if (gfx_state->base.push_constants_state.alloc_size == 0) {
3031 gfx_state->base.push_constants_state =
3032 anv_cmd_buffer_gfx_push_constants(cmd_buffer);
3033 }
3034 return (struct anv_address) {
3035 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3036 .offset = gfx_state->base.push_constants_state.offset,
3037 };
3038 }
3039
3040 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
3041 return (struct anv_address) {
3042 .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
3043 .offset = shader->kernel.offset +
3044 shader->prog_data->const_data_offset,
3045 };
3046
3047 default: {
3048 assert(range->set < MAX_SETS);
3049 struct anv_descriptor_set *set =
3050 gfx_state->base.descriptors[range->set];
3051 const struct anv_descriptor *desc =
3052 &set->descriptors[range->index];
3053
3054 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
3055 if (desc->buffer_view)
3056 return desc->buffer_view->address;
3057 } else {
3058 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
3059 if (desc->buffer) {
3060 const struct anv_push_constants *push =
3061 &gfx_state->base.push_constants;
3062 uint32_t dynamic_offset =
3063 push->dynamic_offsets[range->dynamic_offset_index];
3064 return anv_address_add(desc->buffer->address,
3065 desc->offset + dynamic_offset);
3066 }
3067 }
3068
3069 /* For NULL UBOs, we just return an address in the workaround BO. We do
3070 * writes to it for workarounds but always at the bottom. The higher
3071 * bytes should be all zeros.
3072 */
3073 assert(range->length * 32 <= 2048);
3074 return (struct anv_address) {
3075 .bo = cmd_buffer->device->workaround_bo,
3076 .offset = 1024,
3077 };
3078 }
3079 }
3080 }
3081
3082
3083 /** Returns the size in bytes of the bound buffer
3084 *
3085 * The range is relative to the start of the buffer, not the start of the
3086 * range. The returned range may be smaller than
3087 *
3088 * (range->start + range->length) * 32;
3089 */
3090 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer, const struct anv_shader_bin *shader, const struct anv_push_range *range)3091 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
3092 const struct anv_shader_bin *shader,
3093 const struct anv_push_range *range)
3094 {
3095 assert(shader->stage != MESA_SHADER_COMPUTE);
3096 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3097 switch (range->set) {
3098 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
3099 struct anv_descriptor_set *set =
3100 gfx_state->base.descriptors[range->index];
3101 assert(range->start * 32 < set->desc_mem.alloc_size);
3102 assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);
3103 return set->desc_mem.alloc_size;
3104 }
3105
3106 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
3107 return (range->start + range->length) * 32;
3108
3109 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
3110 return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);
3111
3112 default: {
3113 assert(range->set < MAX_SETS);
3114 struct anv_descriptor_set *set =
3115 gfx_state->base.descriptors[range->set];
3116 const struct anv_descriptor *desc =
3117 &set->descriptors[range->index];
3118
3119 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
3120 /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
3121 * We use the descriptor set's internally allocated surface state to fill the binding table entry.
3122 */
3123 if (!desc->set_buffer_view)
3124 return 0;
3125
3126 if (range->start * 32 > desc->set_buffer_view->range)
3127 return 0;
3128
3129 return desc->set_buffer_view->range;
3130 } else {
3131 if (!desc->buffer)
3132 return 0;
3133
3134 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
3135 /* Compute the offset within the buffer */
3136 const struct anv_push_constants *push =
3137 &gfx_state->base.push_constants;
3138 uint32_t dynamic_offset =
3139 push->dynamic_offsets[range->dynamic_offset_index];
3140 uint64_t offset = desc->offset + dynamic_offset;
3141 /* Clamp to the buffer size */
3142 offset = MIN2(offset, desc->buffer->vk.size);
3143 /* Clamp the range to the buffer size */
3144 uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
3145
3146 /* Align the range for consistency */
3147 bound_range = align_u32(bound_range, ANV_UBO_ALIGNMENT);
3148
3149 return bound_range;
3150 }
3151 }
3152 }
3153 }
3154
3155 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer, gl_shader_stage stage, struct anv_address *buffers, unsigned buffer_count)3156 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
3157 gl_shader_stage stage,
3158 struct anv_address *buffers,
3159 unsigned buffer_count)
3160 {
3161 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3162 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3163
3164 static const uint32_t push_constant_opcodes[] = {
3165 [MESA_SHADER_VERTEX] = 21,
3166 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
3167 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
3168 [MESA_SHADER_GEOMETRY] = 22,
3169 [MESA_SHADER_FRAGMENT] = 23,
3170 };
3171
3172 assert(stage < ARRAY_SIZE(push_constant_opcodes));
3173
3174 UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
3175
3176 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
3177 c._3DCommandSubOpcode = push_constant_opcodes[stage];
3178
3179 /* Set MOCS, except on Gfx8, because the Broadwell PRM says:
3180 *
3181 * "Constant Buffer Object Control State must be always
3182 * programmed to zero."
3183 *
3184 * This restriction does not exist on any newer platforms.
3185 *
3186 * We only have one MOCS field for the whole packet, not one per
3187 * buffer. We could go out of our way here to walk over all of
3188 * the buffers and see if any of them are used externally and use
3189 * the external MOCS. However, the notion that someone would use
3190 * the same bit of memory for both scanout and a UBO is nuts.
3191 *
3192 * Let's not bother and assume it's all internal.
3193 */
3194 #if GFX_VER >= 9
3195 c.MOCS = mocs;
3196 #elif GFX_VER < 8
3197 c.ConstantBody.MOCS = mocs;
3198 #endif
3199
3200 if (anv_pipeline_has_stage(pipeline, stage)) {
3201 const struct anv_pipeline_bind_map *bind_map =
3202 &pipeline->shaders[stage]->bind_map;
3203
3204 #if GFX_VERx10 >= 75
3205 /* The Skylake PRM contains the following restriction:
3206 *
3207 * "The driver must ensure The following case does not occur
3208 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3209 * buffer 3 read length equal to zero committed followed by a
3210 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3211 * zero committed."
3212 *
3213 * To avoid this, we program the buffers in the highest slots.
3214 * This way, slot 0 is only used if slot 3 is also used.
3215 */
3216 assert(buffer_count <= 4);
3217 const unsigned shift = 4 - buffer_count;
3218 for (unsigned i = 0; i < buffer_count; i++) {
3219 const struct anv_push_range *range = &bind_map->push_ranges[i];
3220
3221 /* At this point we only have non-empty ranges */
3222 assert(range->length > 0);
3223
3224 /* For Ivy Bridge, make sure we only set the first range (actual
3225 * push constants)
3226 */
3227 assert((GFX_VERx10 >= 75) || i == 0);
3228
3229 c.ConstantBody.ReadLength[i + shift] = range->length;
3230 c.ConstantBody.Buffer[i + shift] =
3231 anv_address_add(buffers[i], range->start * 32);
3232 }
3233 #else
3234 /* For Ivy Bridge, push constants are relative to dynamic state
3235 * base address and we only ever push actual push constants.
3236 */
3237 if (bind_map->push_ranges[0].length > 0) {
3238 assert(buffer_count == 1);
3239 assert(bind_map->push_ranges[0].set ==
3240 ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
3241 assert(buffers[0].bo ==
3242 cmd_buffer->device->dynamic_state_pool.block_pool.bo);
3243 c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
3244 c.ConstantBody.Buffer[0].bo = NULL;
3245 c.ConstantBody.Buffer[0].offset = buffers[0].offset;
3246 }
3247 assert(bind_map->push_ranges[1].length == 0);
3248 assert(bind_map->push_ranges[2].length == 0);
3249 assert(bind_map->push_ranges[3].length == 0);
3250 #endif
3251 }
3252 }
3253 }
3254
3255 #if GFX_VER >= 12
3256 static void
cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer, uint32_t shader_mask, struct anv_address *buffers, uint32_t buffer_count)3257 cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
3258 uint32_t shader_mask,
3259 struct anv_address *buffers,
3260 uint32_t buffer_count)
3261 {
3262 if (buffer_count == 0) {
3263 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
3264 c.ShaderUpdateEnable = shader_mask;
3265 c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
3266 }
3267 return;
3268 }
3269
3270 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3271 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3272
3273 static const UNUSED uint32_t push_constant_opcodes[] = {
3274 [MESA_SHADER_VERTEX] = 21,
3275 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
3276 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
3277 [MESA_SHADER_GEOMETRY] = 22,
3278 [MESA_SHADER_FRAGMENT] = 23,
3279 };
3280
3281 gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
3282 assert(stage < ARRAY_SIZE(push_constant_opcodes));
3283
3284 const struct anv_pipeline_bind_map *bind_map =
3285 &pipeline->shaders[stage]->bind_map;
3286
3287 uint32_t *dw;
3288 const uint32_t buffer_mask = (1 << buffer_count) - 1;
3289 const uint32_t num_dwords = 2 + 2 * buffer_count;
3290
3291 dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3292 GENX(3DSTATE_CONSTANT_ALL),
3293 .ShaderUpdateEnable = shader_mask,
3294 .PointerBufferMask = buffer_mask,
3295 .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
3296
3297 for (int i = 0; i < buffer_count; i++) {
3298 const struct anv_push_range *range = &bind_map->push_ranges[i];
3299 GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
3300 &cmd_buffer->batch, dw + 2 + i * 2,
3301 &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
3302 .PointerToConstantBuffer =
3303 anv_address_add(buffers[i], range->start * 32),
3304 .ConstantBufferReadLength = range->length,
3305 });
3306 }
3307 }
3308 #endif
3309
3310 static void
cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, VkShaderStageFlags dirty_stages)3311 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
3312 VkShaderStageFlags dirty_stages)
3313 {
3314 VkShaderStageFlags flushed = 0;
3315 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3316 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3317
3318 #if GFX_VER >= 12
3319 uint32_t nobuffer_stages = 0;
3320 #endif
3321
3322 /* Compute robust pushed register access mask for each stage. */
3323 if (cmd_buffer->device->robust_buffer_access) {
3324 anv_foreach_stage(stage, dirty_stages) {
3325 if (!anv_pipeline_has_stage(pipeline, stage))
3326 continue;
3327
3328 const struct anv_shader_bin *shader = pipeline->shaders[stage];
3329 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3330 struct anv_push_constants *push = &gfx_state->base.push_constants;
3331
3332 push->push_reg_mask[stage] = 0;
3333 /* Start of the current range in the shader, relative to the start of
3334 * push constants in the shader.
3335 */
3336 unsigned range_start_reg = 0;
3337 for (unsigned i = 0; i < 4; i++) {
3338 const struct anv_push_range *range = &bind_map->push_ranges[i];
3339 if (range->length == 0)
3340 continue;
3341
3342 unsigned bound_size =
3343 get_push_range_bound_size(cmd_buffer, shader, range);
3344 if (bound_size >= range->start * 32) {
3345 unsigned bound_regs =
3346 MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
3347 range->length);
3348 assert(range_start_reg + bound_regs <= 64);
3349 push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
3350 bound_regs);
3351 }
3352
3353 cmd_buffer->state.push_constants_dirty |=
3354 mesa_to_vk_shader_stage(stage);
3355
3356 range_start_reg += range->length;
3357 }
3358 }
3359 }
3360
3361 /* Resets the push constant state so that we allocate a new one if
3362 * needed.
3363 */
3364 gfx_state->base.push_constants_state = ANV_STATE_NULL;
3365
3366 anv_foreach_stage(stage, dirty_stages) {
3367 unsigned buffer_count = 0;
3368 flushed |= mesa_to_vk_shader_stage(stage);
3369 UNUSED uint32_t max_push_range = 0;
3370
3371 struct anv_address buffers[4] = {};
3372 if (anv_pipeline_has_stage(pipeline, stage)) {
3373 const struct anv_shader_bin *shader = pipeline->shaders[stage];
3374 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3375
3376 /* We have to gather buffer addresses as a second step because the
3377 * loop above puts data into the push constant area and the call to
3378 * get_push_range_address is what locks our push constants and copies
3379 * them into the actual GPU buffer. If we did the two loops at the
3380 * same time, we'd risk only having some of the sizes in the push
3381 * constant buffer when we did the copy.
3382 */
3383 for (unsigned i = 0; i < 4; i++) {
3384 const struct anv_push_range *range = &bind_map->push_ranges[i];
3385 if (range->length == 0)
3386 break;
3387
3388 buffers[i] = get_push_range_address(cmd_buffer, shader, range);
3389 max_push_range = MAX2(max_push_range, range->length);
3390 buffer_count++;
3391 }
3392
3393 /* We have at most 4 buffers but they should be tightly packed */
3394 for (unsigned i = buffer_count; i < 4; i++)
3395 assert(bind_map->push_ranges[i].length == 0);
3396 }
3397
3398 #if GFX_VER >= 12
3399 /* If this stage doesn't have any push constants, emit it later in a
3400 * single CONSTANT_ALL packet.
3401 */
3402 if (buffer_count == 0) {
3403 nobuffer_stages |= 1 << stage;
3404 continue;
3405 }
3406
3407 /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
3408 * contains only 5 bits, so we can only use it for buffers smaller than
3409 * 32.
3410 */
3411 if (max_push_range < 32) {
3412 cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
3413 buffers, buffer_count);
3414 continue;
3415 }
3416 #endif
3417
3418 cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
3419 }
3420
3421 #if GFX_VER >= 12
3422 if (nobuffer_stages)
3423 cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
3424 #endif
3425
3426 cmd_buffer->state.push_constants_dirty &= ~flushed;
3427 }
3428
3429 #if GFX_VERx10 >= 125
3430 static void
cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer, VkShaderStageFlags dirty_stages)3431 cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
3432 VkShaderStageFlags dirty_stages)
3433 {
3434 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
3435 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
3436
3437 if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_NV &&
3438 anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
3439
3440 const struct anv_shader_bin *shader = pipeline->shaders[MESA_SHADER_TASK];
3441 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3442
3443 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
3444 const struct anv_push_range *range = &bind_map->push_ranges[0];
3445 if (range->length > 0) {
3446 struct anv_address buffer =
3447 get_push_range_address(cmd_buffer, shader, range);
3448
3449 uint64_t addr = anv_address_physical(buffer);
3450 data.InlineData[0] = addr & 0xffffffff;
3451 data.InlineData[1] = addr >> 32;
3452
3453 memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
3454 cmd_buffer->state.gfx.base.push_constants.client_data,
3455 BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
3456 }
3457 }
3458 }
3459
3460 if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_NV &&
3461 anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
3462
3463 const struct anv_shader_bin *shader = pipeline->shaders[MESA_SHADER_MESH];
3464 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
3465
3466 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
3467 const struct anv_push_range *range = &bind_map->push_ranges[0];
3468 if (range->length > 0) {
3469 struct anv_address buffer =
3470 get_push_range_address(cmd_buffer, shader, range);
3471
3472 uint64_t addr = anv_address_physical(buffer);
3473 data.InlineData[0] = addr & 0xffffffff;
3474 data.InlineData[1] = addr >> 32;
3475
3476 memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
3477 cmd_buffer->state.gfx.base.push_constants.client_data,
3478 BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
3479 }
3480 }
3481 }
3482
3483 cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
3484 }
3485 #endif
3486
3487 static void
cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)3488 cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
3489 {
3490 const struct vk_dynamic_graphics_state *dyn =
3491 &cmd_buffer->vk.dynamic_graphics_state;
3492
3493 if (!(cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) &&
3494 !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) &&
3495 #if GFX_VER <= 7
3496 !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) &&
3497 !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) &&
3498 #endif
3499 !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
3500 return;
3501
3502 /* Take dynamic primitive topology in to account with
3503 * 3DSTATE_CLIP::ViewportXYClipTestEnable
3504 */
3505 VkPolygonMode dynamic_raster_mode =
3506 genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
3507 dyn->ia.primitive_topology);
3508 bool xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
3509
3510 struct GENX(3DSTATE_CLIP) clip = {
3511 GENX(3DSTATE_CLIP_header),
3512 #if GFX_VER <= 7
3513 .FrontWinding = genX(vk_to_intel_front_face)[dyn->rs.front_face],
3514 .CullMode = genX(vk_to_intel_cullmode)[dyn->rs.cull_mode],
3515 #endif
3516 .ViewportXYClipTestEnable = xy_clip_test_enable,
3517 };
3518 uint32_t dwords[GENX(3DSTATE_CLIP_length)];
3519
3520 /* TODO(mesh): Multiview. */
3521 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3522 if (anv_pipeline_is_primitive(pipeline)) {
3523 const struct brw_vue_prog_data *last =
3524 anv_pipeline_get_last_vue_prog_data(pipeline);
3525 if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3526 clip.MaximumVPIndex = dyn->vp.viewport_count > 0 ?
3527 dyn->vp.viewport_count - 1 : 0;
3528 }
3529 } else if (anv_pipeline_is_mesh(pipeline)) {
3530 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
3531 if (mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
3532 clip.MaximumVPIndex = dyn->vp.viewport_count > 0 ?
3533 dyn->vp.viewport_count - 1 : 0;
3534 }
3535 }
3536
3537 GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);
3538 anv_batch_emit_merge(&cmd_buffer->batch, dwords,
3539 pipeline->gfx7.clip);
3540 }
3541
3542 static void
cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)3543 cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
3544 {
3545 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3546 const struct vk_dynamic_graphics_state *dyn =
3547 &cmd_buffer->vk.dynamic_graphics_state;
3548 uint32_t count = dyn->vp.viewport_count;
3549 const VkViewport *viewports = dyn->vp.viewports;
3550 struct anv_state sf_clip_state =
3551 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
3552
3553 bool negative_one_to_one =
3554 cmd_buffer->state.gfx.pipeline->negative_one_to_one;
3555
3556 float scale = negative_one_to_one ? 0.5f : 1.0f;
3557
3558 for (uint32_t i = 0; i < count; i++) {
3559 const VkViewport *vp = &viewports[i];
3560
3561 /* The gfx7 state struct has just the matrix and guardband fields, the
3562 * gfx8 struct adds the min/max viewport fields. */
3563 struct GENX(SF_CLIP_VIEWPORT) sfv = {
3564 .ViewportMatrixElementm00 = vp->width / 2,
3565 .ViewportMatrixElementm11 = vp->height / 2,
3566 .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
3567 .ViewportMatrixElementm30 = vp->x + vp->width / 2,
3568 .ViewportMatrixElementm31 = vp->y + vp->height / 2,
3569 .ViewportMatrixElementm32 = negative_one_to_one ?
3570 (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
3571 .XMinClipGuardband = -1.0f,
3572 .XMaxClipGuardband = 1.0f,
3573 .YMinClipGuardband = -1.0f,
3574 .YMaxClipGuardband = 1.0f,
3575 #if GFX_VER >= 8
3576 .XMinViewPort = vp->x,
3577 .XMaxViewPort = vp->x + vp->width - 1,
3578 .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
3579 .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
3580 #endif
3581 };
3582
3583 const uint32_t fb_size_max = 1 << 14;
3584 uint32_t x_min = 0, x_max = fb_size_max;
3585 uint32_t y_min = 0, y_max = fb_size_max;
3586
3587 /* If we have a valid renderArea, include that */
3588 if (gfx->render_area.extent.width > 0 &&
3589 gfx->render_area.extent.height > 0) {
3590 x_min = MAX2(x_min, gfx->render_area.offset.x);
3591 x_max = MIN2(x_min, gfx->render_area.offset.x +
3592 gfx->render_area.extent.width);
3593 y_min = MAX2(y_min, gfx->render_area.offset.y);
3594 y_max = MIN2(y_min, gfx->render_area.offset.y +
3595 gfx->render_area.extent.height);
3596 }
3597
3598 /* The client is required to have enough scissors for whatever it sets
3599 * as ViewportIndex but it's possible that they've got more viewports
3600 * set from a previous command. Also, from the Vulkan 1.3.207:
3601 *
3602 * "The application must ensure (using scissor if necessary) that
3603 * all rendering is contained within the render area."
3604 *
3605 * If the client doesn't set a scissor, that basically means it
3606 * guarantees everything is in-bounds already. If we end up using a
3607 * guardband of [-1, 1] in that case, there shouldn't be much loss.
3608 * It's theoretically possible that they could do all their clipping
3609 * with clip planes but that'd be a bit odd.
3610 */
3611 if (i < dyn->vp.scissor_count) {
3612 const VkRect2D *scissor = &dyn->vp.scissors[i];
3613 x_min = MAX2(x_min, scissor->offset.x);
3614 x_max = MIN2(x_min, scissor->offset.x + scissor->extent.width);
3615 y_min = MAX2(y_min, scissor->offset.y);
3616 y_max = MIN2(y_min, scissor->offset.y + scissor->extent.height);
3617 }
3618
3619 /* Only bother calculating the guardband if our known render area is
3620 * less than the maximum size. Otherwise, it will calculate [-1, 1]
3621 * anyway but possibly with precision loss.
3622 */
3623 if (x_min > 0 || x_max < fb_size_max ||
3624 y_min > 0 || y_max < fb_size_max) {
3625 intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
3626 sfv.ViewportMatrixElementm00,
3627 sfv.ViewportMatrixElementm11,
3628 sfv.ViewportMatrixElementm30,
3629 sfv.ViewportMatrixElementm31,
3630 &sfv.XMinClipGuardband,
3631 &sfv.XMaxClipGuardband,
3632 &sfv.YMinClipGuardband,
3633 &sfv.YMaxClipGuardband);
3634 }
3635
3636 GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
3637 }
3638
3639 anv_batch_emit(&cmd_buffer->batch,
3640 GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
3641 clip.SFClipViewportPointer = sf_clip_state.offset;
3642 }
3643 }
3644
3645 static void
cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer, bool depth_clamp_enable)3646 cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
3647 bool depth_clamp_enable)
3648 {
3649 const struct vk_dynamic_graphics_state *dyn =
3650 &cmd_buffer->vk.dynamic_graphics_state;
3651 uint32_t count = dyn->vp.viewport_count;
3652 const VkViewport *viewports = dyn->vp.viewports;
3653 struct anv_state cc_state =
3654 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
3655
3656 for (uint32_t i = 0; i < count; i++) {
3657 const VkViewport *vp = &viewports[i];
3658
3659 /* From the Vulkan spec:
3660 *
3661 * "It is valid for minDepth to be greater than or equal to
3662 * maxDepth."
3663 */
3664 float min_depth = MIN2(vp->minDepth, vp->maxDepth);
3665 float max_depth = MAX2(vp->minDepth, vp->maxDepth);
3666
3667 struct GENX(CC_VIEWPORT) cc_viewport = {
3668 .MinimumDepth = depth_clamp_enable ? min_depth : 0.0f,
3669 .MaximumDepth = depth_clamp_enable ? max_depth : 1.0f,
3670 };
3671
3672 GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
3673 }
3674
3675 anv_batch_emit(&cmd_buffer->batch,
3676 GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
3677 cc.CCViewportPointer = cc_state.offset;
3678 }
3679 }
3680
3681 static int64_t
clamp_int64(int64_t x, int64_t min, int64_t max)3682 clamp_int64(int64_t x, int64_t min, int64_t max)
3683 {
3684 if (x < min)
3685 return min;
3686 else if (x < max)
3687 return x;
3688 else
3689 return max;
3690 }
3691
3692 static void
cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)3693 cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
3694 {
3695 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3696 const struct vk_dynamic_graphics_state *dyn =
3697 &cmd_buffer->vk.dynamic_graphics_state;
3698 uint32_t count = dyn->vp.scissor_count;
3699 const VkRect2D *scissors = dyn->vp.scissors;
3700 const VkViewport *viewports = dyn->vp.viewports;
3701
3702 /* Wa_1409725701:
3703 * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
3704 * stored as an array of up to 16 elements. The location of first
3705 * element of the array, as specified by Pointer to SCISSOR_RECT, should
3706 * be aligned to a 64-byte boundary.
3707 */
3708 uint32_t alignment = 64;
3709 struct anv_state scissor_state =
3710 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, alignment);
3711
3712 for (uint32_t i = 0; i < count; i++) {
3713 const VkRect2D *s = &scissors[i];
3714 const VkViewport *vp = &viewports[i];
3715
3716 /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
3717 * ymax < ymin for empty clips. In case clip x, y, width height are all
3718 * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't
3719 * what we want. Just special case empty clips and produce a canonical
3720 * empty clip. */
3721 static const struct GENX(SCISSOR_RECT) empty_scissor = {
3722 .ScissorRectangleYMin = 1,
3723 .ScissorRectangleXMin = 1,
3724 .ScissorRectangleYMax = 0,
3725 .ScissorRectangleXMax = 0
3726 };
3727
3728 const int max = 0xffff;
3729
3730 uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
3731 uint32_t x_min = MAX2(s->offset.x, vp->x);
3732 uint32_t y_max = MIN2(s->offset.y + s->extent.height - 1,
3733 MAX2(vp->y, vp->y + vp->height) - 1);
3734 uint32_t x_max = MIN2(s->offset.x + s->extent.width - 1,
3735 vp->x + vp->width - 1);
3736
3737 /* Do this math using int64_t so overflow gets clamped correctly. */
3738 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3739 y_min = clamp_int64((uint64_t) y_min, gfx->render_area.offset.y, max);
3740 x_min = clamp_int64((uint64_t) x_min, gfx->render_area.offset.x, max);
3741 y_max = clamp_int64((uint64_t) y_max, 0,
3742 gfx->render_area.offset.y +
3743 gfx->render_area.extent.height - 1);
3744 x_max = clamp_int64((uint64_t) x_max, 0,
3745 gfx->render_area.offset.x +
3746 gfx->render_area.extent.width - 1);
3747 }
3748
3749 struct GENX(SCISSOR_RECT) scissor = {
3750 .ScissorRectangleYMin = y_min,
3751 .ScissorRectangleXMin = x_min,
3752 .ScissorRectangleYMax = y_max,
3753 .ScissorRectangleXMax = x_max
3754 };
3755
3756 if (s->extent.width <= 0 || s->extent.height <= 0) {
3757 GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8,
3758 &empty_scissor);
3759 } else {
3760 GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
3761 }
3762 }
3763
3764 anv_batch_emit(&cmd_buffer->batch,
3765 GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
3766 ssp.ScissorRectPointer = scissor_state.offset;
3767 }
3768 }
3769
3770 static void
cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)3771 cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
3772 {
3773 const struct vk_dynamic_graphics_state *dyn =
3774 &cmd_buffer->vk.dynamic_graphics_state;
3775 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3776
3777 #if GFX_VER == 7
3778 # define streamout_state_dw pipeline->gfx7.streamout_state
3779 #else
3780 # define streamout_state_dw pipeline->gfx8.streamout_state
3781 #endif
3782
3783 uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];
3784
3785 struct GENX(3DSTATE_STREAMOUT) so = {
3786 GENX(3DSTATE_STREAMOUT_header),
3787 .RenderingDisable = dyn->rs.rasterizer_discard_enable,
3788 };
3789 GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);
3790 anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);
3791 }
3792
3793 void
cmd_buffer_flush_state(struct anv_cmd_buffer *cmd_buffer)3794 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
3795 {
3796 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3797 const struct vk_dynamic_graphics_state *dyn =
3798 &cmd_buffer->vk.dynamic_graphics_state;
3799 uint32_t *p;
3800
3801 assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
3802
3803 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
3804
3805 genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
3806
3807 genX(flush_pipeline_select_3d)(cmd_buffer);
3808
3809 /* Apply any pending pipeline flushes we may have. We want to apply them
3810 * now because, if any of those flushes are for things like push constants,
3811 * the GPU will read the state at weird times.
3812 */
3813 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3814
3815 uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;
3816 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
3817 vb_emit |= pipeline->vb_used;
3818
3819 if (vb_emit) {
3820 const uint32_t num_buffers = __builtin_popcount(vb_emit);
3821 const uint32_t num_dwords = 1 + num_buffers * 4;
3822
3823 p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3824 GENX(3DSTATE_VERTEX_BUFFERS));
3825 uint32_t i = 0;
3826 u_foreach_bit(vb, vb_emit) {
3827 struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
3828 uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
3829
3830 struct GENX(VERTEX_BUFFER_STATE) state;
3831 if (buffer) {
3832 uint32_t stride = dyn->vi_binding_strides[vb];
3833 UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
3834
3835 #if GFX_VER <= 7
3836 bool per_instance = pipeline->vb[vb].instanced;
3837 uint32_t divisor = pipeline->vb[vb].instance_divisor *
3838 pipeline->instance_multiplier;
3839 #endif
3840
3841 state = (struct GENX(VERTEX_BUFFER_STATE)) {
3842 .VertexBufferIndex = vb,
3843
3844 .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
3845 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3846 #if GFX_VER <= 7
3847 .BufferAccessType = per_instance ? INSTANCEDATA : VERTEXDATA,
3848 .InstanceDataStepRate = per_instance ? divisor : 1,
3849 #endif
3850 .AddressModifyEnable = true,
3851 .BufferPitch = stride,
3852 .BufferStartingAddress = anv_address_add(buffer->address, offset),
3853 .NullVertexBuffer = offset >= buffer->vk.size,
3854 #if GFX_VER >= 12
3855 .L3BypassDisable = true,
3856 #endif
3857
3858 #if GFX_VER >= 8
3859 .BufferSize = size,
3860 #else
3861 /* XXX: to handle dynamic offset for older gens we might want
3862 * to modify Endaddress, but there are issues when doing so:
3863 *
3864 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439
3865 */
3866 .EndAddress = anv_address_add(buffer->address, buffer->vk.size - 1),
3867 #endif
3868 };
3869 } else {
3870 state = (struct GENX(VERTEX_BUFFER_STATE)) {
3871 .VertexBufferIndex = vb,
3872 .NullVertexBuffer = true,
3873 .MOCS = anv_mocs(cmd_buffer->device, NULL,
3874 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3875 };
3876 }
3877
3878 #if GFX_VER >= 8 && GFX_VER <= 9
3879 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
3880 state.BufferStartingAddress,
3881 state.BufferSize);
3882 #endif
3883
3884 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
3885 i++;
3886 }
3887 }
3888
3889 cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
3890
3891 uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
3892 pipeline->active_stages;
3893 if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
3894 !vk_dynamic_graphics_state_any_dirty(dyn) &&
3895 !cmd_buffer->state.push_constants_dirty)
3896 return;
3897
3898 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||
3899 (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &
3900 ANV_CMD_DIRTY_PIPELINE))) {
3901 /* Wa_16011411144:
3902 *
3903 * SW must insert a PIPE_CONTROL cmd before and after the
3904 * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
3905 * state is not combined with other state changes.
3906 */
3907 if (intel_device_info_is_dg2(&cmd_buffer->device->info)) {
3908 anv_add_pending_pipe_bits(cmd_buffer,
3909 ANV_PIPE_CS_STALL_BIT,
3910 "before SO_BUFFER change WA");
3911 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3912 }
3913
3914 /* We don't need any per-buffer dirty tracking because you're not
3915 * allowed to bind different XFB buffers while XFB is enabled.
3916 */
3917 for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
3918 struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
3919 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
3920 #if GFX_VER < 12
3921 sob.SOBufferIndex = idx;
3922 #else
3923 sob._3DCommandOpcode = 0;
3924 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
3925 #endif
3926
3927 if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
3928 sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo, 0);
3929 sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
3930 xfb->offset);
3931 #if GFX_VER >= 8
3932 sob.SOBufferEnable = true;
3933 sob.StreamOffsetWriteEnable = false;
3934 /* Size is in DWords - 1 */
3935 sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
3936 #else
3937 /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so
3938 * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the
3939 * default for an empty SO_BUFFER packet) to disable them.
3940 */
3941 sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];
3942 sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,
3943 xfb->offset + xfb->size);
3944 #endif
3945 } else {
3946 sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
3947 }
3948 }
3949 }
3950
3951 if (intel_device_info_is_dg2(&cmd_buffer->device->info)) {
3952 /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
3953 anv_add_pending_pipe_bits(cmd_buffer,
3954 ANV_PIPE_CS_STALL_BIT,
3955 "after SO_BUFFER change WA");
3956 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3957 } else if (GFX_VER >= 10) {
3958 /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
3959 anv_add_pending_pipe_bits(cmd_buffer,
3960 ANV_PIPE_CS_STALL_BIT,
3961 "after 3DSTATE_SO_BUFFER call");
3962 }
3963 }
3964
3965 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
3966 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
3967
3968 /* If the pipeline changed, we may need to re-allocate push constant
3969 * space in the URB.
3970 */
3971 cmd_buffer_alloc_push_constants(cmd_buffer);
3972 }
3973
3974 #if GFX_VER <= 7
3975 if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
3976 cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
3977 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
3978 *
3979 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
3980 * stall needs to be sent just prior to any 3DSTATE_VS,
3981 * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
3982 * 3DSTATE_BINDING_TABLE_POINTER_VS,
3983 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one
3984 * PIPE_CONTROL needs to be sent before any combination of VS
3985 * associated 3DSTATE."
3986 */
3987 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3988 pc.DepthStallEnable = true;
3989 pc.PostSyncOperation = WriteImmediateData;
3990 pc.Address = cmd_buffer->device->workaround_address;
3991 anv_debug_dump_pc(pc);
3992 }
3993 }
3994 #endif
3995
3996 /* Render targets live in the same binding table as fragment descriptors */
3997 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
3998 descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
3999
4000 /* We emit the binding tables and sampler tables first, then emit push
4001 * constants and then finally emit binding table and sampler table
4002 * pointers. It has to happen in this order, since emitting the binding
4003 * tables may change the push constants (in case of storage images). After
4004 * emitting push constants, on SKL+ we have to emit the corresponding
4005 * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
4006 */
4007 uint32_t dirty = 0;
4008 if (descriptors_dirty) {
4009 dirty = flush_descriptor_sets(cmd_buffer,
4010 &cmd_buffer->state.gfx.base,
4011 descriptors_dirty,
4012 pipeline->shaders,
4013 ARRAY_SIZE(pipeline->shaders));
4014 cmd_buffer->state.descriptors_dirty &= ~dirty;
4015 }
4016
4017 if (dirty || cmd_buffer->state.push_constants_dirty) {
4018 /* Because we're pushing UBOs, we have to push whenever either
4019 * descriptors or push constants is dirty.
4020 */
4021 dirty |= cmd_buffer->state.push_constants_dirty;
4022 cmd_buffer_flush_push_constants(cmd_buffer,
4023 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
4024 #if GFX_VERx10 >= 125
4025 cmd_buffer_flush_mesh_inline_data(
4026 cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_NV |
4027 VK_SHADER_STAGE_MESH_BIT_NV));
4028 #endif
4029 }
4030
4031 if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
4032 cmd_buffer_emit_descriptor_pointers(cmd_buffer,
4033 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
4034 }
4035
4036 cmd_buffer_emit_clip(cmd_buffer);
4037
4038 if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
4039 ANV_CMD_DIRTY_XFB_ENABLE)) ||
4040 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
4041 cmd_buffer_emit_streamout(cmd_buffer);
4042
4043 if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
4044 ANV_CMD_DIRTY_RENDER_TARGETS)) ||
4045 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
4046 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
4047 cmd_buffer_emit_viewport(cmd_buffer);
4048 cmd_buffer_emit_depth_viewport(cmd_buffer,
4049 pipeline->depth_clamp_enable);
4050 cmd_buffer_emit_scissor(cmd_buffer);
4051 }
4052
4053 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
4054 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
4055 uint32_t topology;
4056 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
4057 topology = _3DPRIM_PATCHLIST(pipeline->patch_control_points);
4058 else
4059 topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology];
4060
4061 cmd_buffer->state.gfx.primitive_topology = topology;
4062
4063 #if (GFX_VER >= 8)
4064 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
4065 vft.PrimitiveTopologyType = topology;
4066 }
4067 #endif
4068 }
4069
4070 genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
4071 }
4072
4073 static void
emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr, uint32_t size, uint32_t index)4074 emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
4075 struct anv_address addr,
4076 uint32_t size, uint32_t index)
4077 {
4078 uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
4079 GENX(3DSTATE_VERTEX_BUFFERS));
4080
4081 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
4082 &(struct GENX(VERTEX_BUFFER_STATE)) {
4083 .VertexBufferIndex = index,
4084 .AddressModifyEnable = true,
4085 .BufferPitch = 0,
4086 .MOCS = anv_mocs(cmd_buffer->device, addr.bo,
4087 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
4088 .NullVertexBuffer = size == 0,
4089 #if GFX_VER >= 12
4090 .L3BypassDisable = true,
4091 #endif
4092 #if (GFX_VER >= 8)
4093 .BufferStartingAddress = addr,
4094 .BufferSize = size
4095 #else
4096 .BufferStartingAddress = addr,
4097 .EndAddress = anv_address_add(addr, size),
4098 #endif
4099 });
4100
4101 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
4102 index, addr, size);
4103 }
4104
4105 static void
emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr)4106 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
4107 struct anv_address addr)
4108 {
4109 emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
4110 }
4111
4112 static void
emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, uint32_t base_vertex, uint32_t base_instance)4113 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
4114 uint32_t base_vertex, uint32_t base_instance)
4115 {
4116 if (base_vertex == 0 && base_instance == 0) {
4117 emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
4118 } else {
4119 struct anv_state id_state =
4120 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
4121
4122 ((uint32_t *)id_state.map)[0] = base_vertex;
4123 ((uint32_t *)id_state.map)[1] = base_instance;
4124
4125 struct anv_address addr = {
4126 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4127 .offset = id_state.offset,
4128 };
4129
4130 emit_base_vertex_instance_bo(cmd_buffer, addr);
4131 }
4132 }
4133
4134 static void
emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)4135 emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
4136 {
4137 struct anv_state state =
4138 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
4139
4140 ((uint32_t *)state.map)[0] = draw_index;
4141
4142 struct anv_address addr = {
4143 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4144 .offset = state.offset,
4145 };
4146
4147 emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
4148 }
4149
4150 static void
update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer, uint32_t access_type)4151 update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
4152 uint32_t access_type)
4153 {
4154 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4155 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4156
4157 uint64_t vb_used = pipeline->vb_used;
4158 if (vs_prog_data->uses_firstvertex ||
4159 vs_prog_data->uses_baseinstance)
4160 vb_used |= 1ull << ANV_SVGS_VB_INDEX;
4161 if (vs_prog_data->uses_drawid)
4162 vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
4163
4164 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
4165 access_type == RANDOM,
4166 vb_used);
4167 }
4168
4169 ALWAYS_INLINE static void
cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer, const struct brw_vs_prog_data *vs_prog_data, uint32_t base_vertex, uint32_t base_instance, uint32_t draw_id, bool force_flush)4170 cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
4171 const struct brw_vs_prog_data *vs_prog_data,
4172 uint32_t base_vertex,
4173 uint32_t base_instance,
4174 uint32_t draw_id,
4175 bool force_flush)
4176 {
4177 bool emitted = false;
4178 if (vs_prog_data->uses_firstvertex ||
4179 vs_prog_data->uses_baseinstance) {
4180 emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
4181 emitted = true;
4182 }
4183 if (vs_prog_data->uses_drawid) {
4184 emit_draw_index(cmd_buffer, draw_id);
4185 emitted = true;
4186 }
4187 /* Emitting draw index or vertex index BOs may result in needing
4188 * additional VF cache flushes.
4189 */
4190 if (emitted || force_flush)
4191 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4192 }
4193
CmdDraw( VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance)4194 void genX(CmdDraw)(
4195 VkCommandBuffer commandBuffer,
4196 uint32_t vertexCount,
4197 uint32_t instanceCount,
4198 uint32_t firstVertex,
4199 uint32_t firstInstance)
4200 {
4201 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4202 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4203 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4204
4205 if (anv_batch_has_error(&cmd_buffer->batch))
4206 return;
4207
4208 const uint32_t count =
4209 vertexCount * instanceCount * pipeline->instance_multiplier;
4210 anv_measure_snapshot(cmd_buffer,
4211 INTEL_SNAPSHOT_DRAW,
4212 "draw", count);
4213 trace_intel_begin_draw(&cmd_buffer->trace);
4214
4215 genX(cmd_buffer_flush_state)(cmd_buffer);
4216
4217 if (cmd_buffer->state.conditional_render_enabled)
4218 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4219
4220 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4221 firstVertex, firstInstance, 0,
4222 true);
4223
4224 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4225 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4226 prim.VertexAccessType = SEQUENTIAL;
4227 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4228 prim.VertexCountPerInstance = vertexCount;
4229 prim.StartVertexLocation = firstVertex;
4230 prim.InstanceCount = instanceCount *
4231 pipeline->instance_multiplier;
4232 prim.StartInstanceLocation = firstInstance;
4233 prim.BaseVertexLocation = 0;
4234 }
4235
4236 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4237
4238 trace_intel_end_draw(&cmd_buffer->trace, count);
4239 }
4240
CmdDrawMultiEXT( VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo, uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)4241 void genX(CmdDrawMultiEXT)(
4242 VkCommandBuffer commandBuffer,
4243 uint32_t drawCount,
4244 const VkMultiDrawInfoEXT *pVertexInfo,
4245 uint32_t instanceCount,
4246 uint32_t firstInstance,
4247 uint32_t stride)
4248 {
4249 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4250 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4251 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4252
4253 if (anv_batch_has_error(&cmd_buffer->batch))
4254 return;
4255
4256 const uint32_t count =
4257 drawCount * instanceCount * pipeline->instance_multiplier;
4258 anv_measure_snapshot(cmd_buffer,
4259 INTEL_SNAPSHOT_DRAW,
4260 "draw_multi", count);
4261 trace_intel_begin_draw_multi(&cmd_buffer->trace);
4262
4263 genX(cmd_buffer_flush_state)(cmd_buffer);
4264
4265 if (cmd_buffer->state.conditional_render_enabled)
4266 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4267
4268 uint32_t i = 0;
4269 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
4270 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4271 draw->firstVertex,
4272 firstInstance, i, !i);
4273
4274 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4275 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4276 prim.VertexAccessType = SEQUENTIAL;
4277 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4278 prim.VertexCountPerInstance = draw->vertexCount;
4279 prim.StartVertexLocation = draw->firstVertex;
4280 prim.InstanceCount = instanceCount *
4281 pipeline->instance_multiplier;
4282 prim.StartInstanceLocation = firstInstance;
4283 prim.BaseVertexLocation = 0;
4284 }
4285 }
4286
4287 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4288
4289 trace_intel_end_draw_multi(&cmd_buffer->trace, count);
4290 }
4291
CmdDrawIndexed( VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance)4292 void genX(CmdDrawIndexed)(
4293 VkCommandBuffer commandBuffer,
4294 uint32_t indexCount,
4295 uint32_t instanceCount,
4296 uint32_t firstIndex,
4297 int32_t vertexOffset,
4298 uint32_t firstInstance)
4299 {
4300 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4301 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4302 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4303
4304 if (anv_batch_has_error(&cmd_buffer->batch))
4305 return;
4306
4307 const uint32_t count =
4308 indexCount * instanceCount * pipeline->instance_multiplier;
4309 anv_measure_snapshot(cmd_buffer,
4310 INTEL_SNAPSHOT_DRAW,
4311 "draw indexed",
4312 count);
4313 trace_intel_begin_draw_indexed(&cmd_buffer->trace);
4314
4315 genX(cmd_buffer_flush_state)(cmd_buffer);
4316
4317 if (cmd_buffer->state.conditional_render_enabled)
4318 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4319
4320 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, vertexOffset, firstInstance, 0, true);
4321
4322 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4323 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4324 prim.VertexAccessType = RANDOM;
4325 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4326 prim.VertexCountPerInstance = indexCount;
4327 prim.StartVertexLocation = firstIndex;
4328 prim.InstanceCount = instanceCount *
4329 pipeline->instance_multiplier;
4330 prim.StartInstanceLocation = firstInstance;
4331 prim.BaseVertexLocation = vertexOffset;
4332 }
4333
4334 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4335
4336 trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
4337 }
4338
CmdDrawMultiIndexedEXT( VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo, uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset)4339 void genX(CmdDrawMultiIndexedEXT)(
4340 VkCommandBuffer commandBuffer,
4341 uint32_t drawCount,
4342 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
4343 uint32_t instanceCount,
4344 uint32_t firstInstance,
4345 uint32_t stride,
4346 const int32_t *pVertexOffset)
4347 {
4348 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4349 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4350 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4351
4352 if (anv_batch_has_error(&cmd_buffer->batch))
4353 return;
4354
4355 const uint32_t count =
4356 drawCount * instanceCount * pipeline->instance_multiplier;
4357 anv_measure_snapshot(cmd_buffer,
4358 INTEL_SNAPSHOT_DRAW,
4359 "draw indexed_multi",
4360 count);
4361 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
4362
4363 genX(cmd_buffer_flush_state)(cmd_buffer);
4364
4365 if (cmd_buffer->state.conditional_render_enabled)
4366 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4367
4368 uint32_t i = 0;
4369 if (pVertexOffset) {
4370 if (vs_prog_data->uses_drawid) {
4371 bool emitted = true;
4372 if (vs_prog_data->uses_firstvertex ||
4373 vs_prog_data->uses_baseinstance) {
4374 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
4375 emitted = true;
4376 }
4377 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4378 if (vs_prog_data->uses_drawid) {
4379 emit_draw_index(cmd_buffer, i);
4380 emitted = true;
4381 }
4382 /* Emitting draw index or vertex index BOs may result in needing
4383 * additional VF cache flushes.
4384 */
4385 if (emitted)
4386 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4387
4388 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4389 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4390 prim.VertexAccessType = RANDOM;
4391 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4392 prim.VertexCountPerInstance = draw->indexCount;
4393 prim.StartVertexLocation = draw->firstIndex;
4394 prim.InstanceCount = instanceCount *
4395 pipeline->instance_multiplier;
4396 prim.StartInstanceLocation = firstInstance;
4397 prim.BaseVertexLocation = *pVertexOffset;
4398 }
4399 emitted = false;
4400 }
4401 } else {
4402 if (vs_prog_data->uses_firstvertex ||
4403 vs_prog_data->uses_baseinstance) {
4404 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
4405 /* Emitting draw index or vertex index BOs may result in needing
4406 * additional VF cache flushes.
4407 */
4408 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4409 }
4410 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4411 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4412 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4413 prim.VertexAccessType = RANDOM;
4414 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4415 prim.VertexCountPerInstance = draw->indexCount;
4416 prim.StartVertexLocation = draw->firstIndex;
4417 prim.InstanceCount = instanceCount *
4418 pipeline->instance_multiplier;
4419 prim.StartInstanceLocation = firstInstance;
4420 prim.BaseVertexLocation = *pVertexOffset;
4421 }
4422 }
4423 }
4424 } else {
4425 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
4426 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
4427 draw->vertexOffset,
4428 firstInstance, i, i != 0);
4429
4430 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4431 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4432 prim.VertexAccessType = RANDOM;
4433 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4434 prim.VertexCountPerInstance = draw->indexCount;
4435 prim.StartVertexLocation = draw->firstIndex;
4436 prim.InstanceCount = instanceCount *
4437 pipeline->instance_multiplier;
4438 prim.StartInstanceLocation = firstInstance;
4439 prim.BaseVertexLocation = draw->vertexOffset;
4440 }
4441 }
4442 }
4443
4444 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4445
4446 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
4447 }
4448
4449 /* Auto-Draw / Indirect Registers */
4450 #define GFX7_3DPRIM_END_OFFSET 0x2420
4451 #define GFX7_3DPRIM_START_VERTEX 0x2430
4452 #define GFX7_3DPRIM_VERTEX_COUNT 0x2434
4453 #define GFX7_3DPRIM_INSTANCE_COUNT 0x2438
4454 #define GFX7_3DPRIM_START_INSTANCE 0x243C
4455 #define GFX7_3DPRIM_BASE_VERTEX 0x2440
4456
CmdDrawIndirectByteCountEXT( VkCommandBuffer commandBuffer, uint32_t instanceCount, uint32_t firstInstance, VkBuffer counterBuffer, VkDeviceSize counterBufferOffset, uint32_t counterOffset, uint32_t vertexStride)4457 void genX(CmdDrawIndirectByteCountEXT)(
4458 VkCommandBuffer commandBuffer,
4459 uint32_t instanceCount,
4460 uint32_t firstInstance,
4461 VkBuffer counterBuffer,
4462 VkDeviceSize counterBufferOffset,
4463 uint32_t counterOffset,
4464 uint32_t vertexStride)
4465 {
4466 #if GFX_VERx10 >= 75
4467 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4468 ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
4469 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4470 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4471
4472 /* firstVertex is always zero for this draw function */
4473 const uint32_t firstVertex = 0;
4474
4475 if (anv_batch_has_error(&cmd_buffer->batch))
4476 return;
4477
4478 anv_measure_snapshot(cmd_buffer,
4479 INTEL_SNAPSHOT_DRAW,
4480 "draw indirect byte count",
4481 instanceCount * pipeline->instance_multiplier);
4482 trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
4483
4484 genX(cmd_buffer_flush_state)(cmd_buffer);
4485
4486 if (cmd_buffer->state.conditional_render_enabled)
4487 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4488
4489 if (vs_prog_data->uses_firstvertex ||
4490 vs_prog_data->uses_baseinstance)
4491 emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
4492 if (vs_prog_data->uses_drawid)
4493 emit_draw_index(cmd_buffer, 0);
4494
4495 /* Emitting draw index or vertex index BOs may result in needing
4496 * additional VF cache flushes.
4497 */
4498 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4499
4500 struct mi_builder b;
4501 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4502 struct mi_value count =
4503 mi_mem32(anv_address_add(counter_buffer->address,
4504 counterBufferOffset));
4505 if (counterOffset)
4506 count = mi_isub(&b, count, mi_imm(counterOffset));
4507 count = mi_udiv32_imm(&b, count, vertexStride);
4508 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
4509
4510 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
4511 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
4512 mi_imm(instanceCount * pipeline->instance_multiplier));
4513 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
4514 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
4515
4516 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4517 prim.IndirectParameterEnable = true;
4518 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4519 prim.VertexAccessType = SEQUENTIAL;
4520 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4521 }
4522
4523 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4524
4525 trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
4526 instanceCount * pipeline->instance_multiplier);
4527 #endif /* GFX_VERx10 >= 75 */
4528 }
4529
4530 static void
load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr, bool indexed)4531 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
4532 struct anv_address addr,
4533 bool indexed)
4534 {
4535 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4536
4537 struct mi_builder b;
4538 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4539
4540 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
4541 mi_mem32(anv_address_add(addr, 0)));
4542
4543 struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
4544 if (pipeline->instance_multiplier > 1) {
4545 #if GFX_VERx10 >= 75
4546 instance_count = mi_imul_imm(&b, instance_count,
4547 pipeline->instance_multiplier);
4548 #else
4549 anv_finishme("Multiview + indirect draw requires MI_MATH; "
4550 "MI_MATH is not supported on Ivy Bridge");
4551 #endif
4552 }
4553 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
4554
4555 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
4556 mi_mem32(anv_address_add(addr, 8)));
4557
4558 if (indexed) {
4559 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
4560 mi_mem32(anv_address_add(addr, 12)));
4561 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
4562 mi_mem32(anv_address_add(addr, 16)));
4563 } else {
4564 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
4565 mi_mem32(anv_address_add(addr, 12)));
4566 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
4567 }
4568 }
4569
CmdDrawIndirect( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride)4570 void genX(CmdDrawIndirect)(
4571 VkCommandBuffer commandBuffer,
4572 VkBuffer _buffer,
4573 VkDeviceSize offset,
4574 uint32_t drawCount,
4575 uint32_t stride)
4576 {
4577 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4578 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4579 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4580 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4581
4582 if (anv_batch_has_error(&cmd_buffer->batch))
4583 return;
4584
4585 anv_measure_snapshot(cmd_buffer,
4586 INTEL_SNAPSHOT_DRAW,
4587 "draw indirect",
4588 drawCount);
4589 trace_intel_begin_draw_indirect(&cmd_buffer->trace);
4590
4591 genX(cmd_buffer_flush_state)(cmd_buffer);
4592
4593 if (cmd_buffer->state.conditional_render_enabled)
4594 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4595
4596 for (uint32_t i = 0; i < drawCount; i++) {
4597 struct anv_address draw = anv_address_add(buffer->address, offset);
4598
4599 if (vs_prog_data->uses_firstvertex ||
4600 vs_prog_data->uses_baseinstance)
4601 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4602 if (vs_prog_data->uses_drawid)
4603 emit_draw_index(cmd_buffer, i);
4604
4605 /* Emitting draw index or vertex index BOs may result in needing
4606 * additional VF cache flushes.
4607 */
4608 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4609
4610 load_indirect_parameters(cmd_buffer, draw, false);
4611
4612 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4613 prim.IndirectParameterEnable = true;
4614 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4615 prim.VertexAccessType = SEQUENTIAL;
4616 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4617 }
4618
4619 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4620
4621 offset += stride;
4622 }
4623
4624 trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
4625 }
4626
CmdDrawIndexedIndirect( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride)4627 void genX(CmdDrawIndexedIndirect)(
4628 VkCommandBuffer commandBuffer,
4629 VkBuffer _buffer,
4630 VkDeviceSize offset,
4631 uint32_t drawCount,
4632 uint32_t stride)
4633 {
4634 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4635 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4636 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
4637 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4638
4639 if (anv_batch_has_error(&cmd_buffer->batch))
4640 return;
4641
4642 anv_measure_snapshot(cmd_buffer,
4643 INTEL_SNAPSHOT_DRAW,
4644 "draw indexed indirect",
4645 drawCount);
4646 trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
4647
4648 genX(cmd_buffer_flush_state)(cmd_buffer);
4649
4650 if (cmd_buffer->state.conditional_render_enabled)
4651 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4652
4653 for (uint32_t i = 0; i < drawCount; i++) {
4654 struct anv_address draw = anv_address_add(buffer->address, offset);
4655
4656 /* TODO: We need to stomp base vertex to 0 somehow */
4657 if (vs_prog_data->uses_firstvertex ||
4658 vs_prog_data->uses_baseinstance)
4659 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4660 if (vs_prog_data->uses_drawid)
4661 emit_draw_index(cmd_buffer, i);
4662
4663 /* Emitting draw index or vertex index BOs may result in needing
4664 * additional VF cache flushes.
4665 */
4666 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4667
4668 load_indirect_parameters(cmd_buffer, draw, true);
4669
4670 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4671 prim.IndirectParameterEnable = true;
4672 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4673 prim.VertexAccessType = RANDOM;
4674 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4675 }
4676
4677 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4678
4679 offset += stride;
4680 }
4681
4682 trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
4683 }
4684
4685 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, struct mi_builder *b, struct anv_buffer *count_buffer, uint64_t countBufferOffset)4686 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4687 struct mi_builder *b,
4688 struct anv_buffer *count_buffer,
4689 uint64_t countBufferOffset)
4690 {
4691 struct anv_address count_address =
4692 anv_address_add(count_buffer->address, countBufferOffset);
4693
4694 struct mi_value ret = mi_imm(0);
4695
4696 if (cmd_buffer->state.conditional_render_enabled) {
4697 #if GFX_VERx10 >= 75
4698 ret = mi_new_gpr(b);
4699 mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
4700 #endif
4701 } else {
4702 /* Upload the current draw count from the draw parameters buffer to
4703 * MI_PREDICATE_SRC0.
4704 */
4705 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
4706 mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
4707 }
4708
4709 return ret;
4710 }
4711
4712 static void
emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, struct mi_builder *b, uint32_t draw_index)4713 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4714 struct mi_builder *b,
4715 uint32_t draw_index)
4716 {
4717 /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
4718 mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
4719
4720 if (draw_index == 0) {
4721 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4722 mip.LoadOperation = LOAD_LOADINV;
4723 mip.CombineOperation = COMBINE_SET;
4724 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4725 }
4726 } else {
4727 /* While draw_index < draw_count the predicate's result will be
4728 * (draw_index == draw_count) ^ TRUE = TRUE
4729 * When draw_index == draw_count the result is
4730 * (TRUE) ^ TRUE = FALSE
4731 * After this all results will be:
4732 * (FALSE) ^ FALSE = FALSE
4733 */
4734 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4735 mip.LoadOperation = LOAD_LOAD;
4736 mip.CombineOperation = COMBINE_XOR;
4737 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4738 }
4739 }
4740 }
4741
4742 #if GFX_VERx10 >= 75
4743 static void
emit_draw_count_predicate_with_conditional_render( struct anv_cmd_buffer *cmd_buffer, struct mi_builder *b, uint32_t draw_index, struct mi_value max)4744 emit_draw_count_predicate_with_conditional_render(
4745 struct anv_cmd_buffer *cmd_buffer,
4746 struct mi_builder *b,
4747 uint32_t draw_index,
4748 struct mi_value max)
4749 {
4750 struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
4751 pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
4752
4753 #if GFX_VER >= 8
4754 mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
4755 #else
4756 /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser
4757 * so we emit MI_PREDICATE to set it.
4758 */
4759
4760 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);
4761 mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4762
4763 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4764 mip.LoadOperation = LOAD_LOADINV;
4765 mip.CombineOperation = COMBINE_SET;
4766 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4767 }
4768 #endif
4769 }
4770 #endif
4771
4772 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer, struct mi_builder *b, uint32_t draw_index, struct mi_value max)4773 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
4774 struct mi_builder *b,
4775 uint32_t draw_index,
4776 struct mi_value max)
4777 {
4778 #if GFX_VERx10 >= 75
4779 if (cmd_buffer->state.conditional_render_enabled) {
4780 emit_draw_count_predicate_with_conditional_render(
4781 cmd_buffer, b, draw_index, mi_value_ref(b, max));
4782 } else {
4783 emit_draw_count_predicate(cmd_buffer, b, draw_index);
4784 }
4785 #else
4786 emit_draw_count_predicate(cmd_buffer, b, draw_index);
4787 #endif
4788 }
4789
CmdDrawIndirectCount( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride)4790 void genX(CmdDrawIndirectCount)(
4791 VkCommandBuffer commandBuffer,
4792 VkBuffer _buffer,
4793 VkDeviceSize offset,
4794 VkBuffer _countBuffer,
4795 VkDeviceSize countBufferOffset,
4796 uint32_t maxDrawCount,
4797 uint32_t stride)
4798 {
4799 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4800 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4801 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4802 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4803 struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4804 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4805
4806 if (anv_batch_has_error(&cmd_buffer->batch))
4807 return;
4808
4809 anv_measure_snapshot(cmd_buffer,
4810 INTEL_SNAPSHOT_DRAW,
4811 "draw indirect count",
4812 0);
4813 trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
4814
4815 genX(cmd_buffer_flush_state)(cmd_buffer);
4816
4817 struct mi_builder b;
4818 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4819 struct mi_value max =
4820 prepare_for_draw_count_predicate(cmd_buffer, &b,
4821 count_buffer, countBufferOffset);
4822
4823 for (uint32_t i = 0; i < maxDrawCount; i++) {
4824 struct anv_address draw = anv_address_add(buffer->address, offset);
4825
4826 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4827
4828 if (vs_prog_data->uses_firstvertex ||
4829 vs_prog_data->uses_baseinstance)
4830 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4831 if (vs_prog_data->uses_drawid)
4832 emit_draw_index(cmd_buffer, i);
4833
4834 /* Emitting draw index or vertex index BOs may result in needing
4835 * additional VF cache flushes.
4836 */
4837 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4838
4839 load_indirect_parameters(cmd_buffer, draw, false);
4840
4841 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4842 prim.IndirectParameterEnable = true;
4843 prim.PredicateEnable = true;
4844 prim.VertexAccessType = SEQUENTIAL;
4845 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4846 }
4847
4848 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4849
4850 offset += stride;
4851 }
4852
4853 mi_value_unref(&b, max);
4854
4855 trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount);
4856 }
4857
CmdDrawIndexedIndirectCount( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride)4858 void genX(CmdDrawIndexedIndirectCount)(
4859 VkCommandBuffer commandBuffer,
4860 VkBuffer _buffer,
4861 VkDeviceSize offset,
4862 VkBuffer _countBuffer,
4863 VkDeviceSize countBufferOffset,
4864 uint32_t maxDrawCount,
4865 uint32_t stride)
4866 {
4867 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4868 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4869 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4870 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4871 struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4872 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4873
4874 if (anv_batch_has_error(&cmd_buffer->batch))
4875 return;
4876
4877 anv_measure_snapshot(cmd_buffer,
4878 INTEL_SNAPSHOT_DRAW,
4879 "draw indexed indirect count",
4880 0);
4881 trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
4882
4883 genX(cmd_buffer_flush_state)(cmd_buffer);
4884
4885 struct mi_builder b;
4886 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
4887 struct mi_value max =
4888 prepare_for_draw_count_predicate(cmd_buffer, &b,
4889 count_buffer, countBufferOffset);
4890
4891 for (uint32_t i = 0; i < maxDrawCount; i++) {
4892 struct anv_address draw = anv_address_add(buffer->address, offset);
4893
4894 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4895
4896 /* TODO: We need to stomp base vertex to 0 somehow */
4897 if (vs_prog_data->uses_firstvertex ||
4898 vs_prog_data->uses_baseinstance)
4899 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4900 if (vs_prog_data->uses_drawid)
4901 emit_draw_index(cmd_buffer, i);
4902
4903 /* Emitting draw index or vertex index BOs may result in needing
4904 * additional VF cache flushes.
4905 */
4906 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4907
4908 load_indirect_parameters(cmd_buffer, draw, true);
4909
4910 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4911 prim.IndirectParameterEnable = true;
4912 prim.PredicateEnable = true;
4913 prim.VertexAccessType = RANDOM;
4914 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4915 }
4916
4917 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4918
4919 offset += stride;
4920 }
4921
4922 mi_value_unref(&b, max);
4923
4924 trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount);
4925
4926 }
4927
CmdBeginTransformFeedbackEXT( VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, uint32_t counterBufferCount, const VkBuffer* pCounterBuffers, const VkDeviceSize* pCounterBufferOffsets)4928 void genX(CmdBeginTransformFeedbackEXT)(
4929 VkCommandBuffer commandBuffer,
4930 uint32_t firstCounterBuffer,
4931 uint32_t counterBufferCount,
4932 const VkBuffer* pCounterBuffers,
4933 const VkDeviceSize* pCounterBufferOffsets)
4934 {
4935 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4936
4937 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4938 assert(counterBufferCount <= MAX_XFB_BUFFERS);
4939 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4940
4941 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4942 *
4943 * "Ssoftware must ensure that no HW stream output operations can be in
4944 * process or otherwise pending at the point that the MI_LOAD/STORE
4945 * commands are processed. This will likely require a pipeline flush."
4946 */
4947 anv_add_pending_pipe_bits(cmd_buffer,
4948 ANV_PIPE_CS_STALL_BIT,
4949 "begin transform feedback");
4950 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4951
4952 for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
4953 /* If we have a counter buffer, this is a resume so we need to load the
4954 * value into the streamout offset register. Otherwise, this is a begin
4955 * and we need to reset it to zero.
4956 */
4957 if (pCounterBuffers &&
4958 idx >= firstCounterBuffer &&
4959 idx - firstCounterBuffer < counterBufferCount &&
4960 pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
4961 uint32_t cb_idx = idx - firstCounterBuffer;
4962 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4963 uint64_t offset = pCounterBufferOffsets ?
4964 pCounterBufferOffsets[cb_idx] : 0;
4965
4966 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
4967 lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4968 lrm.MemoryAddress = anv_address_add(counter_buffer->address,
4969 offset);
4970 }
4971 } else {
4972 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
4973 lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4974 lri.DataDWord = 0;
4975 }
4976 }
4977 }
4978
4979 cmd_buffer->state.xfb_enabled = true;
4980 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
4981 }
4982
CmdEndTransformFeedbackEXT( VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, uint32_t counterBufferCount, const VkBuffer* pCounterBuffers, const VkDeviceSize* pCounterBufferOffsets)4983 void genX(CmdEndTransformFeedbackEXT)(
4984 VkCommandBuffer commandBuffer,
4985 uint32_t firstCounterBuffer,
4986 uint32_t counterBufferCount,
4987 const VkBuffer* pCounterBuffers,
4988 const VkDeviceSize* pCounterBufferOffsets)
4989 {
4990 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4991
4992 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4993 assert(counterBufferCount <= MAX_XFB_BUFFERS);
4994 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4995
4996 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4997 *
4998 * "Ssoftware must ensure that no HW stream output operations can be in
4999 * process or otherwise pending at the point that the MI_LOAD/STORE
5000 * commands are processed. This will likely require a pipeline flush."
5001 */
5002 anv_add_pending_pipe_bits(cmd_buffer,
5003 ANV_PIPE_CS_STALL_BIT,
5004 "end transform feedback");
5005 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5006
5007 for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
5008 unsigned idx = firstCounterBuffer + cb_idx;
5009
5010 /* If we have a counter buffer, this is a resume so we need to load the
5011 * value into the streamout offset register. Otherwise, this is a begin
5012 * and we need to reset it to zero.
5013 */
5014 if (pCounterBuffers &&
5015 cb_idx < counterBufferCount &&
5016 pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
5017 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
5018 uint64_t offset = pCounterBufferOffsets ?
5019 pCounterBufferOffsets[cb_idx] : 0;
5020
5021 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
5022 srm.MemoryAddress = anv_address_add(counter_buffer->address,
5023 offset);
5024 srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
5025 }
5026 }
5027 }
5028
5029 cmd_buffer->state.xfb_enabled = false;
5030 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
5031 }
5032
5033 #if GFX_VERx10 >= 125
5034 void
CmdDrawMeshTasksNV( VkCommandBuffer commandBuffer, uint32_t taskCount, uint32_t firstTask)5035 genX(CmdDrawMeshTasksNV)(
5036 VkCommandBuffer commandBuffer,
5037 uint32_t taskCount,
5038 uint32_t firstTask)
5039 {
5040 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5041
5042 if (anv_batch_has_error(&cmd_buffer->batch))
5043 return;
5044
5045 /* TODO(mesh): Check if this is not emitting more packets than we need. */
5046 genX(cmd_buffer_flush_state)(cmd_buffer);
5047
5048 if (cmd_buffer->state.conditional_render_enabled)
5049 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5050
5051 /* BSpec 54016 says: "The values passed for Starting ThreadGroup ID X
5052 * and ThreadGroup Count X shall not cause TGIDs to exceed (2^32)-1."
5053 */
5054 assert((int64_t)firstTask + taskCount - 1 <= UINT32_MAX);
5055
5056 anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_1D), m) {
5057 m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
5058 m.ThreadGroupCountX = taskCount;
5059 m.StartingThreadGroupIDX = firstTask;
5060 }
5061 }
5062
5063 #define GFX125_3DMESH_TG_COUNT 0x26F0
5064 #define GFX125_3DMESH_STARTING_TGID 0x26F4
5065 #define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
5066
5067 static void
mesh_load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, struct mi_builder *b, struct anv_address addr, bool emit_xp0, uint32_t xp0)5068 mesh_load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
5069 struct mi_builder *b,
5070 struct anv_address addr,
5071 bool emit_xp0,
5072 uint32_t xp0)
5073 {
5074 const size_t taskCountOff = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
5075 const size_t firstTaskOff = offsetof(VkDrawMeshTasksIndirectCommandNV, firstTask);
5076
5077 mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
5078 mi_mem32(anv_address_add(addr, taskCountOff)));
5079
5080 mi_store(b, mi_reg32(GFX125_3DMESH_STARTING_TGID),
5081 mi_mem32(anv_address_add(addr, firstTaskOff)));
5082
5083 if (emit_xp0)
5084 mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
5085 }
5086
5087 static void
emit_indirect_3dmesh_1d(struct anv_batch *batch, bool predicate_enable, bool uses_drawid)5088 emit_indirect_3dmesh_1d(struct anv_batch *batch,
5089 bool predicate_enable,
5090 bool uses_drawid)
5091 {
5092 uint32_t len = GENX(3DMESH_1D_length) + uses_drawid;
5093 uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_1D),
5094 .PredicateEnable = predicate_enable,
5095 .IndirectParameterEnable = true,
5096 .ExtendedParameter0Present = uses_drawid);
5097 if (uses_drawid)
5098 dw[len - 1] = 0;
5099 }
5100
5101 void
CmdDrawMeshTasksIndirectNV( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride)5102 genX(CmdDrawMeshTasksIndirectNV)(
5103 VkCommandBuffer commandBuffer,
5104 VkBuffer _buffer,
5105 VkDeviceSize offset,
5106 uint32_t drawCount,
5107 uint32_t stride)
5108 {
5109 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5110 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5111 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
5112 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
5113 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
5114 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5115
5116 if (anv_batch_has_error(&cmd_buffer->batch))
5117 return;
5118
5119 genX(cmd_buffer_flush_state)(cmd_buffer);
5120
5121 if (cmd_state->conditional_render_enabled)
5122 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5123
5124 bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
5125 mesh_prog_data->uses_drawid;
5126 struct mi_builder b;
5127 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5128
5129 for (uint32_t i = 0; i < drawCount; i++) {
5130 struct anv_address draw = anv_address_add(buffer->address, offset);
5131
5132 mesh_load_indirect_parameters(cmd_buffer, &b, draw, uses_drawid, i);
5133
5134 emit_indirect_3dmesh_1d(&cmd_buffer->batch,
5135 cmd_state->conditional_render_enabled, uses_drawid);
5136
5137 offset += stride;
5138 }
5139 }
5140
5141 void
CmdDrawMeshTasksIndirectCountNV( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride)5142 genX(CmdDrawMeshTasksIndirectCountNV)(
5143 VkCommandBuffer commandBuffer,
5144 VkBuffer _buffer,
5145 VkDeviceSize offset,
5146 VkBuffer _countBuffer,
5147 VkDeviceSize countBufferOffset,
5148 uint32_t maxDrawCount,
5149 uint32_t stride)
5150 {
5151 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5152 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5153 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
5154 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
5155 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
5156 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
5157
5158 if (anv_batch_has_error(&cmd_buffer->batch))
5159 return;
5160
5161 genX(cmd_buffer_flush_state)(cmd_buffer);
5162
5163 bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
5164 mesh_prog_data->uses_drawid;
5165
5166 struct mi_builder b;
5167 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5168
5169 struct mi_value max =
5170 prepare_for_draw_count_predicate(cmd_buffer, &b,
5171 count_buffer, countBufferOffset);
5172
5173 for (uint32_t i = 0; i < maxDrawCount; i++) {
5174 struct anv_address draw = anv_address_add(buffer->address, offset);
5175
5176 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
5177
5178 mesh_load_indirect_parameters(cmd_buffer, &b, draw, uses_drawid, i);
5179
5180 emit_indirect_3dmesh_1d(&cmd_buffer->batch, true, uses_drawid);
5181
5182 offset += stride;
5183 }
5184 }
5185 #endif /* GFX_VERx10 >= 125 */
5186
5187 void
cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)5188 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
5189 {
5190 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
5191 struct anv_compute_pipeline *pipeline = comp_state->pipeline;
5192
5193 assert(pipeline->cs);
5194
5195 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
5196
5197 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
5198
5199 /* Apply any pending pipeline flushes we may have. We want to apply them
5200 * now because, if any of those flushes are for things like push constants,
5201 * the GPU will read the state at weird times.
5202 */
5203 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5204
5205 if (cmd_buffer->state.compute.pipeline_dirty) {
5206 /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
5207 *
5208 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
5209 * the only bits that are changed are scoreboard related: Scoreboard
5210 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
5211 * these scoreboard related states, a MEDIA_STATE_FLUSH is
5212 * sufficient."
5213 */
5214 anv_add_pending_pipe_bits(cmd_buffer,
5215 ANV_PIPE_CS_STALL_BIT,
5216 "flush compute state");
5217 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5218
5219 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
5220
5221 /* The workgroup size of the pipeline affects our push constant layout
5222 * so flag push constants as dirty if we change the pipeline.
5223 */
5224 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5225 }
5226
5227 if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
5228 cmd_buffer->state.compute.pipeline_dirty) {
5229 flush_descriptor_sets(cmd_buffer,
5230 &cmd_buffer->state.compute.base,
5231 VK_SHADER_STAGE_COMPUTE_BIT,
5232 &pipeline->cs, 1);
5233 cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
5234
5235 #if GFX_VERx10 < 125
5236 uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
5237 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
5238 .BindingTablePointer =
5239 cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
5240 .SamplerStatePointer =
5241 cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
5242 };
5243 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
5244
5245 struct anv_state state =
5246 anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
5247 pipeline->interface_descriptor_data,
5248 GENX(INTERFACE_DESCRIPTOR_DATA_length),
5249 64);
5250
5251 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
5252 anv_batch_emit(&cmd_buffer->batch,
5253 GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
5254 mid.InterfaceDescriptorTotalLength = size;
5255 mid.InterfaceDescriptorDataStartAddress = state.offset;
5256 }
5257 #endif
5258 }
5259
5260 if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
5261 comp_state->push_data =
5262 anv_cmd_buffer_cs_push_constants(cmd_buffer);
5263
5264 #if GFX_VERx10 < 125
5265 if (comp_state->push_data.alloc_size) {
5266 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
5267 curbe.CURBETotalDataLength = comp_state->push_data.alloc_size;
5268 curbe.CURBEDataStartAddress = comp_state->push_data.offset;
5269 }
5270 }
5271 #endif
5272
5273 cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
5274 }
5275
5276 cmd_buffer->state.compute.pipeline_dirty = false;
5277
5278 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5279 }
5280
5281 #if GFX_VER == 7
5282
5283 static VkResult
verify_cmd_parser(const struct anv_device *device, int required_version, const char *function)5284 verify_cmd_parser(const struct anv_device *device,
5285 int required_version,
5286 const char *function)
5287 {
5288 if (device->physical->cmd_parser_version < required_version) {
5289 return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT,
5290 "cmd parser version %d is required for %s",
5291 required_version, function);
5292 } else {
5293 return VK_SUCCESS;
5294 }
5295 }
5296
5297 #endif
5298
5299 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer, uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ)5300 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
5301 uint32_t baseGroupX,
5302 uint32_t baseGroupY,
5303 uint32_t baseGroupZ)
5304 {
5305 if (anv_batch_has_error(&cmd_buffer->batch))
5306 return;
5307
5308 struct anv_push_constants *push =
5309 &cmd_buffer->state.compute.base.push_constants;
5310 if (push->cs.base_work_group_id[0] != baseGroupX ||
5311 push->cs.base_work_group_id[1] != baseGroupY ||
5312 push->cs.base_work_group_id[2] != baseGroupZ) {
5313 push->cs.base_work_group_id[0] = baseGroupX;
5314 push->cs.base_work_group_id[1] = baseGroupY;
5315 push->cs.base_work_group_id[2] = baseGroupZ;
5316
5317 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5318 }
5319 }
5320
CmdDispatch( VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)5321 void genX(CmdDispatch)(
5322 VkCommandBuffer commandBuffer,
5323 uint32_t x,
5324 uint32_t y,
5325 uint32_t z)
5326 {
5327 genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z);
5328 }
5329
5330 #if GFX_VERx10 >= 125
5331
5332 static inline void
emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, const struct anv_compute_pipeline *pipeline, bool indirect, const struct brw_cs_prog_data *prog_data, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)5333 emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
5334 const struct anv_compute_pipeline *pipeline, bool indirect,
5335 const struct brw_cs_prog_data *prog_data,
5336 uint32_t groupCountX, uint32_t groupCountY,
5337 uint32_t groupCountZ)
5338 {
5339 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
5340 const struct anv_shader_bin *cs_bin = pipeline->cs;
5341 bool predicate = cmd_buffer->state.conditional_render_enabled;
5342
5343 const struct intel_device_info *devinfo = &pipeline->base.device->info;
5344 const struct brw_cs_dispatch_info dispatch =
5345 brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
5346
5347 anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
5348 cw.IndirectParameterEnable = indirect;
5349 cw.PredicateEnable = predicate;
5350 cw.SIMDSize = dispatch.simd_size / 16;
5351 cw.IndirectDataStartAddress = comp_state->push_data.offset;
5352 cw.IndirectDataLength = comp_state->push_data.alloc_size;
5353 cw.LocalXMaximum = prog_data->local_size[0] - 1;
5354 cw.LocalYMaximum = prog_data->local_size[1] - 1;
5355 cw.LocalZMaximum = prog_data->local_size[2] - 1;
5356 cw.ThreadGroupIDXDimension = groupCountX;
5357 cw.ThreadGroupIDYDimension = groupCountY;
5358 cw.ThreadGroupIDZDimension = groupCountZ;
5359 cw.ExecutionMask = dispatch.right_mask;
5360 cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0);
5361
5362 cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
5363 .KernelStartPointer = cs_bin->kernel.offset,
5364 .SamplerStatePointer =
5365 cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
5366 .BindingTablePointer =
5367 cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
5368 .BindingTableEntryCount =
5369 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
5370 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
5371 .SharedLocalMemorySize = encode_slm_size(GFX_VER,
5372 prog_data->base.total_shared),
5373 .NumberOfBarriers = prog_data->uses_barrier,
5374 };
5375 }
5376 }
5377
5378 #else /* #if GFX_VERx10 >= 125 */
5379
5380 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer, const struct anv_compute_pipeline *pipeline, bool indirect, const struct brw_cs_prog_data *prog_data, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)5381 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
5382 const struct anv_compute_pipeline *pipeline, bool indirect,
5383 const struct brw_cs_prog_data *prog_data,
5384 uint32_t groupCountX, uint32_t groupCountY,
5385 uint32_t groupCountZ)
5386 {
5387 bool predicate = (GFX_VER <= 7 && indirect) ||
5388 cmd_buffer->state.conditional_render_enabled;
5389
5390 const struct intel_device_info *devinfo = &pipeline->base.device->info;
5391 const struct brw_cs_dispatch_info dispatch =
5392 brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
5393
5394 anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
5395 ggw.IndirectParameterEnable = indirect;
5396 ggw.PredicateEnable = predicate;
5397 ggw.SIMDSize = dispatch.simd_size / 16;
5398 ggw.ThreadDepthCounterMaximum = 0;
5399 ggw.ThreadHeightCounterMaximum = 0;
5400 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
5401 ggw.ThreadGroupIDXDimension = groupCountX;
5402 ggw.ThreadGroupIDYDimension = groupCountY;
5403 ggw.ThreadGroupIDZDimension = groupCountZ;
5404 ggw.RightExecutionMask = dispatch.right_mask;
5405 ggw.BottomExecutionMask = 0xffffffff;
5406 }
5407
5408 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
5409 }
5410
5411 #endif /* #if GFX_VERx10 >= 125 */
5412
5413 static inline void
emit_cs_walker(struct anv_cmd_buffer *cmd_buffer, const struct anv_compute_pipeline *pipeline, bool indirect, const struct brw_cs_prog_data *prog_data, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)5414 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
5415 const struct anv_compute_pipeline *pipeline, bool indirect,
5416 const struct brw_cs_prog_data *prog_data,
5417 uint32_t groupCountX, uint32_t groupCountY,
5418 uint32_t groupCountZ)
5419 {
5420 #if GFX_VERx10 >= 125
5421 emit_compute_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
5422 groupCountY, groupCountZ);
5423 #else
5424 emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
5425 groupCountY, groupCountZ);
5426 #endif
5427 }
5428
CmdDispatchBase( VkCommandBuffer commandBuffer, uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)5429 void genX(CmdDispatchBase)(
5430 VkCommandBuffer commandBuffer,
5431 uint32_t baseGroupX,
5432 uint32_t baseGroupY,
5433 uint32_t baseGroupZ,
5434 uint32_t groupCountX,
5435 uint32_t groupCountY,
5436 uint32_t groupCountZ)
5437 {
5438 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5439 struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
5440 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
5441
5442 anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
5443 baseGroupY, baseGroupZ);
5444
5445 if (anv_batch_has_error(&cmd_buffer->batch))
5446 return;
5447
5448 anv_measure_snapshot(cmd_buffer,
5449 INTEL_SNAPSHOT_COMPUTE,
5450 "compute",
5451 groupCountX * groupCountY * groupCountZ *
5452 prog_data->local_size[0] * prog_data->local_size[1] *
5453 prog_data->local_size[2]);
5454
5455 trace_intel_begin_compute(&cmd_buffer->trace);
5456
5457 if (prog_data->uses_num_work_groups) {
5458 struct anv_state state =
5459 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
5460 uint32_t *sizes = state.map;
5461 sizes[0] = groupCountX;
5462 sizes[1] = groupCountY;
5463 sizes[2] = groupCountZ;
5464 cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
5465 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5466 .offset = state.offset,
5467 };
5468
5469 /* The num_workgroups buffer goes in the binding table */
5470 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5471 }
5472
5473 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
5474
5475 if (cmd_buffer->state.conditional_render_enabled)
5476 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5477
5478 emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
5479 groupCountY, groupCountZ);
5480
5481 trace_intel_end_compute(&cmd_buffer->trace,
5482 groupCountX, groupCountY, groupCountZ);
5483 }
5484
5485 #define GPGPU_DISPATCHDIMX 0x2500
5486 #define GPGPU_DISPATCHDIMY 0x2504
5487 #define GPGPU_DISPATCHDIMZ 0x2508
5488
CmdDispatchIndirect( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)5489 void genX(CmdDispatchIndirect)(
5490 VkCommandBuffer commandBuffer,
5491 VkBuffer _buffer,
5492 VkDeviceSize offset)
5493 {
5494 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5495 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5496 struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
5497 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
5498 struct anv_address addr = anv_address_add(buffer->address, offset);
5499 UNUSED struct anv_batch *batch = &cmd_buffer->batch;
5500
5501 anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
5502
5503 #if GFX_VER == 7
5504 /* Linux 4.4 added command parser version 5 which allows the GPGPU
5505 * indirect dispatch registers to be written.
5506 */
5507 if (verify_cmd_parser(cmd_buffer->device, 5,
5508 "vkCmdDispatchIndirect") != VK_SUCCESS)
5509 return;
5510 #endif
5511
5512 anv_measure_snapshot(cmd_buffer,
5513 INTEL_SNAPSHOT_COMPUTE,
5514 "compute indirect",
5515 0);
5516 trace_intel_begin_compute(&cmd_buffer->trace);
5517
5518 if (prog_data->uses_num_work_groups) {
5519 cmd_buffer->state.compute.num_workgroups = addr;
5520
5521 /* The num_workgroups buffer goes in the binding table */
5522 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
5523 }
5524
5525 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
5526
5527 struct mi_builder b;
5528 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5529
5530 struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));
5531 struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));
5532 struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));
5533
5534 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
5535 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
5536 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
5537
5538 #if GFX_VER <= 7
5539 /* predicate = (compute_dispatch_indirect_x_size == 0); */
5540 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);
5541 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5542 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5543 mip.LoadOperation = LOAD_LOAD;
5544 mip.CombineOperation = COMBINE_SET;
5545 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5546 }
5547
5548 /* predicate |= (compute_dispatch_indirect_y_size == 0); */
5549 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);
5550 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5551 mip.LoadOperation = LOAD_LOAD;
5552 mip.CombineOperation = COMBINE_OR;
5553 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5554 }
5555
5556 /* predicate |= (compute_dispatch_indirect_z_size == 0); */
5557 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);
5558 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5559 mip.LoadOperation = LOAD_LOAD;
5560 mip.CombineOperation = COMBINE_OR;
5561 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5562 }
5563
5564 /* predicate = !predicate; */
5565 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5566 mip.LoadOperation = LOAD_LOADINV;
5567 mip.CombineOperation = COMBINE_OR;
5568 mip.CompareOperation = COMPARE_FALSE;
5569 }
5570
5571 #if GFX_VERx10 == 75
5572 if (cmd_buffer->state.conditional_render_enabled) {
5573 /* predicate &= !(conditional_rendering_predicate == 0); */
5574 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),
5575 mi_reg32(ANV_PREDICATE_RESULT_REG));
5576 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
5577 mip.LoadOperation = LOAD_LOADINV;
5578 mip.CombineOperation = COMBINE_AND;
5579 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5580 }
5581 }
5582 #endif
5583
5584 #else /* GFX_VER > 7 */
5585 if (cmd_buffer->state.conditional_render_enabled)
5586 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
5587 #endif
5588
5589 emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
5590
5591 trace_intel_end_compute(&cmd_buffer->trace, 0, 0, 0);
5592 }
5593
5594 struct anv_state
cmd_buffer_ray_query_globals(struct anv_cmd_buffer *cmd_buffer)5595 genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
5596 {
5597 #if GFX_VERx10 >= 125
5598 struct anv_device *device = cmd_buffer->device;
5599
5600 struct anv_state state =
5601 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
5602 BRW_RT_DISPATCH_GLOBALS_SIZE,
5603 64);
5604 struct brw_rt_scratch_layout layout;
5605 uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
5606 * some cases?
5607 */
5608 brw_rt_compute_scratch_layout(&layout, &device->info,
5609 stack_ids_per_dss, 1 << 10);
5610
5611 struct GFX_RT_DISPATCH_GLOBALS rtdg = {
5612 .MemBaseAddress = (struct anv_address) {
5613 /* The ray query HW computes offsets from the top of the buffer, so
5614 * let the address at the end of the buffer.
5615 */
5616 .bo = device->ray_query_bo,
5617 .offset = device->ray_query_bo->size
5618 },
5619 .AsyncRTStackSize = layout.ray_stack_stride / 64,
5620 .NumDSSRTStacks = layout.stack_ids_per_dss,
5621 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
5622 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
5623 .ResumeShaderTable = (struct anv_address) {
5624 .bo = cmd_buffer->state.ray_query_shadow_bo,
5625 },
5626 };
5627 GFX_RT_DISPATCH_GLOBALS_pack(NULL, state.map, &rtdg);
5628
5629 return state;
5630 #else
5631 unreachable("Not supported");
5632 #endif
5633 }
5634
5635 #if GFX_VERx10 >= 125
5636 static void
calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])5637 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
5638 {
5639 unsigned total_shift = 0;
5640 memset(local_shift, 0, 3);
5641
5642 bool progress;
5643 do {
5644 progress = false;
5645 for (unsigned i = 0; i < 3; i++) {
5646 assert(global[i] > 0);
5647 if ((1 << local_shift[i]) < global[i]) {
5648 progress = true;
5649 local_shift[i]++;
5650 total_shift++;
5651 }
5652
5653 if (total_shift == 3)
5654 return;
5655 }
5656 } while(progress);
5657
5658 /* Assign whatever's left to x */
5659 local_shift[0] += 3 - total_shift;
5660 }
5661
5662 static struct GFX_RT_SHADER_TABLE
vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)5663 vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
5664 {
5665 return (struct GFX_RT_SHADER_TABLE) {
5666 .BaseAddress = anv_address_from_u64(region->deviceAddress),
5667 .Stride = region->stride,
5668 };
5669 }
5670
5671 static void
cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, const VkStridedDeviceAddressRegionKHR *raygen_sbt, const VkStridedDeviceAddressRegionKHR *miss_sbt, const VkStridedDeviceAddressRegionKHR *hit_sbt, const VkStridedDeviceAddressRegionKHR *callable_sbt, bool is_indirect, uint32_t launch_width, uint32_t launch_height, uint32_t launch_depth, uint64_t launch_size_addr)5672 cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
5673 const VkStridedDeviceAddressRegionKHR *raygen_sbt,
5674 const VkStridedDeviceAddressRegionKHR *miss_sbt,
5675 const VkStridedDeviceAddressRegionKHR *hit_sbt,
5676 const VkStridedDeviceAddressRegionKHR *callable_sbt,
5677 bool is_indirect,
5678 uint32_t launch_width,
5679 uint32_t launch_height,
5680 uint32_t launch_depth,
5681 uint64_t launch_size_addr)
5682 {
5683 struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
5684 struct anv_ray_tracing_pipeline *pipeline = rt->pipeline;
5685
5686 if (anv_batch_has_error(&cmd_buffer->batch))
5687 return;
5688
5689 /* If we have a known degenerate launch size, just bail */
5690 if (!is_indirect &&
5691 (launch_width == 0 || launch_height == 0 || launch_depth == 0))
5692 return;
5693
5694 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
5695 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
5696
5697 cmd_buffer->state.rt.pipeline_dirty = false;
5698
5699 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5700
5701 /* Add these to the reloc list as they're internal buffers that don't
5702 * actually have relocs to pick them up manually.
5703 *
5704 * TODO(RT): This is a bit of a hack
5705 */
5706 anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
5707 cmd_buffer->batch.alloc,
5708 rt->scratch.bo);
5709
5710 /* Allocate and set up our RT_DISPATCH_GLOBALS */
5711 struct anv_state rtdg_state =
5712 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
5713 BRW_RT_PUSH_CONST_OFFSET +
5714 sizeof(struct anv_push_constants),
5715 64);
5716
5717 struct GFX_RT_DISPATCH_GLOBALS rtdg = {
5718 .MemBaseAddress = (struct anv_address) {
5719 .bo = rt->scratch.bo,
5720 .offset = rt->scratch.layout.ray_stack_start,
5721 },
5722 .CallStackHandler =
5723 anv_shader_bin_get_bsr(cmd_buffer->device->rt_trivial_return, 0),
5724 .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
5725 .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
5726 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
5727 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
5728 .HitGroupTable = vk_sdar_to_shader_table(hit_sbt),
5729 .MissGroupTable = vk_sdar_to_shader_table(miss_sbt),
5730 .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
5731 .LaunchWidth = launch_width,
5732 .LaunchHeight = launch_height,
5733 .LaunchDepth = launch_depth,
5734 .CallableGroupTable = vk_sdar_to_shader_table(callable_sbt),
5735 };
5736 GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg);
5737
5738 /* Push constants go after the RT_DISPATCH_GLOBALS */
5739 assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET);
5740 memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
5741 &cmd_buffer->state.rt.base.push_constants,
5742 sizeof(struct anv_push_constants));
5743
5744 struct anv_address rtdg_addr = {
5745 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5746 .offset = rtdg_state.offset,
5747 };
5748
5749 uint8_t local_size_log2[3];
5750 uint32_t global_size[3] = {};
5751 if (is_indirect) {
5752 /* Pick a local size that's probably ok. We assume most TraceRays calls
5753 * will use a two-dimensional dispatch size. Worst case, our initial
5754 * dispatch will be a little slower than it has to be.
5755 */
5756 local_size_log2[0] = 2;
5757 local_size_log2[1] = 1;
5758 local_size_log2[2] = 0;
5759
5760 struct mi_builder b;
5761 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
5762
5763 struct mi_value launch_size[3] = {
5764 mi_mem32(anv_address_from_u64(launch_size_addr + 0)),
5765 mi_mem32(anv_address_from_u64(launch_size_addr + 4)),
5766 mi_mem32(anv_address_from_u64(launch_size_addr + 8)),
5767 };
5768
5769 /* Store the original launch size into RT_DISPATCH_GLOBALS
5770 *
5771 * TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets
5772 * moved into a genX version.
5773 */
5774 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)),
5775 mi_value_ref(&b, launch_size[0]));
5776 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)),
5777 mi_value_ref(&b, launch_size[1]));
5778 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)),
5779 mi_value_ref(&b, launch_size[2]));
5780
5781 /* Compute the global dispatch size */
5782 for (unsigned i = 0; i < 3; i++) {
5783 if (local_size_log2[i] == 0)
5784 continue;
5785
5786 /* global_size = DIV_ROUND_UP(launch_size, local_size)
5787 *
5788 * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
5789 * has the semantics of shifting the enture 64-bit value and taking
5790 * the bottom 32 so we don't have to worry about roll-over.
5791 */
5792 uint32_t local_size = 1 << local_size_log2[i];
5793 launch_size[i] = mi_iadd(&b, launch_size[i],
5794 mi_imm(local_size - 1));
5795 launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
5796 local_size_log2[i]);
5797 }
5798
5799 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
5800 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
5801 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
5802 } else {
5803 uint32_t launch_size[3] = { launch_width, launch_height, launch_depth };
5804 calc_local_trace_size(local_size_log2, launch_size);
5805
5806 for (unsigned i = 0; i < 3; i++) {
5807 /* We have to be a bit careful here because DIV_ROUND_UP adds to the
5808 * numerator value may overflow. Cast to uint64_t to avoid this.
5809 */
5810 uint32_t local_size = 1 << local_size_log2[i];
5811 global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size);
5812 }
5813 }
5814
5815 anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
5816 cw.IndirectParameterEnable = is_indirect;
5817 cw.PredicateEnable = false;
5818 cw.SIMDSize = SIMD8;
5819 cw.LocalXMaximum = (1 << local_size_log2[0]) - 1;
5820 cw.LocalYMaximum = (1 << local_size_log2[1]) - 1;
5821 cw.LocalZMaximum = (1 << local_size_log2[2]) - 1;
5822 cw.ThreadGroupIDXDimension = global_size[0];
5823 cw.ThreadGroupIDYDimension = global_size[1];
5824 cw.ThreadGroupIDZDimension = global_size[2];
5825 cw.ExecutionMask = 0xff;
5826 cw.EmitInlineParameter = true;
5827 cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0);
5828
5829 const gl_shader_stage s = MESA_SHADER_RAYGEN;
5830 struct anv_device *device = cmd_buffer->device;
5831 struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
5832 struct anv_state *samplers = &cmd_buffer->state.samplers[s];
5833 cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
5834 .KernelStartPointer = device->rt_trampoline->kernel.offset,
5835 .SamplerStatePointer = samplers->offset,
5836 /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
5837 .SamplerCount = 0,
5838 .BindingTablePointer = surfaces->offset,
5839 .NumberofThreadsinGPGPUThreadGroup = 1,
5840 .BTDMode = true,
5841 };
5842
5843 struct brw_rt_raygen_trampoline_params trampoline_params = {
5844 .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
5845 .raygen_bsr_addr = raygen_sbt->deviceAddress,
5846 .is_indirect = is_indirect,
5847 .local_group_size_log2 = {
5848 local_size_log2[0],
5849 local_size_log2[1],
5850 local_size_log2[2],
5851 },
5852 };
5853 STATIC_ASSERT(sizeof(trampoline_params) == 32);
5854 memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
5855 }
5856 }
5857
5858 void
CmdTraceRaysKHR( VkCommandBuffer commandBuffer, const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, uint32_t width, uint32_t height, uint32_t depth)5859 genX(CmdTraceRaysKHR)(
5860 VkCommandBuffer commandBuffer,
5861 const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
5862 const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
5863 const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
5864 const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
5865 uint32_t width,
5866 uint32_t height,
5867 uint32_t depth)
5868 {
5869 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5870
5871 cmd_buffer_trace_rays(cmd_buffer,
5872 pRaygenShaderBindingTable,
5873 pMissShaderBindingTable,
5874 pHitShaderBindingTable,
5875 pCallableShaderBindingTable,
5876 false /* is_indirect */,
5877 width, height, depth,
5878 0 /* launch_size_addr */);
5879 }
5880
5881 void
CmdTraceRaysIndirectKHR( VkCommandBuffer commandBuffer, const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, VkDeviceAddress indirectDeviceAddress)5882 genX(CmdTraceRaysIndirectKHR)(
5883 VkCommandBuffer commandBuffer,
5884 const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
5885 const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
5886 const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
5887 const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
5888 VkDeviceAddress indirectDeviceAddress)
5889 {
5890 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5891
5892 cmd_buffer_trace_rays(cmd_buffer,
5893 pRaygenShaderBindingTable,
5894 pMissShaderBindingTable,
5895 pHitShaderBindingTable,
5896 pCallableShaderBindingTable,
5897 true /* is_indirect */,
5898 0, 0, 0, /* width, height, depth, */
5899 indirectDeviceAddress);
5900 }
5901 #endif /* GFX_VERx10 >= 125 */
5902
5903 static void
flush_pipeline_select(struct anv_cmd_buffer *cmd_buffer, uint32_t pipeline)5904 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
5905 uint32_t pipeline)
5906 {
5907 UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info;
5908
5909 if (cmd_buffer->state.current_pipeline == pipeline)
5910 return;
5911
5912 #if GFX_VER >= 8 && GFX_VER < 10
5913 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
5914 *
5915 * Software must clear the COLOR_CALC_STATE Valid field in
5916 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
5917 * with Pipeline Select set to GPGPU.
5918 *
5919 * The internal hardware docs recommend the same workaround for Gfx9
5920 * hardware too.
5921 */
5922 if (pipeline == GPGPU)
5923 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
5924 #endif
5925
5926 #if GFX_VER == 9
5927 if (pipeline == _3D) {
5928 /* There is a mid-object preemption workaround which requires you to
5929 * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D. However,
5930 * even without preemption, we have issues with geometry flickering when
5931 * GPGPU and 3D are back-to-back and this seems to fix it. We don't
5932 * really know why.
5933 */
5934 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
5935 vfe.MaximumNumberofThreads =
5936 devinfo->max_cs_threads * devinfo->subslice_total - 1;
5937 vfe.NumberofURBEntries = 2;
5938 vfe.URBEntryAllocationSize = 2;
5939 }
5940
5941 /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
5942 * invalid. Set the compute pipeline to dirty to force a re-emit of the
5943 * pipeline in case we get back-to-back dispatch calls with the same
5944 * pipeline and a PIPELINE_SELECT in between.
5945 */
5946 cmd_buffer->state.compute.pipeline_dirty = true;
5947 }
5948 #endif
5949
5950 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
5951 * PIPELINE_SELECT [DevBWR+]":
5952 *
5953 * Project: DEVSNB+
5954 *
5955 * Software must ensure all the write caches are flushed through a
5956 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
5957 * command to invalidate read only caches prior to programming
5958 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
5959 *
5960 * Note the cmd_buffer_apply_pipe_flushes will split this into two
5961 * PIPE_CONTROLs.
5962 */
5963 anv_add_pending_pipe_bits(cmd_buffer,
5964 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
5965 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
5966 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
5967 ANV_PIPE_CS_STALL_BIT |
5968 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5969 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
5970 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
5971 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT,
5972 "flush and invalidate for PIPELINE_SELECT");
5973 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5974
5975 anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
5976 #if GFX_VER >= 9
5977 ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
5978 ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
5979 #endif
5980 ps.PipelineSelection = pipeline;
5981 }
5982
5983 #if GFX_VER == 9
5984 if (devinfo->platform == INTEL_PLATFORM_GLK) {
5985 /* Project: DevGLK
5986 *
5987 * "This chicken bit works around a hardware issue with barrier logic
5988 * encountered when switching between GPGPU and 3D pipelines. To
5989 * workaround the issue, this mode bit should be set after a pipeline
5990 * is selected."
5991 */
5992 anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) {
5993 scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU
5994 : GLK_BARRIER_MODE_3D_HULL;
5995 scec1.GLKBarrierModeMask = 1;
5996 }
5997 }
5998 #endif
5999
6000 cmd_buffer->state.current_pipeline = pipeline;
6001 }
6002
6003 void
flush_pipeline_select_3d(struct anv_cmd_buffer *cmd_buffer)6004 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
6005 {
6006 genX(flush_pipeline_select)(cmd_buffer, _3D);
6007 }
6008
6009 void
flush_pipeline_select_gpgpu(struct anv_cmd_buffer *cmd_buffer)6010 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
6011 {
6012 genX(flush_pipeline_select)(cmd_buffer, GPGPU);
6013 }
6014
6015 void
cmd_buffer_emit_gfx7_depth_flush(struct anv_cmd_buffer *cmd_buffer)6016 genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
6017 {
6018 if (GFX_VER >= 8)
6019 return;
6020
6021 /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
6022 *
6023 * "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
6024 * combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
6025 * 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
6026 * issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
6027 * set), followed by a pipelined depth cache flush (PIPE_CONTROL with
6028 * Depth Flush Bit set, followed by another pipelined depth stall
6029 * (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
6030 * guarantee that the pipeline from WM onwards is already flushed (e.g.,
6031 * via a preceding MI_FLUSH)."
6032 */
6033 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
6034 pipe.DepthStallEnable = true;
6035 anv_debug_dump_pc(pipe);
6036 }
6037 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
6038 pipe.DepthCacheFlushEnable = true;
6039 #if GFX_VER >= 12
6040 pipe.TileCacheFlushEnable = true;
6041 #endif
6042 anv_debug_dump_pc(pipe);
6043 }
6044 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
6045 pipe.DepthStallEnable = true;
6046 anv_debug_dump_pc(pipe);
6047 }
6048 }
6049
6050 void
cmd_buffer_emit_gfx12_depth_wa(struct anv_cmd_buffer *cmd_buffer, const struct isl_surf *surf)6051 genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
6052 const struct isl_surf *surf)
6053 {
6054 #if GFX_VERx10 == 120
6055 const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
6056 surf->samples == 1;
6057
6058 switch (cmd_buffer->state.depth_reg_mode) {
6059 case ANV_DEPTH_REG_MODE_HW_DEFAULT:
6060 if (!is_d16_1x_msaa)
6061 return;
6062 break;
6063 case ANV_DEPTH_REG_MODE_D16_1X_MSAA:
6064 if (is_d16_1x_msaa)
6065 return;
6066 break;
6067 case ANV_DEPTH_REG_MODE_UNKNOWN:
6068 break;
6069 }
6070
6071 /* We'll change some CHICKEN registers depending on the depth surface
6072 * format. Do a depth flush and stall so the pipeline is not using these
6073 * settings while we change the registers.
6074 */
6075 anv_add_pending_pipe_bits(cmd_buffer,
6076 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
6077 ANV_PIPE_DEPTH_STALL_BIT |
6078 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
6079 "Workaround: Stop pipeline for 14010455700");
6080 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6081
6082 /* Wa_14010455700
6083 *
6084 * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
6085 * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
6086 */
6087 anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
6088 reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
6089 reg.HIZPlaneOptimizationdisablebitMask = true;
6090 }
6091
6092 cmd_buffer->state.depth_reg_mode =
6093 is_d16_1x_msaa ? ANV_DEPTH_REG_MODE_D16_1X_MSAA :
6094 ANV_DEPTH_REG_MODE_HW_DEFAULT;
6095 #endif
6096 }
6097
6098 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
6099 *
6100 * "The VF cache needs to be invalidated before binding and then using
6101 * Vertex Buffers that overlap with any previously bound Vertex Buffer
6102 * (at a 64B granularity) since the last invalidation. A VF cache
6103 * invalidate is performed by setting the "VF Cache Invalidation Enable"
6104 * bit in PIPE_CONTROL."
6105 *
6106 * This is implemented by carefully tracking all vertex and index buffer
6107 * bindings and flushing if the cache ever ends up with a range in the cache
6108 * that would exceed 4 GiB. This is implemented in three parts:
6109 *
6110 * 1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
6111 * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
6112 * tracking code of the new binding. If this new binding would cause
6113 * the cache to have a too-large range on the next draw call, a pipeline
6114 * stall and VF cache invalidate are added to pending_pipeline_bits.
6115 *
6116 * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
6117 * empty whenever we emit a VF invalidate.
6118 *
6119 * 3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
6120 * after every 3DPRIMITIVE and copies the bound range into the dirty
6121 * range for each used buffer. This has to be a separate step because
6122 * we don't always re-bind all buffers and so 1. can't know which
6123 * buffers are actually bound.
6124 */
6125 void
cmd_buffer_set_binding_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer, int vb_index, struct anv_address vb_address, uint32_t vb_size)6126 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
6127 int vb_index,
6128 struct anv_address vb_address,
6129 uint32_t vb_size)
6130 {
6131 if (GFX_VER < 8 || GFX_VER > 9 ||
6132 anv_use_relocations(cmd_buffer->device->physical))
6133 return;
6134
6135 struct anv_vb_cache_range *bound, *dirty;
6136 if (vb_index == -1) {
6137 bound = &cmd_buffer->state.gfx.ib_bound_range;
6138 dirty = &cmd_buffer->state.gfx.ib_dirty_range;
6139 } else {
6140 assert(vb_index >= 0);
6141 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
6142 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
6143 bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
6144 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
6145 }
6146
6147 if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
6148 vb_address,
6149 vb_size)) {
6150 anv_add_pending_pipe_bits(cmd_buffer,
6151 ANV_PIPE_CS_STALL_BIT |
6152 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
6153 "vb > 32b range");
6154 }
6155 }
6156
6157 void
cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer, uint32_t access_type, uint64_t vb_used)6158 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
6159 uint32_t access_type,
6160 uint64_t vb_used)
6161 {
6162 if (GFX_VER < 8 || GFX_VER > 9 ||
6163 anv_use_relocations(cmd_buffer->device->physical))
6164 return;
6165
6166 if (access_type == RANDOM) {
6167 /* We have an index buffer */
6168 struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
6169 struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
6170
6171 if (bound->end > bound->start) {
6172 dirty->start = MIN2(dirty->start, bound->start);
6173 dirty->end = MAX2(dirty->end, bound->end);
6174 }
6175 }
6176
6177 uint64_t mask = vb_used;
6178 while (mask) {
6179 int i = u_bit_scan64(&mask);
6180 assert(i >= 0);
6181 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
6182 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
6183
6184 struct anv_vb_cache_range *bound, *dirty;
6185 bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
6186 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
6187
6188 if (bound->end > bound->start) {
6189 dirty->start = MIN2(dirty->start, bound->start);
6190 dirty->end = MAX2(dirty->end, bound->end);
6191 }
6192 }
6193 }
6194
6195 /**
6196 * Update the pixel hashing modes that determine the balancing of PS threads
6197 * across subslices and slices.
6198 *
6199 * \param width Width bound of the rendering area (already scaled down if \p
6200 * scale is greater than 1).
6201 * \param height Height bound of the rendering area (already scaled down if \p
6202 * scale is greater than 1).
6203 * \param scale The number of framebuffer samples that could potentially be
6204 * affected by an individual channel of the PS thread. This is
6205 * typically one for single-sampled rendering, but for operations
6206 * like CCS resolves and fast clears a single PS invocation may
6207 * update a huge number of pixels, in which case a finer
6208 * balancing is desirable in order to maximally utilize the
6209 * bandwidth available. UINT_MAX can be used as shorthand for
6210 * "finest hashing mode available".
6211 */
6212 void
cmd_buffer_emit_hashing_mode(struct anv_cmd_buffer *cmd_buffer, unsigned width, unsigned height, unsigned scale)6213 genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
6214 unsigned width, unsigned height,
6215 unsigned scale)
6216 {
6217 #if GFX_VER == 9
6218 const struct intel_device_info *devinfo = &cmd_buffer->device->info;
6219 const unsigned slice_hashing[] = {
6220 /* Because all Gfx9 platforms with more than one slice require
6221 * three-way subslice hashing, a single "normal" 16x16 slice hashing
6222 * block is guaranteed to suffer from substantial imbalance, with one
6223 * subslice receiving twice as much work as the other two in the
6224 * slice.
6225 *
6226 * The performance impact of that would be particularly severe when
6227 * three-way hashing is also in use for slice balancing (which is the
6228 * case for all Gfx9 GT4 platforms), because one of the slices
6229 * receives one every three 16x16 blocks in either direction, which
6230 * is roughly the periodicity of the underlying subslice imbalance
6231 * pattern ("roughly" because in reality the hardware's
6232 * implementation of three-way hashing doesn't do exact modulo 3
6233 * arithmetic, which somewhat decreases the magnitude of this effect
6234 * in practice). This leads to a systematic subslice imbalance
6235 * within that slice regardless of the size of the primitive. The
6236 * 32x32 hashing mode guarantees that the subslice imbalance within a
6237 * single slice hashing block is minimal, largely eliminating this
6238 * effect.
6239 */
6240 _32x32,
6241 /* Finest slice hashing mode available. */
6242 NORMAL
6243 };
6244 const unsigned subslice_hashing[] = {
6245 /* 16x16 would provide a slight cache locality benefit especially
6246 * visible in the sampler L1 cache efficiency of low-bandwidth
6247 * non-LLC platforms, but it comes at the cost of greater subslice
6248 * imbalance for primitives of dimensions approximately intermediate
6249 * between 16x4 and 16x16.
6250 */
6251 _16x4,
6252 /* Finest subslice hashing mode available. */
6253 _8x4
6254 };
6255 /* Dimensions of the smallest hashing block of a given hashing mode. If
6256 * the rendering area is smaller than this there can't possibly be any
6257 * benefit from switching to this mode, so we optimize out the
6258 * transition.
6259 */
6260 const unsigned min_size[][2] = {
6261 { 16, 4 },
6262 { 8, 4 }
6263 };
6264 const unsigned idx = scale > 1;
6265
6266 if (cmd_buffer->state.current_hash_scale != scale &&
6267 (width > min_size[idx][0] || height > min_size[idx][1])) {
6268 anv_add_pending_pipe_bits(cmd_buffer,
6269 ANV_PIPE_CS_STALL_BIT |
6270 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
6271 "change pixel hash mode");
6272 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6273
6274 anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) {
6275 gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
6276 gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
6277 gt.SubsliceHashing = subslice_hashing[idx];
6278 gt.SubsliceHashingMask = -1;
6279 }
6280
6281 cmd_buffer->state.current_hash_scale = scale;
6282 }
6283 #endif
6284 }
6285
6286 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)6287 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
6288 {
6289 struct anv_device *device = cmd_buffer->device;
6290 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
6291
6292 /* FIXME: Width and Height are wrong */
6293
6294 genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
6295
6296 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
6297 device->isl_dev.ds.size / 4);
6298 if (dw == NULL)
6299 return;
6300
6301 struct isl_view isl_view = {};
6302 struct isl_depth_stencil_hiz_emit_info info = {
6303 .view = &isl_view,
6304 .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
6305 };
6306
6307 if (gfx->depth_att.iview != NULL) {
6308 isl_view = gfx->depth_att.iview->planes[0].isl;
6309 } else if (gfx->stencil_att.iview != NULL) {
6310 isl_view = gfx->stencil_att.iview->planes[0].isl;
6311 }
6312
6313 if (gfx->view_mask) {
6314 assert(isl_view.array_len == 0 ||
6315 isl_view.array_len >= util_last_bit(gfx->view_mask));
6316 isl_view.array_len = util_last_bit(gfx->view_mask);
6317 } else {
6318 assert(isl_view.array_len == 0 ||
6319 isl_view.array_len >= util_last_bit(gfx->layer_count));
6320 isl_view.array_len = gfx->layer_count;
6321 }
6322
6323 if (gfx->depth_att.iview != NULL) {
6324 const struct anv_image_view *iview = gfx->depth_att.iview;
6325 const struct anv_image *image = iview->image;
6326
6327 const uint32_t depth_plane =
6328 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
6329 const struct anv_surface *depth_surface =
6330 &image->planes[depth_plane].primary_surface;
6331 const struct anv_address depth_address =
6332 anv_image_address(image, &depth_surface->memory_range);
6333
6334 info.depth_surf = &depth_surface->isl;
6335
6336 info.depth_address =
6337 anv_batch_emit_reloc(&cmd_buffer->batch,
6338 dw + device->isl_dev.ds.depth_offset / 4,
6339 depth_address.bo, depth_address.offset);
6340 info.mocs =
6341 anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
6342
6343 info.hiz_usage = gfx->depth_att.aux_usage;
6344 if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
6345 assert(isl_aux_usage_has_hiz(info.hiz_usage));
6346
6347 const struct anv_surface *hiz_surface =
6348 &image->planes[depth_plane].aux_surface;
6349 const struct anv_address hiz_address =
6350 anv_image_address(image, &hiz_surface->memory_range);
6351
6352 info.hiz_surf = &hiz_surface->isl;
6353
6354 info.hiz_address =
6355 anv_batch_emit_reloc(&cmd_buffer->batch,
6356 dw + device->isl_dev.ds.hiz_offset / 4,
6357 hiz_address.bo, hiz_address.offset);
6358
6359 info.depth_clear_value = ANV_HZ_FC_VAL;
6360 }
6361 }
6362
6363 if (gfx->stencil_att.iview != NULL) {
6364 const struct anv_image_view *iview = gfx->stencil_att.iview;
6365 const struct anv_image *image = iview->image;
6366
6367 const uint32_t stencil_plane =
6368 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
6369 const struct anv_surface *stencil_surface =
6370 &image->planes[stencil_plane].primary_surface;
6371 const struct anv_address stencil_address =
6372 anv_image_address(image, &stencil_surface->memory_range);
6373
6374 info.stencil_surf = &stencil_surface->isl;
6375
6376 info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
6377 info.stencil_address =
6378 anv_batch_emit_reloc(&cmd_buffer->batch,
6379 dw + device->isl_dev.ds.stencil_offset / 4,
6380 stencil_address.bo, stencil_address.offset);
6381 info.mocs =
6382 anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
6383 }
6384
6385 isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
6386
6387 if (info.depth_surf)
6388 genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
6389
6390 if (GFX_VER >= 12) {
6391 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
6392 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6393
6394 /* Wa_1408224581
6395 *
6396 * Workaround: Gfx12LP Astep only An additional pipe control with
6397 * post-sync = store dword operation would be required.( w/a is to
6398 * have an additional pipe control after the stencil state whenever
6399 * the surface state bits of this state is changing).
6400 *
6401 * This also seems sufficient to handle Wa_14014148106.
6402 */
6403 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
6404 pc.PostSyncOperation = WriteImmediateData;
6405 pc.Address = cmd_buffer->device->workaround_address;
6406 }
6407 }
6408 cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
6409 }
6410
6411 static void
cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer, const struct anv_image_view *fsr_iview)6412 cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer,
6413 const struct anv_image_view *fsr_iview)
6414 {
6415 #if GFX_VERx10 >= 125
6416 struct anv_device *device = cmd_buffer->device;
6417
6418 if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
6419 return;
6420
6421 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
6422 device->isl_dev.cpb.size / 4);
6423 if (dw == NULL)
6424 return;
6425
6426 struct isl_cpb_emit_info info = { };
6427
6428 if (fsr_iview) {
6429 info.view = &fsr_iview->planes[0].isl;
6430 info.surf = &fsr_iview->image->planes[0].primary_surface.isl;
6431 info.address =
6432 anv_batch_emit_reloc(&cmd_buffer->batch,
6433 dw + device->isl_dev.cpb.offset / 4,
6434 fsr_iview->image->bindings[0].address.bo,
6435 fsr_iview->image->bindings[0].address.offset +
6436 fsr_iview->image->bindings[0].memory_range.offset);
6437 info.mocs =
6438 anv_mocs(device, fsr_iview->image->bindings[0].address.bo,
6439 ISL_SURF_USAGE_CPB_BIT);
6440 }
6441
6442 isl_emit_cpb_control_s(&device->isl_dev, dw, &info);
6443 #endif /* GFX_VERx10 >= 125 */
6444 }
6445
6446 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo *att)6447 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
6448 {
6449 const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
6450 vk_find_struct_const(att->pNext,
6451 RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
6452 if (layout_info != NULL)
6453 return layout_info->initialLayout;
6454
6455 return att->imageLayout;
6456 }
6457
CmdBeginRendering( VkCommandBuffer commandBuffer, const VkRenderingInfo* pRenderingInfo)6458 void genX(CmdBeginRendering)(
6459 VkCommandBuffer commandBuffer,
6460 const VkRenderingInfo* pRenderingInfo)
6461 {
6462 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6463 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
6464 VkResult result;
6465
6466 if (!is_render_queue_cmd_buffer(cmd_buffer)) {
6467 assert(!"Trying to start a render pass on non-render queue!");
6468 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
6469 return;
6470 }
6471
6472 anv_measure_beginrenderpass(cmd_buffer);
6473 trace_intel_begin_render_pass(&cmd_buffer->trace);
6474
6475 gfx->rendering_flags = pRenderingInfo->flags;
6476 gfx->render_area = pRenderingInfo->renderArea;
6477 gfx->view_mask = pRenderingInfo->viewMask;
6478 gfx->layer_count = pRenderingInfo->layerCount;
6479 gfx->samples = 0;
6480
6481 const bool is_multiview = gfx->view_mask != 0;
6482 const VkRect2D render_area = gfx->render_area;
6483 const uint32_t layers =
6484 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
6485
6486 /* The framebuffer size is at least large enough to contain the render
6487 * area. Because a zero renderArea is possible, we MAX with 1.
6488 */
6489 struct isl_extent3d fb_size = {
6490 .w = MAX2(1, render_area.offset.x + render_area.extent.width),
6491 .h = MAX2(1, render_area.offset.y + render_area.extent.height),
6492 .d = layers,
6493 };
6494
6495 const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
6496 result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
6497 if (result != VK_SUCCESS)
6498 return;
6499
6500 genX(flush_pipeline_select_3d)(cmd_buffer);
6501
6502 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
6503 if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
6504 continue;
6505
6506 const VkRenderingAttachmentInfo *att =
6507 &pRenderingInfo->pColorAttachments[i];
6508 ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
6509 const VkImageLayout initial_layout = attachment_initial_layout(att);
6510
6511 assert(render_area.offset.x + render_area.extent.width <=
6512 iview->vk.extent.width);
6513 assert(render_area.offset.y + render_area.extent.height <=
6514 iview->vk.extent.height);
6515 assert(layers <= iview->vk.layer_count);
6516
6517 fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
6518 fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
6519
6520 assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
6521 gfx->samples |= iview->vk.image->samples;
6522
6523 enum isl_aux_usage aux_usage =
6524 anv_layout_to_aux_usage(&cmd_buffer->device->info,
6525 iview->image,
6526 VK_IMAGE_ASPECT_COLOR_BIT,
6527 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
6528 att->imageLayout);
6529
6530 union isl_color_value fast_clear_color = { .u32 = { 0, } };
6531
6532 if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6533 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
6534 const union isl_color_value clear_color =
6535 vk_to_isl_color_with_format(att->clearValue.color,
6536 iview->planes[0].isl.format);
6537
6538 /* We only support fast-clears on the first layer */
6539 const bool fast_clear =
6540 (!is_multiview || (gfx->view_mask & 1)) &&
6541 anv_can_fast_clear_color_view(cmd_buffer->device, iview,
6542 att->imageLayout, clear_color,
6543 layers, render_area);
6544
6545 if (att->imageLayout != initial_layout) {
6546 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
6547 render_area.extent.width == iview->vk.extent.width &&
6548 render_area.extent.height == iview->vk.extent.height);
6549 if (is_multiview) {
6550 u_foreach_bit(view, gfx->view_mask) {
6551 transition_color_buffer(cmd_buffer, iview->image,
6552 VK_IMAGE_ASPECT_COLOR_BIT,
6553 iview->vk.base_mip_level, 1,
6554 iview->vk.base_array_layer + view,
6555 1, /* layer_count */
6556 initial_layout, att->imageLayout,
6557 VK_QUEUE_FAMILY_IGNORED,
6558 VK_QUEUE_FAMILY_IGNORED,
6559 fast_clear);
6560 }
6561 } else {
6562 transition_color_buffer(cmd_buffer, iview->image,
6563 VK_IMAGE_ASPECT_COLOR_BIT,
6564 iview->vk.base_mip_level, 1,
6565 iview->vk.base_array_layer,
6566 gfx->layer_count,
6567 initial_layout, att->imageLayout,
6568 VK_QUEUE_FAMILY_IGNORED,
6569 VK_QUEUE_FAMILY_IGNORED,
6570 fast_clear);
6571 }
6572 }
6573
6574 uint32_t clear_view_mask = pRenderingInfo->viewMask;
6575 uint32_t base_clear_layer = iview->vk.base_array_layer;
6576 uint32_t clear_layer_count = gfx->layer_count;
6577 if (fast_clear) {
6578 /* We only support fast-clears on the first layer */
6579 assert(iview->vk.base_mip_level == 0 &&
6580 iview->vk.base_array_layer == 0);
6581
6582 fast_clear_color = clear_color;
6583
6584 if (iview->image->vk.samples == 1) {
6585 anv_image_ccs_op(cmd_buffer, iview->image,
6586 iview->planes[0].isl.format,
6587 iview->planes[0].isl.swizzle,
6588 VK_IMAGE_ASPECT_COLOR_BIT,
6589 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
6590 &fast_clear_color,
6591 false);
6592 } else {
6593 anv_image_mcs_op(cmd_buffer, iview->image,
6594 iview->planes[0].isl.format,
6595 iview->planes[0].isl.swizzle,
6596 VK_IMAGE_ASPECT_COLOR_BIT,
6597 0, 1, ISL_AUX_OP_FAST_CLEAR,
6598 &fast_clear_color,
6599 false);
6600 }
6601 clear_view_mask &= ~1u;
6602 base_clear_layer++;
6603 clear_layer_count--;
6604
6605 if (isl_color_value_is_zero(clear_color,
6606 iview->planes[0].isl.format)) {
6607 /* This image has the auxiliary buffer enabled. We can mark the
6608 * subresource as not needing a resolve because the clear color
6609 * will match what's in every RENDER_SURFACE_STATE object when
6610 * it's being used for sampling.
6611 */
6612 set_image_fast_clear_state(cmd_buffer, iview->image,
6613 VK_IMAGE_ASPECT_COLOR_BIT,
6614 ANV_FAST_CLEAR_DEFAULT_VALUE);
6615 } else {
6616 set_image_fast_clear_state(cmd_buffer, iview->image,
6617 VK_IMAGE_ASPECT_COLOR_BIT,
6618 ANV_FAST_CLEAR_ANY);
6619 }
6620 }
6621
6622 if (is_multiview) {
6623 u_foreach_bit(view, clear_view_mask) {
6624 anv_image_clear_color(cmd_buffer, iview->image,
6625 VK_IMAGE_ASPECT_COLOR_BIT,
6626 aux_usage,
6627 iview->planes[0].isl.format,
6628 iview->planes[0].isl.swizzle,
6629 iview->vk.base_mip_level,
6630 iview->vk.base_array_layer + view, 1,
6631 render_area, clear_color);
6632 }
6633 } else {
6634 anv_image_clear_color(cmd_buffer, iview->image,
6635 VK_IMAGE_ASPECT_COLOR_BIT,
6636 aux_usage,
6637 iview->planes[0].isl.format,
6638 iview->planes[0].isl.swizzle,
6639 iview->vk.base_mip_level,
6640 base_clear_layer, clear_layer_count,
6641 render_area, clear_color);
6642 }
6643 } else {
6644 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
6645 assert(att->imageLayout == initial_layout);
6646 }
6647
6648 gfx->color_att[i].vk_format = iview->vk.format;
6649 gfx->color_att[i].iview = iview;
6650 gfx->color_att[i].layout = att->imageLayout;
6651 gfx->color_att[i].aux_usage = aux_usage;
6652
6653 struct isl_view isl_view = iview->planes[0].isl;
6654 if (pRenderingInfo->viewMask) {
6655 assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
6656 isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
6657 } else {
6658 assert(isl_view.array_len >= pRenderingInfo->layerCount);
6659 isl_view.array_len = pRenderingInfo->layerCount;
6660 }
6661
6662 anv_image_fill_surface_state(cmd_buffer->device,
6663 iview->image,
6664 VK_IMAGE_ASPECT_COLOR_BIT,
6665 &isl_view,
6666 ISL_SURF_USAGE_RENDER_TARGET_BIT,
6667 aux_usage, &fast_clear_color,
6668 0, /* anv_image_view_state_flags */
6669 &gfx->color_att[i].surface_state,
6670 NULL);
6671
6672 add_surface_state_relocs(cmd_buffer, gfx->color_att[i].surface_state);
6673
6674 if (GFX_VER < 10 &&
6675 (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
6676 (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
6677 iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
6678 iview->planes[0].isl.base_level == 0 &&
6679 iview->planes[0].isl.base_array_layer == 0) {
6680 genX(copy_fast_clear_dwords)(cmd_buffer,
6681 gfx->color_att[i].surface_state.state,
6682 iview->image,
6683 VK_IMAGE_ASPECT_COLOR_BIT,
6684 false /* copy to ss */);
6685 }
6686
6687 if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
6688 gfx->color_att[i].resolve_mode = att->resolveMode;
6689 gfx->color_att[i].resolve_iview =
6690 anv_image_view_from_handle(att->resolveImageView);
6691 gfx->color_att[i].resolve_layout = att->resolveImageLayout;
6692 }
6693 }
6694
6695 const struct anv_image_view *fsr_iview = NULL;
6696 const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att =
6697 vk_find_struct_const(pRenderingInfo->pNext,
6698 RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
6699 if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) {
6700 fsr_iview = anv_image_view_from_handle(fsr_att->imageView);
6701 /* imageLayout and shadingRateAttachmentTexelSize are ignored */
6702 }
6703
6704 const struct anv_image_view *ds_iview = NULL;
6705 const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
6706 const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
6707 if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
6708 (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
6709 const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
6710 VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6711 VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6712 VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6713 VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
6714 enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
6715 enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
6716 float depth_clear_value = 0;
6717 uint32_t stencil_clear_value = 0;
6718
6719 if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
6720 d_iview = anv_image_view_from_handle(d_att->imageView);
6721 initial_depth_layout = attachment_initial_layout(d_att);
6722 depth_layout = d_att->imageLayout;
6723 depth_aux_usage =
6724 anv_layout_to_aux_usage(&cmd_buffer->device->info,
6725 d_iview->image,
6726 VK_IMAGE_ASPECT_DEPTH_BIT,
6727 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
6728 depth_layout);
6729 depth_clear_value = d_att->clearValue.depthStencil.depth;
6730 }
6731
6732 if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
6733 s_iview = anv_image_view_from_handle(s_att->imageView);
6734 initial_stencil_layout = attachment_initial_layout(s_att);
6735 stencil_layout = s_att->imageLayout;
6736 stencil_aux_usage =
6737 anv_layout_to_aux_usage(&cmd_buffer->device->info,
6738 s_iview->image,
6739 VK_IMAGE_ASPECT_STENCIL_BIT,
6740 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
6741 stencil_layout);
6742 stencil_clear_value = s_att->clearValue.depthStencil.stencil;
6743 }
6744
6745 assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
6746 ds_iview = d_iview != NULL ? d_iview : s_iview;
6747 assert(ds_iview != NULL);
6748
6749 assert(render_area.offset.x + render_area.extent.width <=
6750 ds_iview->vk.extent.width);
6751 assert(render_area.offset.y + render_area.extent.height <=
6752 ds_iview->vk.extent.height);
6753 assert(layers <= ds_iview->vk.layer_count);
6754
6755 fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
6756 fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
6757
6758 assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
6759 gfx->samples |= ds_iview->vk.image->samples;
6760
6761 VkImageAspectFlags clear_aspects = 0;
6762 if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6763 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
6764 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
6765 if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
6766 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
6767 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
6768
6769 if (clear_aspects != 0) {
6770 const bool hiz_clear =
6771 anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
6772 depth_layout, clear_aspects,
6773 depth_clear_value,
6774 render_area);
6775
6776 if (depth_layout != initial_depth_layout) {
6777 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
6778 render_area.extent.width == d_iview->vk.extent.width &&
6779 render_area.extent.height == d_iview->vk.extent.height);
6780
6781 if (is_multiview) {
6782 u_foreach_bit(view, gfx->view_mask) {
6783 transition_depth_buffer(cmd_buffer, d_iview->image,
6784 d_iview->vk.base_array_layer + view,
6785 1 /* layer_count */,
6786 initial_depth_layout, depth_layout,
6787 hiz_clear);
6788 }
6789 } else {
6790 transition_depth_buffer(cmd_buffer, d_iview->image,
6791 d_iview->vk.base_array_layer,
6792 gfx->layer_count,
6793 initial_depth_layout, depth_layout,
6794 hiz_clear);
6795 }
6796 }
6797
6798 if (stencil_layout != initial_stencil_layout) {
6799 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
6800 render_area.extent.width == s_iview->vk.extent.width &&
6801 render_area.extent.height == s_iview->vk.extent.height);
6802
6803 if (is_multiview) {
6804 u_foreach_bit(view, gfx->view_mask) {
6805 transition_stencil_buffer(cmd_buffer, s_iview->image,
6806 s_iview->vk.base_mip_level, 1,
6807 s_iview->vk.base_array_layer + view,
6808 1 /* layer_count */,
6809 initial_stencil_layout,
6810 stencil_layout,
6811 hiz_clear);
6812 }
6813 } else {
6814 transition_stencil_buffer(cmd_buffer, s_iview->image,
6815 s_iview->vk.base_mip_level, 1,
6816 s_iview->vk.base_array_layer,
6817 gfx->layer_count,
6818 initial_stencil_layout,
6819 stencil_layout,
6820 hiz_clear);
6821 }
6822 }
6823
6824 if (is_multiview) {
6825 uint32_t clear_view_mask = pRenderingInfo->viewMask;
6826 while (clear_view_mask) {
6827 int view = u_bit_scan(&clear_view_mask);
6828
6829 uint32_t level = ds_iview->vk.base_mip_level;
6830 uint32_t layer = ds_iview->vk.base_array_layer + view;
6831
6832 if (hiz_clear) {
6833 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
6834 clear_aspects,
6835 level, layer, 1,
6836 render_area,
6837 stencil_clear_value);
6838 } else {
6839 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
6840 clear_aspects,
6841 depth_aux_usage,
6842 level, layer, 1,
6843 render_area,
6844 depth_clear_value,
6845 stencil_clear_value);
6846 }
6847 }
6848 } else {
6849 uint32_t level = ds_iview->vk.base_mip_level;
6850 uint32_t base_layer = ds_iview->vk.base_array_layer;
6851 uint32_t layer_count = gfx->layer_count;
6852
6853 if (hiz_clear) {
6854 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
6855 clear_aspects,
6856 level, base_layer, layer_count,
6857 render_area,
6858 stencil_clear_value);
6859 } else {
6860 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
6861 clear_aspects,
6862 depth_aux_usage,
6863 level, base_layer, layer_count,
6864 render_area,
6865 depth_clear_value,
6866 stencil_clear_value);
6867 }
6868 }
6869 } else {
6870 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
6871 assert(depth_layout == initial_depth_layout);
6872 assert(stencil_layout == initial_stencil_layout);
6873 }
6874
6875 if (d_iview != NULL) {
6876 gfx->depth_att.vk_format = d_iview->vk.format;
6877 gfx->depth_att.iview = d_iview;
6878 gfx->depth_att.layout = depth_layout;
6879 gfx->depth_att.aux_usage = depth_aux_usage;
6880 if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
6881 assert(d_att->resolveImageView != VK_NULL_HANDLE);
6882 gfx->depth_att.resolve_mode = d_att->resolveMode;
6883 gfx->depth_att.resolve_iview =
6884 anv_image_view_from_handle(d_att->resolveImageView);
6885 gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
6886 }
6887 }
6888
6889 if (s_iview != NULL) {
6890 gfx->stencil_att.vk_format = s_iview->vk.format;
6891 gfx->stencil_att.iview = s_iview;
6892 gfx->stencil_att.layout = stencil_layout;
6893 gfx->stencil_att.aux_usage = stencil_aux_usage;
6894 if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
6895 assert(s_att->resolveImageView != VK_NULL_HANDLE);
6896 gfx->stencil_att.resolve_mode = s_att->resolveMode;
6897 gfx->stencil_att.resolve_iview =
6898 anv_image_view_from_handle(s_att->resolveImageView);
6899 gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
6900 }
6901 }
6902 }
6903
6904 /* Finally, now that we know the right size, set up the null surface */
6905 assert(util_bitcount(gfx->samples) <= 1);
6906 isl_null_fill_state(&cmd_buffer->device->isl_dev,
6907 gfx->null_surface_state.map,
6908 .size = fb_size);
6909
6910 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
6911 if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
6912 continue;
6913
6914 isl_null_fill_state(&cmd_buffer->device->isl_dev,
6915 gfx->color_att[i].surface_state.state.map,
6916 .size = fb_size);
6917 }
6918
6919 /****** We can now start emitting code to begin the render pass ******/
6920
6921 gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
6922
6923 /* Our implementation of VK_KHR_multiview uses instancing to draw the
6924 * different views. If the client asks for instancing, we need to use the
6925 * Instance Data Step Rate to ensure that we repeat the client's
6926 * per-instance data once for each view. Since this bit is in
6927 * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top
6928 * of each subpass.
6929 */
6930 if (GFX_VER == 7)
6931 gfx->vb_dirty |= ~0;
6932
6933 /* It is possible to start a render pass with an old pipeline. Because the
6934 * render pass and subpass index are both baked into the pipeline, this is
6935 * highly unlikely. In order to do so, it requires that you have a render
6936 * pass with a single subpass and that you use that render pass twice
6937 * back-to-back and use the same pipeline at the start of the second render
6938 * pass as at the end of the first. In order to avoid unpredictable issues
6939 * with this edge case, we just dirty the pipeline at the start of every
6940 * subpass.
6941 */
6942 gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
6943
6944 #if GFX_VER >= 11
6945 /* The PIPE_CONTROL command description says:
6946 *
6947 * "Whenever a Binding Table Index (BTI) used by a Render Target Message
6948 * points to a different RENDER_SURFACE_STATE, SW must issue a Render
6949 * Target Cache Flush by enabling this bit. When render target flush
6950 * is set due to new association of BTI, PS Scoreboard Stall bit must
6951 * be set in this packet."
6952 */
6953 anv_add_pending_pipe_bits(cmd_buffer,
6954 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
6955 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
6956 "change RT");
6957 #endif
6958
6959 cmd_buffer_emit_depth_stencil(cmd_buffer);
6960
6961 cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview);
6962 }
6963
6964 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer, struct anv_attachment *att, VkImageAspectFlagBits aspect)6965 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
6966 struct anv_attachment *att,
6967 VkImageAspectFlagBits aspect)
6968 {
6969 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
6970 const struct anv_image_view *iview = att->iview;
6971
6972 if (gfx->view_mask == 0) {
6973 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
6974 aspect, att->aux_usage,
6975 iview->planes[0].isl.base_level,
6976 iview->planes[0].isl.base_array_layer,
6977 gfx->layer_count);
6978 } else {
6979 uint32_t res_view_mask = gfx->view_mask;
6980 while (res_view_mask) {
6981 int i = u_bit_scan(&res_view_mask);
6982
6983 const uint32_t level = iview->planes[0].isl.base_level;
6984 const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
6985
6986 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
6987 aspect, att->aux_usage,
6988 level, layer, 1);
6989 }
6990 }
6991 }
6992
6993 static enum blorp_filter
vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)6994 vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
6995 {
6996 switch (vk_mode) {
6997 case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
6998 return BLORP_FILTER_SAMPLE_0;
6999 case VK_RESOLVE_MODE_AVERAGE_BIT:
7000 return BLORP_FILTER_AVERAGE;
7001 case VK_RESOLVE_MODE_MIN_BIT:
7002 return BLORP_FILTER_MIN_SAMPLE;
7003 case VK_RESOLVE_MODE_MAX_BIT:
7004 return BLORP_FILTER_MAX_SAMPLE;
7005 default:
7006 return BLORP_FILTER_NONE;
7007 }
7008 }
7009
7010 static void
cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer, const struct anv_attachment *att, VkImageLayout layout, VkImageAspectFlagBits aspect)7011 cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer,
7012 const struct anv_attachment *att,
7013 VkImageLayout layout,
7014 VkImageAspectFlagBits aspect)
7015 {
7016 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
7017 const struct anv_image_view *src_iview = att->iview;
7018 const struct anv_image_view *dst_iview = att->resolve_iview;
7019
7020 enum isl_aux_usage src_aux_usage =
7021 anv_layout_to_aux_usage(&cmd_buffer->device->info,
7022 src_iview->image, aspect,
7023 VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
7024 layout);
7025
7026 enum isl_aux_usage dst_aux_usage =
7027 anv_layout_to_aux_usage(&cmd_buffer->device->info,
7028 dst_iview->image, aspect,
7029 VK_IMAGE_USAGE_TRANSFER_DST_BIT,
7030 att->resolve_layout);
7031
7032 enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
7033
7034 const VkRect2D render_area = gfx->render_area;
7035 if (gfx->view_mask == 0) {
7036 anv_image_msaa_resolve(cmd_buffer,
7037 src_iview->image, src_aux_usage,
7038 src_iview->planes[0].isl.base_level,
7039 src_iview->planes[0].isl.base_array_layer,
7040 dst_iview->image, dst_aux_usage,
7041 dst_iview->planes[0].isl.base_level,
7042 dst_iview->planes[0].isl.base_array_layer,
7043 aspect,
7044 render_area.offset.x, render_area.offset.y,
7045 render_area.offset.x, render_area.offset.y,
7046 render_area.extent.width,
7047 render_area.extent.height,
7048 gfx->layer_count, filter);
7049 } else {
7050 uint32_t res_view_mask = gfx->view_mask;
7051 while (res_view_mask) {
7052 int i = u_bit_scan(&res_view_mask);
7053
7054 anv_image_msaa_resolve(cmd_buffer,
7055 src_iview->image, src_aux_usage,
7056 src_iview->planes[0].isl.base_level,
7057 src_iview->planes[0].isl.base_array_layer + i,
7058 dst_iview->image, dst_aux_usage,
7059 dst_iview->planes[0].isl.base_level,
7060 dst_iview->planes[0].isl.base_array_layer + i,
7061 aspect,
7062 render_area.offset.x, render_area.offset.y,
7063 render_area.offset.x, render_area.offset.y,
7064 render_area.extent.width,
7065 render_area.extent.height,
7066 1, filter);
7067 }
7068 }
7069 }
7070
CmdEndRendering( VkCommandBuffer commandBuffer)7071 void genX(CmdEndRendering)(
7072 VkCommandBuffer commandBuffer)
7073 {
7074 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7075 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
7076
7077 if (anv_batch_has_error(&cmd_buffer->batch))
7078 return;
7079
7080 const bool is_multiview = gfx->view_mask != 0;
7081 const uint32_t layers =
7082 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
7083
7084 bool has_color_resolve = false;
7085 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
7086 if (gfx->color_att[i].iview == NULL)
7087 continue;
7088
7089 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
7090 VK_IMAGE_ASPECT_COLOR_BIT);
7091
7092 /* Stash this off for later */
7093 if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
7094 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
7095 has_color_resolve = true;
7096 }
7097
7098 if (gfx->depth_att.iview != NULL) {
7099 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
7100 VK_IMAGE_ASPECT_DEPTH_BIT);
7101 }
7102
7103 if (gfx->stencil_att.iview != NULL) {
7104 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
7105 VK_IMAGE_ASPECT_STENCIL_BIT);
7106 }
7107
7108 if (has_color_resolve) {
7109 /* We are about to do some MSAA resolves. We need to flush so that the
7110 * result of writes to the MSAA color attachments show up in the sampler
7111 * when we blit to the single-sampled resolve target.
7112 */
7113 anv_add_pending_pipe_bits(cmd_buffer,
7114 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
7115 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
7116 "MSAA resolve");
7117 }
7118
7119 if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
7120 gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE) {
7121 /* We are about to do some MSAA resolves. We need to flush so that the
7122 * result of writes to the MSAA depth attachments show up in the sampler
7123 * when we blit to the single-sampled resolve target.
7124 */
7125 anv_add_pending_pipe_bits(cmd_buffer,
7126 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
7127 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
7128 "MSAA resolve");
7129 }
7130
7131 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
7132 const struct anv_attachment *att = &gfx->color_att[i];
7133 if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
7134 (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
7135 continue;
7136
7137 cmd_buffer_resolve_msaa_attachment(cmd_buffer, att, att->layout,
7138 VK_IMAGE_ASPECT_COLOR_BIT);
7139 }
7140
7141 if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
7142 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
7143 const struct anv_image_view *src_iview = gfx->depth_att.iview;
7144
7145 /* MSAA resolves sample from the source attachment. Transition the
7146 * depth attachment first to get rid of any HiZ that we may not be
7147 * able to handle.
7148 */
7149 transition_depth_buffer(cmd_buffer, src_iview->image,
7150 src_iview->planes[0].isl.base_array_layer,
7151 layers,
7152 gfx->depth_att.layout,
7153 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
7154 false /* will_full_fast_clear */);
7155
7156 cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->depth_att,
7157 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
7158 VK_IMAGE_ASPECT_DEPTH_BIT);
7159
7160 /* Transition the source back to the original layout. This seems a bit
7161 * inefficient but, since HiZ resolves aren't destructive, going from
7162 * less HiZ to more is generally a no-op.
7163 */
7164 transition_depth_buffer(cmd_buffer, src_iview->image,
7165 src_iview->planes[0].isl.base_array_layer,
7166 layers,
7167 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
7168 gfx->depth_att.layout,
7169 false /* will_full_fast_clear */);
7170 }
7171
7172 if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
7173 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
7174 cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->stencil_att,
7175 gfx->stencil_att.layout,
7176 VK_IMAGE_ASPECT_STENCIL_BIT);
7177 }
7178
7179 #if GFX_VER == 7
7180 /* On gfx7, we have to store a texturable version of the stencil buffer in
7181 * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
7182 * forth at strategic points. Stencil writes are only allowed in following
7183 * layouts:
7184 *
7185 * - VK_IMAGE_LAYOUT_GENERAL
7186 * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
7187 * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
7188 * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
7189 * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
7190 * - VK_IMAGE_LAYOUT_SUBPASS_SELF_DEPENDENCY_MESA
7191 *
7192 * For general, we have no nice opportunity to transition so we do the copy
7193 * to the shadow unconditionally at the end of the subpass. For transfer
7194 * destinations, we can update it as part of the transfer op. For the other
7195 * layouts, we delay the copy until a transition into some other layout.
7196 */
7197 if (gfx->stencil_att.iview != NULL) {
7198 const struct anv_image_view *iview = gfx->stencil_att.iview;
7199 const struct anv_image *image = iview->image;
7200 const uint32_t plane =
7201 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
7202
7203 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
7204 (gfx->stencil_att.layout == VK_IMAGE_LAYOUT_GENERAL ||
7205 gfx->stencil_att.layout == VK_IMAGE_LAYOUT_SUBPASS_SELF_DEPENDENCY_MESA)) {
7206 anv_image_copy_to_shadow(cmd_buffer, image,
7207 VK_IMAGE_ASPECT_STENCIL_BIT,
7208 iview->planes[plane].isl.base_level, 1,
7209 iview->planes[plane].isl.base_array_layer,
7210 layers);
7211 }
7212 }
7213 #endif
7214
7215 trace_intel_end_render_pass(&cmd_buffer->trace,
7216 gfx->render_area.extent.width,
7217 gfx->render_area.extent.height,
7218 gfx->color_att_count,
7219 gfx->samples);
7220
7221 anv_cmd_buffer_reset_rendering(cmd_buffer);
7222 }
7223
7224 void
cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer)7225 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
7226 {
7227 #if GFX_VERx10 >= 75
7228 struct mi_builder b;
7229 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
7230
7231 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
7232 mi_reg32(ANV_PREDICATE_RESULT_REG));
7233 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7234
7235 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
7236 mip.LoadOperation = LOAD_LOADINV;
7237 mip.CombineOperation = COMBINE_SET;
7238 mip.CompareOperation = COMPARE_SRCS_EQUAL;
7239 }
7240 #endif
7241 }
7242
7243 #if GFX_VERx10 >= 75
CmdBeginConditionalRenderingEXT( VkCommandBuffer commandBuffer, const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)7244 void genX(CmdBeginConditionalRenderingEXT)(
7245 VkCommandBuffer commandBuffer,
7246 const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)
7247 {
7248 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7249 ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
7250 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
7251 struct anv_address value_address =
7252 anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
7253
7254 const bool isInverted = pConditionalRenderingBegin->flags &
7255 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
7256
7257 cmd_state->conditional_render_enabled = true;
7258
7259 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7260
7261 struct mi_builder b;
7262 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
7263
7264 /* Section 19.4 of the Vulkan 1.1.85 spec says:
7265 *
7266 * If the value of the predicate in buffer memory changes
7267 * while conditional rendering is active, the rendering commands
7268 * may be discarded in an implementation-dependent way.
7269 * Some implementations may latch the value of the predicate
7270 * upon beginning conditional rendering while others
7271 * may read it before every rendering command.
7272 *
7273 * So it's perfectly fine to read a value from the buffer once.
7274 */
7275 struct mi_value value = mi_mem32(value_address);
7276
7277 /* Precompute predicate result, it is necessary to support secondary
7278 * command buffers since it is unknown if conditional rendering is
7279 * inverted when populating them.
7280 */
7281 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
7282 isInverted ? mi_uge(&b, mi_imm(0), value) :
7283 mi_ult(&b, mi_imm(0), value));
7284 }
7285
CmdEndConditionalRenderingEXT( VkCommandBuffer commandBuffer)7286 void genX(CmdEndConditionalRenderingEXT)(
7287 VkCommandBuffer commandBuffer)
7288 {
7289 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7290 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
7291
7292 cmd_state->conditional_render_enabled = false;
7293 }
7294 #endif
7295
7296 /* Set of stage bits for which are pipelined, i.e. they get queued
7297 * by the command streamer for later execution.
7298 */
7299 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
7300 ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
7301 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
7302 VK_PIPELINE_STAGE_2_HOST_BIT | \
7303 VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
7304
CmdSetEvent2( VkCommandBuffer commandBuffer, VkEvent _event, const VkDependencyInfo* pDependencyInfo)7305 void genX(CmdSetEvent2)(
7306 VkCommandBuffer commandBuffer,
7307 VkEvent _event,
7308 const VkDependencyInfo* pDependencyInfo)
7309 {
7310 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7311 ANV_FROM_HANDLE(anv_event, event, _event);
7312
7313 VkPipelineStageFlags2 src_stages = 0;
7314
7315 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
7316 src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7317 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
7318 src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7319 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
7320 src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7321
7322 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
7323 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7324
7325 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
7326 if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
7327 pc.StallAtPixelScoreboard = true;
7328 pc.CommandStreamerStallEnable = true;
7329 }
7330
7331 pc.DestinationAddressType = DAT_PPGTT,
7332 pc.PostSyncOperation = WriteImmediateData,
7333 pc.Address = (struct anv_address) {
7334 cmd_buffer->device->dynamic_state_pool.block_pool.bo,
7335 event->state.offset
7336 };
7337 pc.ImmediateData = VK_EVENT_SET;
7338 anv_debug_dump_pc(pc);
7339 }
7340 }
7341
CmdResetEvent2( VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags2 stageMask)7342 void genX(CmdResetEvent2)(
7343 VkCommandBuffer commandBuffer,
7344 VkEvent _event,
7345 VkPipelineStageFlags2 stageMask)
7346 {
7347 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7348 ANV_FROM_HANDLE(anv_event, event, _event);
7349
7350 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
7351 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7352
7353 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
7354 if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
7355 pc.StallAtPixelScoreboard = true;
7356 pc.CommandStreamerStallEnable = true;
7357 }
7358
7359 pc.DestinationAddressType = DAT_PPGTT;
7360 pc.PostSyncOperation = WriteImmediateData;
7361 pc.Address = (struct anv_address) {
7362 cmd_buffer->device->dynamic_state_pool.block_pool.bo,
7363 event->state.offset
7364 };
7365 pc.ImmediateData = VK_EVENT_RESET;
7366 anv_debug_dump_pc(pc);
7367 }
7368 }
7369
CmdWaitEvents2( VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent* pEvents, const VkDependencyInfo* pDependencyInfos)7370 void genX(CmdWaitEvents2)(
7371 VkCommandBuffer commandBuffer,
7372 uint32_t eventCount,
7373 const VkEvent* pEvents,
7374 const VkDependencyInfo* pDependencyInfos)
7375 {
7376 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7377
7378 #if GFX_VER >= 8
7379 for (uint32_t i = 0; i < eventCount; i++) {
7380 ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
7381
7382 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
7383 sem.WaitMode = PollingMode,
7384 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD,
7385 sem.SemaphoreDataDword = VK_EVENT_SET,
7386 sem.SemaphoreAddress = (struct anv_address) {
7387 cmd_buffer->device->dynamic_state_pool.block_pool.bo,
7388 event->state.offset
7389 };
7390 }
7391 }
7392 #else
7393 anv_finishme("Implement events on gfx7");
7394 #endif
7395
7396 cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
7397 }
7398
vk_to_intel_index_type(VkIndexType type)7399 static uint32_t vk_to_intel_index_type(VkIndexType type)
7400 {
7401 switch (type) {
7402 case VK_INDEX_TYPE_UINT8_EXT:
7403 return INDEX_BYTE;
7404 case VK_INDEX_TYPE_UINT16:
7405 return INDEX_WORD;
7406 case VK_INDEX_TYPE_UINT32:
7407 return INDEX_DWORD;
7408 default:
7409 unreachable("invalid index type");
7410 }
7411 }
7412
restart_index_for_type(VkIndexType type)7413 static uint32_t restart_index_for_type(VkIndexType type)
7414 {
7415 switch (type) {
7416 case VK_INDEX_TYPE_UINT8_EXT:
7417 return UINT8_MAX;
7418 case VK_INDEX_TYPE_UINT16:
7419 return UINT16_MAX;
7420 case VK_INDEX_TYPE_UINT32:
7421 return UINT32_MAX;
7422 default:
7423 unreachable("invalid index type");
7424 }
7425 }
7426
CmdBindIndexBuffer( VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, VkIndexType indexType)7427 void genX(CmdBindIndexBuffer)(
7428 VkCommandBuffer commandBuffer,
7429 VkBuffer _buffer,
7430 VkDeviceSize offset,
7431 VkIndexType indexType)
7432 {
7433 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7434 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
7435
7436 cmd_buffer->state.gfx.restart_index = restart_index_for_type(indexType);
7437 cmd_buffer->state.gfx.index_buffer = buffer;
7438 cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
7439 cmd_buffer->state.gfx.index_offset = offset;
7440
7441 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
7442 }
7443
CmdSetPerformanceOverrideINTEL( VkCommandBuffer commandBuffer, const VkPerformanceOverrideInfoINTEL* pOverrideInfo)7444 VkResult genX(CmdSetPerformanceOverrideINTEL)(
7445 VkCommandBuffer commandBuffer,
7446 const VkPerformanceOverrideInfoINTEL* pOverrideInfo)
7447 {
7448 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
7449
7450 switch (pOverrideInfo->type) {
7451 case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
7452 #if GFX_VER >= 9
7453 anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
7454 csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
7455 csdm2.MediaInstructionDisable = pOverrideInfo->enable;
7456 csdm2._3DRenderingInstructionDisableMask = true;
7457 csdm2.MediaInstructionDisableMask = true;
7458 }
7459 #else
7460 anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {
7461 instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;
7462 instpm.MediaInstructionDisable = pOverrideInfo->enable;
7463 instpm._3DRenderingInstructionDisableMask = true;
7464 instpm.MediaInstructionDisableMask = true;
7465 }
7466 #endif
7467 break;
7468 }
7469
7470 case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
7471 if (pOverrideInfo->enable) {
7472 /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
7473 anv_add_pending_pipe_bits(cmd_buffer,
7474 ANV_PIPE_FLUSH_BITS |
7475 ANV_PIPE_INVALIDATE_BITS,
7476 "perf counter isolation");
7477 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
7478 }
7479 break;
7480
7481 default:
7482 unreachable("Invalid override");
7483 }
7484
7485 return VK_SUCCESS;
7486 }
7487
CmdSetPerformanceStreamMarkerINTEL( VkCommandBuffer commandBuffer, const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo)7488 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
7489 VkCommandBuffer commandBuffer,
7490 const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo)
7491 {
7492 /* TODO: Waiting on the register to write, might depend on generation. */
7493
7494 return VK_SUCCESS;
7495 }
7496
7497 #define TIMESTAMP 0x2358
7498
cmd_emit_timestamp(struct anv_batch *batch, struct anv_device *device, struct anv_address addr, bool end_of_pipe)7499 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
7500 struct anv_device *device,
7501 struct anv_address addr,
7502 bool end_of_pipe) {
7503 if (end_of_pipe) {
7504 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
7505 pc.PostSyncOperation = WriteTimestamp;
7506 pc.Address = addr;
7507 anv_debug_dump_pc(pc);
7508 }
7509 } else {
7510 struct mi_builder b;
7511 mi_builder_init(&b, &device->info, batch);
7512 mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
7513 }
7514 }
7515