1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "anv_private.h" 25 26#include "genxml/gen_macros.h" 27#include "genxml/genX_pack.h" 28#include "genxml/gen_rt_pack.h" 29 30#include "common/intel_l3_config.h" 31#include "common/intel_sample_positions.h" 32#include "nir/nir_xfb_info.h" 33#include "vk_util.h" 34#include "vk_format.h" 35#include "vk_log.h" 36#include "vk_render_pass.h" 37 38static uint32_t 39vertex_element_comp_control(enum isl_format format, unsigned comp) 40{ 41 uint8_t bits; 42 switch (comp) { 43 case 0: bits = isl_format_layouts[format].channels.r.bits; break; 44 case 1: bits = isl_format_layouts[format].channels.g.bits; break; 45 case 2: bits = isl_format_layouts[format].channels.b.bits; break; 46 case 3: bits = isl_format_layouts[format].channels.a.bits; break; 47 default: unreachable("Invalid component"); 48 } 49 50 /* 51 * Take in account hardware restrictions when dealing with 64-bit floats. 52 * 53 * From Broadwell spec, command reference structures, page 586: 54 * "When SourceElementFormat is set to one of the *64*_PASSTHRU formats, 55 * 64-bit components are stored * in the URB without any conversion. In 56 * this case, vertex elements must be written as 128 or 256 bits, with 57 * VFCOMP_STORE_0 being used to pad the output as required. E.g., if 58 * R64_PASSTHRU is used to copy a 64-bit Red component into the URB, 59 * Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3 60 * set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or 61 * Components 1-3 must be specified as VFCOMP_STORE_0 in order to output 62 * a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires 63 * Component 3 to be specified as VFCOMP_STORE_0 in order to output a 64 * 256-bit vertex element." 65 */ 66 if (bits) { 67 return VFCOMP_STORE_SRC; 68 } else if (comp >= 2 && 69 !isl_format_layouts[format].channels.b.bits && 70 isl_format_layouts[format].channels.r.type == ISL_RAW) { 71 /* When emitting 64-bit attributes, we need to write either 128 or 256 72 * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and 73 * VFCOMP_STORE_0 to pad the written chunk */ 74 return VFCOMP_NOSTORE; 75 } else if (comp < 3 || 76 isl_format_layouts[format].channels.r.type == ISL_RAW) { 77 /* Note we need to pad with value 0, not 1, due hardware restrictions 78 * (see comment above) */ 79 return VFCOMP_STORE_0; 80 } else if (isl_format_layouts[format].channels.r.type == ISL_UINT || 81 isl_format_layouts[format].channels.r.type == ISL_SINT) { 82 assert(comp == 3); 83 return VFCOMP_STORE_1_INT; 84 } else { 85 assert(comp == 3); 86 return VFCOMP_STORE_1_FP; 87 } 88} 89 90static void 91emit_vertex_input(struct anv_graphics_pipeline *pipeline, 92 const struct vk_vertex_input_state *vi) 93{ 94 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 95 96 /* Pull inputs_read out of the VS prog data */ 97 const uint64_t inputs_read = vs_prog_data->inputs_read; 98 const uint64_t double_inputs_read = 99 vs_prog_data->double_inputs_read & inputs_read; 100 assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0); 101 const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0; 102 const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0; 103 const bool needs_svgs_elem = vs_prog_data->uses_vertexid || 104 vs_prog_data->uses_instanceid || 105 vs_prog_data->uses_firstvertex || 106 vs_prog_data->uses_baseinstance; 107 108 uint32_t elem_count = __builtin_popcount(elements) - 109 __builtin_popcount(elements_double) / 2; 110 111 const uint32_t total_elems = 112 MAX2(1, elem_count + needs_svgs_elem + vs_prog_data->uses_drawid); 113 114 uint32_t *p; 115 116 const uint32_t num_dwords = 1 + total_elems * 2; 117 p = anv_batch_emitn(&pipeline->base.batch, num_dwords, 118 GENX(3DSTATE_VERTEX_ELEMENTS)); 119 if (!p) 120 return; 121 122 for (uint32_t i = 0; i < total_elems; i++) { 123 /* The SKL docs for VERTEX_ELEMENT_STATE say: 124 * 125 * "All elements must be valid from Element[0] to the last valid 126 * element. (I.e. if Element[2] is valid then Element[1] and 127 * Element[0] must also be valid)." 128 * 129 * The SKL docs for 3D_Vertex_Component_Control say: 130 * 131 * "Don't store this component. (Not valid for Component 0, but can 132 * be used for Component 1-3)." 133 * 134 * So we can't just leave a vertex element blank and hope for the best. 135 * We have to tell the VF hardware to put something in it; so we just 136 * store a bunch of zero. 137 * 138 * TODO: Compact vertex elements so we never end up with holes. 139 */ 140 struct GENX(VERTEX_ELEMENT_STATE) element = { 141 .Valid = true, 142 .Component0Control = VFCOMP_STORE_0, 143 .Component1Control = VFCOMP_STORE_0, 144 .Component2Control = VFCOMP_STORE_0, 145 .Component3Control = VFCOMP_STORE_0, 146 }; 147 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + i * 2], &element); 148 } 149 150 u_foreach_bit(a, vi->attributes_valid) { 151 enum isl_format format = anv_get_isl_format(&pipeline->base.device->info, 152 vi->attributes[a].format, 153 VK_IMAGE_ASPECT_COLOR_BIT, 154 VK_IMAGE_TILING_LINEAR); 155 156 uint32_t binding = vi->attributes[a].binding; 157 assert(binding < MAX_VBS); 158 159 if ((elements & (1 << a)) == 0) 160 continue; /* Binding unused */ 161 162 uint32_t slot = 163 __builtin_popcount(elements & ((1 << a) - 1)) - 164 DIV_ROUND_UP(__builtin_popcount(elements_double & 165 ((1 << a) -1)), 2); 166 167 struct GENX(VERTEX_ELEMENT_STATE) element = { 168 .VertexBufferIndex = vi->attributes[a].binding, 169 .Valid = true, 170 .SourceElementFormat = format, 171 .EdgeFlagEnable = false, 172 .SourceElementOffset = vi->attributes[a].offset, 173 .Component0Control = vertex_element_comp_control(format, 0), 174 .Component1Control = vertex_element_comp_control(format, 1), 175 .Component2Control = vertex_element_comp_control(format, 2), 176 .Component3Control = vertex_element_comp_control(format, 3), 177 }; 178 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element); 179 180#if GFX_VER >= 8 181 /* On Broadwell and later, we have a separate VF_INSTANCING packet 182 * that controls instancing. On Haswell and prior, that's part of 183 * VERTEX_BUFFER_STATE which we emit later. 184 */ 185 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) { 186 bool per_instance = pipeline->vb[binding].instanced; 187 uint32_t divisor = pipeline->vb[binding].instance_divisor * 188 pipeline->instance_multiplier; 189 190 vfi.InstancingEnable = per_instance; 191 vfi.VertexElementIndex = slot; 192 vfi.InstanceDataStepRate = per_instance ? divisor : 1; 193 } 194#endif 195 } 196 197 const uint32_t id_slot = elem_count; 198 if (needs_svgs_elem) { 199 /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum: 200 * "Within a VERTEX_ELEMENT_STATE structure, if a Component 201 * Control field is set to something other than VFCOMP_STORE_SRC, 202 * no higher-numbered Component Control fields may be set to 203 * VFCOMP_STORE_SRC" 204 * 205 * This means, that if we have BaseInstance, we need BaseVertex as 206 * well. Just do all or nothing. 207 */ 208 uint32_t base_ctrl = (vs_prog_data->uses_firstvertex || 209 vs_prog_data->uses_baseinstance) ? 210 VFCOMP_STORE_SRC : VFCOMP_STORE_0; 211 212 struct GENX(VERTEX_ELEMENT_STATE) element = { 213 .VertexBufferIndex = ANV_SVGS_VB_INDEX, 214 .Valid = true, 215 .SourceElementFormat = ISL_FORMAT_R32G32_UINT, 216 .Component0Control = base_ctrl, 217 .Component1Control = base_ctrl, 218#if GFX_VER >= 8 219 .Component2Control = VFCOMP_STORE_0, 220 .Component3Control = VFCOMP_STORE_0, 221#else 222 .Component2Control = VFCOMP_STORE_VID, 223 .Component3Control = VFCOMP_STORE_IID, 224#endif 225 }; 226 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + id_slot * 2], &element); 227 228#if GFX_VER >= 8 229 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) { 230 vfi.VertexElementIndex = id_slot; 231 } 232#endif 233 } 234 235#if GFX_VER >= 8 236 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_SGVS), sgvs) { 237 sgvs.VertexIDEnable = vs_prog_data->uses_vertexid; 238 sgvs.VertexIDComponentNumber = 2; 239 sgvs.VertexIDElementOffset = id_slot; 240 sgvs.InstanceIDEnable = vs_prog_data->uses_instanceid; 241 sgvs.InstanceIDComponentNumber = 3; 242 sgvs.InstanceIDElementOffset = id_slot; 243 } 244#endif 245 246 const uint32_t drawid_slot = elem_count + needs_svgs_elem; 247 if (vs_prog_data->uses_drawid) { 248 struct GENX(VERTEX_ELEMENT_STATE) element = { 249 .VertexBufferIndex = ANV_DRAWID_VB_INDEX, 250 .Valid = true, 251 .SourceElementFormat = ISL_FORMAT_R32_UINT, 252 .Component0Control = VFCOMP_STORE_SRC, 253 .Component1Control = VFCOMP_STORE_0, 254 .Component2Control = VFCOMP_STORE_0, 255 .Component3Control = VFCOMP_STORE_0, 256 }; 257 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, 258 &p[1 + drawid_slot * 2], 259 &element); 260 261#if GFX_VER >= 8 262 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) { 263 vfi.VertexElementIndex = drawid_slot; 264 } 265#endif 266 } 267} 268 269void 270genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, 271 const struct intel_l3_config *l3_config, 272 VkShaderStageFlags active_stages, 273 const unsigned entry_size[4], 274 enum intel_urb_deref_block_size *deref_block_size) 275{ 276 const struct intel_device_info *devinfo = &device->info; 277 278 unsigned entries[4]; 279 unsigned start[4]; 280 bool constrained; 281 intel_get_urb_config(devinfo, l3_config, 282 active_stages & 283 VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, 284 active_stages & VK_SHADER_STAGE_GEOMETRY_BIT, 285 entry_size, entries, start, deref_block_size, 286 &constrained); 287 288#if GFX_VERx10 == 70 289 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: 290 * 291 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall 292 * needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS, 293 * 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS, 294 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL 295 * needs to be sent before any combination of VS associated 3DSTATE." 296 */ 297 anv_batch_emit(batch, GFX7_PIPE_CONTROL, pc) { 298 pc.DepthStallEnable = true; 299 pc.PostSyncOperation = WriteImmediateData; 300 pc.Address = device->workaround_address; 301 } 302#endif 303 304 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { 305 anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) { 306 urb._3DCommandSubOpcode += i; 307 urb.VSURBStartingAddress = start[i]; 308 urb.VSURBEntryAllocationSize = entry_size[i] - 1; 309 urb.VSNumberofURBEntries = entries[i]; 310 } 311 } 312#if GFX_VERx10 >= 125 313 if (device->physical->vk.supported_extensions.NV_mesh_shader) { 314 anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero); 315 anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero); 316 } 317#endif 318} 319 320#if GFX_VERx10 >= 125 321static void 322emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline, 323 enum intel_urb_deref_block_size *deref_block_size) 324{ 325 const struct intel_device_info *devinfo = &pipeline->base.device->info; 326 327 const struct brw_task_prog_data *task_prog_data = 328 anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK) ? 329 get_task_prog_data(pipeline) : NULL; 330 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); 331 332 const struct intel_mesh_urb_allocation alloc = 333 intel_get_mesh_urb_config(devinfo, pipeline->base.l3_config, 334 task_prog_data ? task_prog_data->map.size_dw : 0, 335 mesh_prog_data->map.size_dw); 336 337 /* Zero out the primitive pipeline URB allocations. */ 338 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { 339 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_URB_VS), urb) { 340 urb._3DCommandSubOpcode += i; 341 } 342 } 343 344 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_URB_ALLOC_TASK), urb) { 345 if (task_prog_data) { 346 urb.TASKURBEntryAllocationSize = alloc.task_entry_size_64b - 1; 347 urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries; 348 urb.TASKNumberofURBEntriesSliceN = alloc.task_entries; 349 urb.TASKURBStartingAddressSlice0 = alloc.task_starting_address_8kb; 350 urb.TASKURBStartingAddressSliceN = alloc.task_starting_address_8kb; 351 } 352 } 353 354 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_URB_ALLOC_MESH), urb) { 355 urb.MESHURBEntryAllocationSize = alloc.mesh_entry_size_64b - 1; 356 urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries; 357 urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries; 358 urb.MESHURBStartingAddressSlice0 = alloc.mesh_starting_address_8kb; 359 urb.MESHURBStartingAddressSliceN = alloc.mesh_starting_address_8kb; 360 } 361 362 *deref_block_size = alloc.deref_block_size; 363} 364#endif 365 366static void 367emit_urb_setup(struct anv_graphics_pipeline *pipeline, 368 enum intel_urb_deref_block_size *deref_block_size) 369{ 370#if GFX_VERx10 >= 125 371 if (anv_pipeline_is_mesh(pipeline)) { 372 emit_urb_setup_mesh(pipeline, deref_block_size); 373 return; 374 } 375#endif 376 377 unsigned entry_size[4]; 378 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { 379 const struct brw_vue_prog_data *prog_data = 380 !anv_pipeline_has_stage(pipeline, i) ? NULL : 381 (const struct brw_vue_prog_data *) pipeline->shaders[i]->prog_data; 382 383 entry_size[i] = prog_data ? prog_data->urb_entry_size : 1; 384 } 385 386 genX(emit_urb_setup)(pipeline->base.device, &pipeline->base.batch, 387 pipeline->base.l3_config, 388 pipeline->active_stages, entry_size, 389 deref_block_size); 390} 391 392static void 393emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline) 394{ 395 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 396 397 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { 398 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE), sbe); 399#if GFX_VER >= 8 400 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ), sbe); 401#endif 402#if GFX_VERx10 >= 125 403 if (anv_pipeline_is_mesh(pipeline)) 404 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_MESH), sbe_mesh); 405#endif 406 return; 407 } 408 409 struct GENX(3DSTATE_SBE) sbe = { 410 GENX(3DSTATE_SBE_header), 411 /* TODO(mesh): Figure out cases where we need attribute swizzling. See also 412 * calculate_urb_setup() and related functions. 413 */ 414 .AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline), 415 .PointSpriteTextureCoordinateOrigin = UPPERLEFT, 416 .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs, 417 .ConstantInterpolationEnable = wm_prog_data->flat_inputs, 418 }; 419 420#if GFX_VER >= 9 421 for (unsigned i = 0; i < 32; i++) 422 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; 423#endif 424 425#if GFX_VER >= 8 426 /* On Broadwell, they broke 3DSTATE_SBE into two packets */ 427 struct GENX(3DSTATE_SBE_SWIZ) swiz = { 428 GENX(3DSTATE_SBE_SWIZ_header), 429 }; 430#else 431# define swiz sbe 432#endif 433 434 if (anv_pipeline_is_primitive(pipeline)) { 435 const struct brw_vue_map *fs_input_map = 436 &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map; 437 438 int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs, 439 fs_input_map); 440 assert(first_slot % 2 == 0); 441 unsigned urb_entry_read_offset = first_slot / 2; 442 int max_source_attr = 0; 443 for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) { 444 uint8_t attr = wm_prog_data->urb_setup_attribs[idx]; 445 int input_index = wm_prog_data->urb_setup[attr]; 446 447 assert(0 <= input_index); 448 449 /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the 450 * VUE header 451 */ 452 if (attr == VARYING_SLOT_VIEWPORT || 453 attr == VARYING_SLOT_LAYER || 454 attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) { 455 continue; 456 } 457 458 if (attr == VARYING_SLOT_PNTC) { 459 sbe.PointSpriteTextureCoordinateEnable = 1 << input_index; 460 continue; 461 } 462 463 const int slot = fs_input_map->varying_to_slot[attr]; 464 465 if (slot == -1) { 466 /* This attribute does not exist in the VUE--that means that the 467 * vertex shader did not write to it. It could be that it's a 468 * regular varying read by the fragment shader but not written by 469 * the vertex shader or it's gl_PrimitiveID. In the first case the 470 * value is undefined, in the second it needs to be 471 * gl_PrimitiveID. 472 */ 473 swiz.Attribute[input_index].ConstantSource = PRIM_ID; 474 swiz.Attribute[input_index].ComponentOverrideX = true; 475 swiz.Attribute[input_index].ComponentOverrideY = true; 476 swiz.Attribute[input_index].ComponentOverrideZ = true; 477 swiz.Attribute[input_index].ComponentOverrideW = true; 478 continue; 479 } 480 481 /* We have to subtract two slots to account for the URB entry output 482 * read offset in the VS and GS stages. 483 */ 484 const int source_attr = slot - 2 * urb_entry_read_offset; 485 assert(source_attr >= 0 && source_attr < 32); 486 max_source_attr = MAX2(max_source_attr, source_attr); 487 /* The hardware can only do overrides on 16 overrides at a time, and the 488 * other up to 16 have to be lined up so that the input index = the 489 * output index. We'll need to do some tweaking to make sure that's the 490 * case. 491 */ 492 if (input_index < 16) 493 swiz.Attribute[input_index].SourceAttribute = source_attr; 494 else 495 assert(source_attr == input_index); 496 } 497 498 sbe.VertexURBEntryReadOffset = urb_entry_read_offset; 499 sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2); 500#if GFX_VER >= 8 501 sbe.ForceVertexURBEntryReadOffset = true; 502 sbe.ForceVertexURBEntryReadLength = true; 503#endif 504 } else { 505 assert(anv_pipeline_is_mesh(pipeline)); 506#if GFX_VERx10 >= 125 507 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); 508 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_MESH), sbe_mesh) { 509 const struct brw_mue_map *mue = &mesh_prog_data->map; 510 511 assert(mue->per_vertex_header_size_dw % 8 == 0); 512 sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8; 513 sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8); 514 515 /* Clip distance array is passed in the per-vertex header so that 516 * it can be consumed by the HW. If user wants to read it in the FS, 517 * adjust the offset and length to cover it. Conveniently it is at 518 * the end of the per-vertex header, right before per-vertex 519 * attributes. 520 * 521 * Note that FS attribute reading must be aware that the clip 522 * distances have fixed position. 523 */ 524 if (mue->per_vertex_header_size_dw > 8 && 525 (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 || 526 wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) { 527 sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1; 528 sbe_mesh.PerVertexURBEntryOutputReadLength += 1; 529 } 530 531 assert(mue->per_primitive_header_size_dw % 8 == 0); 532 sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = mue->per_primitive_header_size_dw / 8; 533 sbe_mesh.PerPrimitiveURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8); 534 535 /* Just like with clip distances, if Primitive Shading Rate, 536 * Viewport Index or Layer is read back in the FS, adjust 537 * the offset and length to cover the Primitive Header, where 538 * PSR, Viewport Index & Layer are stored. 539 */ 540 if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 || 541 wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 || 542 wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0) { 543 assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0); 544 sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1; 545 sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1; 546 } 547 } 548#endif 549 } 550 551 uint32_t *dw = anv_batch_emit_dwords(&pipeline->base.batch, 552 GENX(3DSTATE_SBE_length)); 553 if (!dw) 554 return; 555 GENX(3DSTATE_SBE_pack)(&pipeline->base.batch, dw, &sbe); 556 557#if GFX_VER >= 8 558 dw = anv_batch_emit_dwords(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ_length)); 559 if (!dw) 560 return; 561 GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->base.batch, dw, &swiz); 562#endif 563} 564 565/** Returns the final polygon mode for rasterization 566 * 567 * This function takes into account polygon mode, primitive topology and the 568 * different shader stages which might generate their own type of primitives. 569 */ 570VkPolygonMode 571genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline, 572 VkPrimitiveTopology primitive_topology) 573{ 574 if (anv_pipeline_is_mesh(pipeline)) { 575 switch (get_mesh_prog_data(pipeline)->primitive_type) { 576 case SHADER_PRIM_POINTS: 577 return VK_POLYGON_MODE_POINT; 578 case SHADER_PRIM_LINES: 579 return VK_POLYGON_MODE_LINE; 580 case SHADER_PRIM_TRIANGLES: 581 return pipeline->polygon_mode; 582 default: 583 unreachable("invalid primitive type for mesh"); 584 } 585 } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) { 586 switch (get_gs_prog_data(pipeline)->output_topology) { 587 case _3DPRIM_POINTLIST: 588 return VK_POLYGON_MODE_POINT; 589 590 case _3DPRIM_LINELIST: 591 case _3DPRIM_LINESTRIP: 592 case _3DPRIM_LINELOOP: 593 return VK_POLYGON_MODE_LINE; 594 595 case _3DPRIM_TRILIST: 596 case _3DPRIM_TRIFAN: 597 case _3DPRIM_TRISTRIP: 598 case _3DPRIM_RECTLIST: 599 case _3DPRIM_QUADLIST: 600 case _3DPRIM_QUADSTRIP: 601 case _3DPRIM_POLYGON: 602 return pipeline->polygon_mode; 603 } 604 unreachable("Unsupported GS output topology"); 605 } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { 606 switch (get_tes_prog_data(pipeline)->output_topology) { 607 case BRW_TESS_OUTPUT_TOPOLOGY_POINT: 608 return VK_POLYGON_MODE_POINT; 609 610 case BRW_TESS_OUTPUT_TOPOLOGY_LINE: 611 return VK_POLYGON_MODE_LINE; 612 613 case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW: 614 case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW: 615 return pipeline->polygon_mode; 616 } 617 unreachable("Unsupported TCS output topology"); 618 } else { 619 switch (primitive_topology) { 620 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: 621 return VK_POLYGON_MODE_POINT; 622 623 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: 624 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: 625 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: 626 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: 627 return VK_POLYGON_MODE_LINE; 628 629 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: 630 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: 631 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: 632 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: 633 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: 634 return pipeline->polygon_mode; 635 636 default: 637 unreachable("Unsupported primitive topology"); 638 } 639 } 640} 641 642uint32_t 643genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline, 644 VkPolygonMode raster_mode) 645{ 646#if GFX_VER <= 7 647 if (raster_mode == VK_POLYGON_MODE_LINE) { 648 switch (pipeline->line_mode) { 649 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT: 650 return MSRASTMODE_ON_PATTERN; 651 652 case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT: 653 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT: 654 return MSRASTMODE_OFF_PIXEL; 655 656 default: 657 unreachable("Unsupported line rasterization mode"); 658 } 659 } else { 660 return pipeline->rasterization_samples > 1 ? 661 MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL; 662 } 663#else 664 unreachable("Only on gen7"); 665#endif 666} 667 668const uint32_t genX(vk_to_intel_cullmode)[] = { 669 [VK_CULL_MODE_NONE] = CULLMODE_NONE, 670 [VK_CULL_MODE_FRONT_BIT] = CULLMODE_FRONT, 671 [VK_CULL_MODE_BACK_BIT] = CULLMODE_BACK, 672 [VK_CULL_MODE_FRONT_AND_BACK] = CULLMODE_BOTH 673}; 674 675const uint32_t genX(vk_to_intel_fillmode)[] = { 676 [VK_POLYGON_MODE_FILL] = FILL_MODE_SOLID, 677 [VK_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME, 678 [VK_POLYGON_MODE_POINT] = FILL_MODE_POINT, 679}; 680 681const uint32_t genX(vk_to_intel_front_face)[] = { 682 [VK_FRONT_FACE_COUNTER_CLOCKWISE] = 1, 683 [VK_FRONT_FACE_CLOCKWISE] = 0 684}; 685 686void 687genX(rasterization_mode)(VkPolygonMode raster_mode, 688 VkLineRasterizationModeEXT line_mode, 689 float line_width, 690 uint32_t *api_mode, 691 bool *msaa_rasterization_enable) 692{ 693#if GFX_VER >= 8 694 if (raster_mode == VK_POLYGON_MODE_LINE) { 695 /* Unfortunately, configuring our line rasterization hardware on gfx8 696 * and later is rather painful. Instead of giving us bits to tell the 697 * hardware what line mode to use like we had on gfx7, we now have an 698 * arcane combination of API Mode and MSAA enable bits which do things 699 * in a table which are expected to magically put the hardware into the 700 * right mode for your API. Sadly, Vulkan isn't any of the APIs the 701 * hardware people thought of so nothing works the way you want it to. 702 * 703 * Look at the table titled "Multisample Rasterization Modes" in Vol 7 704 * of the Skylake PRM for more details. 705 */ 706 switch (line_mode) { 707 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT: 708 *api_mode = DX100; 709#if GFX_VER <= 9 710 /* Prior to ICL, the algorithm the HW uses to draw wide lines 711 * doesn't quite match what the CTS expects, at least for rectangular 712 * lines, so we set this to false here, making it draw parallelograms 713 * instead, which work well enough. 714 */ 715 *msaa_rasterization_enable = line_width < 1.0078125; 716#else 717 *msaa_rasterization_enable = true; 718#endif 719 break; 720 721 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT: 722 case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT: 723 *api_mode = DX9OGL; 724 *msaa_rasterization_enable = false; 725 break; 726 727 default: 728 unreachable("Unsupported line rasterization mode"); 729 } 730 } else { 731 *api_mode = DX100; 732 *msaa_rasterization_enable = true; 733 } 734#else 735 unreachable("Invalid call"); 736#endif 737} 738 739static void 740emit_rs_state(struct anv_graphics_pipeline *pipeline, 741 const struct vk_input_assembly_state *ia, 742 const struct vk_rasterization_state *rs, 743 const struct vk_multisample_state *ms, 744 const struct vk_render_pass_state *rp, 745 enum intel_urb_deref_block_size urb_deref_block_size) 746{ 747 struct GENX(3DSTATE_SF) sf = { 748 GENX(3DSTATE_SF_header), 749 }; 750 751 sf.ViewportTransformEnable = true; 752 sf.StatisticsEnable = true; 753 sf.VertexSubPixelPrecisionSelect = _8Bit; 754 sf.AALineDistanceMode = true; 755 756 switch (rs->provoking_vertex) { 757 case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: 758 sf.TriangleStripListProvokingVertexSelect = 0; 759 sf.LineStripListProvokingVertexSelect = 0; 760 sf.TriangleFanProvokingVertexSelect = 1; 761 break; 762 763 case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: 764 sf.TriangleStripListProvokingVertexSelect = 2; 765 sf.LineStripListProvokingVertexSelect = 1; 766 sf.TriangleFanProvokingVertexSelect = 2; 767 break; 768 769 default: 770 unreachable("Invalid provoking vertex mode"); 771 } 772 773#if GFX_VERx10 == 75 774 sf.LineStippleEnable = rs->line.stipple.enable; 775#endif 776 777#if GFX_VER >= 12 778 sf.DerefBlockSize = urb_deref_block_size; 779#endif 780 781 bool point_from_shader; 782 if (anv_pipeline_is_primitive(pipeline)) { 783 const struct brw_vue_prog_data *last_vue_prog_data = 784 anv_pipeline_get_last_vue_prog_data(pipeline); 785 point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ; 786 } else { 787 assert(anv_pipeline_is_mesh(pipeline)); 788 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); 789 point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0; 790 } 791 792 if (point_from_shader) { 793 sf.PointWidthSource = Vertex; 794 } else { 795 sf.PointWidthSource = State; 796 sf.PointWidth = 1.0; 797 } 798 799#if GFX_VER >= 8 800 struct GENX(3DSTATE_RASTER) raster = { 801 GENX(3DSTATE_RASTER_header), 802 }; 803#else 804# define raster sf 805#endif 806 807 /* For details on 3DSTATE_RASTER multisample state, see the BSpec table 808 * "Multisample Modes State". 809 */ 810#if GFX_VER >= 8 811 /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix 812 * computations. If we ever set this bit to a different value, they will 813 * need to be updated accordingly. 814 */ 815 raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0; 816 raster.ForceMultisampling = false; 817#endif 818 819 raster.FrontFaceFillMode = genX(vk_to_intel_fillmode)[rs->polygon_mode]; 820 raster.BackFaceFillMode = genX(vk_to_intel_fillmode)[rs->polygon_mode]; 821 raster.ScissorRectangleEnable = true; 822 823#if GFX_VER >= 9 824 /* GFX9+ splits ViewportZClipTestEnable into near and far enable bits */ 825 raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable; 826 raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable; 827#elif GFX_VER >= 8 828 raster.ViewportZClipTestEnable = pipeline->depth_clip_enable; 829#endif 830 831#if GFX_VER >= 9 832 raster.ConservativeRasterizationEnable = 833 rs->conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT; 834#endif 835 836#if GFX_VER == 7 837 /* Gfx7 requires that we provide the depth format in 3DSTATE_SF so that it 838 * can get the depth offsets correct. 839 */ 840 if (rp != NULL && 841 rp->depth_attachment_format != VK_FORMAT_UNDEFINED) { 842 assert(vk_format_has_depth(rp->depth_attachment_format)); 843 enum isl_format isl_format = 844 anv_get_isl_format(&pipeline->base.device->info, 845 rp->depth_attachment_format, 846 VK_IMAGE_ASPECT_DEPTH_BIT, 847 VK_IMAGE_TILING_OPTIMAL); 848 sf.DepthBufferSurfaceFormat = 849 isl_format_get_depth_format(isl_format, false); 850 } 851#endif 852 853#if GFX_VER >= 8 854 GENX(3DSTATE_SF_pack)(NULL, pipeline->gfx8.sf, &sf); 855 GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gfx8.raster, &raster); 856#else 857# undef raster 858 GENX(3DSTATE_SF_pack)(NULL, &pipeline->gfx7.sf, &sf); 859#endif 860} 861 862static void 863emit_ms_state(struct anv_graphics_pipeline *pipeline, 864 const struct vk_multisample_state *ms) 865{ 866#if GFX_VER >= 8 867 /* On Gfx8+ 3DSTATE_MULTISAMPLE only holds the number of samples. */ 868 genX(emit_multisample)(&pipeline->base.batch, 869 pipeline->rasterization_samples, 870 NULL); 871#endif 872 873 /* From the Vulkan 1.0 spec: 874 * If pSampleMask is NULL, it is treated as if the mask has all bits 875 * enabled, i.e. no coverage is removed from fragments. 876 * 877 * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits. 878 */ 879#if GFX_VER >= 8 880 uint32_t sample_mask = 0xffff; 881#else 882 uint32_t sample_mask = 0xff; 883#endif 884 885 if (ms != NULL) 886 sample_mask &= ms->sample_mask; 887 888 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SAMPLE_MASK), sm) { 889 sm.SampleMask = sample_mask; 890 } 891} 892 893const uint32_t genX(vk_to_intel_logic_op)[] = { 894 [VK_LOGIC_OP_COPY] = LOGICOP_COPY, 895 [VK_LOGIC_OP_CLEAR] = LOGICOP_CLEAR, 896 [VK_LOGIC_OP_AND] = LOGICOP_AND, 897 [VK_LOGIC_OP_AND_REVERSE] = LOGICOP_AND_REVERSE, 898 [VK_LOGIC_OP_AND_INVERTED] = LOGICOP_AND_INVERTED, 899 [VK_LOGIC_OP_NO_OP] = LOGICOP_NOOP, 900 [VK_LOGIC_OP_XOR] = LOGICOP_XOR, 901 [VK_LOGIC_OP_OR] = LOGICOP_OR, 902 [VK_LOGIC_OP_NOR] = LOGICOP_NOR, 903 [VK_LOGIC_OP_EQUIVALENT] = LOGICOP_EQUIV, 904 [VK_LOGIC_OP_INVERT] = LOGICOP_INVERT, 905 [VK_LOGIC_OP_OR_REVERSE] = LOGICOP_OR_REVERSE, 906 [VK_LOGIC_OP_COPY_INVERTED] = LOGICOP_COPY_INVERTED, 907 [VK_LOGIC_OP_OR_INVERTED] = LOGICOP_OR_INVERTED, 908 [VK_LOGIC_OP_NAND] = LOGICOP_NAND, 909 [VK_LOGIC_OP_SET] = LOGICOP_SET, 910}; 911 912static const uint32_t vk_to_intel_blend[] = { 913 [VK_BLEND_FACTOR_ZERO] = BLENDFACTOR_ZERO, 914 [VK_BLEND_FACTOR_ONE] = BLENDFACTOR_ONE, 915 [VK_BLEND_FACTOR_SRC_COLOR] = BLENDFACTOR_SRC_COLOR, 916 [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR] = BLENDFACTOR_INV_SRC_COLOR, 917 [VK_BLEND_FACTOR_DST_COLOR] = BLENDFACTOR_DST_COLOR, 918 [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR] = BLENDFACTOR_INV_DST_COLOR, 919 [VK_BLEND_FACTOR_SRC_ALPHA] = BLENDFACTOR_SRC_ALPHA, 920 [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA] = BLENDFACTOR_INV_SRC_ALPHA, 921 [VK_BLEND_FACTOR_DST_ALPHA] = BLENDFACTOR_DST_ALPHA, 922 [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA] = BLENDFACTOR_INV_DST_ALPHA, 923 [VK_BLEND_FACTOR_CONSTANT_COLOR] = BLENDFACTOR_CONST_COLOR, 924 [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR, 925 [VK_BLEND_FACTOR_CONSTANT_ALPHA] = BLENDFACTOR_CONST_ALPHA, 926 [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA, 927 [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE] = BLENDFACTOR_SRC_ALPHA_SATURATE, 928 [VK_BLEND_FACTOR_SRC1_COLOR] = BLENDFACTOR_SRC1_COLOR, 929 [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR] = BLENDFACTOR_INV_SRC1_COLOR, 930 [VK_BLEND_FACTOR_SRC1_ALPHA] = BLENDFACTOR_SRC1_ALPHA, 931 [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA] = BLENDFACTOR_INV_SRC1_ALPHA, 932}; 933 934static const uint32_t vk_to_intel_blend_op[] = { 935 [VK_BLEND_OP_ADD] = BLENDFUNCTION_ADD, 936 [VK_BLEND_OP_SUBTRACT] = BLENDFUNCTION_SUBTRACT, 937 [VK_BLEND_OP_REVERSE_SUBTRACT] = BLENDFUNCTION_REVERSE_SUBTRACT, 938 [VK_BLEND_OP_MIN] = BLENDFUNCTION_MIN, 939 [VK_BLEND_OP_MAX] = BLENDFUNCTION_MAX, 940}; 941 942const uint32_t genX(vk_to_intel_compare_op)[] = { 943 [VK_COMPARE_OP_NEVER] = PREFILTEROP_NEVER, 944 [VK_COMPARE_OP_LESS] = PREFILTEROP_LESS, 945 [VK_COMPARE_OP_EQUAL] = PREFILTEROP_EQUAL, 946 [VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROP_LEQUAL, 947 [VK_COMPARE_OP_GREATER] = PREFILTEROP_GREATER, 948 [VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROP_NOTEQUAL, 949 [VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROP_GEQUAL, 950 [VK_COMPARE_OP_ALWAYS] = PREFILTEROP_ALWAYS, 951}; 952 953const uint32_t genX(vk_to_intel_stencil_op)[] = { 954 [VK_STENCIL_OP_KEEP] = STENCILOP_KEEP, 955 [VK_STENCIL_OP_ZERO] = STENCILOP_ZERO, 956 [VK_STENCIL_OP_REPLACE] = STENCILOP_REPLACE, 957 [VK_STENCIL_OP_INCREMENT_AND_CLAMP] = STENCILOP_INCRSAT, 958 [VK_STENCIL_OP_DECREMENT_AND_CLAMP] = STENCILOP_DECRSAT, 959 [VK_STENCIL_OP_INVERT] = STENCILOP_INVERT, 960 [VK_STENCIL_OP_INCREMENT_AND_WRAP] = STENCILOP_INCR, 961 [VK_STENCIL_OP_DECREMENT_AND_WRAP] = STENCILOP_DECR, 962}; 963 964const uint32_t genX(vk_to_intel_primitive_type)[] = { 965 [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = _3DPRIM_POINTLIST, 966 [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = _3DPRIM_LINELIST, 967 [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = _3DPRIM_LINESTRIP, 968 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = _3DPRIM_TRILIST, 969 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP, 970 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = _3DPRIM_TRIFAN, 971 [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = _3DPRIM_LINELIST_ADJ, 972 [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ, 973 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = _3DPRIM_TRILIST_ADJ, 974 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ, 975}; 976 977static bool 978is_dual_src_blend_factor(VkBlendFactor factor) 979{ 980 return factor == VK_BLEND_FACTOR_SRC1_COLOR || 981 factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR || 982 factor == VK_BLEND_FACTOR_SRC1_ALPHA || 983 factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA; 984} 985 986static inline uint32_t * 987write_disabled_blend(uint32_t *state) 988{ 989 struct GENX(BLEND_STATE_ENTRY) entry = { 990 .WriteDisableAlpha = true, 991 .WriteDisableRed = true, 992 .WriteDisableGreen = true, 993 .WriteDisableBlue = true, 994 }; 995 GENX(BLEND_STATE_ENTRY_pack)(NULL, state, &entry); 996 return state + GENX(BLEND_STATE_ENTRY_length); 997} 998 999static void 1000emit_cb_state(struct anv_graphics_pipeline *pipeline, 1001 const struct vk_color_blend_state *cb, 1002 const struct vk_multisample_state *ms) 1003{ 1004 struct anv_device *device = pipeline->base.device; 1005 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 1006 1007 struct GENX(BLEND_STATE) blend_state = { 1008#if GFX_VER >= 8 1009 .AlphaToCoverageEnable = ms && ms->alpha_to_coverage_enable, 1010 .AlphaToOneEnable = ms && ms->alpha_to_one_enable, 1011#endif 1012 }; 1013 1014 uint32_t surface_count = 0; 1015 struct anv_pipeline_bind_map *map; 1016 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { 1017 map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map; 1018 surface_count = map->surface_count; 1019 } 1020 1021 const struct intel_device_info *devinfo = &pipeline->base.device->info; 1022 uint32_t *blend_state_start = devinfo->ver >= 8 ? 1023 pipeline->gfx8.blend_state : pipeline->gfx7.blend_state; 1024 uint32_t *state_pos = blend_state_start; 1025 1026 state_pos += GENX(BLEND_STATE_length); 1027#if GFX_VER >= 8 1028 struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 }; 1029#endif 1030 for (unsigned i = 0; i < surface_count; i++) { 1031 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i]; 1032 1033 /* All color attachments are at the beginning of the binding table */ 1034 if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) 1035 break; 1036 1037 /* We can have at most 8 attachments */ 1038 assert(i < MAX_RTS); 1039 1040 if (cb == NULL || binding->index >= cb->attachment_count) { 1041 state_pos = write_disabled_blend(state_pos); 1042 continue; 1043 } 1044 1045 const struct vk_color_blend_attachment_state *a = 1046 &cb->attachments[binding->index]; 1047 1048 struct GENX(BLEND_STATE_ENTRY) entry = { 1049#if GFX_VER < 8 1050 .AlphaToCoverageEnable = ms && ms->alpha_to_coverage_enable, 1051 .AlphaToOneEnable = ms && ms->alpha_to_one_enable, 1052#endif 1053 .LogicOpEnable = cb->logic_op_enable, 1054 1055 /* Vulkan specification 1.2.168, VkLogicOp: 1056 * 1057 * "Logical operations are controlled by the logicOpEnable and 1058 * logicOp members of VkPipelineColorBlendStateCreateInfo. If 1059 * logicOpEnable is VK_TRUE, then a logical operation selected by 1060 * logicOp is applied between each color attachment and the 1061 * fragment’s corresponding output value, and blending of all 1062 * attachments is treated as if it were disabled." 1063 * 1064 * From the Broadwell PRM Volume 2d: Command Reference: Structures: 1065 * BLEND_STATE_ENTRY: 1066 * 1067 * "Enabling LogicOp and Color Buffer Blending at the same time is 1068 * UNDEFINED" 1069 */ 1070 .ColorBufferBlendEnable = !cb->logic_op_enable && a->blend_enable, 1071 .ColorClampRange = COLORCLAMP_RTFORMAT, 1072 .PreBlendColorClampEnable = true, 1073 .PostBlendColorClampEnable = true, 1074 .SourceBlendFactor = vk_to_intel_blend[a->src_color_blend_factor], 1075 .DestinationBlendFactor = vk_to_intel_blend[a->dst_color_blend_factor], 1076 .ColorBlendFunction = vk_to_intel_blend_op[a->color_blend_op], 1077 .SourceAlphaBlendFactor = vk_to_intel_blend[a->src_alpha_blend_factor], 1078 .DestinationAlphaBlendFactor = vk_to_intel_blend[a->dst_alpha_blend_factor], 1079 .AlphaBlendFunction = vk_to_intel_blend_op[a->alpha_blend_op], 1080 }; 1081 1082 if (a->src_color_blend_factor != a->src_alpha_blend_factor || 1083 a->dst_color_blend_factor != a->dst_alpha_blend_factor || 1084 a->color_blend_op != a->alpha_blend_op) { 1085#if GFX_VER >= 8 1086 blend_state.IndependentAlphaBlendEnable = true; 1087#else 1088 entry.IndependentAlphaBlendEnable = true; 1089#endif 1090 } 1091 1092 /* The Dual Source Blending documentation says: 1093 * 1094 * "If SRC1 is included in a src/dst blend factor and 1095 * a DualSource RT Write message is not used, results 1096 * are UNDEFINED. (This reflects the same restriction in DX APIs, 1097 * where undefined results are produced if “o1” is not written 1098 * by a PS – there are no default values defined)." 1099 * 1100 * There is no way to gracefully fix this undefined situation 1101 * so we just disable the blending to prevent possible issues. 1102 */ 1103 if (!wm_prog_data->dual_src_blend && 1104 (is_dual_src_blend_factor(a->src_color_blend_factor) || 1105 is_dual_src_blend_factor(a->dst_color_blend_factor) || 1106 is_dual_src_blend_factor(a->src_alpha_blend_factor) || 1107 is_dual_src_blend_factor(a->dst_alpha_blend_factor))) { 1108 vk_logw(VK_LOG_OBJS(&device->vk.base), 1109 "Enabled dual-src blend factors without writing both targets " 1110 "in the shader. Disabling blending to avoid GPU hangs."); 1111 entry.ColorBufferBlendEnable = false; 1112 } 1113 1114 /* Our hardware applies the blend factor prior to the blend function 1115 * regardless of what function is used. Technically, this means the 1116 * hardware can do MORE than GL or Vulkan specify. However, it also 1117 * means that, for MIN and MAX, we have to stomp the blend factor to 1118 * ONE to make it a no-op. 1119 */ 1120 if (a->color_blend_op == VK_BLEND_OP_MIN || 1121 a->color_blend_op == VK_BLEND_OP_MAX) { 1122 entry.SourceBlendFactor = BLENDFACTOR_ONE; 1123 entry.DestinationBlendFactor = BLENDFACTOR_ONE; 1124 } 1125 if (a->alpha_blend_op == VK_BLEND_OP_MIN || 1126 a->alpha_blend_op == VK_BLEND_OP_MAX) { 1127 entry.SourceAlphaBlendFactor = BLENDFACTOR_ONE; 1128 entry.DestinationAlphaBlendFactor = BLENDFACTOR_ONE; 1129 } 1130 GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry); 1131 state_pos += GENX(BLEND_STATE_ENTRY_length); 1132#if GFX_VER >= 8 1133 if (i == 0) 1134 bs0 = entry; 1135#endif 1136 } 1137 1138#if GFX_VER >= 8 1139 struct GENX(3DSTATE_PS_BLEND) blend = { 1140 GENX(3DSTATE_PS_BLEND_header), 1141 }; 1142 blend.AlphaToCoverageEnable = blend_state.AlphaToCoverageEnable; 1143 blend.ColorBufferBlendEnable = bs0.ColorBufferBlendEnable; 1144 blend.SourceAlphaBlendFactor = bs0.SourceAlphaBlendFactor; 1145 blend.DestinationAlphaBlendFactor = bs0.DestinationAlphaBlendFactor; 1146 blend.SourceBlendFactor = bs0.SourceBlendFactor; 1147 blend.DestinationBlendFactor = bs0.DestinationBlendFactor; 1148 blend.AlphaTestEnable = false; 1149 blend.IndependentAlphaBlendEnable = blend_state.IndependentAlphaBlendEnable; 1150 1151 GENX(3DSTATE_PS_BLEND_pack)(NULL, pipeline->gfx8.ps_blend, &blend); 1152#endif 1153 1154 GENX(BLEND_STATE_pack)(NULL, blend_state_start, &blend_state); 1155} 1156 1157static void 1158emit_3dstate_clip(struct anv_graphics_pipeline *pipeline, 1159 const struct vk_input_assembly_state *ia, 1160 const struct vk_viewport_state *vp, 1161 const struct vk_rasterization_state *rs) 1162{ 1163 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 1164 (void) wm_prog_data; 1165 1166 struct GENX(3DSTATE_CLIP) clip = { 1167 GENX(3DSTATE_CLIP_header), 1168 }; 1169 1170 clip.ClipEnable = true; 1171 clip.StatisticsEnable = true; 1172 clip.EarlyCullEnable = true; 1173 clip.APIMode = pipeline->negative_one_to_one ? APIMODE_OGL : APIMODE_D3D; 1174 clip.GuardbandClipTestEnable = true; 1175 1176#if GFX_VER >= 8 1177 clip.VertexSubPixelPrecisionSelect = _8Bit; 1178#endif 1179 clip.ClipMode = CLIPMODE_NORMAL; 1180 1181 switch (rs->provoking_vertex) { 1182 case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: 1183 clip.TriangleStripListProvokingVertexSelect = 0; 1184 clip.LineStripListProvokingVertexSelect = 0; 1185 clip.TriangleFanProvokingVertexSelect = 1; 1186 break; 1187 1188 case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: 1189 clip.TriangleStripListProvokingVertexSelect = 2; 1190 clip.LineStripListProvokingVertexSelect = 1; 1191 clip.TriangleFanProvokingVertexSelect = 2; 1192 break; 1193 1194 default: 1195 unreachable("Invalid provoking vertex mode"); 1196 } 1197 1198 clip.MinimumPointWidth = 0.125; 1199 clip.MaximumPointWidth = 255.875; 1200 1201 /* TODO(mesh): Multiview. */ 1202 if (anv_pipeline_is_primitive(pipeline)) { 1203 const struct brw_vue_prog_data *last = 1204 anv_pipeline_get_last_vue_prog_data(pipeline); 1205 1206 /* From the Vulkan 1.0.45 spec: 1207 * 1208 * "If the last active vertex processing stage shader entry point's 1209 * interface does not include a variable decorated with 1210 * ViewportIndex, then the first viewport is used." 1211 */ 1212 if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) { 1213 clip.MaximumVPIndex = vp->viewport_count > 0 ? 1214 vp->viewport_count - 1 : 0; 1215 } else { 1216 clip.MaximumVPIndex = 0; 1217 } 1218 1219 /* From the Vulkan 1.0.45 spec: 1220 * 1221 * "If the last active vertex processing stage shader entry point's 1222 * interface does not include a variable decorated with Layer, then 1223 * the first layer is used." 1224 */ 1225 clip.ForceZeroRTAIndexEnable = 1226 !(last->vue_map.slots_valid & VARYING_BIT_LAYER); 1227 1228#if GFX_VER == 7 1229 clip.UserClipDistanceClipTestEnableBitmask = last->clip_distance_mask; 1230 clip.UserClipDistanceCullTestEnableBitmask = last->cull_distance_mask; 1231#endif 1232 } else if (anv_pipeline_is_mesh(pipeline)) { 1233 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); 1234 if (vp && vp->viewport_count > 0 && 1235 mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) { 1236 clip.MaximumVPIndex = vp->viewport_count - 1; 1237 } 1238 } 1239 1240#if GFX_VER == 7 1241 clip.FrontWinding = genX(vk_to_intel_front_face)[rs->front_face]; 1242 clip.CullMode = genX(vk_to_intel_cullmode)[rs->cull_mode]; 1243 clip.ViewportZClipTestEnable = pipeline->depth_clip_enable; 1244#else 1245 clip.NonPerspectiveBarycentricEnable = wm_prog_data ? 1246 wm_prog_data->uses_nonperspective_interp_modes : 0; 1247#endif 1248 1249 GENX(3DSTATE_CLIP_pack)(NULL, pipeline->gfx7.clip, &clip); 1250 1251#if GFX_VERx10 >= 125 1252 if (anv_pipeline_is_mesh(pipeline)) { 1253 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); 1254 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_CLIP_MESH), clip_mesh) { 1255 clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0; 1256 clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask; 1257 clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask; 1258 } 1259 } 1260#endif 1261} 1262 1263static void 1264emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline, 1265 const struct vk_rasterization_state *rs) 1266{ 1267 const struct brw_vue_prog_data *prog_data = 1268 anv_pipeline_get_last_vue_prog_data(pipeline); 1269 const struct brw_vue_map *vue_map = &prog_data->vue_map; 1270 1271 nir_xfb_info *xfb_info; 1272 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) 1273 xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info; 1274 else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) 1275 xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info; 1276 else 1277 xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info; 1278 1279 if (xfb_info) { 1280 struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128]; 1281 int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0}; 1282 int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0}; 1283 1284 memset(so_decl, 0, sizeof(so_decl)); 1285 1286 for (unsigned i = 0; i < xfb_info->output_count; i++) { 1287 const nir_xfb_output_info *output = &xfb_info->outputs[i]; 1288 unsigned buffer = output->buffer; 1289 unsigned stream = xfb_info->buffer_to_stream[buffer]; 1290 1291 /* Our hardware is unusual in that it requires us to program SO_DECLs 1292 * for fake "hole" components, rather than simply taking the offset 1293 * for each real varying. Each hole can have size 1, 2, 3, or 4; we 1294 * program as many size = 4 holes as we can, then a final hole to 1295 * accommodate the final 1, 2, or 3 remaining. 1296 */ 1297 int hole_dwords = (output->offset - next_offset[buffer]) / 4; 1298 while (hole_dwords > 0) { 1299 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { 1300 .HoleFlag = 1, 1301 .OutputBufferSlot = buffer, 1302 .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1, 1303 }; 1304 hole_dwords -= 4; 1305 } 1306 1307 int varying = output->location; 1308 uint8_t component_mask = output->component_mask; 1309 /* VARYING_SLOT_PSIZ contains four scalar fields packed together: 1310 * - VARYING_SLOT_PRIMITIVE_SHADING_RATE in VARYING_SLOT_PSIZ.x 1311 * - VARYING_SLOT_LAYER in VARYING_SLOT_PSIZ.y 1312 * - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z 1313 * - VARYING_SLOT_PSIZ in VARYING_SLOT_PSIZ.w 1314 */ 1315 if (varying == VARYING_SLOT_PRIMITIVE_SHADING_RATE) { 1316 varying = VARYING_SLOT_PSIZ; 1317 component_mask = 1 << 0; // SO_DECL_COMPMASK_X 1318 } else if (varying == VARYING_SLOT_LAYER) { 1319 varying = VARYING_SLOT_PSIZ; 1320 component_mask = 1 << 1; // SO_DECL_COMPMASK_Y 1321 } else if (varying == VARYING_SLOT_VIEWPORT) { 1322 varying = VARYING_SLOT_PSIZ; 1323 component_mask = 1 << 2; // SO_DECL_COMPMASK_Z 1324 } else if (varying == VARYING_SLOT_PSIZ) { 1325 component_mask = 1 << 3; // SO_DECL_COMPMASK_W 1326 } 1327 1328 next_offset[buffer] = output->offset + 1329 __builtin_popcount(component_mask) * 4; 1330 1331 const int slot = vue_map->varying_to_slot[varying]; 1332 if (slot < 0) { 1333 /* This can happen if the shader never writes to the varying. 1334 * Insert a hole instead of actual varying data. 1335 */ 1336 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { 1337 .HoleFlag = true, 1338 .OutputBufferSlot = buffer, 1339 .ComponentMask = component_mask, 1340 }; 1341 } else { 1342 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { 1343 .OutputBufferSlot = buffer, 1344 .RegisterIndex = slot, 1345 .ComponentMask = component_mask, 1346 }; 1347 } 1348 } 1349 1350 int max_decls = 0; 1351 for (unsigned s = 0; s < MAX_XFB_STREAMS; s++) 1352 max_decls = MAX2(max_decls, decls[s]); 1353 1354 uint8_t sbs[MAX_XFB_STREAMS] = { }; 1355 for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) { 1356 if (xfb_info->buffers_written & (1 << b)) 1357 sbs[xfb_info->buffer_to_stream[b]] |= 1 << b; 1358 } 1359 1360 /* Wa_16011773973: 1361 * If SOL is enabled and SO_DECL state has to be programmed, 1362 * 1. Send 3D State SOL state with SOL disabled 1363 * 2. Send SO_DECL NP state 1364 * 3. Send 3D State SOL with SOL Enabled 1365 */ 1366 if (intel_device_info_is_dg2(&pipeline->base.device->info)) 1367 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), so); 1368 1369 uint32_t *dw = anv_batch_emitn(&pipeline->base.batch, 3 + 2 * max_decls, 1370 GENX(3DSTATE_SO_DECL_LIST), 1371 .StreamtoBufferSelects0 = sbs[0], 1372 .StreamtoBufferSelects1 = sbs[1], 1373 .StreamtoBufferSelects2 = sbs[2], 1374 .StreamtoBufferSelects3 = sbs[3], 1375 .NumEntries0 = decls[0], 1376 .NumEntries1 = decls[1], 1377 .NumEntries2 = decls[2], 1378 .NumEntries3 = decls[3]); 1379 1380 for (int i = 0; i < max_decls; i++) { 1381 GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2, 1382 &(struct GENX(SO_DECL_ENTRY)) { 1383 .Stream0Decl = so_decl[0][i], 1384 .Stream1Decl = so_decl[1][i], 1385 .Stream2Decl = so_decl[2][i], 1386 .Stream3Decl = so_decl[3][i], 1387 }); 1388 } 1389 } 1390 1391#if GFX_VER == 7 1392# define streamout_state_dw pipeline->gfx7.streamout_state 1393#else 1394# define streamout_state_dw pipeline->gfx8.streamout_state 1395#endif 1396 1397 struct GENX(3DSTATE_STREAMOUT) so = { 1398 GENX(3DSTATE_STREAMOUT_header), 1399 }; 1400 1401 if (xfb_info) { 1402 so.SOFunctionEnable = true; 1403 so.SOStatisticsEnable = true; 1404 1405 switch (rs->provoking_vertex) { 1406 case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: 1407 so.ReorderMode = LEADING; 1408 break; 1409 1410 case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: 1411 so.ReorderMode = TRAILING; 1412 break; 1413 1414 default: 1415 unreachable("Invalid provoking vertex mode"); 1416 } 1417 1418 so.RenderStreamSelect = rs->rasterization_stream; 1419 1420#if GFX_VER >= 8 1421 so.Buffer0SurfacePitch = xfb_info->buffers[0].stride; 1422 so.Buffer1SurfacePitch = xfb_info->buffers[1].stride; 1423 so.Buffer2SurfacePitch = xfb_info->buffers[2].stride; 1424 so.Buffer3SurfacePitch = xfb_info->buffers[3].stride; 1425#else 1426 pipeline->gfx7.xfb_bo_pitch[0] = xfb_info->buffers[0].stride; 1427 pipeline->gfx7.xfb_bo_pitch[1] = xfb_info->buffers[1].stride; 1428 pipeline->gfx7.xfb_bo_pitch[2] = xfb_info->buffers[2].stride; 1429 pipeline->gfx7.xfb_bo_pitch[3] = xfb_info->buffers[3].stride; 1430 1431 /* On Gfx7, the SO buffer enables live in 3DSTATE_STREAMOUT which 1432 * is a bit inconvenient because we don't know what buffers will 1433 * actually be enabled until draw time. We do our best here by 1434 * setting them based on buffers_written and we disable them 1435 * as-needed at draw time by setting EndAddress = BaseAddress. 1436 */ 1437 so.SOBufferEnable0 = xfb_info->buffers_written & (1 << 0); 1438 so.SOBufferEnable1 = xfb_info->buffers_written & (1 << 1); 1439 so.SOBufferEnable2 = xfb_info->buffers_written & (1 << 2); 1440 so.SOBufferEnable3 = xfb_info->buffers_written & (1 << 3); 1441#endif 1442 1443 int urb_entry_read_offset = 0; 1444 int urb_entry_read_length = 1445 (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset; 1446 1447 /* We always read the whole vertex. This could be reduced at some 1448 * point by reading less and offsetting the register index in the 1449 * SO_DECLs. 1450 */ 1451 so.Stream0VertexReadOffset = urb_entry_read_offset; 1452 so.Stream0VertexReadLength = urb_entry_read_length - 1; 1453 so.Stream1VertexReadOffset = urb_entry_read_offset; 1454 so.Stream1VertexReadLength = urb_entry_read_length - 1; 1455 so.Stream2VertexReadOffset = urb_entry_read_offset; 1456 so.Stream2VertexReadLength = urb_entry_read_length - 1; 1457 so.Stream3VertexReadOffset = urb_entry_read_offset; 1458 so.Stream3VertexReadLength = urb_entry_read_length - 1; 1459 } 1460 1461 GENX(3DSTATE_STREAMOUT_pack)(NULL, streamout_state_dw, &so); 1462} 1463 1464static uint32_t 1465get_sampler_count(const struct anv_shader_bin *bin) 1466{ 1467 uint32_t count_by_4 = DIV_ROUND_UP(bin->bind_map.sampler_count, 4); 1468 1469 /* We can potentially have way more than 32 samplers and that's ok. 1470 * However, the 3DSTATE_XS packets only have 3 bits to specify how 1471 * many to pre-fetch and all values above 4 are marked reserved. 1472 */ 1473 return MIN2(count_by_4, 4); 1474} 1475 1476static UNUSED struct anv_address 1477get_scratch_address(struct anv_pipeline *pipeline, 1478 gl_shader_stage stage, 1479 const struct anv_shader_bin *bin) 1480{ 1481 return (struct anv_address) { 1482 .bo = anv_scratch_pool_alloc(pipeline->device, 1483 &pipeline->device->scratch_pool, 1484 stage, bin->prog_data->total_scratch), 1485 .offset = 0, 1486 }; 1487} 1488 1489static UNUSED uint32_t 1490get_scratch_space(const struct anv_shader_bin *bin) 1491{ 1492 return ffs(bin->prog_data->total_scratch / 2048); 1493} 1494 1495static UNUSED uint32_t 1496get_scratch_surf(struct anv_pipeline *pipeline, 1497 gl_shader_stage stage, 1498 const struct anv_shader_bin *bin) 1499{ 1500 if (bin->prog_data->total_scratch == 0) 1501 return 0; 1502 1503 struct anv_bo *bo = 1504 anv_scratch_pool_alloc(pipeline->device, 1505 &pipeline->device->scratch_pool, 1506 stage, bin->prog_data->total_scratch); 1507 anv_reloc_list_add_bo(pipeline->batch.relocs, 1508 pipeline->batch.alloc, bo); 1509 return anv_scratch_pool_get_surf(pipeline->device, 1510 &pipeline->device->scratch_pool, 1511 bin->prog_data->total_scratch) >> 4; 1512} 1513 1514static void 1515emit_3dstate_vs(struct anv_graphics_pipeline *pipeline) 1516{ 1517 const struct intel_device_info *devinfo = &pipeline->base.device->info; 1518 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 1519 const struct anv_shader_bin *vs_bin = 1520 pipeline->shaders[MESA_SHADER_VERTEX]; 1521 1522 assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX)); 1523 1524 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VS), vs) { 1525 vs.Enable = true; 1526 vs.StatisticsEnable = true; 1527 vs.KernelStartPointer = vs_bin->kernel.offset; 1528#if GFX_VER >= 8 1529 vs.SIMD8DispatchEnable = 1530 vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8; 1531#endif 1532 1533 assert(!vs_prog_data->base.base.use_alt_mode); 1534#if GFX_VER < 11 1535 vs.SingleVertexDispatch = false; 1536#endif 1537 vs.VectorMaskEnable = false; 1538 /* Wa_1606682166: 1539 * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. 1540 * Disable the Sampler state prefetch functionality in the SARB by 1541 * programming 0xB000[30] to '1'. 1542 */ 1543 vs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin); 1544 vs.BindingTableEntryCount = vs_bin->bind_map.surface_count; 1545 vs.FloatingPointMode = IEEE754; 1546 vs.IllegalOpcodeExceptionEnable = false; 1547 vs.SoftwareExceptionEnable = false; 1548 vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1; 1549 1550 if (GFX_VER == 9 && devinfo->gt == 4 && 1551 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { 1552 /* On Sky Lake GT4, we have experienced some hangs related to the VS 1553 * cache and tessellation. It is unknown exactly what is happening 1554 * but the Haswell docs for the "VS Reference Count Full Force Miss 1555 * Enable" field of the "Thread Mode" register refer to a HSW bug in 1556 * which the VUE handle reference count would overflow resulting in 1557 * internal reference counting bugs. My (Jason's) best guess is that 1558 * this bug cropped back up on SKL GT4 when we suddenly had more 1559 * threads in play than any previous gfx9 hardware. 1560 * 1561 * What we do know for sure is that setting this bit when 1562 * tessellation shaders are in use fixes a GPU hang in Batman: Arkham 1563 * City when playing with DXVK (https://bugs.freedesktop.org/107280). 1564 * Disabling the vertex cache with tessellation shaders should only 1565 * have a minor performance impact as the tessellation shaders are 1566 * likely generating and processing far more geometry than the vertex 1567 * stage. 1568 */ 1569 vs.VertexCacheDisable = true; 1570 } 1571 1572 vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length; 1573 vs.VertexURBEntryReadOffset = 0; 1574 vs.DispatchGRFStartRegisterForURBData = 1575 vs_prog_data->base.base.dispatch_grf_start_reg; 1576 1577#if GFX_VER >= 8 1578 vs.UserClipDistanceClipTestEnableBitmask = 1579 vs_prog_data->base.clip_distance_mask; 1580 vs.UserClipDistanceCullTestEnableBitmask = 1581 vs_prog_data->base.cull_distance_mask; 1582#endif 1583 1584#if GFX_VERx10 >= 125 1585 vs.ScratchSpaceBuffer = 1586 get_scratch_surf(&pipeline->base, MESA_SHADER_VERTEX, vs_bin); 1587#else 1588 vs.PerThreadScratchSpace = get_scratch_space(vs_bin); 1589 vs.ScratchSpaceBasePointer = 1590 get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin); 1591#endif 1592 } 1593} 1594 1595static void 1596emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline, 1597 const struct vk_tessellation_state *ts) 1598{ 1599 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { 1600 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs); 1601 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te); 1602 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds); 1603 return; 1604 } 1605 1606 const struct intel_device_info *devinfo = &pipeline->base.device->info; 1607 const struct anv_shader_bin *tcs_bin = 1608 pipeline->shaders[MESA_SHADER_TESS_CTRL]; 1609 const struct anv_shader_bin *tes_bin = 1610 pipeline->shaders[MESA_SHADER_TESS_EVAL]; 1611 1612 const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline); 1613 const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline); 1614 1615 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs) { 1616 hs.Enable = true; 1617 hs.StatisticsEnable = true; 1618 hs.KernelStartPointer = tcs_bin->kernel.offset; 1619 /* Wa_1606682166 */ 1620 hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin); 1621 hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count; 1622 1623#if GFX_VER >= 12 1624 /* Wa_1604578095: 1625 * 1626 * Hang occurs when the number of max threads is less than 2 times 1627 * the number of instance count. The number of max threads must be 1628 * more than 2 times the number of instance count. 1629 */ 1630 assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances); 1631#endif 1632 1633 hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; 1634 hs.IncludeVertexHandles = true; 1635 hs.InstanceCount = tcs_prog_data->instances - 1; 1636 1637 hs.VertexURBEntryReadLength = 0; 1638 hs.VertexURBEntryReadOffset = 0; 1639 hs.DispatchGRFStartRegisterForURBData = 1640 tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f; 1641#if GFX_VER >= 12 1642 hs.DispatchGRFStartRegisterForURBData5 = 1643 tcs_prog_data->base.base.dispatch_grf_start_reg >> 5; 1644#endif 1645 1646#if GFX_VERx10 >= 125 1647 hs.ScratchSpaceBuffer = 1648 get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin); 1649#else 1650 hs.PerThreadScratchSpace = get_scratch_space(tcs_bin); 1651 hs.ScratchSpaceBasePointer = 1652 get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin); 1653#endif 1654 1655#if GFX_VER == 12 1656 /* Patch Count threshold specifies the maximum number of patches that 1657 * will be accumulated before a thread dispatch is forced. 1658 */ 1659 hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold; 1660#endif 1661 1662#if GFX_VER >= 9 1663 hs.DispatchMode = tcs_prog_data->base.dispatch_mode; 1664 hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id; 1665#endif 1666 } 1667 1668 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te) { 1669 te.Partitioning = tes_prog_data->partitioning; 1670 1671 if (ts->domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) { 1672 te.OutputTopology = tes_prog_data->output_topology; 1673 } else { 1674 /* When the origin is upper-left, we have to flip the winding order */ 1675 if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) { 1676 te.OutputTopology = OUTPUT_TRI_CW; 1677 } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) { 1678 te.OutputTopology = OUTPUT_TRI_CCW; 1679 } else { 1680 te.OutputTopology = tes_prog_data->output_topology; 1681 } 1682 } 1683 1684 te.TEDomain = tes_prog_data->domain; 1685 te.TEEnable = true; 1686 te.MaximumTessellationFactorOdd = 63.0; 1687 te.MaximumTessellationFactorNotOdd = 64.0; 1688#if GFX_VERx10 >= 125 1689 te.TessellationDistributionMode = TEDMODE_RR_FREE; 1690 te.TessellationDistributionLevel = TEDLEVEL_PATCH; 1691 /* 64_TRIANGLES */ 1692 te.SmallPatchThreshold = 3; 1693 /* 1K_TRIANGLES */ 1694 te.TargetBlockSize = 8; 1695 /* 1K_TRIANGLES */ 1696 te.LocalBOPAccumulatorThreshold = 1; 1697#endif 1698 } 1699 1700 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds) { 1701 ds.Enable = true; 1702 ds.StatisticsEnable = true; 1703 ds.KernelStartPointer = tes_bin->kernel.offset; 1704 /* Wa_1606682166 */ 1705 ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin); 1706 ds.BindingTableEntryCount = tes_bin->bind_map.surface_count; 1707 ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; 1708 1709 ds.ComputeWCoordinateEnable = 1710 tes_prog_data->domain == BRW_TESS_DOMAIN_TRI; 1711 1712 ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length; 1713 ds.PatchURBEntryReadOffset = 0; 1714 ds.DispatchGRFStartRegisterForURBData = 1715 tes_prog_data->base.base.dispatch_grf_start_reg; 1716 1717#if GFX_VER >= 8 1718#if GFX_VER < 11 1719 ds.DispatchMode = 1720 tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ? 1721 DISPATCH_MODE_SIMD8_SINGLE_PATCH : 1722 DISPATCH_MODE_SIMD4X2; 1723#else 1724 assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8); 1725 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH; 1726#endif 1727 1728 ds.UserClipDistanceClipTestEnableBitmask = 1729 tes_prog_data->base.clip_distance_mask; 1730 ds.UserClipDistanceCullTestEnableBitmask = 1731 tes_prog_data->base.cull_distance_mask; 1732#endif 1733 1734#if GFX_VER >= 12 1735 ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id; 1736#endif 1737#if GFX_VERx10 >= 125 1738 ds.ScratchSpaceBuffer = 1739 get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin); 1740#else 1741 ds.PerThreadScratchSpace = get_scratch_space(tes_bin); 1742 ds.ScratchSpaceBasePointer = 1743 get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin); 1744#endif 1745 } 1746} 1747 1748static void 1749emit_3dstate_gs(struct anv_graphics_pipeline *pipeline) 1750{ 1751 const struct intel_device_info *devinfo = &pipeline->base.device->info; 1752 const struct anv_shader_bin *gs_bin = 1753 pipeline->shaders[MESA_SHADER_GEOMETRY]; 1754 1755 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) { 1756 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs); 1757 return; 1758 } 1759 1760 const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline); 1761 1762 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs) { 1763 gs.Enable = true; 1764 gs.StatisticsEnable = true; 1765 gs.KernelStartPointer = gs_bin->kernel.offset; 1766 gs.DispatchMode = gs_prog_data->base.dispatch_mode; 1767 1768 gs.SingleProgramFlow = false; 1769 gs.VectorMaskEnable = false; 1770 /* Wa_1606682166 */ 1771 gs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin); 1772 gs.BindingTableEntryCount = gs_bin->bind_map.surface_count; 1773 gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles; 1774 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id; 1775 1776 if (GFX_VER == 8) { 1777 /* Broadwell is weird. It needs us to divide by 2. */ 1778 gs.MaximumNumberofThreads = devinfo->max_gs_threads / 2 - 1; 1779 } else { 1780 gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1; 1781 } 1782 1783 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1; 1784 gs.OutputTopology = gs_prog_data->output_topology; 1785 gs.ControlDataFormat = gs_prog_data->control_data_format; 1786 gs.ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords; 1787 gs.InstanceControl = MAX2(gs_prog_data->invocations, 1) - 1; 1788 gs.ReorderMode = TRAILING; 1789 1790#if GFX_VER >= 8 1791 gs.ExpectedVertexCount = gs_prog_data->vertices_in; 1792 gs.StaticOutput = gs_prog_data->static_vertex_count >= 0; 1793 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ? 1794 gs_prog_data->static_vertex_count : 0; 1795#endif 1796 1797 gs.VertexURBEntryReadOffset = 0; 1798 gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length; 1799 gs.DispatchGRFStartRegisterForURBData = 1800 gs_prog_data->base.base.dispatch_grf_start_reg; 1801 1802#if GFX_VER >= 8 1803 gs.UserClipDistanceClipTestEnableBitmask = 1804 gs_prog_data->base.clip_distance_mask; 1805 gs.UserClipDistanceCullTestEnableBitmask = 1806 gs_prog_data->base.cull_distance_mask; 1807#endif 1808 1809#if GFX_VERx10 >= 125 1810 gs.ScratchSpaceBuffer = 1811 get_scratch_surf(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin); 1812#else 1813 gs.PerThreadScratchSpace = get_scratch_space(gs_bin); 1814 gs.ScratchSpaceBasePointer = 1815 get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin); 1816#endif 1817 } 1818} 1819 1820static void 1821emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, 1822 const struct vk_input_assembly_state *ia, 1823 const struct vk_rasterization_state *rs, 1824 const struct vk_multisample_state *ms, 1825 const struct vk_color_blend_state *cb, 1826 const struct vk_render_pass_state *rp) 1827{ 1828 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 1829 1830 struct GENX(3DSTATE_WM) wm = { 1831 GENX(3DSTATE_WM_header), 1832 }; 1833 wm.StatisticsEnable = true; 1834 wm.LineEndCapAntialiasingRegionWidth = _05pixels; 1835 wm.LineAntialiasingRegionWidth = _10pixels; 1836 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT; 1837 1838 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { 1839 if (wm_prog_data->early_fragment_tests) { 1840 wm.EarlyDepthStencilControl = EDSC_PREPS; 1841 } else if (wm_prog_data->has_side_effects) { 1842 wm.EarlyDepthStencilControl = EDSC_PSEXEC; 1843 } else { 1844 wm.EarlyDepthStencilControl = EDSC_NORMAL; 1845 } 1846 1847#if GFX_VER >= 8 1848 /* Gen8 hardware tries to compute ThreadDispatchEnable for us but 1849 * doesn't take into account KillPixels when no depth or stencil 1850 * writes are enabled. In order for occlusion queries to work 1851 * correctly with no attachments, we need to force-enable PS thread 1852 * dispatch. 1853 * 1854 * The BDW docs are pretty clear that that this bit isn't validated 1855 * and probably shouldn't be used in production: 1856 * 1857 * "This must always be set to Normal. This field should not be 1858 * tested for functional validation." 1859 * 1860 * Unfortunately, however, the other mechanism we have for doing this 1861 * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW. 1862 * Given two bad options, we choose the one which works. 1863 */ 1864 pipeline->force_fragment_thread_dispatch = 1865 wm_prog_data->has_side_effects || 1866 wm_prog_data->uses_kill; 1867#endif 1868 1869 wm.BarycentricInterpolationMode = 1870 wm_prog_data->barycentric_interp_modes; 1871 1872#if GFX_VER < 8 1873 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode; 1874 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth; 1875 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w; 1876 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask; 1877 1878 /* If the subpass has a depth or stencil self-dependency, then we 1879 * need to force the hardware to do the depth/stencil write *after* 1880 * fragment shader execution. Otherwise, the writes may hit memory 1881 * before we get around to fetching from the input attachment and we 1882 * may get the depth or stencil value from the current draw rather 1883 * than the previous one. 1884 */ 1885 wm.PixelShaderKillsPixel = rp->depth_self_dependency || 1886 rp->stencil_self_dependency || 1887 wm_prog_data->uses_kill; 1888 1889 pipeline->force_fragment_thread_dispatch = 1890 wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF || 1891 wm_prog_data->has_side_effects || 1892 wm.PixelShaderKillsPixel; 1893 1894 if (ms != NULL && ms->rasterization_samples > 1) { 1895 if (wm_prog_data->persample_dispatch) { 1896 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; 1897 } else { 1898 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL; 1899 } 1900 } else { 1901 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; 1902 } 1903#endif 1904 1905 wm.LineStippleEnable = rs->line.stipple.enable; 1906 } 1907 1908 const struct intel_device_info *devinfo = &pipeline->base.device->info; 1909 uint32_t *dws = devinfo->ver >= 8 ? pipeline->gfx8.wm : pipeline->gfx7.wm; 1910 GENX(3DSTATE_WM_pack)(NULL, dws, &wm); 1911} 1912 1913static void 1914emit_3dstate_ps(struct anv_graphics_pipeline *pipeline, 1915 const struct vk_multisample_state *ms, 1916 const struct vk_color_blend_state *cb) 1917{ 1918 UNUSED const struct intel_device_info *devinfo = 1919 &pipeline->base.device->info; 1920 const struct anv_shader_bin *fs_bin = 1921 pipeline->shaders[MESA_SHADER_FRAGMENT]; 1922 1923 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { 1924 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) { 1925#if GFX_VER == 7 1926 /* Even if no fragments are ever dispatched, gfx7 hardware hangs if 1927 * we don't at least set the maximum number of threads. 1928 */ 1929 ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1; 1930#endif 1931 } 1932 return; 1933 } 1934 1935 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 1936 1937#if GFX_VER < 8 1938 /* The hardware wedges if you have this bit set but don't turn on any dual 1939 * source blend factors. 1940 */ 1941 bool dual_src_blend = false; 1942 if (wm_prog_data->dual_src_blend && cb) { 1943 for (uint32_t i = 0; i < cb->attachment_count; i++) { 1944 const struct vk_color_blend_attachment_state *a = 1945 &cb->attachments[i]; 1946 1947 if (a->blend_enable && 1948 (is_dual_src_blend_factor(a->src_color_blend_factor) || 1949 is_dual_src_blend_factor(a->dst_color_blend_factor) || 1950 is_dual_src_blend_factor(a->src_alpha_blend_factor) || 1951 is_dual_src_blend_factor(a->dst_alpha_blend_factor))) { 1952 dual_src_blend = true; 1953 break; 1954 } 1955 } 1956 } 1957#endif 1958 1959 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) { 1960 ps._8PixelDispatchEnable = wm_prog_data->dispatch_8; 1961 ps._16PixelDispatchEnable = wm_prog_data->dispatch_16; 1962 ps._32PixelDispatchEnable = wm_prog_data->dispatch_32; 1963 1964 /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable: 1965 * 1966 * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32 1967 * Dispatch must not be enabled for PER_PIXEL dispatch mode." 1968 * 1969 * Since 16x MSAA is first introduced on SKL, we don't need to apply 1970 * the workaround on any older hardware. 1971 */ 1972 if (GFX_VER >= 9 && !wm_prog_data->persample_dispatch && 1973 ms != NULL && ms->rasterization_samples == 16) { 1974 assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable); 1975 ps._32PixelDispatchEnable = false; 1976 } 1977 1978 ps.KernelStartPointer0 = fs_bin->kernel.offset + 1979 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0); 1980 ps.KernelStartPointer1 = fs_bin->kernel.offset + 1981 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1); 1982 ps.KernelStartPointer2 = fs_bin->kernel.offset + 1983 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2); 1984 1985 ps.SingleProgramFlow = false; 1986 ps.VectorMaskEnable = GFX_VER >= 8 && 1987 wm_prog_data->uses_vmask; 1988 /* Wa_1606682166 */ 1989 ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin); 1990 ps.BindingTableEntryCount = fs_bin->bind_map.surface_count; 1991 ps.PushConstantEnable = wm_prog_data->base.nr_params > 0 || 1992 wm_prog_data->base.ubo_ranges[0].length; 1993 ps.PositionXYOffsetSelect = wm_prog_data->uses_pos_offset ? 1994 POSOFFSET_SAMPLE: POSOFFSET_NONE; 1995#if GFX_VER < 8 1996 ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0; 1997 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; 1998 ps.DualSourceBlendEnable = dual_src_blend; 1999#endif 2000 2001#if GFX_VERx10 == 75 2002 /* Haswell requires the sample mask to be set in this packet as well 2003 * as in 3DSTATE_SAMPLE_MASK; the values should match. 2004 */ 2005 ps.SampleMask = 0xff; 2006#endif 2007 2008#if GFX_VER >= 8 2009 ps.MaximumNumberofThreadsPerPSD = 2010 devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1); 2011#else 2012 ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1; 2013#endif 2014 2015 ps.DispatchGRFStartRegisterForConstantSetupData0 = 2016 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0); 2017 ps.DispatchGRFStartRegisterForConstantSetupData1 = 2018 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1); 2019 ps.DispatchGRFStartRegisterForConstantSetupData2 = 2020 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2); 2021 2022#if GFX_VERx10 >= 125 2023 ps.ScratchSpaceBuffer = 2024 get_scratch_surf(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin); 2025#else 2026 ps.PerThreadScratchSpace = get_scratch_space(fs_bin); 2027 ps.ScratchSpaceBasePointer = 2028 get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin); 2029#endif 2030 } 2031} 2032 2033#if GFX_VER >= 8 2034static void 2035emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline, 2036 const struct vk_rasterization_state *rs, 2037 const struct vk_render_pass_state *rp) 2038{ 2039 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 2040 2041 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { 2042 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps); 2043 return; 2044 } 2045 2046 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps) { 2047 ps.PixelShaderValid = true; 2048 ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0; 2049 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; 2050 ps.PixelShaderIsPerSample = wm_prog_data->persample_dispatch; 2051 ps.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode; 2052 ps.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth; 2053 ps.PixelShaderUsesSourceW = wm_prog_data->uses_src_w; 2054 2055 /* If the subpass has a depth or stencil self-dependency, then we need 2056 * to force the hardware to do the depth/stencil write *after* fragment 2057 * shader execution. Otherwise, the writes may hit memory before we get 2058 * around to fetching from the input attachment and we may get the depth 2059 * or stencil value from the current draw rather than the previous one. 2060 */ 2061 ps.PixelShaderKillsPixel = rp->depth_self_dependency || 2062 rp->stencil_self_dependency || 2063 wm_prog_data->uses_kill; 2064 2065#if GFX_VER >= 9 2066 ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil; 2067 ps.PixelShaderPullsBary = wm_prog_data->pulls_bary; 2068 2069 ps.InputCoverageMaskState = ICMS_NONE; 2070 assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */ 2071 if (!wm_prog_data->uses_sample_mask) 2072 ps.InputCoverageMaskState = ICMS_NONE; 2073 else if (wm_prog_data->per_coarse_pixel_dispatch) 2074 ps.InputCoverageMaskState = ICMS_NORMAL; 2075 else if (wm_prog_data->post_depth_coverage) 2076 ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE; 2077 else 2078 ps.InputCoverageMaskState = ICMS_NORMAL; 2079#else 2080 ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask; 2081#endif 2082 2083#if GFX_VER >= 11 2084 ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients = 2085 wm_prog_data->uses_depth_w_coefficients; 2086 ps.PixelShaderIsPerCoarsePixel = wm_prog_data->per_coarse_pixel_dispatch; 2087#endif 2088#if GFX_VERx10 >= 125 2089 /* TODO: We should only require this when the last geometry shader uses 2090 * a fragment shading rate that is not constant. 2091 */ 2092 ps.EnablePSDependencyOnCPsizeChange = wm_prog_data->per_coarse_pixel_dispatch; 2093#endif 2094 } 2095} 2096#endif 2097 2098static void 2099emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline) 2100{ 2101 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_STATISTICS), vfs) { 2102 vfs.StatisticsEnable = true; 2103 } 2104} 2105 2106static void 2107compute_kill_pixel(struct anv_graphics_pipeline *pipeline, 2108 const struct vk_multisample_state *ms, 2109 const struct vk_render_pass_state *rp) 2110{ 2111 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { 2112 pipeline->kill_pixel = false; 2113 return; 2114 } 2115 2116 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 2117 2118 /* This computes the KillPixel portion of the computation for whether or 2119 * not we want to enable the PMA fix on gfx8 or gfx9. It's given by this 2120 * chunk of the giant formula: 2121 * 2122 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || 2123 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || 2124 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || 2125 * 3DSTATE_PS_BLEND::AlphaTestEnable || 2126 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) 2127 * 2128 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is 2129 * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept 2130 * of an alpha test. 2131 */ 2132 pipeline->kill_pixel = 2133 rp->depth_self_dependency || 2134 rp->stencil_self_dependency || 2135 wm_prog_data->uses_kill || 2136 wm_prog_data->uses_omask || 2137 (ms && ms->alpha_to_coverage_enable); 2138} 2139 2140#if GFX_VER == 12 2141static void 2142emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline, 2143 const struct vk_render_pass_state *rp) 2144{ 2145 if (!pipeline->use_primitive_replication) { 2146 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); 2147 return; 2148 } 2149 2150 int view_count = util_bitcount(rp->view_mask); 2151 assert(view_count > 1 && view_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION); 2152 2153 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) { 2154 pr.ReplicaMask = (1 << view_count) - 1; 2155 pr.ReplicationCount = view_count - 1; 2156 2157 int i = 0; 2158 u_foreach_bit(view_index, rp->view_mask) { 2159 pr.RTAIOffset[i] = view_index; 2160 i++; 2161 } 2162 } 2163} 2164#endif 2165 2166#if GFX_VERx10 >= 125 2167static void 2168emit_task_state(struct anv_graphics_pipeline *pipeline) 2169{ 2170 assert(anv_pipeline_is_mesh(pipeline)); 2171 2172 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) { 2173 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_CONTROL), zero); 2174 return; 2175 } 2176 2177 const struct anv_shader_bin *task_bin = pipeline->shaders[MESA_SHADER_TASK]; 2178 2179 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_CONTROL), tc) { 2180 tc.TaskShaderEnable = true; 2181 tc.ScratchSpaceBuffer = 2182 get_scratch_surf(&pipeline->base, MESA_SHADER_TASK, task_bin); 2183 tc.MaximumNumberofThreadGroups = 511; 2184 } 2185 2186 const struct intel_device_info *devinfo = &pipeline->base.device->info; 2187 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline); 2188 const struct brw_cs_dispatch_info task_dispatch = 2189 brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL); 2190 2191 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_SHADER), task) { 2192 task.KernelStartPointer = task_bin->kernel.offset; 2193 task.SIMDSize = task_dispatch.simd_size / 16; 2194 task.MessageSIMD = task.SIMDSize; 2195 task.NumberofThreadsinGPGPUThreadGroup = task_dispatch.threads; 2196 task.ExecutionMask = task_dispatch.right_mask; 2197 task.LocalXMaximum = task_dispatch.group_size - 1; 2198 task.EmitLocalIDX = true; 2199 2200 task.NumberofBarriers = task_prog_data->base.uses_barrier; 2201 task.SharedLocalMemorySize = 2202 encode_slm_size(GFX_VER, task_prog_data->base.base.total_shared); 2203 2204 /* 2205 * 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address 2206 * of a buffer with push constants and descriptor set table and 2207 * InlineData[2:7] will be used for first few push constants. 2208 */ 2209 task.EmitInlineParameter = true; 2210 2211 task.XP0Required = task_prog_data->uses_drawid; 2212 } 2213 2214 /* Recommended values from "Task and Mesh Distribution Programming". */ 2215 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_REDISTRIB), redistrib) { 2216 redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1; 2217 redistrib.SmallTaskThreshold = 1; /* 2^N */ 2218 redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */ 2219 redistrib.TaskRedistributionLevel = TASKREDISTRIB_BOM; 2220 2221 /* TODO: We have an unknown issue with Task Payload when task redistribution 2222 * is enabled. Disable it for now. 2223 * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/7141 2224 */ 2225 redistrib.TaskRedistributionMode = TASKREDISTRIB_OFF; 2226 } 2227} 2228 2229static void 2230emit_mesh_state(struct anv_graphics_pipeline *pipeline) 2231{ 2232 assert(anv_pipeline_is_mesh(pipeline)); 2233 2234 const struct anv_shader_bin *mesh_bin = pipeline->shaders[MESA_SHADER_MESH]; 2235 2236 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MESH_CONTROL), mc) { 2237 mc.MeshShaderEnable = true; 2238 mc.ScratchSpaceBuffer = 2239 get_scratch_surf(&pipeline->base, MESA_SHADER_MESH, mesh_bin); 2240 mc.MaximumNumberofThreadGroups = 511; 2241 } 2242 2243 const struct intel_device_info *devinfo = &pipeline->base.device->info; 2244 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); 2245 const struct brw_cs_dispatch_info mesh_dispatch = 2246 brw_cs_get_dispatch_info(devinfo, &mesh_prog_data->base, NULL); 2247 2248 const unsigned output_topology = 2249 mesh_prog_data->primitive_type == SHADER_PRIM_POINTS ? OUTPUT_POINT : 2250 mesh_prog_data->primitive_type == SHADER_PRIM_LINES ? OUTPUT_LINE : 2251 OUTPUT_TRI; 2252 2253 uint32_t index_format; 2254 switch (mesh_prog_data->index_format) { 2255 case BRW_INDEX_FORMAT_U32: 2256 index_format = INDEX_U32; 2257 break; 2258 default: 2259 unreachable("invalid index format"); 2260 } 2261 2262 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MESH_SHADER), mesh) { 2263 mesh.KernelStartPointer = mesh_bin->kernel.offset; 2264 mesh.SIMDSize = mesh_dispatch.simd_size / 16; 2265 mesh.MessageSIMD = mesh.SIMDSize; 2266 mesh.NumberofThreadsinGPGPUThreadGroup = mesh_dispatch.threads; 2267 mesh.ExecutionMask = mesh_dispatch.right_mask; 2268 mesh.LocalXMaximum = mesh_dispatch.group_size - 1; 2269 mesh.EmitLocalIDX = true; 2270 2271 mesh.MaximumPrimitiveCount = mesh_prog_data->map.max_primitives - 1; 2272 mesh.OutputTopology = output_topology; 2273 mesh.PerVertexDataPitch = mesh_prog_data->map.per_vertex_pitch_dw / 8; 2274 mesh.PerPrimitiveDataPresent = mesh_prog_data->map.per_primitive_pitch_dw > 0; 2275 mesh.PerPrimitiveDataPitch = mesh_prog_data->map.per_primitive_pitch_dw / 8; 2276 mesh.IndexFormat = index_format; 2277 2278 mesh.NumberofBarriers = mesh_prog_data->base.uses_barrier; 2279 mesh.SharedLocalMemorySize = 2280 encode_slm_size(GFX_VER, mesh_prog_data->base.base.total_shared); 2281 2282 /* 2283 * 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address 2284 * of a buffer with push constants and descriptor set table and 2285 * InlineData[2:7] will be used for first few push constants. 2286 */ 2287 mesh.EmitInlineParameter = true; 2288 2289 mesh.XP0Required = mesh_prog_data->uses_drawid; 2290 } 2291 2292 /* Recommended values from "Task and Mesh Distribution Programming". */ 2293 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MESH_DISTRIB), distrib) { 2294 distrib.DistributionMode = MESH_RR_FREE; 2295 distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 4 : 9; /* 2^N thread groups */ 2296 distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 3 : 3; /* 2^N thread groups */ 2297 } 2298} 2299#endif 2300 2301void 2302genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline, 2303 const struct vk_graphics_pipeline_state *state) 2304{ 2305 enum intel_urb_deref_block_size urb_deref_block_size; 2306 emit_urb_setup(pipeline, &urb_deref_block_size); 2307 2308 assert(state->rs != NULL); 2309 emit_rs_state(pipeline, state->ia, state->rs, state->ms, state->rp, 2310 urb_deref_block_size); 2311 emit_ms_state(pipeline, state->ms); 2312 emit_cb_state(pipeline, state->cb, state->ms); 2313 compute_kill_pixel(pipeline, state->ms, state->rp); 2314 2315 emit_3dstate_clip(pipeline, state->ia, state->vp, state->rs); 2316 2317#if GFX_VER == 12 2318 emit_3dstate_primitive_replication(pipeline, state->rp); 2319#endif 2320 2321#if 0 2322 /* From gfx7_vs_state.c */ 2323 2324 /** 2325 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages > 2326 * Geometry > Geometry Shader > State: 2327 * 2328 * "Note: Because of corruption in IVB:GT2, software needs to flush the 2329 * whole fixed function pipeline when the GS enable changes value in 2330 * the 3DSTATE_GS." 2331 * 2332 * The hardware architects have clarified that in this context "flush the 2333 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS 2334 * Stall" bit set. 2335 */ 2336 if (device->info.platform == INTEL_PLATFORM_IVB) 2337 gfx7_emit_vs_workaround_flush(brw); 2338#endif 2339 2340 if (anv_pipeline_is_primitive(pipeline)) { 2341 emit_vertex_input(pipeline, state->vi); 2342 2343 emit_3dstate_vs(pipeline); 2344 emit_3dstate_hs_te_ds(pipeline, state->ts); 2345 emit_3dstate_gs(pipeline); 2346 2347 emit_3dstate_vf_statistics(pipeline); 2348 2349 emit_3dstate_streamout(pipeline, state->rs); 2350 2351#if GFX_VERx10 >= 125 2352 const struct anv_device *device = pipeline->base.device; 2353 /* Disable Mesh. */ 2354 if (device->physical->vk.supported_extensions.NV_mesh_shader) { 2355 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MESH_CONTROL), zero); 2356 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_CONTROL), zero); 2357 } 2358#endif 2359 } else { 2360 assert(anv_pipeline_is_mesh(pipeline)); 2361 2362 /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable 2363 * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1. 2364 */ 2365 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), so) {} 2366 2367#if GFX_VERx10 >= 125 2368 emit_task_state(pipeline); 2369 emit_mesh_state(pipeline); 2370#endif 2371 } 2372 2373 emit_3dstate_sbe(pipeline); 2374 emit_3dstate_wm(pipeline, state->ia, state->rs, 2375 state->ms, state->cb, state->rp); 2376 emit_3dstate_ps(pipeline, state->ms, state->cb); 2377#if GFX_VER >= 8 2378 emit_3dstate_ps_extra(pipeline, state->rs, state->rp); 2379#endif 2380} 2381 2382#if GFX_VERx10 >= 125 2383 2384void 2385genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) 2386{ 2387 struct anv_device *device = pipeline->base.device; 2388 const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); 2389 anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0); 2390 2391 const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs; 2392 const struct intel_device_info *devinfo = &device->info; 2393 2394 anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) { 2395 cfe.MaximumNumberofThreads = 2396 devinfo->max_cs_threads * devinfo->subslice_total; 2397 cfe.ScratchSpaceBuffer = 2398 get_scratch_surf(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin); 2399 } 2400} 2401 2402#else /* #if GFX_VERx10 >= 125 */ 2403 2404void 2405genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) 2406{ 2407 struct anv_device *device = pipeline->base.device; 2408 const struct intel_device_info *devinfo = &device->info; 2409 const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); 2410 2411 anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0); 2412 2413 const struct brw_cs_dispatch_info dispatch = 2414 brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL); 2415 const uint32_t vfe_curbe_allocation = 2416 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads + 2417 cs_prog_data->push.cross_thread.regs, 2); 2418 2419 const struct anv_shader_bin *cs_bin = pipeline->cs; 2420 2421 anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) { 2422#if GFX_VER > 7 2423 vfe.StackSize = 0; 2424#else 2425 vfe.GPGPUMode = true; 2426#endif 2427 vfe.MaximumNumberofThreads = 2428 devinfo->max_cs_threads * devinfo->subslice_total - 1; 2429 vfe.NumberofURBEntries = GFX_VER <= 7 ? 0 : 2; 2430#if GFX_VER < 11 2431 vfe.ResetGatewayTimer = true; 2432#endif 2433#if GFX_VER <= 8 2434 vfe.BypassGatewayControl = true; 2435#endif 2436 vfe.URBEntryAllocationSize = GFX_VER <= 7 ? 0 : 2; 2437 vfe.CURBEAllocationSize = vfe_curbe_allocation; 2438 2439 if (cs_bin->prog_data->total_scratch) { 2440 if (GFX_VER >= 8) { 2441 /* Broadwell's Per Thread Scratch Space is in the range [0, 11] 2442 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. 2443 */ 2444 vfe.PerThreadScratchSpace = 2445 ffs(cs_bin->prog_data->total_scratch) - 11; 2446 } else if (GFX_VERx10 == 75) { 2447 /* Haswell's Per Thread Scratch Space is in the range [0, 10] 2448 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M. 2449 */ 2450 vfe.PerThreadScratchSpace = 2451 ffs(cs_bin->prog_data->total_scratch) - 12; 2452 } else { 2453 /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB] 2454 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB. 2455 */ 2456 vfe.PerThreadScratchSpace = 2457 cs_bin->prog_data->total_scratch / 1024 - 1; 2458 } 2459 vfe.ScratchSpaceBasePointer = 2460 get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin); 2461 } 2462 } 2463 2464 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { 2465 .KernelStartPointer = 2466 cs_bin->kernel.offset + 2467 brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size), 2468 2469 /* Wa_1606682166 */ 2470 .SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin), 2471 /* We add 1 because the CS indirect parameters buffer isn't accounted 2472 * for in bind_map.surface_count. 2473 */ 2474 .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30), 2475 .BarrierEnable = cs_prog_data->uses_barrier, 2476 .SharedLocalMemorySize = 2477 encode_slm_size(GFX_VER, cs_prog_data->base.total_shared), 2478 2479#if GFX_VERx10 != 75 2480 .ConstantURBEntryReadOffset = 0, 2481#endif 2482 .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs, 2483#if GFX_VERx10 >= 75 2484 .CrossThreadConstantDataReadLength = 2485 cs_prog_data->push.cross_thread.regs, 2486#endif 2487#if GFX_VER >= 12 2488 /* TODO: Check if we are missing workarounds and enable mid-thread 2489 * preemption. 2490 * 2491 * We still have issues with mid-thread preemption (it was already 2492 * disabled by the kernel on gfx11, due to missing workarounds). It's 2493 * possible that we are just missing some workarounds, and could enable 2494 * it later, but for now let's disable it to fix a GPU in compute in Car 2495 * Chase (and possibly more). 2496 */ 2497 .ThreadPreemptionDisable = true, 2498#endif 2499 2500 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, 2501 }; 2502 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, 2503 pipeline->interface_descriptor_data, 2504 &desc); 2505} 2506 2507#endif /* #if GFX_VERx10 >= 125 */ 2508 2509#if GFX_VERx10 >= 125 2510 2511void 2512genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline) 2513{ 2514 for (uint32_t i = 0; i < pipeline->group_count; i++) { 2515 struct anv_rt_shader_group *group = &pipeline->groups[i]; 2516 2517 switch (group->type) { 2518 case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: { 2519 struct GFX_RT_GENERAL_SBT_HANDLE sh = {}; 2520 sh.General = anv_shader_bin_get_bsr(group->general, 32); 2521 GFX_RT_GENERAL_SBT_HANDLE_pack(NULL, group->handle, &sh); 2522 break; 2523 } 2524 2525 case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: { 2526 struct GFX_RT_TRIANGLES_SBT_HANDLE sh = {}; 2527 if (group->closest_hit) 2528 sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32); 2529 if (group->any_hit) 2530 sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24); 2531 GFX_RT_TRIANGLES_SBT_HANDLE_pack(NULL, group->handle, &sh); 2532 break; 2533 } 2534 2535 case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: { 2536 struct GFX_RT_PROCEDURAL_SBT_HANDLE sh = {}; 2537 if (group->closest_hit) 2538 sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32); 2539 sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24); 2540 GFX_RT_PROCEDURAL_SBT_HANDLE_pack(NULL, group->handle, &sh); 2541 break; 2542 } 2543 2544 default: 2545 unreachable("Invalid shader group type"); 2546 } 2547 } 2548} 2549 2550#else 2551 2552void 2553genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline) 2554{ 2555 unreachable("Ray tracing not supported"); 2556} 2557 2558#endif /* GFX_VERx10 >= 125 */ 2559