1/* 2 * Copyright © 2017 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included 12 * in all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 * DEALINGS IN THE SOFTWARE. 21 */ 22 23/** 24 * @file crocus_state.c 25 * 26 * ============================= GENXML CODE ============================= 27 * [This file is compiled once per generation.] 28 * ======================================================================= 29 * 30 * This is the main state upload code. 31 * 32 * Gallium uses Constant State Objects, or CSOs, for most state. Large, 33 * complex, or highly reusable state can be created once, and bound and 34 * rebound multiple times. This is modeled with the pipe->create_*_state() 35 * and pipe->bind_*_state() hooks. Highly dynamic or inexpensive state is 36 * streamed out on the fly, via pipe->set_*_state() hooks. 37 * 38 * OpenGL involves frequently mutating context state, which is mirrored in 39 * core Mesa by highly mutable data structures. However, most applications 40 * typically draw the same things over and over - from frame to frame, most 41 * of the same objects are still visible and need to be redrawn. So, rather 42 * than inventing new state all the time, applications usually mutate to swap 43 * between known states that we've seen before. 44 * 45 * Gallium isolates us from this mutation by tracking API state, and 46 * distilling it into a set of Constant State Objects, or CSOs. Large, 47 * complex, or typically reusable state can be created once, then reused 48 * multiple times. Drivers can create and store their own associated data. 49 * This create/bind model corresponds to the pipe->create_*_state() and 50 * pipe->bind_*_state() driver hooks. 51 * 52 * Some state is cheap to create, or expected to be highly dynamic. Rather 53 * than creating and caching piles of CSOs for these, Gallium simply streams 54 * them out, via the pipe->set_*_state() driver hooks. 55 * 56 * To reduce draw time overhead, we try to compute as much state at create 57 * time as possible. Wherever possible, we translate the Gallium pipe state 58 * to 3DSTATE commands, and store those commands in the CSO. At draw time, 59 * we can simply memcpy them into a batch buffer. 60 * 61 * No hardware matches the abstraction perfectly, so some commands require 62 * information from multiple CSOs. In this case, we can store two copies 63 * of the packet (one in each CSO), and simply | together their DWords at 64 * draw time. Sometimes the second set is trivial (one or two fields), so 65 * we simply pack it at draw time. 66 * 67 * There are two main components in the file below. First, the CSO hooks 68 * create/bind/track state. The second are the draw-time upload functions, 69 * crocus_upload_render_state() and crocus_upload_compute_state(), which read 70 * the context state and emit the commands into the actual batch. 71 */ 72 73#include <errno.h> 74#include <stdio.h> 75 76#if HAVE_VALGRIND 77#include <memcheck.h> 78#include <valgrind.h> 79#define VG(x) x 80#ifdef DEBUG 81#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x)) 82#endif 83#else 84#define VG(x) 85#endif 86 87#include "drm-uapi/i915_drm.h" 88#include "intel/common/intel_l3_config.h" 89#include "intel/common/intel_sample_positions.h" 90#include "intel/compiler/brw_compiler.h" 91#include "compiler/shader_info.h" 92#include "pipe/p_context.h" 93#include "pipe/p_defines.h" 94#include "pipe/p_screen.h" 95#include "pipe/p_state.h" 96#include "util/format/u_format.h" 97#include "util/half_float.h" 98#include "util/u_dual_blend.h" 99#include "util/u_framebuffer.h" 100#include "util/u_helpers.h" 101#include "util/u_inlines.h" 102#include "util/u_memory.h" 103#include "util/u_prim.h" 104#include "util/u_transfer.h" 105#include "util/u_upload_mgr.h" 106#include "util/u_viewport.h" 107#include "crocus_batch.h" 108#include "crocus_context.h" 109#include "crocus_defines.h" 110#include "crocus_pipe.h" 111#include "crocus_resource.h" 112 113#include "crocus_genx_macros.h" 114#include "intel/common/intel_guardband.h" 115#include "main/macros.h" /* UNCLAMPED_* */ 116 117/** 118 * Statically assert that PIPE_* enums match the hardware packets. 119 * (As long as they match, we don't need to translate them.) 120 */ 121UNUSED static void pipe_asserts() 122{ 123#define PIPE_ASSERT(x) STATIC_ASSERT((int)x) 124 125 /* pipe_logicop happens to match the hardware. */ 126 PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR); 127 PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR); 128 PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED); 129 PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED); 130 PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE); 131 PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT); 132 PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR); 133 PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND); 134 PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND); 135 PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV); 136 PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP); 137 PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED); 138 PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY); 139 PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE); 140 PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR); 141 PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET); 142 143 /* pipe_blend_func happens to match the hardware. */ 144 PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE); 145 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR); 146 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA); 147 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA); 148 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR); 149 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE); 150 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR); 151 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA); 152 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR); 153 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA); 154 PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO); 155 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR); 156 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA); 157 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA); 158 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR); 159 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR); 160 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA); 161 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR); 162 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA); 163 164 /* pipe_blend_func happens to match the hardware. */ 165 PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD); 166 PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT); 167 PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT); 168 PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN); 169 PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX); 170 171 /* pipe_stencil_op happens to match the hardware. */ 172 PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP); 173 PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO); 174 PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE); 175 PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT); 176 PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT); 177 PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR); 178 PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR); 179 PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT); 180 181#if GFX_VER >= 6 182 /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */ 183 PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT); 184 PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT); 185#endif 186#undef PIPE_ASSERT 187} 188 189static unsigned 190translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch) 191{ 192 static const unsigned map[] = { 193 [PIPE_PRIM_POINTS] = _3DPRIM_POINTLIST, 194 [PIPE_PRIM_LINES] = _3DPRIM_LINELIST, 195 [PIPE_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP, 196 [PIPE_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP, 197 [PIPE_PRIM_TRIANGLES] = _3DPRIM_TRILIST, 198 [PIPE_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP, 199 [PIPE_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN, 200 [PIPE_PRIM_QUADS] = _3DPRIM_QUADLIST, 201 [PIPE_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP, 202 [PIPE_PRIM_POLYGON] = _3DPRIM_POLYGON, 203#if GFX_VER >= 6 204 [PIPE_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ, 205 [PIPE_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ, 206 [PIPE_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ, 207 [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ, 208#endif 209#if GFX_VER >= 7 210 [PIPE_PRIM_PATCHES] = _3DPRIM_PATCHLIST_1 - 1, 211#endif 212 }; 213 214 return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0); 215} 216 217static unsigned 218translate_compare_func(enum pipe_compare_func pipe_func) 219{ 220 static const unsigned map[] = { 221 [PIPE_FUNC_NEVER] = COMPAREFUNCTION_NEVER, 222 [PIPE_FUNC_LESS] = COMPAREFUNCTION_LESS, 223 [PIPE_FUNC_EQUAL] = COMPAREFUNCTION_EQUAL, 224 [PIPE_FUNC_LEQUAL] = COMPAREFUNCTION_LEQUAL, 225 [PIPE_FUNC_GREATER] = COMPAREFUNCTION_GREATER, 226 [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL, 227 [PIPE_FUNC_GEQUAL] = COMPAREFUNCTION_GEQUAL, 228 [PIPE_FUNC_ALWAYS] = COMPAREFUNCTION_ALWAYS, 229 }; 230 return map[pipe_func]; 231} 232 233static unsigned 234translate_shadow_func(enum pipe_compare_func pipe_func) 235{ 236 /* Gallium specifies the result of shadow comparisons as: 237 * 238 * 1 if ref <op> texel, 239 * 0 otherwise. 240 * 241 * The hardware does: 242 * 243 * 0 if texel <op> ref, 244 * 1 otherwise. 245 * 246 * So we need to flip the operator and also negate. 247 */ 248 static const unsigned map[] = { 249 [PIPE_FUNC_NEVER] = PREFILTEROP_ALWAYS, 250 [PIPE_FUNC_LESS] = PREFILTEROP_LEQUAL, 251 [PIPE_FUNC_EQUAL] = PREFILTEROP_NOTEQUAL, 252 [PIPE_FUNC_LEQUAL] = PREFILTEROP_LESS, 253 [PIPE_FUNC_GREATER] = PREFILTEROP_GEQUAL, 254 [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL, 255 [PIPE_FUNC_GEQUAL] = PREFILTEROP_GREATER, 256 [PIPE_FUNC_ALWAYS] = PREFILTEROP_NEVER, 257 }; 258 return map[pipe_func]; 259} 260 261static unsigned 262translate_cull_mode(unsigned pipe_face) 263{ 264 static const unsigned map[4] = { 265 [PIPE_FACE_NONE] = CULLMODE_NONE, 266 [PIPE_FACE_FRONT] = CULLMODE_FRONT, 267 [PIPE_FACE_BACK] = CULLMODE_BACK, 268 [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH, 269 }; 270 return map[pipe_face]; 271} 272 273#if GFX_VER >= 6 274static unsigned 275translate_fill_mode(unsigned pipe_polymode) 276{ 277 static const unsigned map[4] = { 278 [PIPE_POLYGON_MODE_FILL] = FILL_MODE_SOLID, 279 [PIPE_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME, 280 [PIPE_POLYGON_MODE_POINT] = FILL_MODE_POINT, 281 [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID, 282 }; 283 return map[pipe_polymode]; 284} 285#endif 286 287static unsigned 288translate_mip_filter(enum pipe_tex_mipfilter pipe_mip) 289{ 290 static const unsigned map[] = { 291 [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST, 292 [PIPE_TEX_MIPFILTER_LINEAR] = MIPFILTER_LINEAR, 293 [PIPE_TEX_MIPFILTER_NONE] = MIPFILTER_NONE, 294 }; 295 return map[pipe_mip]; 296} 297 298static uint32_t 299translate_wrap(unsigned pipe_wrap, bool either_nearest) 300{ 301 static const unsigned map[] = { 302 [PIPE_TEX_WRAP_REPEAT] = TCM_WRAP, 303#if GFX_VER == 8 304 [PIPE_TEX_WRAP_CLAMP] = TCM_HALF_BORDER, 305#else 306 [PIPE_TEX_WRAP_CLAMP] = TCM_CLAMP_BORDER, 307#endif 308 [PIPE_TEX_WRAP_CLAMP_TO_EDGE] = TCM_CLAMP, 309 [PIPE_TEX_WRAP_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER, 310 [PIPE_TEX_WRAP_MIRROR_REPEAT] = TCM_MIRROR, 311 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE, 312 313 /* These are unsupported. */ 314 [PIPE_TEX_WRAP_MIRROR_CLAMP] = -1, 315 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1, 316 }; 317#if GFX_VER < 8 318 if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest) 319 return TCM_CLAMP; 320#endif 321 return map[pipe_wrap]; 322} 323 324/** 325 * Equiv if brw_state_batch 326 */ 327static uint32_t * 328stream_state(struct crocus_batch *batch, 329 unsigned size, 330 unsigned alignment, 331 uint32_t *out_offset) 332{ 333 uint32_t offset = ALIGN(batch->state.used, alignment); 334 335 if (offset + size >= STATE_SZ && !batch->no_wrap) { 336 crocus_batch_flush(batch); 337 offset = ALIGN(batch->state.used, alignment); 338 } else if (offset + size >= batch->state.bo->size) { 339 const unsigned new_size = 340 MIN2(batch->state.bo->size + batch->state.bo->size / 2, 341 MAX_STATE_SIZE); 342 crocus_grow_buffer(batch, true, batch->state.used, new_size); 343 assert(offset + size < batch->state.bo->size); 344 } 345 346 crocus_record_state_size(batch->state_sizes, offset, size); 347 348 batch->state.used = offset + size; 349 *out_offset = offset; 350 351 return (uint32_t *)batch->state.map + (offset >> 2); 352} 353 354/** 355 * stream_state() + memcpy. 356 */ 357static uint32_t 358emit_state(struct crocus_batch *batch, const void *data, unsigned size, 359 unsigned alignment) 360{ 361 unsigned offset = 0; 362 uint32_t *map = stream_state(batch, size, alignment, &offset); 363 364 if (map) 365 memcpy(map, data, size); 366 367 return offset; 368} 369 370#if GFX_VER <= 5 371static void 372upload_pipelined_state_pointers(struct crocus_batch *batch, 373 bool gs_active, uint32_t gs_offset, 374 uint32_t vs_offset, uint32_t sf_offset, 375 uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset) 376{ 377#if GFX_VER == 5 378 /* Need to flush before changing clip max threads for errata. */ 379 crocus_emit_cmd(batch, GENX(MI_FLUSH), foo); 380#endif 381 382 crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) { 383 pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset); 384 pp.GSEnable = gs_active; 385 if (gs_active) 386 pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset); 387 pp.ClipEnable = true; 388 pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset); 389 pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset); 390 pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset); 391 pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset); 392 } 393} 394 395#endif 396/** 397 * Did field 'x' change between 'old_cso' and 'new_cso'? 398 * 399 * (If so, we may want to set some dirty flags.) 400 */ 401#define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x)) 402#define cso_changed_memcmp(x) \ 403 (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0) 404 405static void 406flush_before_state_base_change(struct crocus_batch *batch) 407{ 408#if GFX_VER >= 6 409 /* Flush before emitting STATE_BASE_ADDRESS. 410 * 411 * This isn't documented anywhere in the PRM. However, it seems to be 412 * necessary prior to changing the surface state base adress. We've 413 * seen issues in Vulkan where we get GPU hangs when using multi-level 414 * command buffers which clear depth, reset state base address, and then 415 * go render stuff. 416 * 417 * Normally, in GL, we would trust the kernel to do sufficient stalls 418 * and flushes prior to executing our batch. However, it doesn't seem 419 * as if the kernel's flushing is always sufficient and we don't want to 420 * rely on it. 421 * 422 * We make this an end-of-pipe sync instead of a normal flush because we 423 * do not know the current status of the GPU. On Haswell at least, 424 * having a fast-clear operation in flight at the same time as a normal 425 * rendering operation can cause hangs. Since the kernel's flushing is 426 * insufficient, we need to ensure that any rendering operations from 427 * other processes are definitely complete before we try to do our own 428 * rendering. It's a bit of a big hammer but it appears to work. 429 */ 430 const unsigned dc_flush = 431 GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0; 432 crocus_emit_end_of_pipe_sync(batch, 433 "change STATE_BASE_ADDRESS (flushes)", 434 PIPE_CONTROL_RENDER_TARGET_FLUSH | 435 dc_flush | 436 PIPE_CONTROL_DEPTH_CACHE_FLUSH); 437#endif 438} 439 440static void 441flush_after_state_base_change(struct crocus_batch *batch) 442{ 443 /* After re-setting the surface state base address, we have to do some 444 * cache flusing so that the sampler engine will pick up the new 445 * SURFACE_STATE objects and binding tables. From the Broadwell PRM, 446 * Shared Function > 3D Sampler > State > State Caching (page 96): 447 * 448 * Coherency with system memory in the state cache, like the texture 449 * cache is handled partially by software. It is expected that the 450 * command stream or shader will issue Cache Flush operation or 451 * Cache_Flush sampler message to ensure that the L1 cache remains 452 * coherent with system memory. 453 * 454 * [...] 455 * 456 * Whenever the value of the Dynamic_State_Base_Addr, 457 * Surface_State_Base_Addr are altered, the L1 state cache must be 458 * invalidated to ensure the new surface or sampler state is fetched 459 * from system memory. 460 * 461 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit 462 * which, according the PIPE_CONTROL instruction documentation in the 463 * Broadwell PRM: 464 * 465 * Setting this bit is independent of any other bit in this packet. 466 * This bit controls the invalidation of the L1 and L2 state caches 467 * at the top of the pipe i.e. at the parsing time. 468 * 469 * Unfortunately, experimentation seems to indicate that state cache 470 * invalidation through a PIPE_CONTROL does nothing whatsoever in 471 * regards to surface state and binding tables. In stead, it seems that 472 * invalidating the texture cache is what is actually needed. 473 * 474 * XXX: As far as we have been able to determine through 475 * experimentation, shows that flush the texture cache appears to be 476 * sufficient. The theory here is that all of the sampling/rendering 477 * units cache the binding table in the texture cache. However, we have 478 * yet to be able to actually confirm this. 479 */ 480#if GFX_VER >= 6 481 crocus_emit_end_of_pipe_sync(batch, 482 "change STATE_BASE_ADDRESS (invalidates)", 483 PIPE_CONTROL_INSTRUCTION_INVALIDATE | 484 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | 485 PIPE_CONTROL_CONST_CACHE_INVALIDATE | 486 PIPE_CONTROL_STATE_CACHE_INVALIDATE); 487#endif 488} 489 490#if GFX_VER >= 6 491static void 492crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg, 493 struct crocus_bo *bo, uint32_t offset, 494 bool predicated) 495{ 496 crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) { 497 srm.RegisterAddress = reg; 498 srm.MemoryAddress = ggtt_bo(bo, offset); 499#if GFX_VERx10 >= 75 500 srm.PredicateEnable = predicated; 501#else 502 if (predicated) 503 unreachable("unsupported predication"); 504#endif 505 } 506} 507 508static void 509crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg, 510 struct crocus_bo *bo, uint32_t offset, 511 bool predicated) 512{ 513 crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated); 514 crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated); 515} 516#endif 517 518#if GFX_VER >= 7 519static void 520_crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val) 521{ 522 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { 523 lri.RegisterOffset = reg; 524 lri.DataDWord = val; 525 } 526} 527#define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v) 528 529#if GFX_VERx10 >= 75 530static void 531_crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src) 532{ 533 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { 534 lrr.SourceRegisterAddress = src; 535 lrr.DestinationRegisterAddress = dst; 536 } 537} 538 539static void 540crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst, 541 uint32_t src) 542{ 543 _crocus_emit_lrr(batch, dst, src); 544} 545 546static void 547crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst, 548 uint32_t src) 549{ 550 _crocus_emit_lrr(batch, dst, src); 551 _crocus_emit_lrr(batch, dst + 4, src + 4); 552} 553#endif 554 555static void 556crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg, 557 uint32_t val) 558{ 559 _crocus_emit_lri(batch, reg, val); 560} 561 562static void 563crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg, 564 uint64_t val) 565{ 566 _crocus_emit_lri(batch, reg + 0, val & 0xffffffff); 567 _crocus_emit_lri(batch, reg + 4, val >> 32); 568} 569 570/** 571 * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer. 572 */ 573static void 574crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg, 575 struct crocus_bo *bo, uint32_t offset) 576{ 577 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 578 lrm.RegisterAddress = reg; 579 lrm.MemoryAddress = ro_bo(bo, offset); 580 } 581} 582 583/** 584 * Load a 64-bit value from a buffer into a MMIO register via 585 * two MI_LOAD_REGISTER_MEM commands. 586 */ 587static void 588crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg, 589 struct crocus_bo *bo, uint32_t offset) 590{ 591 crocus_load_register_mem32(batch, reg + 0, bo, offset + 0); 592 crocus_load_register_mem32(batch, reg + 4, bo, offset + 4); 593} 594 595#if GFX_VERx10 >= 75 596static void 597crocus_store_data_imm32(struct crocus_batch *batch, 598 struct crocus_bo *bo, uint32_t offset, 599 uint32_t imm) 600{ 601 crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) { 602 sdi.Address = rw_bo(bo, offset); 603#if GFX_VER >= 6 604 sdi.ImmediateData = imm; 605#endif 606 } 607} 608 609static void 610crocus_store_data_imm64(struct crocus_batch *batch, 611 struct crocus_bo *bo, uint32_t offset, 612 uint64_t imm) 613{ 614 /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of 615 * 2 in genxml but it's actually variable length and we need 5 DWords. 616 */ 617 void *map = crocus_get_command_space(batch, 4 * 5); 618 _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) { 619 sdi.DWordLength = 5 - 2; 620 sdi.Address = rw_bo(bo, offset); 621#if GFX_VER >= 6 622 sdi.ImmediateData = imm; 623#endif 624 } 625} 626#endif 627 628static void 629crocus_copy_mem_mem(struct crocus_batch *batch, 630 struct crocus_bo *dst_bo, uint32_t dst_offset, 631 struct crocus_bo *src_bo, uint32_t src_offset, 632 unsigned bytes) 633{ 634 assert(bytes % 4 == 0); 635 assert(dst_offset % 4 == 0); 636 assert(src_offset % 4 == 0); 637 638#define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */ 639 for (unsigned i = 0; i < bytes; i += 4) { 640 crocus_load_register_mem32(batch, CROCUS_TEMP_REG, 641 src_bo, src_offset + i); 642 crocus_store_register_mem32(batch, CROCUS_TEMP_REG, 643 dst_bo, dst_offset + i, false); 644 } 645} 646#endif 647 648/** 649 * Gallium CSO for rasterizer state. 650 */ 651struct crocus_rasterizer_state { 652 struct pipe_rasterizer_state cso; 653#if GFX_VER >= 6 654 uint32_t sf[GENX(3DSTATE_SF_length)]; 655 uint32_t clip[GENX(3DSTATE_CLIP_length)]; 656#endif 657#if GFX_VER >= 8 658 uint32_t raster[GENX(3DSTATE_RASTER_length)]; 659#endif 660 uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)]; 661 662 uint8_t num_clip_plane_consts; 663 bool fill_mode_point_or_line; 664}; 665 666#if GFX_VER <= 5 667#define URB_VS 0 668#define URB_GS 1 669#define URB_CLP 2 670#define URB_SF 3 671#define URB_CS 4 672 673static const struct { 674 uint32_t min_nr_entries; 675 uint32_t preferred_nr_entries; 676 uint32_t min_entry_size; 677 uint32_t max_entry_size; 678} limits[URB_CS+1] = { 679 { 16, 32, 1, 5 }, /* vs */ 680 { 4, 8, 1, 5 }, /* gs */ 681 { 5, 10, 1, 5 }, /* clp */ 682 { 1, 8, 1, 12 }, /* sf */ 683 { 1, 4, 1, 32 } /* cs */ 684}; 685 686static bool check_urb_layout(struct crocus_context *ice) 687{ 688 ice->urb.vs_start = 0; 689 ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize; 690 ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize; 691 ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize; 692 ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize; 693 694 return ice->urb.cs_start + ice->urb.nr_cs_entries * 695 ice->urb.csize <= ice->urb.size; 696} 697 698 699static bool 700crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize, 701 unsigned vsize, unsigned sfsize) 702{ 703 struct crocus_context *ice = batch->ice; 704 if (csize < limits[URB_CS].min_entry_size) 705 csize = limits[URB_CS].min_entry_size; 706 707 if (vsize < limits[URB_VS].min_entry_size) 708 vsize = limits[URB_VS].min_entry_size; 709 710 if (sfsize < limits[URB_SF].min_entry_size) 711 sfsize = limits[URB_SF].min_entry_size; 712 713 if (ice->urb.vsize < vsize || 714 ice->urb.sfsize < sfsize || 715 ice->urb.csize < csize || 716 (ice->urb.constrained && (ice->urb.vsize > vsize || 717 ice->urb.sfsize > sfsize || 718 ice->urb.csize > csize))) { 719 720 721 ice->urb.csize = csize; 722 ice->urb.sfsize = sfsize; 723 ice->urb.vsize = vsize; 724 725 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries; 726 ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries; 727 ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries; 728 ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries; 729 ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries; 730 731 ice->urb.constrained = 0; 732 733 if (GFX_VER == 5) { 734 ice->urb.nr_vs_entries = 128; 735 ice->urb.nr_sf_entries = 48; 736 if (check_urb_layout(ice)) { 737 goto done; 738 } else { 739 ice->urb.constrained = 1; 740 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries; 741 ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries; 742 } 743 } else if (GFX_VERx10 == 45) { 744 ice->urb.nr_vs_entries = 64; 745 if (check_urb_layout(ice)) { 746 goto done; 747 } else { 748 ice->urb.constrained = 1; 749 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries; 750 } 751 } 752 753 if (!check_urb_layout(ice)) { 754 ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries; 755 ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries; 756 ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries; 757 ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries; 758 ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries; 759 760 /* Mark us as operating with constrained nr_entries, so that next 761 * time we recalculate we'll resize the fences in the hope of 762 * escaping constrained mode and getting back to normal performance. 763 */ 764 ice->urb.constrained = 1; 765 766 if (!check_urb_layout(ice)) { 767 /* This is impossible, given the maximal sizes of urb 768 * entries and the values for minimum nr of entries 769 * provided above. 770 */ 771 fprintf(stderr, "couldn't calculate URB layout!\n"); 772 exit(1); 773 } 774 775 if (INTEL_DEBUG(DEBUG_URB|DEBUG_PERF)) 776 fprintf(stderr, "URB CONSTRAINED\n"); 777 } 778 779done: 780 if (INTEL_DEBUG(DEBUG_URB)) 781 fprintf(stderr, 782 "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n", 783 ice->urb.vs_start, 784 ice->urb.gs_start, 785 ice->urb.clip_start, 786 ice->urb.sf_start, 787 ice->urb.cs_start, 788 ice->urb.size); 789 return true; 790 } 791 return false; 792} 793 794static void 795crocus_upload_urb_fence(struct crocus_batch *batch) 796{ 797 uint32_t urb_fence[3]; 798 _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) { 799 urb.VSUnitURBReallocationRequest = 1; 800 urb.GSUnitURBReallocationRequest = 1; 801 urb.CLIPUnitURBReallocationRequest = 1; 802 urb.SFUnitURBReallocationRequest = 1; 803 urb.VFEUnitURBReallocationRequest = 1; 804 urb.CSUnitURBReallocationRequest = 1; 805 806 urb.VSFence = batch->ice->urb.gs_start; 807 urb.GSFence = batch->ice->urb.clip_start; 808 urb.CLIPFence = batch->ice->urb.sf_start; 809 urb.SFFence = batch->ice->urb.cs_start; 810 urb.CSFence = batch->ice->urb.size; 811 } 812 813 /* erratum: URB_FENCE must not cross a 64byte cacheline */ 814 if ((crocus_batch_bytes_used(batch) & 15) > 12) { 815 int pad = 16 - (crocus_batch_bytes_used(batch) & 15); 816 do { 817 *(uint32_t *)batch->command.map_next = 0; 818 batch->command.map_next += sizeof(uint32_t); 819 } while (--pad); 820 } 821 822 crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3); 823} 824 825static bool 826calculate_curbe_offsets(struct crocus_batch *batch) 827{ 828 struct crocus_context *ice = batch->ice; 829 830 unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0; 831 unsigned total_regs; 832 833 nr_fp_regs = 0; 834 for (int i = 0; i < 4; i++) { 835 const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i]; 836 if (range->length == 0) 837 continue; 838 839 /* ubo range tracks at 256-bit, we need 512-bit */ 840 nr_fp_regs += (range->length + 1) / 2; 841 } 842 843 if (ice->state.cso_rast->cso.clip_plane_enable) { 844 unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable); 845 nr_clip_regs = (nr_planes * 4 + 15) / 16; 846 } 847 848 nr_vp_regs = 0; 849 for (int i = 0; i < 4; i++) { 850 const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i]; 851 if (range->length == 0) 852 continue; 853 854 /* ubo range tracks at 256-bit, we need 512-bit */ 855 nr_vp_regs += (range->length + 1) / 2; 856 } 857 if (nr_vp_regs == 0) { 858 /* The pre-gen6 VS requires that some push constants get loaded no 859 * matter what, or the GPU would hang. 860 */ 861 nr_vp_regs = 1; 862 } 863 total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs; 864 865 /* The CURBE allocation size is limited to 32 512-bit units (128 EU 866 * registers, or 1024 floats). See CS_URB_STATE in the gen4 or gen5 867 * (volume 1, part 1) PRMs. 868 * 869 * Note that in brw_fs.cpp we're only loading up to 16 EU registers of 870 * values as push constants before spilling to pull constants, and in 871 * brw_vec4.cpp we're loading up to 32 registers of push constants. An EU 872 * register is 1/2 of one of these URB entry units, so that leaves us 16 EU 873 * regs for clip. 874 */ 875 assert(total_regs <= 32); 876 877 /* Lazy resize: 878 */ 879 if (nr_fp_regs > ice->curbe.wm_size || 880 nr_vp_regs > ice->curbe.vs_size || 881 nr_clip_regs != ice->curbe.clip_size || 882 (total_regs < ice->curbe.total_size / 4 && 883 ice->curbe.total_size > 16)) { 884 885 GLuint reg = 0; 886 887 /* Calculate a new layout: 888 */ 889 reg = 0; 890 ice->curbe.wm_start = reg; 891 ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs; 892 ice->curbe.clip_start = reg; 893 ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs; 894 ice->curbe.vs_start = reg; 895 ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs; 896 ice->curbe.total_size = reg; 897 898 if (0) 899 fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n", 900 ice->curbe.wm_start, 901 ice->curbe.wm_size, 902 ice->curbe.clip_start, 903 ice->curbe.clip_size, 904 ice->curbe.vs_start, 905 ice->curbe.vs_size ); 906 return true; 907 } 908 return false; 909} 910 911static void 912upload_shader_consts(struct crocus_context *ice, 913 gl_shader_stage stage, 914 uint32_t *map, 915 unsigned start) 916{ 917 struct crocus_compiled_shader *shader = ice->shaders.prog[stage]; 918 struct brw_stage_prog_data *prog_data = (void *) shader->prog_data; 919 uint32_t *cmap; 920 bool found = false; 921 unsigned offset = start * 16; 922 int total = 0; 923 for (int i = 0; i < 4; i++) { 924 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; 925 926 if (range->length == 0) 927 continue; 928 929 unsigned block_index = crocus_bti_to_group_index( 930 &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block); 931 unsigned len = range->length * 8 * sizeof(float); 932 unsigned start = range->start * 8 * sizeof(float); 933 struct pipe_transfer *transfer; 934 935 cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer, 936 ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len, 937 PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer); 938 if (cmap) 939 memcpy(&map[offset + (total * 8)], cmap, len); 940 pipe_buffer_unmap(&ice->ctx, transfer); 941 total += range->length; 942 found = true; 943 } 944 945 if (stage == MESA_SHADER_VERTEX && !found) { 946 /* The pre-gen6 VS requires that some push constants get loaded no 947 * matter what, or the GPU would hang. 948 */ 949 unsigned len = 16; 950 memset(&map[offset], 0, len); 951 } 952} 953 954static const float fixed_plane[6][4] = { 955 { 0, 0, -1, 1 }, 956 { 0, 0, 1, 1 }, 957 { 0, -1, 0, 1 }, 958 { 0, 1, 0, 1 }, 959 {-1, 0, 0, 1 }, 960 { 1, 0, 0, 1 } 961}; 962 963static void 964gen4_upload_curbe(struct crocus_batch *batch) 965{ 966 struct crocus_context *ice = batch->ice; 967 const unsigned sz = ice->curbe.total_size; 968 const unsigned buf_sz = sz * 16 * sizeof(float); 969 970 if (sz == 0) 971 goto emit; 972 973 uint32_t *map; 974 u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64, 975 &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map); 976 977 /* fragment shader constants */ 978 if (ice->curbe.wm_size) { 979 upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start); 980 } 981 982 /* clipper constants */ 983 if (ice->curbe.clip_size) { 984 unsigned offset = ice->curbe.clip_start * 16; 985 float *fmap = (float *)map; 986 unsigned i; 987 /* If any planes are going this way, send them all this way: 988 */ 989 for (i = 0; i < 6; i++) { 990 fmap[offset + i * 4 + 0] = fixed_plane[i][0]; 991 fmap[offset + i * 4 + 1] = fixed_plane[i][1]; 992 fmap[offset + i * 4 + 2] = fixed_plane[i][2]; 993 fmap[offset + i * 4 + 3] = fixed_plane[i][3]; 994 } 995 996 unsigned mask = ice->state.cso_rast->cso.clip_plane_enable; 997 struct pipe_clip_state *cp = &ice->state.clip_planes; 998 while (mask) { 999 const int j = u_bit_scan(&mask); 1000 fmap[offset + i * 4 + 0] = cp->ucp[j][0]; 1001 fmap[offset + i * 4 + 1] = cp->ucp[j][1]; 1002 fmap[offset + i * 4 + 2] = cp->ucp[j][2]; 1003 fmap[offset + i * 4 + 3] = cp->ucp[j][3]; 1004 i++; 1005 } 1006 } 1007 1008 /* vertex shader constants */ 1009 if (ice->curbe.vs_size) { 1010 upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start); 1011 } 1012 if (0) { 1013 for (int i = 0; i < sz*16; i+=4) { 1014 float *f = (float *)map; 1015 fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4, 1016 f[i+0], f[i+1], f[i+2], f[i+3]); 1017 } 1018 } 1019 1020emit: 1021 crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) { 1022 if (ice->curbe.curbe_res) { 1023 cb.BufferLength = ice->curbe.total_size - 1; 1024 cb.Valid = 1; 1025 cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset); 1026 } 1027 } 1028 1029#if GFX_VER == 4 && GFX_VERx10 != 45 1030 /* Work around a Broadwater/Crestline depth interpolator bug. The 1031 * following sequence will cause GPU hangs: 1032 * 1033 * 1. Change state so that all depth related fields in CC_STATE are 1034 * disabled, and in WM_STATE, only "PS Use Source Depth" is enabled. 1035 * 2. Emit a CONSTANT_BUFFER packet. 1036 * 3. Draw via 3DPRIMITIVE. 1037 * 1038 * The recommended workaround is to emit a non-pipelined state change after 1039 * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline. 1040 * 1041 * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small), 1042 * and always emit it when "PS Use Source Depth" is set. We could be more 1043 * precise, but the additional complexity is probably not worth it. 1044 * 1045 */ 1046 const struct shader_info *fs_info = 1047 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT); 1048 1049 if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) { 1050 ice->state.global_depth_offset_clamp = 0; 1051 crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp); 1052 } 1053#endif 1054} 1055#endif 1056 1057#if GFX_VER >= 7 1058 1059#define IVB_L3SQCREG1_SQGHPCI_DEFAULT 0x00730000 1060#define VLV_L3SQCREG1_SQGHPCI_DEFAULT 0x00d30000 1061#define HSW_L3SQCREG1_SQGHPCI_DEFAULT 0x00610000 1062 1063static void 1064setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg) 1065{ 1066#if GFX_VER == 7 1067 const struct intel_device_info *devinfo = &batch->screen->devinfo; 1068 const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL]; 1069 const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] || 1070 cfg->n[INTEL_L3P_ALL]; 1071 const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] || 1072 cfg->n[INTEL_L3P_ALL]; 1073 const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] || 1074 cfg->n[INTEL_L3P_ALL]; 1075 const bool has_slm = cfg->n[INTEL_L3P_SLM]; 1076#endif 1077 1078 /* According to the hardware docs, the L3 partitioning can only be changed 1079 * while the pipeline is completely drained and the caches are flushed, 1080 * which involves a first PIPE_CONTROL flush which stalls the pipeline... 1081 */ 1082 crocus_emit_pipe_control_flush(batch, "l3_config", 1083 PIPE_CONTROL_DATA_CACHE_FLUSH | 1084 PIPE_CONTROL_CS_STALL); 1085 1086 /* ...followed by a second pipelined PIPE_CONTROL that initiates 1087 * invalidation of the relevant caches. Note that because RO invalidation 1088 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL 1089 * command is processed by the CS) we cannot combine it with the previous 1090 * stalling flush as the hardware documentation suggests, because that 1091 * would cause the CS to stall on previous rendering *after* RO 1092 * invalidation and wouldn't prevent the RO caches from being polluted by 1093 * concurrent rendering before the stall completes. This intentionally 1094 * doesn't implement the SKL+ hardware workaround suggesting to enable CS 1095 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for 1096 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs 1097 * already guarantee that there is no concurrent GPGPU kernel execution 1098 * (see SKL HSD 2132585). 1099 */ 1100 crocus_emit_pipe_control_flush(batch, "l3 config", 1101 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | 1102 PIPE_CONTROL_CONST_CACHE_INVALIDATE | 1103 PIPE_CONTROL_INSTRUCTION_INVALIDATE | 1104 PIPE_CONTROL_STATE_CACHE_INVALIDATE); 1105 1106 /* Now send a third stalling flush to make sure that invalidation is 1107 * complete when the L3 configuration registers are modified. 1108 */ 1109 crocus_emit_pipe_control_flush(batch, "l3 config", 1110 PIPE_CONTROL_DATA_CACHE_FLUSH | 1111 PIPE_CONTROL_CS_STALL); 1112 1113#if GFX_VER == 8 1114 assert(!cfg->n[INTEL_L3P_IS] && !cfg->n[INTEL_L3P_C] && !cfg->n[INTEL_L3P_T]); 1115 crocus_emit_reg(batch, GENX(L3CNTLREG), reg) { 1116 reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0; 1117 reg.URBAllocation = cfg->n[INTEL_L3P_URB]; 1118 reg.ROAllocation = cfg->n[INTEL_L3P_RO]; 1119 reg.DCAllocation = cfg->n[INTEL_L3P_DC]; 1120 reg.AllAllocation = cfg->n[INTEL_L3P_ALL]; 1121 } 1122#else 1123 assert(!cfg->n[INTEL_L3P_ALL]); 1124 1125 /* When enabled SLM only uses a portion of the L3 on half of the banks, 1126 * the matching space on the remaining banks has to be allocated to a 1127 * client (URB for all validated configurations) set to the 1128 * lower-bandwidth 2-bank address hashing mode. 1129 */ 1130 const bool urb_low_bw = has_slm && devinfo->platform != INTEL_PLATFORM_BYT; 1131 assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]); 1132 1133 /* Minimum number of ways that can be allocated to the URB. */ 1134 const unsigned n0_urb = (devinfo->platform == INTEL_PLATFORM_BYT ? 32 : 0); 1135 assert(cfg->n[INTEL_L3P_URB] >= n0_urb); 1136 1137 uint32_t l3sqcr1, l3cr2, l3cr3; 1138 1139 crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) { 1140 reg.ConvertDC_UC = !has_dc; 1141 reg.ConvertIS_UC = !has_is; 1142 reg.ConvertC_UC = !has_c; 1143 reg.ConvertT_UC = !has_t; 1144#if GFX_VERx10 == 75 1145 reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT; 1146#else 1147 reg.L3SQGeneralPriorityCreditInitialization = 1148 devinfo->platform == INTEL_PLATFORM_BYT ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT; 1149#endif 1150 reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT; 1151 }; 1152 1153 crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) { 1154 reg.SLMEnable = has_slm; 1155 reg.URBLowBandwidth = urb_low_bw; 1156 reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb; 1157#if !(GFX_VERx10 == 75) 1158 reg.ALLAllocation = cfg->n[INTEL_L3P_ALL]; 1159#endif 1160 reg.ROAllocation = cfg->n[INTEL_L3P_RO]; 1161 reg.DCAllocation = cfg->n[INTEL_L3P_DC]; 1162 }; 1163 1164 crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) { 1165 reg.ISAllocation = cfg->n[INTEL_L3P_IS]; 1166 reg.ISLowBandwidth = 0; 1167 reg.CAllocation = cfg->n[INTEL_L3P_C]; 1168 reg.CLowBandwidth = 0; 1169 reg.TAllocation = cfg->n[INTEL_L3P_T]; 1170 reg.TLowBandwidth = 0; 1171 }; 1172 1173 /* Set up the L3 partitioning. */ 1174 crocus_emit_lri(batch, L3SQCREG1, l3sqcr1); 1175 crocus_emit_lri(batch, L3CNTLREG2, l3cr2); 1176 crocus_emit_lri(batch, L3CNTLREG3, l3cr3); 1177 1178#if GFX_VERSIONx10 == 75 1179 /* TODO: Fail screen creation if command parser version < 4 */ 1180 uint32_t scratch1, chicken3; 1181 crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) { 1182 reg.L3AtomicDisable = !has_dc; 1183 } 1184 crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) { 1185 reg.L3AtomicDisableMask = true; 1186 reg.L3AtomicDisable = !has_dc; 1187 } 1188 crocus_emit_lri(batch, SCRATCH1, scratch1); 1189 crocus_emit_lri(batch, CHICKEN3, chicken3); 1190#endif 1191#endif 1192} 1193 1194static void 1195emit_l3_state(struct crocus_batch *batch, bool compute) 1196{ 1197 const struct intel_l3_config *const cfg = 1198 compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d; 1199 1200 setup_l3_config(batch, cfg); 1201 if (INTEL_DEBUG(DEBUG_L3)) { 1202 intel_dump_l3_config(cfg, stderr); 1203 } 1204} 1205 1206/** 1207 * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set. 1208 */ 1209static void 1210gen7_emit_cs_stall_flush(struct crocus_batch *batch) 1211{ 1212 crocus_emit_pipe_control_write(batch, 1213 "workaround", 1214 PIPE_CONTROL_CS_STALL 1215 | PIPE_CONTROL_WRITE_IMMEDIATE, 1216 batch->ice->workaround_bo, 1217 batch->ice->workaround_offset, 0); 1218} 1219#endif 1220 1221static void 1222emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline) 1223{ 1224#if GFX_VER == 8 1225 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT: 1226 * 1227 * Software must clear the COLOR_CALC_STATE Valid field in 1228 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT 1229 * with Pipeline Select set to GPGPU. 1230 * 1231 * The internal hardware docs recommend the same workaround for Gfx9 1232 * hardware too. 1233 */ 1234 if (pipeline == GPGPU) 1235 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t); 1236#endif 1237 1238#if GFX_VER >= 6 1239 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] 1240 * PIPELINE_SELECT [DevBWR+]": 1241 * 1242 * "Project: DEVSNB+ 1243 * 1244 * Software must ensure all the write caches are flushed through a 1245 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL 1246 * command to invalidate read only caches prior to programming 1247 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode." 1248 */ 1249 const unsigned dc_flush = 1250 GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0; 1251 crocus_emit_pipe_control_flush(batch, 1252 "workaround: PIPELINE_SELECT flushes (1/2)", 1253 PIPE_CONTROL_RENDER_TARGET_FLUSH | 1254 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 1255 dc_flush | 1256 PIPE_CONTROL_CS_STALL); 1257 1258 crocus_emit_pipe_control_flush(batch, 1259 "workaround: PIPELINE_SELECT flushes (2/2)", 1260 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | 1261 PIPE_CONTROL_CONST_CACHE_INVALIDATE | 1262 PIPE_CONTROL_STATE_CACHE_INVALIDATE | 1263 PIPE_CONTROL_INSTRUCTION_INVALIDATE); 1264#else 1265 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] 1266 * PIPELINE_SELECT [DevBWR+]": 1267 * 1268 * Project: PRE-DEVSNB 1269 * 1270 * Software must ensure the current pipeline is flushed via an 1271 * MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT. 1272 */ 1273 crocus_emit_cmd(batch, GENX(MI_FLUSH), foo); 1274#endif 1275 1276 crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) { 1277 sel.PipelineSelection = pipeline; 1278 } 1279 1280#if GFX_VER == 7 && !(GFX_VERx10 == 75) 1281 if (pipeline == _3D) { 1282 gen7_emit_cs_stall_flush(batch); 1283 1284 crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) { 1285 prim.PrimitiveTopologyType = _3DPRIM_POINTLIST; 1286 }; 1287 } 1288#endif 1289} 1290 1291/** 1292 * The following diagram shows how we partition the URB: 1293 * 1294 * 16kB or 32kB Rest of the URB space 1295 * __________-__________ _________________-_________________ 1296 * / \ / \ 1297 * +-------------------------------------------------------------+ 1298 * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB | 1299 * | Constants | Entries | 1300 * +-------------------------------------------------------------+ 1301 * 1302 * Notably, push constants must be stored at the beginning of the URB 1303 * space, while entries can be stored anywhere. Ivybridge and Haswell 1304 * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3 1305 * doubles this (32kB). 1306 * 1307 * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and 1308 * sized) in increments of 1kB. Haswell GT3 requires them to be located and 1309 * sized in increments of 2kB. 1310 * 1311 * Currently we split the constant buffer space evenly among whatever stages 1312 * are active. This is probably not ideal, but simple. 1313 * 1314 * Ivybridge GT1 and Haswell GT1 have 128kB of URB space. 1315 * Ivybridge GT2 and Haswell GT2 have 256kB of URB space. 1316 * Haswell GT3 has 512kB of URB space. 1317 * 1318 * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations", 1319 * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS. 1320 */ 1321#if GFX_VER >= 7 1322static void 1323crocus_alloc_push_constants(struct crocus_batch *batch) 1324{ 1325 const unsigned push_constant_kb = 1326 batch->screen->devinfo.max_constant_urb_size_kb; 1327 unsigned size_per_stage = push_constant_kb / 5; 1328 1329 /* For now, we set a static partitioning of the push constant area, 1330 * assuming that all stages could be in use. 1331 * 1332 * TODO: Try lazily allocating the HS/DS/GS sections as needed, and 1333 * see if that improves performance by offering more space to 1334 * the VS/FS when those aren't in use. Also, try dynamically 1335 * enabling/disabling it like i965 does. This would be more 1336 * stalls and may not actually help; we don't know yet. 1337 */ 1338 for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) { 1339 crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) { 1340 alloc._3DCommandSubOpcode = 18 + i; 1341 alloc.ConstantBufferOffset = size_per_stage * i; 1342 alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage; 1343 } 1344 } 1345 1346 /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS): 1347 * 1348 * A PIPE_CONTROL command with the CS Stall bit set must be programmed 1349 * in the ring after this instruction. 1350 * 1351 * No such restriction exists for Haswell or Baytrail. 1352 */ 1353 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB) 1354 gen7_emit_cs_stall_flush(batch); 1355} 1356#endif 1357 1358/** 1359 * Upload the initial GPU state for a render context. 1360 * 1361 * This sets some invariant state that needs to be programmed a particular 1362 * way, but we never actually change. 1363 */ 1364static void 1365crocus_init_render_context(struct crocus_batch *batch) 1366{ 1367 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo; 1368 1369 emit_pipeline_select(batch, _3D); 1370 1371 crocus_emit_cmd(batch, GENX(STATE_SIP), foo); 1372 1373#if GFX_VER >= 7 1374 emit_l3_state(batch, false); 1375#endif 1376#if (GFX_VERx10 == 70 || GFX_VERx10 == 80) 1377 crocus_emit_reg(batch, GENX(INSTPM), reg) { 1378 reg.CONSTANT_BUFFERAddressOffsetDisable = true; 1379 reg.CONSTANT_BUFFERAddressOffsetDisableMask = true; 1380 } 1381#endif 1382#if GFX_VER >= 5 || GFX_VERx10 == 45 1383 /* Use the legacy AA line coverage computation. */ 1384 crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo); 1385#endif 1386 1387 /* No polygon stippling offsets are necessary. */ 1388 /* TODO: may need to set an offset for origin-UL framebuffers */ 1389 crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo); 1390 1391#if GFX_VER >= 7 1392 crocus_alloc_push_constants(batch); 1393#endif 1394 1395#if GFX_VER == 8 1396 /* Set the initial MSAA sample positions. */ 1397 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) { 1398 INTEL_SAMPLE_POS_1X(pat._1xSample); 1399 INTEL_SAMPLE_POS_2X(pat._2xSample); 1400 INTEL_SAMPLE_POS_4X(pat._4xSample); 1401 INTEL_SAMPLE_POS_8X(pat._8xSample); 1402 } 1403 1404 /* Disable chromakeying (it's for media) */ 1405 crocus_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo); 1406 1407 /* We want regular rendering, not special HiZ operations. */ 1408 crocus_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo); 1409#endif 1410} 1411 1412#if GFX_VER >= 7 1413static void 1414crocus_init_compute_context(struct crocus_batch *batch) 1415{ 1416 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo; 1417 1418 emit_pipeline_select(batch, GPGPU); 1419 1420#if GFX_VER >= 7 1421 emit_l3_state(batch, true); 1422#endif 1423} 1424#endif 1425 1426/** 1427 * Generation-specific context state (ice->state.genx->...). 1428 * 1429 * Most state can go in crocus_context directly, but these encode hardware 1430 * packets which vary by generation. 1431 */ 1432struct crocus_genx_state { 1433 struct { 1434#if GFX_VER >= 7 1435 struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES]; 1436#endif 1437 } shaders[MESA_SHADER_STAGES]; 1438 1439#if GFX_VER == 8 1440 bool pma_fix_enabled; 1441#endif 1442}; 1443 1444/** 1445 * The pipe->set_blend_color() driver hook. 1446 * 1447 * This corresponds to our COLOR_CALC_STATE. 1448 */ 1449static void 1450crocus_set_blend_color(struct pipe_context *ctx, 1451 const struct pipe_blend_color *state) 1452{ 1453 struct crocus_context *ice = (struct crocus_context *) ctx; 1454 1455 /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */ 1456 memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color)); 1457#if GFX_VER <= 5 1458 ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR; 1459#else 1460 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE; 1461#endif 1462} 1463 1464/** 1465 * Gallium CSO for blend state (see pipe_blend_state). 1466 */ 1467struct crocus_blend_state { 1468#if GFX_VER == 8 1469 /** Partial 3DSTATE_PS_BLEND */ 1470 uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)]; 1471#endif 1472 1473 /** copy of BLEND_STATE */ 1474 struct pipe_blend_state cso; 1475 1476 /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */ 1477 uint8_t blend_enables; 1478 1479 /** Bitfield of whether color writes are enabled for RT[i] */ 1480 uint8_t color_write_enables; 1481 1482 /** Does RT[0] use dual color blending? */ 1483 bool dual_color_blending; 1484}; 1485 1486static enum pipe_blendfactor 1487fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one) 1488{ 1489 if (alpha_to_one) { 1490 if (f == PIPE_BLENDFACTOR_SRC1_ALPHA) 1491 return PIPE_BLENDFACTOR_ONE; 1492 1493 if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA) 1494 return PIPE_BLENDFACTOR_ZERO; 1495 } 1496 1497 return f; 1498} 1499 1500#if GFX_VER >= 6 1501typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML; 1502#else 1503typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML; 1504#endif 1505 1506static bool 1507can_emit_logic_op(struct crocus_context *ice) 1508{ 1509 /* all pre gen8 have logicop restricted to unorm */ 1510 enum pipe_format pformat = PIPE_FORMAT_NONE; 1511 for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) { 1512 if (ice->state.framebuffer.cbufs[i]) { 1513 pformat = ice->state.framebuffer.cbufs[i]->format; 1514 break; 1515 } 1516 } 1517 return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat)); 1518} 1519 1520static bool 1521set_blend_entry_bits(struct crocus_batch *batch, BLEND_ENTRY_GENXML *entry, 1522 struct crocus_blend_state *cso_blend, 1523 int idx) 1524{ 1525 struct crocus_context *ice = batch->ice; 1526 bool independent_alpha_blend = false; 1527 const struct pipe_rt_blend_state *rt = 1528 &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? idx : 0]; 1529 const unsigned blend_enabled = rt->blend_enable; 1530 1531 enum pipe_blendfactor src_rgb = 1532 fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one); 1533 enum pipe_blendfactor src_alpha = 1534 fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one); 1535 enum pipe_blendfactor dst_rgb = 1536 fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one); 1537 enum pipe_blendfactor dst_alpha = 1538 fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one); 1539 1540 if (rt->rgb_func != rt->alpha_func || 1541 src_rgb != src_alpha || dst_rgb != dst_alpha) 1542 independent_alpha_blend = true; 1543 if (cso_blend->cso.logicop_enable) { 1544 if (GFX_VER >= 8 || can_emit_logic_op(ice)) { 1545 entry->LogicOpEnable = cso_blend->cso.logicop_enable; 1546 entry->LogicOpFunction = cso_blend->cso.logicop_func; 1547 } 1548 } else if (blend_enabled) { 1549 if (idx == 0) { 1550 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT]; 1551 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data; 1552 entry->ColorBufferBlendEnable = 1553 (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend); 1554 } else 1555 entry->ColorBufferBlendEnable = 1; 1556 1557 entry->ColorBlendFunction = rt->rgb_func; 1558 entry->AlphaBlendFunction = rt->alpha_func; 1559 entry->SourceBlendFactor = (int) src_rgb; 1560 entry->SourceAlphaBlendFactor = (int) src_alpha; 1561 entry->DestinationBlendFactor = (int) dst_rgb; 1562 entry->DestinationAlphaBlendFactor = (int) dst_alpha; 1563 } 1564#if GFX_VER <= 5 1565 /* 1566 * Gen4/GM45/ILK can't handle have ColorBufferBlendEnable == 0 1567 * when a dual src blend shader is in use. Setup dummy blending. 1568 */ 1569 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT]; 1570 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data; 1571 if (idx == 0 && !blend_enabled && wm_prog_data->dual_src_blend) { 1572 entry->ColorBufferBlendEnable = 1; 1573 entry->ColorBlendFunction = PIPE_BLEND_ADD; 1574 entry->AlphaBlendFunction = PIPE_BLEND_ADD; 1575 entry->SourceBlendFactor = PIPE_BLENDFACTOR_ONE; 1576 entry->SourceAlphaBlendFactor = PIPE_BLENDFACTOR_ONE; 1577 entry->DestinationBlendFactor = PIPE_BLENDFACTOR_ZERO; 1578 entry->DestinationAlphaBlendFactor = PIPE_BLENDFACTOR_ZERO; 1579 } 1580#endif 1581 return independent_alpha_blend; 1582} 1583 1584/** 1585 * The pipe->create_blend_state() driver hook. 1586 * 1587 * Translates a pipe_blend_state into crocus_blend_state. 1588 */ 1589static void * 1590crocus_create_blend_state(struct pipe_context *ctx, 1591 const struct pipe_blend_state *state) 1592{ 1593 struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state)); 1594 1595 cso->blend_enables = 0; 1596 cso->color_write_enables = 0; 1597 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS <= 8); 1598 1599 cso->cso = *state; 1600 cso->dual_color_blending = util_blend_state_is_dual(state, 0); 1601 1602#if GFX_VER == 8 1603 bool indep_alpha_blend = false; 1604#endif 1605 for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) { 1606 const struct pipe_rt_blend_state *rt = 1607 &state->rt[state->independent_blend_enable ? i : 0]; 1608 if (rt->blend_enable) 1609 cso->blend_enables |= 1u << i; 1610 if (rt->colormask) 1611 cso->color_write_enables |= 1u << i; 1612#if GFX_VER == 8 1613 enum pipe_blendfactor src_rgb = 1614 fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one); 1615 enum pipe_blendfactor src_alpha = 1616 fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one); 1617 enum pipe_blendfactor dst_rgb = 1618 fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one); 1619 enum pipe_blendfactor dst_alpha = 1620 fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one); 1621 1622 if (rt->rgb_func != rt->alpha_func || 1623 src_rgb != src_alpha || dst_rgb != dst_alpha) 1624 indep_alpha_blend = true; 1625#endif 1626 } 1627 1628#if GFX_VER == 8 1629 crocus_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) { 1630 /* pb.HasWriteableRT is filled in at draw time. 1631 * pb.AlphaTestEnable is filled in at draw time. 1632 * 1633 * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid 1634 * setting it when dual color blending without an appropriate shader. 1635 */ 1636 1637 pb.AlphaToCoverageEnable = state->alpha_to_coverage; 1638 pb.IndependentAlphaBlendEnable = indep_alpha_blend; 1639 1640 /* The casts prevent warnings about implicit enum type conversions. */ 1641 pb.SourceBlendFactor = 1642 (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one); 1643 pb.SourceAlphaBlendFactor = 1644 (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one); 1645 pb.DestinationBlendFactor = 1646 (int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one); 1647 pb.DestinationAlphaBlendFactor = 1648 (int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one); 1649 } 1650#endif 1651 return cso; 1652} 1653 1654/** 1655 * The pipe->bind_blend_state() driver hook. 1656 * 1657 * Bind a blending CSO and flag related dirty bits. 1658 */ 1659static void 1660crocus_bind_blend_state(struct pipe_context *ctx, void *state) 1661{ 1662 struct crocus_context *ice = (struct crocus_context *) ctx; 1663 struct crocus_blend_state *cso = state; 1664 1665 ice->state.cso_blend = cso; 1666 ice->state.blend_enables = cso ? cso->blend_enables : 0; 1667 1668 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS; 1669 ice->state.dirty |= CROCUS_DIRTY_WM; 1670#if GFX_VER >= 6 1671 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE; 1672#endif 1673#if GFX_VER >= 7 1674 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS; 1675#endif 1676#if GFX_VER == 8 1677 ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX; 1678 ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND; 1679#endif 1680 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE; 1681 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; 1682 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND]; 1683} 1684 1685/** 1686 * Return true if the FS writes to any color outputs which are not disabled 1687 * via color masking. 1688 */ 1689static bool 1690has_writeable_rt(const struct crocus_blend_state *cso_blend, 1691 const struct shader_info *fs_info) 1692{ 1693 if (!fs_info) 1694 return false; 1695 1696 unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0; 1697 1698 if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR)) 1699 rt_outputs = (1 << BRW_MAX_DRAW_BUFFERS) - 1; 1700 1701 return cso_blend->color_write_enables & rt_outputs; 1702} 1703 1704/** 1705 * Gallium CSO for depth, stencil, and alpha testing state. 1706 */ 1707struct crocus_depth_stencil_alpha_state { 1708 struct pipe_depth_stencil_alpha_state cso; 1709 1710 bool depth_writes_enabled; 1711 bool stencil_writes_enabled; 1712}; 1713 1714/** 1715 * The pipe->create_depth_stencil_alpha_state() driver hook. 1716 * 1717 * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha 1718 * testing state since we need pieces of it in a variety of places. 1719 */ 1720static void * 1721crocus_create_zsa_state(struct pipe_context *ctx, 1722 const struct pipe_depth_stencil_alpha_state *state) 1723{ 1724 struct crocus_depth_stencil_alpha_state *cso = 1725 malloc(sizeof(struct crocus_depth_stencil_alpha_state)); 1726 1727 bool two_sided_stencil = state->stencil[1].enabled; 1728 cso->cso = *state; 1729 1730 cso->depth_writes_enabled = state->depth_writemask; 1731 cso->stencil_writes_enabled = 1732 state->stencil[0].writemask != 0 || 1733 (two_sided_stencil && state->stencil[1].writemask != 0); 1734 1735 /* The state tracker needs to optimize away EQUAL writes for us. */ 1736 assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask)); 1737 1738 return cso; 1739} 1740 1741/** 1742 * The pipe->bind_depth_stencil_alpha_state() driver hook. 1743 * 1744 * Bind a depth/stencil/alpha CSO and flag related dirty bits. 1745 */ 1746static void 1747crocus_bind_zsa_state(struct pipe_context *ctx, void *state) 1748{ 1749 struct crocus_context *ice = (struct crocus_context *) ctx; 1750 struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa; 1751 struct crocus_depth_stencil_alpha_state *new_cso = state; 1752 1753 if (new_cso) { 1754 if (cso_changed(cso.alpha_ref_value)) 1755 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE; 1756 1757 if (cso_changed(cso.alpha_enabled)) 1758 ice->state.dirty |= CROCUS_DIRTY_WM; 1759#if GFX_VER >= 6 1760 if (cso_changed(cso.alpha_enabled)) 1761 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE; 1762 1763 if (cso_changed(cso.alpha_func)) 1764 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE; 1765#endif 1766#if GFX_VER == 8 1767 if (cso_changed(cso.alpha_enabled)) 1768 ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND; 1769#endif 1770 1771 if (cso_changed(depth_writes_enabled)) 1772 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; 1773 1774 ice->state.depth_writes_enabled = new_cso->depth_writes_enabled; 1775 ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled; 1776 1777#if GFX_VER <= 5 1778 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE; 1779#endif 1780 } 1781 1782 ice->state.cso_zsa = new_cso; 1783 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT; 1784#if GFX_VER >= 6 1785 ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL; 1786#endif 1787#if GFX_VER == 8 1788 ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX; 1789#endif 1790 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA]; 1791} 1792 1793#if GFX_VER == 8 1794static bool 1795want_pma_fix(struct crocus_context *ice) 1796{ 1797 UNUSED struct crocus_screen *screen = (void *) ice->ctx.screen; 1798 UNUSED const struct intel_device_info *devinfo = &screen->devinfo; 1799 const struct brw_wm_prog_data *wm_prog_data = (void *) 1800 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data; 1801 const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; 1802 const struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa; 1803 const struct crocus_blend_state *cso_blend = ice->state.cso_blend; 1804 1805 /* In very specific combinations of state, we can instruct Gfx8-9 hardware 1806 * to avoid stalling at the pixel mask array. The state equations are 1807 * documented in these places: 1808 * 1809 * - Gfx8 Depth PMA Fix: CACHE_MODE_1::NP_PMA_FIX_ENABLE 1810 * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable 1811 * 1812 * Both equations share some common elements: 1813 * 1814 * no_hiz_op = 1815 * !(3DSTATE_WM_HZ_OP::DepthBufferClear || 1816 * 3DSTATE_WM_HZ_OP::DepthBufferResolve || 1817 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || 1818 * 3DSTATE_WM_HZ_OP::StencilBufferClear) && 1819 * 1820 * killpixels = 1821 * 3DSTATE_WM::ForceKillPix != ForceOff && 1822 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || 1823 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || 1824 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || 1825 * 3DSTATE_PS_BLEND::AlphaTestEnable || 1826 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) 1827 * 1828 * (Technically the stencil PMA treats ForceKillPix differently, 1829 * but I think this is a documentation oversight, and we don't 1830 * ever use it in this way, so it doesn't matter). 1831 * 1832 * common_pma_fix = 1833 * 3DSTATE_WM::ForceThreadDispatch != 1 && 1834 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 && 1835 * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL && 1836 * 3DSTATE_DEPTH_BUFFER::HIZ Enable && 1837 * 3DSTATE_WM::EDSC_Mode != EDSC_PREPS && 1838 * 3DSTATE_PS_EXTRA::PixelShaderValid && 1839 * no_hiz_op 1840 * 1841 * These are always true: 1842 * 1843 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 1844 * 3DSTATE_PS_EXTRA::PixelShaderValid 1845 * 1846 * Also, we never use the normal drawing path for HiZ ops; these are true: 1847 * 1848 * !(3DSTATE_WM_HZ_OP::DepthBufferClear || 1849 * 3DSTATE_WM_HZ_OP::DepthBufferResolve || 1850 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || 1851 * 3DSTATE_WM_HZ_OP::StencilBufferClear) 1852 * 1853 * This happens sometimes: 1854 * 1855 * 3DSTATE_WM::ForceThreadDispatch != 1 1856 * 1857 * However, we choose to ignore it as it either agrees with the signal 1858 * (dispatch was already enabled, so nothing out of the ordinary), or 1859 * there are no framebuffer attachments (so no depth or HiZ anyway, 1860 * meaning the PMA signal will already be disabled). 1861 */ 1862 1863 if (!cso_fb->zsbuf) 1864 return false; 1865 1866 struct crocus_resource *zres, *sres; 1867 crocus_get_depth_stencil_resources(devinfo, 1868 cso_fb->zsbuf->texture, &zres, &sres); 1869 1870 /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL && 1871 * 3DSTATE_DEPTH_BUFFER::HIZ Enable && 1872 */ 1873 if (!zres || !crocus_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level)) 1874 return false; 1875 1876 /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */ 1877 if (wm_prog_data->early_fragment_tests) 1878 return false; 1879 1880 /* 3DSTATE_WM::ForceKillPix != ForceOff && 1881 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || 1882 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || 1883 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || 1884 * 3DSTATE_PS_BLEND::AlphaTestEnable || 1885 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) 1886 */ 1887 bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask || 1888 cso_blend->cso.alpha_to_coverage || cso_zsa->cso.alpha_enabled; 1889 1890 /* The Gfx8 depth PMA equation becomes: 1891 * 1892 * depth_writes = 1893 * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable && 1894 * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE 1895 * 1896 * stencil_writes = 1897 * 3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && 1898 * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE && 1899 * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE 1900 * 1901 * Z_PMA_OPT = 1902 * common_pma_fix && 1903 * 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable && 1904 * ((killpixels && (depth_writes || stencil_writes)) || 1905 * 3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF) 1906 * 1907 */ 1908 if (!cso_zsa->cso.depth_enabled) 1909 return false; 1910 1911 return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF || 1912 (killpixels && (cso_zsa->depth_writes_enabled || 1913 (sres && cso_zsa->stencil_writes_enabled))); 1914} 1915#endif 1916void 1917genX(crocus_update_pma_fix)(struct crocus_context *ice, 1918 struct crocus_batch *batch, 1919 bool enable) 1920{ 1921#if GFX_VER == 8 1922 struct crocus_genx_state *genx = ice->state.genx; 1923 1924 if (genx->pma_fix_enabled == enable) 1925 return; 1926 1927 genx->pma_fix_enabled = enable; 1928 1929 /* According to the Broadwell PIPE_CONTROL documentation, software should 1930 * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set 1931 * prior to the LRI. If stencil buffer writes are enabled, then a Render * Cache Flush is also necessary. 1932 * 1933 * The Gfx9 docs say to use a depth stall rather than a command streamer 1934 * stall. However, the hardware seems to violently disagree. A full 1935 * command streamer stall seems to be needed in both cases. 1936 */ 1937 crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)", 1938 PIPE_CONTROL_CS_STALL | 1939 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 1940 PIPE_CONTROL_RENDER_TARGET_FLUSH); 1941 1942 crocus_emit_reg(batch, GENX(CACHE_MODE_1), reg) { 1943 reg.NPPMAFixEnable = enable; 1944 reg.NPEarlyZFailsDisable = enable; 1945 reg.NPPMAFixEnableMask = true; 1946 reg.NPEarlyZFailsDisableMask = true; 1947 } 1948 1949 /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache 1950 * Flush bits is often necessary. We do it regardless because it's easier. 1951 * The render cache flush is also necessary if stencil writes are enabled. 1952 * 1953 * Again, the Gfx9 docs give a different set of flushes but the Broadwell 1954 * flushes seem to work just as well. 1955 */ 1956 crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)", 1957 PIPE_CONTROL_DEPTH_STALL | 1958 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 1959 PIPE_CONTROL_RENDER_TARGET_FLUSH); 1960#endif 1961} 1962 1963static float 1964get_line_width(const struct pipe_rasterizer_state *state) 1965{ 1966 float line_width = state->line_width; 1967 1968 /* From the OpenGL 4.4 spec: 1969 * 1970 * "The actual width of non-antialiased lines is determined by rounding 1971 * the supplied width to the nearest integer, then clamping it to the 1972 * implementation-dependent maximum non-antialiased line width." 1973 */ 1974 if (!state->multisample && !state->line_smooth) 1975 line_width = roundf(state->line_width); 1976 1977 if (!state->multisample && state->line_smooth && line_width < 1.5f) { 1978 /* For 1 pixel line thickness or less, the general anti-aliasing 1979 * algorithm gives up, and a garbage line is generated. Setting a 1980 * Line Width of 0.0 specifies the rasterization of the "thinnest" 1981 * (one-pixel-wide), non-antialiased lines. 1982 * 1983 * Lines rendered with zero Line Width are rasterized using the 1984 * "Grid Intersection Quantization" rules as specified by the 1985 * "Zero-Width (Cosmetic) Line Rasterization" section of the docs. 1986 */ 1987 /* hack around this for gfx4/5 fps counters in hud. */ 1988 line_width = GFX_VER < 6 ? 1.5f : 0.0f; 1989 } 1990 return line_width; 1991} 1992 1993/** 1994 * The pipe->create_rasterizer_state() driver hook. 1995 */ 1996static void * 1997crocus_create_rasterizer_state(struct pipe_context *ctx, 1998 const struct pipe_rasterizer_state *state) 1999{ 2000 struct crocus_rasterizer_state *cso = 2001 malloc(sizeof(struct crocus_rasterizer_state)); 2002 2003 cso->fill_mode_point_or_line = 2004 state->fill_front == PIPE_POLYGON_MODE_LINE || 2005 state->fill_front == PIPE_POLYGON_MODE_POINT || 2006 state->fill_back == PIPE_POLYGON_MODE_LINE || 2007 state->fill_back == PIPE_POLYGON_MODE_POINT; 2008 2009 if (state->clip_plane_enable != 0) 2010 cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1; 2011 else 2012 cso->num_clip_plane_consts = 0; 2013 2014 cso->cso = *state; 2015 2016#if GFX_VER >= 6 2017 float line_width = get_line_width(state); 2018 2019 crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) { 2020 sf.StatisticsEnable = true; 2021 sf.AALineDistanceMode = AALINEDISTANCE_TRUE; 2022 sf.LineEndCapAntialiasingRegionWidth = 2023 state->line_smooth ? _10pixels : _05pixels; 2024 sf.LastPixelEnable = state->line_last_pixel; 2025#if GFX_VER <= 7 2026 sf.AntialiasingEnable = state->line_smooth; 2027#endif 2028#if GFX_VER == 8 2029 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; 2030 if (screen->devinfo.platform == INTEL_PLATFORM_CHV) 2031 sf.CHVLineWidth = line_width; 2032 else 2033 sf.LineWidth = line_width; 2034#else 2035 sf.LineWidth = line_width; 2036#endif 2037 sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State; 2038 sf.PointWidth = state->point_size; 2039 2040 if (state->flatshade_first) { 2041 sf.TriangleFanProvokingVertexSelect = 1; 2042 } else { 2043 sf.TriangleStripListProvokingVertexSelect = 2; 2044 sf.TriangleFanProvokingVertexSelect = 2; 2045 sf.LineStripListProvokingVertexSelect = 1; 2046 } 2047 2048#if GFX_VER == 6 2049 sf.AttributeSwizzleEnable = true; 2050 if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT) 2051 sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT; 2052 else 2053 sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT; 2054#endif 2055 2056#if GFX_VER <= 7 2057 sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way... 2058 2059#if GFX_VER >= 6 2060 sf.GlobalDepthOffsetEnableSolid = state->offset_tri; 2061 sf.GlobalDepthOffsetEnableWireframe = state->offset_line; 2062 sf.GlobalDepthOffsetEnablePoint = state->offset_point; 2063 sf.GlobalDepthOffsetConstant = state->offset_units * 2; 2064 sf.GlobalDepthOffsetScale = state->offset_scale; 2065 sf.GlobalDepthOffsetClamp = state->offset_clamp; 2066 2067 sf.FrontFaceFillMode = translate_fill_mode(state->fill_front); 2068 sf.BackFaceFillMode = translate_fill_mode(state->fill_back); 2069#endif 2070 2071 sf.CullMode = translate_cull_mode(state->cull_face); 2072 sf.ScissorRectangleEnable = true; 2073 2074#if GFX_VERx10 == 75 2075 sf.LineStippleEnable = state->line_stipple_enable; 2076#endif 2077#endif 2078 } 2079#endif 2080 2081#if GFX_VER == 8 2082 crocus_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) { 2083 rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise; 2084 rr.CullMode = translate_cull_mode(state->cull_face); 2085 rr.FrontFaceFillMode = translate_fill_mode(state->fill_front); 2086 rr.BackFaceFillMode = translate_fill_mode(state->fill_back); 2087 rr.DXMultisampleRasterizationEnable = state->multisample; 2088 rr.GlobalDepthOffsetEnableSolid = state->offset_tri; 2089 rr.GlobalDepthOffsetEnableWireframe = state->offset_line; 2090 rr.GlobalDepthOffsetEnablePoint = state->offset_point; 2091 rr.GlobalDepthOffsetConstant = state->offset_units * 2; 2092 rr.GlobalDepthOffsetScale = state->offset_scale; 2093 rr.GlobalDepthOffsetClamp = state->offset_clamp; 2094 rr.SmoothPointEnable = state->point_smooth; 2095 rr.AntialiasingEnable = state->line_smooth; 2096 rr.ScissorRectangleEnable = state->scissor; 2097 rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far); 2098 } 2099#endif 2100 2101#if GFX_VER >= 6 2102 crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) { 2103 /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from 2104 * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB. 2105 */ 2106#if GFX_VER >= 7 2107 cl.EarlyCullEnable = true; 2108#endif 2109 2110#if GFX_VER == 7 2111 cl.FrontWinding = state->front_ccw ? 1 : 0; 2112 cl.CullMode = translate_cull_mode(state->cull_face); 2113#endif 2114 cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable; 2115#if GFX_VER < 8 2116 cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far); 2117#endif 2118 cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL; 2119 cl.GuardbandClipTestEnable = true; 2120 cl.ClipEnable = true; 2121 cl.MinimumPointWidth = 0.125; 2122 cl.MaximumPointWidth = 255.875; 2123 2124#if GFX_VER == 8 2125 cl.ForceUserClipDistanceClipTestEnableBitmask = true; 2126#endif 2127 2128 if (state->flatshade_first) { 2129 cl.TriangleFanProvokingVertexSelect = 1; 2130 } else { 2131 cl.TriangleStripListProvokingVertexSelect = 2; 2132 cl.TriangleFanProvokingVertexSelect = 2; 2133 cl.LineStripListProvokingVertexSelect = 1; 2134 } 2135 } 2136#endif 2137 2138 /* Remap from 0..255 back to 1..256 */ 2139 const unsigned line_stipple_factor = state->line_stipple_factor + 1; 2140 2141 crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) { 2142 if (state->line_stipple_enable) { 2143 line.LineStipplePattern = state->line_stipple_pattern; 2144 line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor; 2145 line.LineStippleRepeatCount = line_stipple_factor; 2146 } 2147 } 2148 2149 return cso; 2150} 2151 2152/** 2153 * The pipe->bind_rasterizer_state() driver hook. 2154 * 2155 * Bind a rasterizer CSO and flag related dirty bits. 2156 */ 2157static void 2158crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state) 2159{ 2160 struct crocus_context *ice = (struct crocus_context *) ctx; 2161 struct crocus_rasterizer_state *old_cso = ice->state.cso_rast; 2162 struct crocus_rasterizer_state *new_cso = state; 2163 2164 if (new_cso) { 2165 /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */ 2166 if (cso_changed_memcmp(line_stipple)) 2167 ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE; 2168#if GFX_VER >= 6 2169 if (cso_changed(cso.half_pixel_center)) 2170 ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE; 2171 if (cso_changed(cso.scissor)) 2172 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT; 2173 if (cso_changed(cso.multisample)) 2174 ice->state.dirty |= CROCUS_DIRTY_WM; 2175#else 2176 if (cso_changed(cso.scissor)) 2177 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT; 2178#endif 2179 2180 if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable)) 2181 ice->state.dirty |= CROCUS_DIRTY_WM; 2182 2183#if GFX_VER >= 6 2184 if (cso_changed(cso.rasterizer_discard)) 2185 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP; 2186 2187 if (cso_changed(cso.flatshade_first)) 2188 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT; 2189#endif 2190 2191 if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) || 2192 cso_changed(cso.clip_halfz)) 2193 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT; 2194 2195#if GFX_VER >= 7 2196 if (cso_changed(cso.sprite_coord_enable) || 2197 cso_changed(cso.sprite_coord_mode) || 2198 cso_changed(cso.light_twoside)) 2199 ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE; 2200#endif 2201#if GFX_VER <= 5 2202 if (cso_changed(cso.clip_plane_enable)) 2203 ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE; 2204#endif 2205 } 2206 2207 ice->state.cso_rast = new_cso; 2208 ice->state.dirty |= CROCUS_DIRTY_RASTER; 2209 ice->state.dirty |= CROCUS_DIRTY_CLIP; 2210#if GFX_VER <= 5 2211 ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG; 2212 ice->state.dirty |= CROCUS_DIRTY_WM; 2213#endif 2214#if GFX_VER <= 6 2215 ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG; 2216#endif 2217 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER]; 2218} 2219 2220/** 2221 * Return true if the given wrap mode requires the border color to exist. 2222 * 2223 * (We can skip uploading it if the sampler isn't going to use it.) 2224 */ 2225static bool 2226wrap_mode_needs_border_color(unsigned wrap_mode) 2227{ 2228#if GFX_VER == 8 2229 return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER; 2230#else 2231 return wrap_mode == TCM_CLAMP_BORDER; 2232#endif 2233} 2234 2235/** 2236 * Gallium CSO for sampler state. 2237 */ 2238struct crocus_sampler_state { 2239 struct pipe_sampler_state pstate; 2240 union pipe_color_union border_color; 2241 bool needs_border_color; 2242 unsigned wrap_s; 2243 unsigned wrap_t; 2244 unsigned wrap_r; 2245 unsigned mag_img_filter; 2246 float min_lod; 2247}; 2248 2249/** 2250 * The pipe->create_sampler_state() driver hook. 2251 * 2252 * We fill out SAMPLER_STATE (except for the border color pointer), and 2253 * store that on the CPU. It doesn't make sense to upload it to a GPU 2254 * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires 2255 * all bound sampler states to be in contiguous memor. 2256 */ 2257static void * 2258crocus_create_sampler_state(struct pipe_context *ctx, 2259 const struct pipe_sampler_state *state) 2260{ 2261 struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state); 2262 2263 if (!cso) 2264 return NULL; 2265 2266 STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST); 2267 STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR); 2268 2269 bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST || 2270 state->mag_img_filter == PIPE_TEX_FILTER_NEAREST; 2271 cso->wrap_s = translate_wrap(state->wrap_s, either_nearest); 2272 cso->wrap_t = translate_wrap(state->wrap_t, either_nearest); 2273 cso->wrap_r = translate_wrap(state->wrap_r, either_nearest); 2274 2275 cso->pstate = *state; 2276 2277 memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color)); 2278 2279 cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) || 2280 wrap_mode_needs_border_color(cso->wrap_t) || 2281 wrap_mode_needs_border_color(cso->wrap_r); 2282 2283 cso->min_lod = state->min_lod; 2284 cso->mag_img_filter = state->mag_img_filter; 2285 2286 // XXX: explain this code ported from ilo...I don't get it at all... 2287 if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE && 2288 state->min_lod > 0.0f) { 2289 cso->min_lod = 0.0f; 2290 cso->mag_img_filter = state->min_img_filter; 2291 } 2292 2293 return cso; 2294} 2295 2296/** 2297 * The pipe->bind_sampler_states() driver hook. 2298 */ 2299static void 2300crocus_bind_sampler_states(struct pipe_context *ctx, 2301 enum pipe_shader_type p_stage, 2302 unsigned start, unsigned count, 2303 void **states) 2304{ 2305 struct crocus_context *ice = (struct crocus_context *) ctx; 2306 gl_shader_stage stage = stage_from_pipe(p_stage); 2307 struct crocus_shader_state *shs = &ice->state.shaders[stage]; 2308 2309 assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS); 2310 2311 bool dirty = false; 2312 2313 for (int i = 0; i < count; i++) { 2314 if (shs->samplers[start + i] != states[i]) { 2315 shs->samplers[start + i] = states[i]; 2316 dirty = true; 2317 } 2318 } 2319 2320 if (dirty) { 2321#if GFX_VER <= 5 2322 if (p_stage == PIPE_SHADER_FRAGMENT) 2323 ice->state.dirty |= CROCUS_DIRTY_WM; 2324 else if (p_stage == PIPE_SHADER_VERTEX) 2325 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS; 2326#endif 2327 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage; 2328 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES]; 2329 } 2330} 2331 2332enum samp_workaround { 2333 SAMP_NORMAL, 2334 SAMP_CUBE_CLAMP, 2335 SAMP_CUBE_CUBE, 2336 SAMP_T_WRAP, 2337}; 2338 2339static void 2340crocus_upload_sampler_state(struct crocus_batch *batch, 2341 struct crocus_sampler_state *cso, 2342 uint32_t border_color_offset, 2343 enum samp_workaround samp_workaround, 2344 uint32_t first_level, 2345 void *map) 2346{ 2347 struct pipe_sampler_state *state = &cso->pstate; 2348 uint32_t wrap_s, wrap_t, wrap_r; 2349 2350 wrap_s = cso->wrap_s; 2351 wrap_t = cso->wrap_t; 2352 wrap_r = cso->wrap_r; 2353 2354 switch (samp_workaround) { 2355 case SAMP_CUBE_CLAMP: 2356 wrap_s = TCM_CLAMP; 2357 wrap_t = TCM_CLAMP; 2358 wrap_r = TCM_CLAMP; 2359 break; 2360 case SAMP_CUBE_CUBE: 2361 wrap_s = TCM_CUBE; 2362 wrap_t = TCM_CUBE; 2363 wrap_r = TCM_CUBE; 2364 break; 2365 case SAMP_T_WRAP: 2366 wrap_t = TCM_WRAP; 2367 break; 2368 default: 2369 break; 2370 } 2371 2372 _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) { 2373 samp.TCXAddressControlMode = wrap_s; 2374 samp.TCYAddressControlMode = wrap_t; 2375 samp.TCZAddressControlMode = wrap_r; 2376 2377#if GFX_VER >= 6 2378 samp.NonnormalizedCoordinateEnable = !state->normalized_coords; 2379#endif 2380 samp.MinModeFilter = state->min_img_filter; 2381 samp.MagModeFilter = cso->mag_img_filter; 2382 samp.MipModeFilter = translate_mip_filter(state->min_mip_filter); 2383 samp.MaximumAnisotropy = RATIO21; 2384 2385 if (state->max_anisotropy >= 2) { 2386 if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) { 2387 samp.MinModeFilter = MAPFILTER_ANISOTROPIC; 2388#if GFX_VER >= 7 2389 samp.AnisotropicAlgorithm = EWAApproximation; 2390#endif 2391 } 2392 2393 if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR) 2394 samp.MagModeFilter = MAPFILTER_ANISOTROPIC; 2395 2396 samp.MaximumAnisotropy = 2397 MIN2((state->max_anisotropy - 2) / 2, RATIO161); 2398 } 2399 2400 /* Set address rounding bits if not using nearest filtering. */ 2401 if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) { 2402 samp.UAddressMinFilterRoundingEnable = true; 2403 samp.VAddressMinFilterRoundingEnable = true; 2404 samp.RAddressMinFilterRoundingEnable = true; 2405 } 2406 2407 if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) { 2408 samp.UAddressMagFilterRoundingEnable = true; 2409 samp.VAddressMagFilterRoundingEnable = true; 2410 samp.RAddressMagFilterRoundingEnable = true; 2411 } 2412 2413 if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) 2414 samp.ShadowFunction = translate_shadow_func(state->compare_func); 2415 2416 const float hw_max_lod = GFX_VER >= 7 ? 14 : 13; 2417 2418#if GFX_VER == 8 2419 samp.LODPreClampMode = CLAMP_MODE_OGL; 2420#else 2421 samp.LODPreClampEnable = true; 2422#endif 2423 samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod); 2424 samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod); 2425 samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15); 2426 2427#if GFX_VER == 6 2428 samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod); 2429 samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter; 2430#endif 2431 2432#if GFX_VER < 6 2433 samp.BorderColorPointer = 2434 ro_bo(batch->state.bo, border_color_offset); 2435#else 2436 samp.BorderColorPointer = border_color_offset; 2437#endif 2438 } 2439} 2440 2441static void 2442crocus_upload_border_color(struct crocus_batch *batch, 2443 struct crocus_sampler_state *cso, 2444 struct crocus_sampler_view *tex, 2445 uint32_t *bc_offset) 2446{ 2447 /* We may need to swizzle the border color for format faking. 2448 * A/LA formats are faked as R/RG with 000R or R00G swizzles. 2449 * This means we need to move the border color's A channel into 2450 * the R or G channels so that those read swizzles will move it 2451 * back into A. 2452 */ 2453 enum pipe_format internal_format = PIPE_FORMAT_NONE; 2454 union pipe_color_union *color = &cso->border_color; 2455 union pipe_color_union tmp; 2456 if (tex) { 2457 internal_format = tex->res->internal_format; 2458 2459 if (util_format_is_alpha(internal_format)) { 2460 unsigned char swz[4] = { 2461 PIPE_SWIZZLE_0, PIPE_SWIZZLE_0, 2462 PIPE_SWIZZLE_0, PIPE_SWIZZLE_W, 2463 }; 2464 util_format_apply_color_swizzle(&tmp, color, swz, true); 2465 color = &tmp; 2466 } else if (util_format_is_luminance_alpha(internal_format) && 2467 internal_format != PIPE_FORMAT_L8A8_SRGB) { 2468 unsigned char swz[4] = { 2469 PIPE_SWIZZLE_X, PIPE_SWIZZLE_X, 2470 PIPE_SWIZZLE_X, PIPE_SWIZZLE_W 2471 }; 2472 util_format_apply_color_swizzle(&tmp, color, swz, true); 2473 color = &tmp; 2474 } 2475 } 2476 bool is_integer_format = util_format_is_pure_integer(internal_format); 2477 unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4; 2478 const int sbc_align = (GFX_VER == 8 ? 64 : ((GFX_VERx10 == 75 && is_integer_format) ? 512 : 32)); 2479 uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset); 2480 2481 struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 }; 2482 2483#define ASSIGN(dst, src) \ 2484 do { \ 2485 dst = src; \ 2486 } while (0) 2487 2488#define ASSIGNu16(dst, src) \ 2489 do { \ 2490 dst = (uint16_t)src; \ 2491 } while (0) 2492 2493#define ASSIGNu8(dst, src) \ 2494 do { \ 2495 dst = (uint8_t)src; \ 2496 } while (0) 2497 2498#define BORDER_COLOR_ATTR(macro, _color_type, src) \ 2499 macro(state.BorderColor ## _color_type ## Red, src[0]); \ 2500 macro(state.BorderColor ## _color_type ## Green, src[1]); \ 2501 macro(state.BorderColor ## _color_type ## Blue, src[2]); \ 2502 macro(state.BorderColor ## _color_type ## Alpha, src[3]); 2503 2504#if GFX_VER >= 8 2505 /* On Broadwell, the border color is represented as four 32-bit floats, 2506 * integers, or unsigned values, interpreted according to the surface 2507 * format. This matches the sampler->BorderColor union exactly; just 2508 * memcpy the values. 2509 */ 2510 BORDER_COLOR_ATTR(ASSIGN, 32bit, color->ui); 2511#elif GFX_VERx10 == 75 2512 if (is_integer_format) { 2513 const struct util_format_description *format_desc = 2514 util_format_description(internal_format); 2515 2516 /* From the Haswell PRM, "Command Reference: Structures", Page 36: 2517 * "If any color channel is missing from the surface format, 2518 * corresponding border color should be programmed as zero and if 2519 * alpha channel is missing, corresponding Alpha border color should 2520 * be programmed as 1." 2521 */ 2522 unsigned c[4] = { 0, 0, 0, 1 }; 2523 for (int i = 0; i < 4; i++) { 2524 if (format_desc->channel[i].size) 2525 c[i] = color->ui[i]; 2526 } 2527 2528 switch (format_desc->channel[0].size) { 2529 case 8: 2530 /* Copy RGBA in order. */ 2531 BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c); 2532 break; 2533 case 10: 2534 /* R10G10B10A2_UINT is treated like a 16-bit format. */ 2535 case 16: 2536 BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c); 2537 break; 2538 case 32: 2539 if (format_desc->channel[1].size && !format_desc->channel[2].size) { 2540 /* Careful inspection of the tables reveals that for RG32 formats, 2541 * the green channel needs to go where blue normally belongs. 2542 */ 2543 state.BorderColor32bitRed = c[0]; 2544 state.BorderColor32bitBlue = c[1]; 2545 state.BorderColor32bitAlpha = 1; 2546 } else { 2547 /* Copy RGBA in order. */ 2548 BORDER_COLOR_ATTR(ASSIGN, 32bit, c); 2549 } 2550 break; 2551 default: 2552 assert(!"Invalid number of bits per channel in integer format."); 2553 break; 2554 } 2555 } else { 2556 BORDER_COLOR_ATTR(ASSIGN, Float, color->f); 2557 } 2558#elif GFX_VER == 5 || GFX_VER == 6 2559 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f); 2560 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f); 2561 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f); 2562 2563#define MESA_FLOAT_TO_HALF(dst, src) \ 2564 dst = _mesa_float_to_half(src); 2565 2566 BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f); 2567 2568#undef MESA_FLOAT_TO_HALF 2569 2570 state.BorderColorSnorm8Red = state.BorderColorSnorm16Red >> 8; 2571 state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8; 2572 state.BorderColorSnorm8Blue = state.BorderColorSnorm16Blue >> 8; 2573 state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8; 2574 2575 BORDER_COLOR_ATTR(ASSIGN, Float, color->f); 2576 2577#elif GFX_VER == 4 2578 BORDER_COLOR_ATTR(ASSIGN, , color->f); 2579#else 2580 BORDER_COLOR_ATTR(ASSIGN, Float, color->f); 2581#endif 2582 2583#undef ASSIGN 2584#undef BORDER_COLOR_ATTR 2585 2586 GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state); 2587} 2588 2589/** 2590 * Upload the sampler states into a contiguous area of GPU memory, for 2591 * for 3DSTATE_SAMPLER_STATE_POINTERS_*. 2592 * 2593 * Also fill out the border color state pointers. 2594 */ 2595static void 2596crocus_upload_sampler_states(struct crocus_context *ice, 2597 struct crocus_batch *batch, gl_shader_stage stage) 2598{ 2599 struct crocus_shader_state *shs = &ice->state.shaders[stage]; 2600 const struct shader_info *info = crocus_get_shader_info(ice, stage); 2601 2602 /* We assume the state tracker will call pipe->bind_sampler_states() 2603 * if the program's number of textures changes. 2604 */ 2605 unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0; 2606 2607 if (!count) 2608 return; 2609 2610 /* Assemble the SAMPLER_STATEs into a contiguous table that lives 2611 * in the dynamic state memory zone, so we can point to it via the 2612 * 3DSTATE_SAMPLER_STATE_POINTERS_* commands. 2613 */ 2614 unsigned size = count * 4 * GENX(SAMPLER_STATE_length); 2615 uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset); 2616 2617 if (unlikely(!map)) 2618 return; 2619 2620 for (int i = 0; i < count; i++) { 2621 struct crocus_sampler_state *state = shs->samplers[i]; 2622 struct crocus_sampler_view *tex = shs->textures[i]; 2623 2624 if (!state || !tex) { 2625 memset(map, 0, 4 * GENX(SAMPLER_STATE_length)); 2626 } else { 2627 unsigned border_color_offset = 0; 2628 if (state->needs_border_color) { 2629 crocus_upload_border_color(batch, state, tex, &border_color_offset); 2630 } 2631 2632 enum samp_workaround wa = SAMP_NORMAL; 2633 /* There's a bug in 1D texture sampling - it actually pays 2634 * attention to the wrap_t value, though it should not. 2635 * Override the wrap_t value here to GL_REPEAT to keep 2636 * any nonexistent border pixels from floating in. 2637 */ 2638 if (tex->base.target == PIPE_TEXTURE_1D) 2639 wa = SAMP_T_WRAP; 2640 else if (tex->base.target == PIPE_TEXTURE_CUBE || 2641 tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) { 2642 /* Cube maps must use the same wrap mode for all three coordinate 2643 * dimensions. Prior to Haswell, only CUBE and CLAMP are valid. 2644 * 2645 * Ivybridge and Baytrail seem to have problems with CUBE mode and 2646 * integer formats. Fall back to CLAMP for now. 2647 */ 2648 if (state->pstate.seamless_cube_map && 2649 !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format))) 2650 wa = SAMP_CUBE_CUBE; 2651 else 2652 wa = SAMP_CUBE_CLAMP; 2653 } 2654 2655 uint32_t first_level = 0; 2656 if (tex->base.target != PIPE_BUFFER) 2657 first_level = tex->base.u.tex.first_level; 2658 2659 crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map); 2660 } 2661 2662 map += GENX(SAMPLER_STATE_length); 2663 } 2664} 2665 2666/** 2667 * The pipe->create_sampler_view() driver hook. 2668 */ 2669static struct pipe_sampler_view * 2670crocus_create_sampler_view(struct pipe_context *ctx, 2671 struct pipe_resource *tex, 2672 const struct pipe_sampler_view *tmpl) 2673{ 2674 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; 2675 const struct intel_device_info *devinfo = &screen->devinfo; 2676 struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view)); 2677 2678 if (!isv) 2679 return NULL; 2680 2681 /* initialize base object */ 2682 isv->base = *tmpl; 2683 isv->base.context = ctx; 2684 isv->base.texture = NULL; 2685 pipe_reference_init(&isv->base.reference, 1); 2686 pipe_resource_reference(&isv->base.texture, tex); 2687 2688 if (util_format_is_depth_or_stencil(tmpl->format)) { 2689 struct crocus_resource *zres, *sres; 2690 const struct util_format_description *desc = 2691 util_format_description(tmpl->format); 2692 2693 crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres); 2694 2695 tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b; 2696 2697 if (tex->format == PIPE_FORMAT_S8_UINT) 2698 if (GFX_VER == 7 && sres->shadow) 2699 tex = &sres->shadow->base.b; 2700 } 2701 2702 isv->res = (struct crocus_resource *) tex; 2703 2704 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT; 2705 2706 if (isv->base.target == PIPE_TEXTURE_CUBE || 2707 isv->base.target == PIPE_TEXTURE_CUBE_ARRAY) 2708 usage |= ISL_SURF_USAGE_CUBE_BIT; 2709 2710 const struct crocus_format_info fmt = 2711 crocus_format_for_usage(devinfo, tmpl->format, usage); 2712 2713 enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a }; 2714 crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz); 2715 2716 /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */ 2717 if (GFX_VER < 6 && 2718 (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT || 2719 tmpl->format == PIPE_FORMAT_X24S8_UINT)) { 2720 isv->swizzle[0] = tmpl->swizzle_g; 2721 isv->swizzle[1] = tmpl->swizzle_g; 2722 isv->swizzle[2] = tmpl->swizzle_g; 2723 isv->swizzle[3] = tmpl->swizzle_g; 2724 } 2725 2726 isv->clear_color = isv->res->aux.clear_color; 2727 2728 isv->view = (struct isl_view) { 2729 .format = fmt.fmt, 2730#if GFX_VERx10 >= 75 2731 .swizzle = (struct isl_swizzle) { 2732 .r = pipe_to_isl_swizzle(isv->swizzle[0], false), 2733 .g = pipe_to_isl_swizzle(isv->swizzle[1], false), 2734 .b = pipe_to_isl_swizzle(isv->swizzle[2], false), 2735 .a = pipe_to_isl_swizzle(isv->swizzle[3], false), 2736 }, 2737#else 2738 /* swizzling handled in shader code */ 2739 .swizzle = ISL_SWIZZLE_IDENTITY, 2740#endif 2741 .usage = usage, 2742 }; 2743 2744 /* Fill out SURFACE_STATE for this view. */ 2745 if (tmpl->target != PIPE_BUFFER) { 2746 isv->view.base_level = tmpl->u.tex.first_level; 2747 isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1; 2748 2749 /* Hardware older than skylake ignores this value */ 2750 assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer); 2751 2752 // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2? 2753 isv->view.base_array_layer = tmpl->u.tex.first_layer; 2754 isv->view.array_len = 2755 tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1; 2756 } 2757#if GFX_VER >= 6 2758 /* just create a second view struct for texture gather just in case */ 2759 isv->gather_view = isv->view; 2760 2761#if GFX_VER == 7 2762 if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT || 2763 fmt.fmt == ISL_FORMAT_R32G32_SINT || 2764 fmt.fmt == ISL_FORMAT_R32G32_UINT) { 2765 isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD; 2766#if GFX_VERx10 >= 75 2767 isv->gather_view.swizzle = (struct isl_swizzle) { 2768 .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75), 2769 .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75), 2770 .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75), 2771 .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75), 2772 }; 2773#endif 2774 } 2775#endif 2776#if GFX_VER == 6 2777 /* Sandybridge's gather4 message is broken for integer formats. 2778 * To work around this, we pretend the surface is UNORM for 2779 * 8 or 16-bit formats, and emit shader instructions to recover 2780 * the real INT/UINT value. For 32-bit formats, we pretend 2781 * the surface is FLOAT, and simply reinterpret the resulting 2782 * bits. 2783 */ 2784 switch (fmt.fmt) { 2785 case ISL_FORMAT_R8_SINT: 2786 case ISL_FORMAT_R8_UINT: 2787 isv->gather_view.format = ISL_FORMAT_R8_UNORM; 2788 break; 2789 2790 case ISL_FORMAT_R16_SINT: 2791 case ISL_FORMAT_R16_UINT: 2792 isv->gather_view.format = ISL_FORMAT_R16_UNORM; 2793 break; 2794 2795 case ISL_FORMAT_R32_SINT: 2796 case ISL_FORMAT_R32_UINT: 2797 isv->gather_view.format = ISL_FORMAT_R32_FLOAT; 2798 break; 2799 2800 default: 2801 break; 2802 } 2803#endif 2804#endif 2805 /* Fill out SURFACE_STATE for this view. */ 2806 if (tmpl->target != PIPE_BUFFER) { 2807 if (crocus_resource_unfinished_aux_import(isv->res)) 2808 crocus_resource_finish_aux_import(&screen->base, isv->res); 2809 2810 } 2811 2812 return &isv->base; 2813} 2814 2815static void 2816crocus_sampler_view_destroy(struct pipe_context *ctx, 2817 struct pipe_sampler_view *state) 2818{ 2819 struct crocus_sampler_view *isv = (void *) state; 2820 pipe_resource_reference(&state->texture, NULL); 2821 free(isv); 2822} 2823 2824/** 2825 * The pipe->create_surface() driver hook. 2826 * 2827 * In Gallium nomenclature, "surfaces" are a view of a resource that 2828 * can be bound as a render target or depth/stencil buffer. 2829 */ 2830static struct pipe_surface * 2831crocus_create_surface(struct pipe_context *ctx, 2832 struct pipe_resource *tex, 2833 const struct pipe_surface *tmpl) 2834{ 2835 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; 2836 const struct intel_device_info *devinfo = &screen->devinfo; 2837 2838 isl_surf_usage_flags_t usage = 0; 2839 if (tmpl->writable) 2840 usage = ISL_SURF_USAGE_STORAGE_BIT; 2841 else if (util_format_is_depth_or_stencil(tmpl->format)) 2842 usage = ISL_SURF_USAGE_DEPTH_BIT; 2843 else 2844 usage = ISL_SURF_USAGE_RENDER_TARGET_BIT; 2845 2846 const struct crocus_format_info fmt = 2847 crocus_format_for_usage(devinfo, tmpl->format, usage); 2848 2849 if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) && 2850 !isl_format_supports_rendering(devinfo, fmt.fmt)) { 2851 /* Framebuffer validation will reject this invalid case, but it 2852 * hasn't had the opportunity yet. In the meantime, we need to 2853 * avoid hitting ISL asserts about unsupported formats below. 2854 */ 2855 return NULL; 2856 } 2857 2858 struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface)); 2859 struct pipe_surface *psurf = &surf->base; 2860 struct crocus_resource *res = (struct crocus_resource *) tex; 2861 2862 if (!surf) 2863 return NULL; 2864 2865 pipe_reference_init(&psurf->reference, 1); 2866 pipe_resource_reference(&psurf->texture, tex); 2867 psurf->context = ctx; 2868 psurf->format = tmpl->format; 2869 psurf->width = tex->width0; 2870 psurf->height = tex->height0; 2871 psurf->texture = tex; 2872 psurf->u.tex.first_layer = tmpl->u.tex.first_layer; 2873 psurf->u.tex.last_layer = tmpl->u.tex.last_layer; 2874 psurf->u.tex.level = tmpl->u.tex.level; 2875 2876 uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1; 2877 2878 struct isl_view *view = &surf->view; 2879 *view = (struct isl_view) { 2880 .format = fmt.fmt, 2881 .base_level = tmpl->u.tex.level, 2882 .levels = 1, 2883 .base_array_layer = tmpl->u.tex.first_layer, 2884 .array_len = array_len, 2885 .swizzle = ISL_SWIZZLE_IDENTITY, 2886 .usage = usage, 2887 }; 2888 2889#if GFX_VER >= 6 2890 struct isl_view *read_view = &surf->read_view; 2891 *read_view = (struct isl_view) { 2892 .format = fmt.fmt, 2893 .base_level = tmpl->u.tex.level, 2894 .levels = 1, 2895 .base_array_layer = tmpl->u.tex.first_layer, 2896 .array_len = array_len, 2897 .swizzle = ISL_SWIZZLE_IDENTITY, 2898 .usage = ISL_SURF_USAGE_TEXTURE_BIT, 2899 }; 2900#endif 2901 2902 surf->clear_color = res->aux.clear_color; 2903 2904 /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */ 2905 if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT | 2906 ISL_SURF_USAGE_STENCIL_BIT)) 2907 return psurf; 2908 2909 if (!isl_format_is_compressed(res->surf.format)) { 2910 if (crocus_resource_unfinished_aux_import(res)) 2911 crocus_resource_finish_aux_import(&screen->base, res); 2912 2913 memcpy(&surf->surf, &res->surf, sizeof(surf->surf)); 2914 uint64_t temp_offset; 2915 uint32_t temp_x, temp_y; 2916 2917 isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level, 2918 res->base.b.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer, 2919 res->base.b.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0, 2920 &temp_offset, &temp_x, &temp_y); 2921 if (!devinfo->has_surface_tile_offset && 2922 (temp_x || temp_y)) { 2923 /* Original gfx4 hardware couldn't draw to a non-tile-aligned 2924 * destination. 2925 */ 2926 /* move to temp */ 2927 struct pipe_resource wa_templ = (struct pipe_resource) { 2928 .width0 = u_minify(res->base.b.width0, tmpl->u.tex.level), 2929 .height0 = u_minify(res->base.b.height0, tmpl->u.tex.level), 2930 .depth0 = 1, 2931 .array_size = 1, 2932 .format = res->base.b.format, 2933 .target = PIPE_TEXTURE_2D, 2934 .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW, 2935 }; 2936 surf->align_res = screen->base.resource_create(&screen->base, &wa_templ); 2937 view->base_level = 0; 2938 view->base_array_layer = 0; 2939 view->array_len = 1; 2940 struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res; 2941 memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf)); 2942 } 2943 return psurf; 2944 } 2945 2946 /* The resource has a compressed format, which is not renderable, but we 2947 * have a renderable view format. We must be attempting to upload blocks 2948 * of compressed data via an uncompressed view. 2949 * 2950 * In this case, we can assume there are no auxiliary buffers, a single 2951 * miplevel, and that the resource is single-sampled. Gallium may try 2952 * and create an uncompressed view with multiple layers, however. 2953 */ 2954 assert(!isl_format_is_compressed(fmt.fmt)); 2955 assert(res->surf.samples == 1); 2956 assert(view->levels == 1); 2957 2958 /* TODO: compressed pbo uploads aren't working here */ 2959 return NULL; 2960 2961 uint64_t offset_B = 0; 2962 uint32_t tile_x_sa = 0, tile_y_sa = 0; 2963 2964 if (view->base_level > 0) { 2965 /* We can't rely on the hardware's miplevel selection with such 2966 * a substantial lie about the format, so we select a single image 2967 * using the Tile X/Y Offset fields. In this case, we can't handle 2968 * multiple array slices. 2969 * 2970 * On Broadwell, HALIGN and VALIGN are specified in pixels and are 2971 * hard-coded to align to exactly the block size of the compressed 2972 * texture. This means that, when reinterpreted as a non-compressed 2973 * texture, the tile offsets may be anything and we can't rely on 2974 * X/Y Offset. 2975 * 2976 * Return NULL to force the state tracker to take fallback paths. 2977 */ 2978 // TODO: check if the gen7 check is right, originally gen8 2979 if (view->array_len > 1 || GFX_VER == 7) 2980 return NULL; 2981 2982 const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D; 2983 isl_surf_get_image_surf(&screen->isl_dev, &res->surf, 2984 view->base_level, 2985 is_3d ? 0 : view->base_array_layer, 2986 is_3d ? view->base_array_layer : 0, 2987 &surf->surf, 2988 &offset_B, &tile_x_sa, &tile_y_sa); 2989 2990 /* We use address and tile offsets to access a single level/layer 2991 * as a subimage, so reset level/layer so it doesn't offset again. 2992 */ 2993 view->base_array_layer = 0; 2994 view->base_level = 0; 2995 } else { 2996 /* Level 0 doesn't require tile offsets, and the hardware can find 2997 * array slices using QPitch even with the format override, so we 2998 * can allow layers in this case. Copy the original ISL surface. 2999 */ 3000 memcpy(&surf->surf, &res->surf, sizeof(surf->surf)); 3001 } 3002 3003 /* Scale down the image dimensions by the block size. */ 3004 const struct isl_format_layout *fmtl = 3005 isl_format_get_layout(res->surf.format); 3006 surf->surf.format = fmt.fmt; 3007 surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf); 3008 surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf); 3009 tile_x_sa /= fmtl->bw; 3010 tile_y_sa /= fmtl->bh; 3011 3012 psurf->width = surf->surf.logical_level0_px.width; 3013 psurf->height = surf->surf.logical_level0_px.height; 3014 3015 return psurf; 3016} 3017 3018#if GFX_VER >= 7 3019static void 3020fill_default_image_param(struct brw_image_param *param) 3021{ 3022 memset(param, 0, sizeof(*param)); 3023 /* Set the swizzling shifts to all-ones to effectively disable swizzling -- 3024 * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more 3025 * detailed explanation of these parameters. 3026 */ 3027 param->swizzling[0] = 0xff; 3028 param->swizzling[1] = 0xff; 3029} 3030 3031static void 3032fill_buffer_image_param(struct brw_image_param *param, 3033 enum pipe_format pfmt, 3034 unsigned size) 3035{ 3036 const unsigned cpp = util_format_get_blocksize(pfmt); 3037 3038 fill_default_image_param(param); 3039 param->size[0] = size / cpp; 3040 param->stride[0] = cpp; 3041} 3042 3043#endif 3044 3045/** 3046 * The pipe->set_shader_images() driver hook. 3047 */ 3048static void 3049crocus_set_shader_images(struct pipe_context *ctx, 3050 enum pipe_shader_type p_stage, 3051 unsigned start_slot, unsigned count, 3052 unsigned unbind_num_trailing_slots, 3053 const struct pipe_image_view *p_images) 3054{ 3055#if GFX_VER >= 7 3056 struct crocus_context *ice = (struct crocus_context *) ctx; 3057 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; 3058 const struct intel_device_info *devinfo = &screen->devinfo; 3059 gl_shader_stage stage = stage_from_pipe(p_stage); 3060 struct crocus_shader_state *shs = &ice->state.shaders[stage]; 3061 struct crocus_genx_state *genx = ice->state.genx; 3062 struct brw_image_param *image_params = genx->shaders[stage].image_param; 3063 3064 shs->bound_image_views &= ~u_bit_consecutive(start_slot, count); 3065 3066 for (unsigned i = 0; i < count; i++) { 3067 struct crocus_image_view *iv = &shs->image[start_slot + i]; 3068 3069 if (p_images && p_images[i].resource) { 3070 const struct pipe_image_view *img = &p_images[i]; 3071 struct crocus_resource *res = (void *) img->resource; 3072 3073 util_copy_image_view(&iv->base, img); 3074 3075 shs->bound_image_views |= 1 << (start_slot + i); 3076 3077 res->bind_history |= PIPE_BIND_SHADER_IMAGE; 3078 res->bind_stages |= 1 << stage; 3079 3080 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT; 3081 struct crocus_format_info fmt = 3082 crocus_format_for_usage(devinfo, img->format, usage); 3083 3084 struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles); 3085 if (img->shader_access & PIPE_IMAGE_ACCESS_READ) { 3086 /* On Gen8, try to use typed surfaces reads (which support a 3087 * limited number of formats), and if not possible, fall back 3088 * to untyped reads. 3089 */ 3090 if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt)) 3091 fmt.fmt = ISL_FORMAT_RAW; 3092 else 3093 fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt); 3094 } 3095 3096 if (res->base.b.target != PIPE_BUFFER) { 3097 struct isl_view view = { 3098 .format = fmt.fmt, 3099 .base_level = img->u.tex.level, 3100 .levels = 1, 3101 .base_array_layer = img->u.tex.first_layer, 3102 .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1, 3103 .swizzle = swiz, 3104 .usage = usage, 3105 }; 3106 3107 iv->view = view; 3108 3109 isl_surf_fill_image_param(&screen->isl_dev, 3110 &image_params[start_slot + i], 3111 &res->surf, &view); 3112 } else { 3113 struct isl_view view = { 3114 .format = fmt.fmt, 3115 .swizzle = swiz, 3116 .usage = usage, 3117 }; 3118 iv->view = view; 3119 3120 util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset, 3121 img->u.buf.offset + img->u.buf.size); 3122 fill_buffer_image_param(&image_params[start_slot + i], 3123 img->format, img->u.buf.size); 3124 } 3125 } else { 3126 pipe_resource_reference(&iv->base.resource, NULL); 3127 fill_default_image_param(&image_params[start_slot + i]); 3128 } 3129 } 3130 3131 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage; 3132 ice->state.dirty |= 3133 stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES 3134 : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; 3135 3136 /* Broadwell also needs brw_image_params re-uploaded */ 3137 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage; 3138 shs->sysvals_need_upload = true; 3139#endif 3140} 3141 3142 3143/** 3144 * The pipe->set_sampler_views() driver hook. 3145 */ 3146static void 3147crocus_set_sampler_views(struct pipe_context *ctx, 3148 enum pipe_shader_type p_stage, 3149 unsigned start, unsigned count, 3150 unsigned unbind_num_trailing_slots, 3151 bool take_ownership, 3152 struct pipe_sampler_view **views) 3153{ 3154 struct crocus_context *ice = (struct crocus_context *) ctx; 3155 gl_shader_stage stage = stage_from_pipe(p_stage); 3156 struct crocus_shader_state *shs = &ice->state.shaders[stage]; 3157 3158 shs->bound_sampler_views &= ~u_bit_consecutive(start, count); 3159 3160 for (unsigned i = 0; i < count; i++) { 3161 struct pipe_sampler_view *pview = views ? views[i] : NULL; 3162 3163 if (take_ownership) { 3164 pipe_sampler_view_reference((struct pipe_sampler_view **) 3165 &shs->textures[start + i], NULL); 3166 shs->textures[start + i] = (struct crocus_sampler_view *)pview; 3167 } else { 3168 pipe_sampler_view_reference((struct pipe_sampler_view **) 3169 &shs->textures[start + i], pview); 3170 } 3171 3172 struct crocus_sampler_view *view = (void *) pview; 3173 if (view) { 3174 view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW; 3175 view->res->bind_stages |= 1 << stage; 3176 3177 shs->bound_sampler_views |= 1 << (start + i); 3178 } 3179 } 3180#if GFX_VER == 6 3181 /* first level parameters to crocus_upload_sampler_state is gfx6 only */ 3182 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage; 3183#endif 3184 ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage); 3185 ice->state.dirty |= 3186 stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES 3187 : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; 3188 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES]; 3189} 3190 3191/** 3192 * The pipe->set_tess_state() driver hook. 3193 */ 3194static void 3195crocus_set_tess_state(struct pipe_context *ctx, 3196 const float default_outer_level[4], 3197 const float default_inner_level[2]) 3198{ 3199 struct crocus_context *ice = (struct crocus_context *) ctx; 3200 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL]; 3201 3202 memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float)); 3203 memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float)); 3204 3205 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS; 3206 shs->sysvals_need_upload = true; 3207} 3208 3209static void 3210crocus_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices) 3211{ 3212 struct crocus_context *ice = (struct crocus_context *) ctx; 3213 3214 ice->state.patch_vertices = patch_vertices; 3215} 3216 3217static void 3218crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf) 3219{ 3220 struct crocus_surface *surf = (void *) p_surf; 3221 pipe_resource_reference(&p_surf->texture, NULL); 3222 3223 pipe_resource_reference(&surf->align_res, NULL); 3224 free(surf); 3225} 3226 3227static void 3228crocus_set_clip_state(struct pipe_context *ctx, 3229 const struct pipe_clip_state *state) 3230{ 3231 struct crocus_context *ice = (struct crocus_context *) ctx; 3232 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX]; 3233 struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY]; 3234 struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL]; 3235 3236 memcpy(&ice->state.clip_planes, state, sizeof(*state)); 3237 3238#if GFX_VER <= 5 3239 ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE; 3240#endif 3241 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS | 3242 CROCUS_STAGE_DIRTY_CONSTANTS_TES; 3243 shs->sysvals_need_upload = true; 3244 gshs->sysvals_need_upload = true; 3245 tshs->sysvals_need_upload = true; 3246} 3247 3248/** 3249 * The pipe->set_polygon_stipple() driver hook. 3250 */ 3251static void 3252crocus_set_polygon_stipple(struct pipe_context *ctx, 3253 const struct pipe_poly_stipple *state) 3254{ 3255 struct crocus_context *ice = (struct crocus_context *) ctx; 3256 memcpy(&ice->state.poly_stipple, state, sizeof(*state)); 3257 ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE; 3258} 3259 3260/** 3261 * The pipe->set_sample_mask() driver hook. 3262 */ 3263static void 3264crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask) 3265{ 3266 struct crocus_context *ice = (struct crocus_context *) ctx; 3267 3268 /* We only support 16x MSAA, so we have 16 bits of sample maks. 3269 * st/mesa may pass us 0xffffffff though, meaning "enable all samples". 3270 */ 3271 ice->state.sample_mask = sample_mask & 0xff; 3272 ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK; 3273} 3274 3275static void 3276crocus_fill_scissor_rect(struct crocus_context *ice, 3277 int idx, 3278 struct pipe_scissor_state *ss) 3279{ 3280 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; 3281 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso; 3282 const struct pipe_viewport_state *vp = &ice->state.viewports[idx]; 3283 struct pipe_scissor_state scissor = (struct pipe_scissor_state) { 3284 .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0), 3285 .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1, 3286 .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0), 3287 .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1, 3288 }; 3289 if (cso_state->scissor) { 3290 struct pipe_scissor_state *s = &ice->state.scissors[idx]; 3291 scissor.minx = MAX2(scissor.minx, s->minx); 3292 scissor.miny = MAX2(scissor.miny, s->miny); 3293 scissor.maxx = MIN2(scissor.maxx, s->maxx); 3294 scissor.maxy = MIN2(scissor.maxy, s->maxy); 3295 } 3296 *ss = scissor; 3297} 3298 3299/** 3300 * The pipe->set_scissor_states() driver hook. 3301 * 3302 * This corresponds to our SCISSOR_RECT state structures. It's an 3303 * exact match, so we just store them, and memcpy them out later. 3304 */ 3305static void 3306crocus_set_scissor_states(struct pipe_context *ctx, 3307 unsigned start_slot, 3308 unsigned num_scissors, 3309 const struct pipe_scissor_state *rects) 3310{ 3311 struct crocus_context *ice = (struct crocus_context *) ctx; 3312 3313 for (unsigned i = 0; i < num_scissors; i++) { 3314 if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) { 3315 /* If the scissor was out of bounds and got clamped to 0 width/height 3316 * at the bounds, the subtraction of 1 from maximums could produce a 3317 * negative number and thus not clip anything. Instead, just provide 3318 * a min > max scissor inside the bounds, which produces the expected 3319 * no rendering. 3320 */ 3321 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) { 3322 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0, 3323 }; 3324 } else { 3325 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) { 3326 .minx = rects[i].minx, .miny = rects[i].miny, 3327 .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1, 3328 }; 3329 } 3330 } 3331 3332#if GFX_VER < 6 3333 ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */ 3334#else 3335 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT; 3336#endif 3337 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT; 3338 3339} 3340 3341/** 3342 * The pipe->set_stencil_ref() driver hook. 3343 * 3344 * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time. 3345 */ 3346static void 3347crocus_set_stencil_ref(struct pipe_context *ctx, 3348 const struct pipe_stencil_ref ref) 3349{ 3350 struct crocus_context *ice = (struct crocus_context *) ctx; 3351 ice->state.stencil_ref = ref; 3352 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE; 3353} 3354 3355#if GFX_VER == 8 3356static float 3357viewport_extent(const struct pipe_viewport_state *state, int axis, float sign) 3358{ 3359 return copysignf(state->scale[axis], sign) + state->translate[axis]; 3360} 3361#endif 3362 3363/** 3364 * The pipe->set_viewport_states() driver hook. 3365 * 3366 * This corresponds to our SF_CLIP_VIEWPORT states. We can't calculate 3367 * the guardband yet, as we need the framebuffer dimensions, but we can 3368 * at least fill out the rest. 3369 */ 3370static void 3371crocus_set_viewport_states(struct pipe_context *ctx, 3372 unsigned start_slot, 3373 unsigned count, 3374 const struct pipe_viewport_state *states) 3375{ 3376 struct crocus_context *ice = (struct crocus_context *) ctx; 3377 3378 memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count); 3379 3380 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT; 3381 ice->state.dirty |= CROCUS_DIRTY_RASTER; 3382#if GFX_VER >= 6 3383 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT; 3384#endif 3385 3386 if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near || 3387 !ice->state.cso_rast->cso.depth_clip_far)) 3388 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT; 3389} 3390 3391/** 3392 * The pipe->set_framebuffer_state() driver hook. 3393 * 3394 * Sets the current draw FBO, including color render targets, depth, 3395 * and stencil buffers. 3396 */ 3397static void 3398crocus_set_framebuffer_state(struct pipe_context *ctx, 3399 const struct pipe_framebuffer_state *state) 3400{ 3401 struct crocus_context *ice = (struct crocus_context *) ctx; 3402 struct pipe_framebuffer_state *cso = &ice->state.framebuffer; 3403 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; 3404 const struct intel_device_info *devinfo = &screen->devinfo; 3405#if 0 3406 struct isl_device *isl_dev = &screen->isl_dev; 3407 struct crocus_resource *zres; 3408 struct crocus_resource *stencil_res; 3409#endif 3410 3411 unsigned samples = util_framebuffer_get_num_samples(state); 3412 unsigned layers = util_framebuffer_get_num_layers(state); 3413 3414#if GFX_VER >= 6 3415 if (cso->samples != samples) { 3416 ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE; 3417 ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK; 3418 ice->state.dirty |= CROCUS_DIRTY_RASTER; 3419#if GFX_VERx10 == 75 3420 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS; 3421#endif 3422 } 3423#endif 3424 3425#if GFX_VER >= 6 && GFX_VER < 8 3426 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE; 3427#endif 3428 3429 if ((cso->layers == 0) != (layers == 0)) { 3430 ice->state.dirty |= CROCUS_DIRTY_CLIP; 3431 } 3432 3433 if (cso->width != state->width || cso->height != state->height) { 3434 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT; 3435 ice->state.dirty |= CROCUS_DIRTY_RASTER; 3436 ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE; 3437#if GFX_VER >= 6 3438 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT; 3439#endif 3440 } 3441 3442 if (cso->zsbuf || state->zsbuf) { 3443 ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER; 3444 3445 /* update SF's depth buffer format */ 3446 if (GFX_VER == 7 && cso->zsbuf) 3447 ice->state.dirty |= CROCUS_DIRTY_RASTER; 3448 } 3449 3450 /* wm thread dispatch enable */ 3451 ice->state.dirty |= CROCUS_DIRTY_WM; 3452 util_copy_framebuffer_state(cso, state); 3453 cso->samples = samples; 3454 cso->layers = layers; 3455 3456 if (cso->zsbuf) { 3457 struct crocus_resource *zres; 3458 struct crocus_resource *stencil_res; 3459 enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE; 3460 crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres, 3461 &stencil_res); 3462 if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) { 3463 aux_usage = zres->aux.usage; 3464 } 3465 ice->state.hiz_usage = aux_usage; 3466 } 3467 3468 /* Render target change */ 3469 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS; 3470 3471 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; 3472 3473 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER]; 3474} 3475 3476/** 3477 * The pipe->set_constant_buffer() driver hook. 3478 * 3479 * This uploads any constant data in user buffers, and references 3480 * any UBO resources containing constant data. 3481 */ 3482static void 3483crocus_set_constant_buffer(struct pipe_context *ctx, 3484 enum pipe_shader_type p_stage, unsigned index, 3485 bool take_ownership, 3486 const struct pipe_constant_buffer *input) 3487{ 3488 struct crocus_context *ice = (struct crocus_context *) ctx; 3489 gl_shader_stage stage = stage_from_pipe(p_stage); 3490 struct crocus_shader_state *shs = &ice->state.shaders[stage]; 3491 struct pipe_constant_buffer *cbuf = &shs->constbufs[index]; 3492 3493 util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership); 3494 3495 if (input && input->buffer_size && (input->buffer || input->user_buffer)) { 3496 shs->bound_cbufs |= 1u << index; 3497 3498 if (input->user_buffer) { 3499 void *map = NULL; 3500 pipe_resource_reference(&cbuf->buffer, NULL); 3501 u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64, 3502 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map); 3503 3504 if (!cbuf->buffer) { 3505 /* Allocation was unsuccessful - just unbind */ 3506 crocus_set_constant_buffer(ctx, p_stage, index, false, NULL); 3507 return; 3508 } 3509 3510 assert(map); 3511 memcpy(map, input->user_buffer, input->buffer_size); 3512 } 3513 cbuf->buffer_size = 3514 MIN2(input->buffer_size, 3515 crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset); 3516 3517 struct crocus_resource *res = (void *) cbuf->buffer; 3518 res->bind_history |= PIPE_BIND_CONSTANT_BUFFER; 3519 res->bind_stages |= 1 << stage; 3520 } else { 3521 shs->bound_cbufs &= ~(1u << index); 3522 } 3523 3524 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage; 3525} 3526 3527static void 3528upload_sysvals(struct crocus_context *ice, 3529 gl_shader_stage stage) 3530{ 3531 UNUSED struct crocus_genx_state *genx = ice->state.genx; 3532 struct crocus_shader_state *shs = &ice->state.shaders[stage]; 3533 3534 struct crocus_compiled_shader *shader = ice->shaders.prog[stage]; 3535 if (!shader || shader->num_system_values == 0) 3536 return; 3537 3538 assert(shader->num_cbufs > 0); 3539 3540 unsigned sysval_cbuf_index = shader->num_cbufs - 1; 3541 struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index]; 3542 unsigned upload_size = shader->num_system_values * sizeof(uint32_t); 3543 uint32_t *map = NULL; 3544 3545 assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS); 3546 u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64, 3547 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map); 3548 3549 for (int i = 0; i < shader->num_system_values; i++) { 3550 uint32_t sysval = shader->system_values[i]; 3551 uint32_t value = 0; 3552 3553 if (BRW_PARAM_DOMAIN(sysval) == BRW_PARAM_DOMAIN_IMAGE) { 3554#if GFX_VER >= 7 3555 unsigned img = BRW_PARAM_IMAGE_IDX(sysval); 3556 unsigned offset = BRW_PARAM_IMAGE_OFFSET(sysval); 3557 struct brw_image_param *param = 3558 &genx->shaders[stage].image_param[img]; 3559 3560 assert(offset < sizeof(struct brw_image_param)); 3561 value = ((uint32_t *) param)[offset]; 3562#endif 3563 } else if (sysval == BRW_PARAM_BUILTIN_ZERO) { 3564 value = 0; 3565 } else if (BRW_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) { 3566 int plane = BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval); 3567 int comp = BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval); 3568 value = fui(ice->state.clip_planes.ucp[plane][comp]); 3569 } else if (sysval == BRW_PARAM_BUILTIN_PATCH_VERTICES_IN) { 3570 if (stage == MESA_SHADER_TESS_CTRL) { 3571 value = ice->state.vertices_per_patch; 3572 } else { 3573 assert(stage == MESA_SHADER_TESS_EVAL); 3574 const struct shader_info *tcs_info = 3575 crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL); 3576 if (tcs_info) 3577 value = tcs_info->tess.tcs_vertices_out; 3578 else 3579 value = ice->state.vertices_per_patch; 3580 } 3581 } else if (sysval >= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X && 3582 sysval <= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) { 3583 unsigned i = sysval - BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X; 3584 value = fui(ice->state.default_outer_level[i]); 3585 } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X) { 3586 value = fui(ice->state.default_inner_level[0]); 3587 } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) { 3588 value = fui(ice->state.default_inner_level[1]); 3589 } else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X && 3590 sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) { 3591 unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X; 3592 value = ice->state.last_block[i]; 3593 } else { 3594 assert(!"unhandled system value"); 3595 } 3596 3597 *map++ = value; 3598 } 3599 3600 cbuf->buffer_size = upload_size; 3601 shs->sysvals_need_upload = false; 3602} 3603 3604/** 3605 * The pipe->set_shader_buffers() driver hook. 3606 * 3607 * This binds SSBOs and ABOs. Unfortunately, we need to stream out 3608 * SURFACE_STATE here, as the buffer offset may change each time. 3609 */ 3610static void 3611crocus_set_shader_buffers(struct pipe_context *ctx, 3612 enum pipe_shader_type p_stage, 3613 unsigned start_slot, unsigned count, 3614 const struct pipe_shader_buffer *buffers, 3615 unsigned writable_bitmask) 3616{ 3617 struct crocus_context *ice = (struct crocus_context *) ctx; 3618 gl_shader_stage stage = stage_from_pipe(p_stage); 3619 struct crocus_shader_state *shs = &ice->state.shaders[stage]; 3620 3621 unsigned modified_bits = u_bit_consecutive(start_slot, count); 3622 3623 shs->bound_ssbos &= ~modified_bits; 3624 shs->writable_ssbos &= ~modified_bits; 3625 shs->writable_ssbos |= writable_bitmask << start_slot; 3626 3627 for (unsigned i = 0; i < count; i++) { 3628 if (buffers && buffers[i].buffer) { 3629 struct crocus_resource *res = (void *) buffers[i].buffer; 3630 struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i]; 3631 pipe_resource_reference(&ssbo->buffer, &res->base.b); 3632 ssbo->buffer_offset = buffers[i].buffer_offset; 3633 ssbo->buffer_size = 3634 MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset); 3635 3636 shs->bound_ssbos |= 1 << (start_slot + i); 3637 3638 res->bind_history |= PIPE_BIND_SHADER_BUFFER; 3639 res->bind_stages |= 1 << stage; 3640 3641 util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset, 3642 ssbo->buffer_offset + ssbo->buffer_size); 3643 } else { 3644 pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL); 3645 } 3646 } 3647 3648 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage; 3649} 3650 3651static void 3652crocus_delete_state(struct pipe_context *ctx, void *state) 3653{ 3654 free(state); 3655} 3656 3657/** 3658 * The pipe->set_vertex_buffers() driver hook. 3659 * 3660 * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet. 3661 */ 3662static void 3663crocus_set_vertex_buffers(struct pipe_context *ctx, 3664 unsigned start_slot, unsigned count, 3665 unsigned unbind_num_trailing_slots, 3666 bool take_ownership, 3667 const struct pipe_vertex_buffer *buffers) 3668{ 3669 struct crocus_context *ice = (struct crocus_context *) ctx; 3670 struct crocus_screen *screen = (struct crocus_screen *) ctx->screen; 3671 const unsigned padding = 3672 (GFX_VERx10 < 75 && screen->devinfo.platform != INTEL_PLATFORM_BYT) * 2; 3673 ice->state.bound_vertex_buffers &= 3674 ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots); 3675 3676 util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers, 3677 buffers, start_slot, count, unbind_num_trailing_slots, 3678 take_ownership); 3679 3680 for (unsigned i = 0; i < count; i++) { 3681 struct pipe_vertex_buffer *state = 3682 &ice->state.vertex_buffers[start_slot + i]; 3683 3684 if (!state->is_user_buffer && state->buffer.resource) { 3685 struct crocus_resource *res = (void *)state->buffer.resource; 3686 res->bind_history |= PIPE_BIND_VERTEX_BUFFER; 3687 } 3688 3689 uint32_t end = 0; 3690 if (state->buffer.resource) 3691 end = state->buffer.resource->width0 + padding; 3692 ice->state.vb_end[start_slot + i] = end; 3693 } 3694 ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS; 3695} 3696 3697#if GFX_VERx10 < 75 3698static uint8_t get_wa_flags(enum isl_format format) 3699{ 3700 uint8_t wa_flags = 0; 3701 3702 switch (format) { 3703 case ISL_FORMAT_R10G10B10A2_USCALED: 3704 wa_flags = BRW_ATTRIB_WA_SCALE; 3705 break; 3706 case ISL_FORMAT_R10G10B10A2_SSCALED: 3707 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE; 3708 break; 3709 case ISL_FORMAT_R10G10B10A2_UNORM: 3710 wa_flags = BRW_ATTRIB_WA_NORMALIZE; 3711 break; 3712 case ISL_FORMAT_R10G10B10A2_SNORM: 3713 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE; 3714 break; 3715 case ISL_FORMAT_R10G10B10A2_SINT: 3716 wa_flags = BRW_ATTRIB_WA_SIGN; 3717 break; 3718 case ISL_FORMAT_B10G10R10A2_USCALED: 3719 wa_flags = BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA; 3720 break; 3721 case ISL_FORMAT_B10G10R10A2_SSCALED: 3722 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA; 3723 break; 3724 case ISL_FORMAT_B10G10R10A2_UNORM: 3725 wa_flags = BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA; 3726 break; 3727 case ISL_FORMAT_B10G10R10A2_SNORM: 3728 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA; 3729 break; 3730 case ISL_FORMAT_B10G10R10A2_SINT: 3731 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_BGRA; 3732 break; 3733 case ISL_FORMAT_B10G10R10A2_UINT: 3734 wa_flags = BRW_ATTRIB_WA_BGRA; 3735 break; 3736 default: 3737 break; 3738 } 3739 return wa_flags; 3740} 3741#endif 3742 3743/** 3744 * Gallium CSO for vertex elements. 3745 */ 3746struct crocus_vertex_element_state { 3747 uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)]; 3748#if GFX_VER == 8 3749 uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)]; 3750#endif 3751 uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)]; 3752#if GFX_VER == 8 3753 uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)]; 3754#endif 3755 uint32_t step_rate[16]; 3756 uint8_t wa_flags[33]; 3757 unsigned count; 3758}; 3759 3760/** 3761 * The pipe->create_vertex_elements() driver hook. 3762 * 3763 * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS 3764 * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing 3765 * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are 3766 * needed. In these cases we will need information available at draw time. 3767 * We setup edgeflag_ve and edgeflag_vfi as alternatives last 3768 * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at 3769 * draw time if we detect that EdgeFlag is needed by the Vertex Shader. 3770 */ 3771static void * 3772crocus_create_vertex_elements(struct pipe_context *ctx, 3773 unsigned count, 3774 const struct pipe_vertex_element *state) 3775{ 3776 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; 3777 const struct intel_device_info *devinfo = &screen->devinfo; 3778 struct crocus_vertex_element_state *cso = 3779 malloc(sizeof(struct crocus_vertex_element_state)); 3780 3781 cso->count = count; 3782 3783 crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) { 3784 ve.DWordLength = 3785 1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2; 3786 } 3787 3788 uint32_t *ve_pack_dest = &cso->vertex_elements[1]; 3789#if GFX_VER == 8 3790 uint32_t *vfi_pack_dest = cso->vf_instancing; 3791#endif 3792 3793 if (count == 0) { 3794 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) { 3795 ve.Valid = true; 3796 ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT; 3797 ve.Component0Control = VFCOMP_STORE_0; 3798 ve.Component1Control = VFCOMP_STORE_0; 3799 ve.Component2Control = VFCOMP_STORE_0; 3800 ve.Component3Control = VFCOMP_STORE_1_FP; 3801 } 3802#if GFX_VER == 8 3803 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) { 3804 } 3805#endif 3806 } 3807 3808 for (int i = 0; i < count; i++) { 3809 const struct crocus_format_info fmt = 3810 crocus_format_for_usage(devinfo, state[i].src_format, 0); 3811 unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC, 3812 VFCOMP_STORE_SRC, VFCOMP_STORE_SRC }; 3813 enum isl_format actual_fmt = fmt.fmt; 3814 3815#if GFX_VERx10 < 75 3816 cso->wa_flags[i] = get_wa_flags(fmt.fmt); 3817 3818 if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED || 3819 fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED || 3820 fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM || 3821 fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM || 3822 fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT || 3823 fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED || 3824 fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED || 3825 fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM || 3826 fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM || 3827 fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT || 3828 fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT) 3829 actual_fmt = ISL_FORMAT_R10G10B10A2_UINT; 3830 if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT) 3831 actual_fmt = ISL_FORMAT_R8G8B8A8_SINT; 3832 if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT) 3833 actual_fmt = ISL_FORMAT_R8G8B8A8_UINT; 3834 if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT) 3835 actual_fmt = ISL_FORMAT_R16G16B16A16_SINT; 3836 if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT) 3837 actual_fmt = ISL_FORMAT_R16G16B16A16_UINT; 3838#endif 3839 3840 cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor; 3841 3842 switch (isl_format_get_num_channels(fmt.fmt)) { 3843 case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH; 3844 case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH; 3845 case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH; 3846 case 3: 3847 comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT 3848 : VFCOMP_STORE_1_FP; 3849 break; 3850 } 3851 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) { 3852#if GFX_VER >= 6 3853 ve.EdgeFlagEnable = false; 3854#endif 3855 ve.VertexBufferIndex = state[i].vertex_buffer_index; 3856 ve.Valid = true; 3857 ve.SourceElementOffset = state[i].src_offset; 3858 ve.SourceElementFormat = actual_fmt; 3859 ve.Component0Control = comp[0]; 3860 ve.Component1Control = comp[1]; 3861 ve.Component2Control = comp[2]; 3862 ve.Component3Control = comp[3]; 3863#if GFX_VER < 5 3864 ve.DestinationElementOffset = i * 4; 3865#endif 3866 } 3867 3868#if GFX_VER == 8 3869 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) { 3870 vi.VertexElementIndex = i; 3871 vi.InstancingEnable = state[i].instance_divisor > 0; 3872 vi.InstanceDataStepRate = state[i].instance_divisor; 3873 } 3874#endif 3875 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length); 3876#if GFX_VER == 8 3877 vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length); 3878#endif 3879 } 3880 3881 /* An alternative version of the last VE and VFI is stored so it 3882 * can be used at draw time in case Vertex Shader uses EdgeFlag 3883 */ 3884 if (count) { 3885 const unsigned edgeflag_index = count - 1; 3886 const struct crocus_format_info fmt = 3887 crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0); 3888 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) { 3889#if GFX_VER >= 6 3890 ve.EdgeFlagEnable = true; 3891#endif 3892 ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index; 3893 ve.Valid = true; 3894 ve.SourceElementOffset = state[edgeflag_index].src_offset; 3895 ve.SourceElementFormat = fmt.fmt; 3896 ve.Component0Control = VFCOMP_STORE_SRC; 3897 ve.Component1Control = VFCOMP_STORE_0; 3898 ve.Component2Control = VFCOMP_STORE_0; 3899 ve.Component3Control = VFCOMP_STORE_0; 3900 } 3901#if GFX_VER == 8 3902 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) { 3903 /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled 3904 * at draw time, as it should change if SGVs are emitted. 3905 */ 3906 vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0; 3907 vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor; 3908 } 3909#endif 3910 } 3911 3912 return cso; 3913} 3914 3915/** 3916 * The pipe->bind_vertex_elements_state() driver hook. 3917 */ 3918static void 3919crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state) 3920{ 3921 struct crocus_context *ice = (struct crocus_context *) ctx; 3922#if GFX_VER == 8 3923 struct crocus_vertex_element_state *old_cso = ice->state.cso_vertex_elements; 3924 struct crocus_vertex_element_state *new_cso = state; 3925 3926 if (new_cso && cso_changed(count)) 3927 ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_SGVS; 3928#endif 3929 ice->state.cso_vertex_elements = state; 3930 ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS; 3931 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS]; 3932} 3933 3934#if GFX_VER >= 6 3935struct crocus_streamout_counter { 3936 uint32_t offset_start; 3937 uint32_t offset_end; 3938 3939 uint64_t accum; 3940}; 3941 3942/** 3943 * Gallium CSO for stream output (transform feedback) targets. 3944 */ 3945struct crocus_stream_output_target { 3946 struct pipe_stream_output_target base; 3947 3948 /** Stride (bytes-per-vertex) during this transform feedback operation */ 3949 uint16_t stride; 3950 3951 /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */ 3952 bool zeroed; 3953 3954 struct crocus_resource *offset_res; 3955 uint32_t offset_offset; 3956 3957#if GFX_VER == 6 3958 void *prim_map; 3959 struct crocus_streamout_counter prev_count; 3960 struct crocus_streamout_counter count; 3961#endif 3962#if GFX_VER == 8 3963 /** Does the next 3DSTATE_SO_BUFFER need to zero the offsets? */ 3964 bool zero_offset; 3965#endif 3966}; 3967 3968#if GFX_VER >= 7 3969static uint32_t 3970crocus_get_so_offset(struct pipe_stream_output_target *so) 3971{ 3972 struct crocus_stream_output_target *tgt = (void *)so; 3973 struct pipe_transfer *transfer; 3974 struct pipe_box box; 3975 uint32_t result; 3976 u_box_1d(tgt->offset_offset, 4, &box); 3977 void *val = so->context->buffer_map(so->context, &tgt->offset_res->base.b, 3978 0, PIPE_MAP_DIRECTLY, 3979 &box, &transfer); 3980 assert(val); 3981 result = *(uint32_t *)val; 3982 so->context->buffer_unmap(so->context, transfer); 3983 3984 return result / tgt->stride; 3985} 3986#endif 3987 3988#if GFX_VER == 6 3989static void 3990compute_vertices_written_so_far(struct crocus_context *ice, 3991 struct crocus_stream_output_target *tgt, 3992 struct crocus_streamout_counter *count, 3993 uint64_t *svbi); 3994 3995static uint32_t 3996crocus_get_so_offset(struct pipe_stream_output_target *so) 3997{ 3998 struct crocus_stream_output_target *tgt = (void *)so; 3999 struct crocus_context *ice = (void *)so->context; 4000 4001 uint64_t vert_written; 4002 compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written); 4003 return vert_written; 4004} 4005#endif 4006 4007/** 4008 * The pipe->create_stream_output_target() driver hook. 4009 * 4010 * "Target" here refers to a destination buffer. We translate this into 4011 * a 3DSTATE_SO_BUFFER packet. We can handle most fields, but don't yet 4012 * know which buffer this represents, or whether we ought to zero the 4013 * write-offsets, or append. Those are handled in the set() hook. 4014 */ 4015static struct pipe_stream_output_target * 4016crocus_create_stream_output_target(struct pipe_context *ctx, 4017 struct pipe_resource *p_res, 4018 unsigned buffer_offset, 4019 unsigned buffer_size) 4020{ 4021 struct crocus_resource *res = (void *) p_res; 4022 struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso)); 4023 if (!cso) 4024 return NULL; 4025 4026 res->bind_history |= PIPE_BIND_STREAM_OUTPUT; 4027 4028 pipe_reference_init(&cso->base.reference, 1); 4029 pipe_resource_reference(&cso->base.buffer, p_res); 4030 cso->base.buffer_offset = buffer_offset; 4031 cso->base.buffer_size = buffer_size; 4032 cso->base.context = ctx; 4033 4034 util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset, 4035 buffer_offset + buffer_size); 4036#if GFX_VER >= 7 4037 struct crocus_context *ice = (struct crocus_context *) ctx; 4038 void *temp; 4039 u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4, 4040 &cso->offset_offset, 4041 (struct pipe_resource **)&cso->offset_res, 4042 &temp); 4043#endif 4044 4045 return &cso->base; 4046} 4047 4048static void 4049crocus_stream_output_target_destroy(struct pipe_context *ctx, 4050 struct pipe_stream_output_target *state) 4051{ 4052 struct crocus_stream_output_target *cso = (void *) state; 4053 4054 pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL); 4055 pipe_resource_reference(&cso->base.buffer, NULL); 4056 4057 free(cso); 4058} 4059 4060#define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288 4061#define GEN7_SO_WRITE_OFFSET(n) (0x5280 + (n) * 4) 4062 4063#if GFX_VER == 6 4064static void 4065aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt, 4066 struct crocus_streamout_counter *counter) 4067{ 4068 uint64_t *prim_counts = tgt->prim_map; 4069 4070 if (crocus_batch_references(batch, tgt->offset_res->bo)) { 4071 struct pipe_fence_handle *out_fence = NULL; 4072 batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0); 4073 batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX); 4074 batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL); 4075 } 4076 4077 for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) { 4078 counter->accum += prim_counts[i + 1] - prim_counts[i]; 4079 } 4080 tgt->count.offset_start = tgt->count.offset_end = 0; 4081} 4082 4083static void 4084crocus_stream_store_prims_written(struct crocus_batch *batch, 4085 struct crocus_stream_output_target *tgt) 4086{ 4087 if (!tgt->offset_res) { 4088 u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4, 4089 &tgt->offset_offset, 4090 (struct pipe_resource **)&tgt->offset_res, 4091 &tgt->prim_map); 4092 tgt->count.offset_start = tgt->count.offset_end = 0; 4093 } 4094 4095 if (tgt->count.offset_end + 16 >= 4096) { 4096 aggregate_stream_counter(batch, tgt, &tgt->prev_count); 4097 aggregate_stream_counter(batch, tgt, &tgt->count); 4098 } 4099 4100 crocus_emit_mi_flush(batch); 4101 crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN, 4102 tgt->offset_res->bo, 4103 tgt->count.offset_end + tgt->offset_offset, false); 4104 tgt->count.offset_end += 8; 4105} 4106 4107static void 4108compute_vertices_written_so_far(struct crocus_context *ice, 4109 struct crocus_stream_output_target *tgt, 4110 struct crocus_streamout_counter *counter, 4111 uint64_t *svbi) 4112{ 4113 //TODO vertices per prim 4114 aggregate_stream_counter(&ice->batches[0], tgt, counter); 4115 4116 *svbi = counter->accum * ice->state.last_xfb_verts_per_prim; 4117} 4118#endif 4119/** 4120 * The pipe->set_stream_output_targets() driver hook. 4121 * 4122 * At this point, we know which targets are bound to a particular index, 4123 * and also whether we want to append or start over. We can finish the 4124 * 3DSTATE_SO_BUFFER packets we started earlier. 4125 */ 4126static void 4127crocus_set_stream_output_targets(struct pipe_context *ctx, 4128 unsigned num_targets, 4129 struct pipe_stream_output_target **targets, 4130 const unsigned *offsets) 4131{ 4132 struct crocus_context *ice = (struct crocus_context *) ctx; 4133 struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; 4134 struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL }; 4135 const bool active = num_targets > 0; 4136 if (ice->state.streamout_active != active) { 4137 ice->state.streamout_active = active; 4138#if GFX_VER >= 7 4139 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT; 4140#else 4141 ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG; 4142#endif 4143 4144 /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because 4145 * it's a non-pipelined command. If we're switching streamout on, we 4146 * may have missed emitting it earlier, so do so now. (We're already 4147 * taking a stall to update 3DSTATE_SO_BUFFERS anyway...) 4148 */ 4149 if (active) { 4150#if GFX_VER >= 7 4151 ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST; 4152#endif 4153 } else { 4154 uint32_t flush = 0; 4155 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { 4156 struct crocus_stream_output_target *tgt = 4157 (void *) ice->state.so_target[i]; 4158 if (tgt) { 4159 struct crocus_resource *res = (void *) tgt->base.buffer; 4160 4161 flush |= crocus_flush_bits_for_history(res); 4162 crocus_dirty_for_history(ice, res); 4163 } 4164 } 4165 crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER], 4166 "make streamout results visible", flush); 4167 } 4168 } 4169 4170 ice->state.so_targets = num_targets; 4171 for (int i = 0; i < 4; i++) { 4172 pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]); 4173 pipe_so_target_reference(&ice->state.so_target[i], 4174 i < num_targets ? targets[i] : NULL); 4175 } 4176 4177#if GFX_VER == 6 4178 bool stored_num_prims = false; 4179 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { 4180 if (num_targets) { 4181 struct crocus_stream_output_target *tgt = 4182 (void *) ice->state.so_target[i]; 4183 4184 if (!tgt) 4185 continue; 4186 if (offsets[i] == 0) { 4187 // This means that we're supposed to ignore anything written to 4188 // the buffer before. We can do this by just clearing out the 4189 // count of writes to the prim count buffer. 4190 tgt->count.offset_start = tgt->count.offset_end; 4191 tgt->count.accum = 0; 4192 ice->state.svbi = 0; 4193 } else { 4194 if (tgt->offset_res) { 4195 compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi); 4196 tgt->count.offset_start = tgt->count.offset_end; 4197 } 4198 } 4199 4200 if (!stored_num_prims) { 4201 crocus_stream_store_prims_written(batch, tgt); 4202 stored_num_prims = true; 4203 } 4204 } else { 4205 struct crocus_stream_output_target *tgt = 4206 (void *) old_tgt[i]; 4207 if (tgt) { 4208 if (!stored_num_prims) { 4209 crocus_stream_store_prims_written(batch, tgt); 4210 stored_num_prims = true; 4211 } 4212 4213 if (tgt->offset_res) { 4214 tgt->prev_count = tgt->count; 4215 } 4216 } 4217 } 4218 pipe_so_target_reference(&old_tgt[i], NULL); 4219 } 4220 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS; 4221#else 4222 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { 4223 if (num_targets) { 4224 struct crocus_stream_output_target *tgt = 4225 (void *) ice->state.so_target[i]; 4226 4227 if (offsets[i] == 0) { 4228#if GFX_VER == 8 4229 if (tgt) 4230 tgt->zero_offset = true; 4231#endif 4232 crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0); 4233 } 4234 else if (tgt) 4235 crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i), 4236 tgt->offset_res->bo, 4237 tgt->offset_offset); 4238 } else { 4239 struct crocus_stream_output_target *tgt = 4240 (void *) old_tgt[i]; 4241 if (tgt) 4242 crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i), 4243 tgt->offset_res->bo, 4244 tgt->offset_offset, false); 4245 } 4246 pipe_so_target_reference(&old_tgt[i], NULL); 4247 } 4248#endif 4249 /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */ 4250 if (!active) 4251 return; 4252#if GFX_VER >= 7 4253 ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS; 4254#elif GFX_VER == 6 4255 ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI; 4256#endif 4257} 4258 4259#endif 4260 4261#if GFX_VER >= 7 4262/** 4263 * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and 4264 * 3DSTATE_STREAMOUT packets. 4265 * 4266 * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout 4267 * hardware to record. We can create it entirely based on the shader, with 4268 * no dynamic state dependencies. 4269 * 4270 * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and 4271 * state-based settings. We capture the shader-related ones here, and merge 4272 * the rest in at draw time. 4273 */ 4274static uint32_t * 4275crocus_create_so_decl_list(const struct pipe_stream_output_info *info, 4276 const struct brw_vue_map *vue_map) 4277{ 4278 struct GENX(SO_DECL) so_decl[PIPE_MAX_VERTEX_STREAMS][128]; 4279 int buffer_mask[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; 4280 int next_offset[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; 4281 int decls[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; 4282 int max_decls = 0; 4283 STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= PIPE_MAX_SO_OUTPUTS); 4284 4285 memset(so_decl, 0, sizeof(so_decl)); 4286 4287 /* Construct the list of SO_DECLs to be emitted. The formatting of the 4288 * command feels strange -- each dword pair contains a SO_DECL per stream. 4289 */ 4290 for (unsigned i = 0; i < info->num_outputs; i++) { 4291 const struct pipe_stream_output *output = &info->output[i]; 4292 const int buffer = output->output_buffer; 4293 const int varying = output->register_index; 4294 const unsigned stream_id = output->stream; 4295 assert(stream_id < PIPE_MAX_VERTEX_STREAMS); 4296 4297 buffer_mask[stream_id] |= 1 << buffer; 4298 4299 assert(vue_map->varying_to_slot[varying] >= 0); 4300 4301 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[] 4302 * array. Instead, it simply increments DstOffset for the following 4303 * input by the number of components that should be skipped. 4304 * 4305 * Our hardware is unusual in that it requires us to program SO_DECLs 4306 * for fake "hole" components, rather than simply taking the offset 4307 * for each real varying. Each hole can have size 1, 2, 3, or 4; we 4308 * program as many size = 4 holes as we can, then a final hole to 4309 * accommodate the final 1, 2, or 3 remaining. 4310 */ 4311 int skip_components = output->dst_offset - next_offset[buffer]; 4312 4313 while (skip_components > 0) { 4314 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) { 4315 .HoleFlag = 1, 4316 .OutputBufferSlot = output->output_buffer, 4317 .ComponentMask = (1 << MIN2(skip_components, 4)) - 1, 4318 }; 4319 skip_components -= 4; 4320 } 4321 4322 next_offset[buffer] = output->dst_offset + output->num_components; 4323 4324 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) { 4325 .OutputBufferSlot = output->output_buffer, 4326 .RegisterIndex = vue_map->varying_to_slot[varying], 4327 .ComponentMask = 4328 ((1 << output->num_components) - 1) << output->start_component, 4329 }; 4330 4331 if (decls[stream_id] > max_decls) 4332 max_decls = decls[stream_id]; 4333 } 4334 4335 unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls); 4336 uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords); 4337 uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length); 4338 4339 crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) { 4340 int urb_entry_read_offset = 0; 4341 int urb_entry_read_length = (vue_map->num_slots + 1) / 2 - 4342 urb_entry_read_offset; 4343 4344 /* We always read the whole vertex. This could be reduced at some 4345 * point by reading less and offsetting the register index in the 4346 * SO_DECLs. 4347 */ 4348 sol.Stream0VertexReadOffset = urb_entry_read_offset; 4349 sol.Stream0VertexReadLength = urb_entry_read_length - 1; 4350 sol.Stream1VertexReadOffset = urb_entry_read_offset; 4351 sol.Stream1VertexReadLength = urb_entry_read_length - 1; 4352 sol.Stream2VertexReadOffset = urb_entry_read_offset; 4353 sol.Stream2VertexReadLength = urb_entry_read_length - 1; 4354 sol.Stream3VertexReadOffset = urb_entry_read_offset; 4355 sol.Stream3VertexReadLength = urb_entry_read_length - 1; 4356 4357 // TODO: Double-check that stride == 0 means no buffer. Probably this 4358 // needs to go elsewhere, where the buffer enable stuff is actually 4359 // known. 4360#if GFX_VER < 8 4361 sol.SOBufferEnable0 = !!info->stride[0]; 4362 sol.SOBufferEnable1 = !!info->stride[1]; 4363 sol.SOBufferEnable2 = !!info->stride[2]; 4364 sol.SOBufferEnable3 = !!info->stride[3]; 4365#else 4366 /* Set buffer pitches; 0 means unbound. */ 4367 sol.Buffer0SurfacePitch = 4 * info->stride[0]; 4368 sol.Buffer1SurfacePitch = 4 * info->stride[1]; 4369 sol.Buffer2SurfacePitch = 4 * info->stride[2]; 4370 sol.Buffer3SurfacePitch = 4 * info->stride[3]; 4371#endif 4372 } 4373 4374 crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) { 4375 list.DWordLength = 3 + 2 * max_decls - 2; 4376 list.StreamtoBufferSelects0 = buffer_mask[0]; 4377 list.StreamtoBufferSelects1 = buffer_mask[1]; 4378 list.StreamtoBufferSelects2 = buffer_mask[2]; 4379 list.StreamtoBufferSelects3 = buffer_mask[3]; 4380 list.NumEntries0 = decls[0]; 4381 list.NumEntries1 = decls[1]; 4382 list.NumEntries2 = decls[2]; 4383 list.NumEntries3 = decls[3]; 4384 } 4385 4386 for (int i = 0; i < max_decls; i++) { 4387 crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) { 4388 entry.Stream0Decl = so_decl[0][i]; 4389 entry.Stream1Decl = so_decl[1][i]; 4390 entry.Stream2Decl = so_decl[2][i]; 4391 entry.Stream3Decl = so_decl[3][i]; 4392 } 4393 } 4394 4395 return map; 4396} 4397#endif 4398 4399#if GFX_VER == 6 4400static void 4401crocus_emit_so_svbi(struct crocus_context *ice) 4402{ 4403 struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; 4404 4405 unsigned max_vertex = 0xffffffff; 4406 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { 4407 struct crocus_stream_output_target *tgt = 4408 (void *) ice->state.so_target[i]; 4409 if (tgt) 4410 max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride); 4411 } 4412 4413 crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) { 4414 svbi.IndexNumber = 0; 4415 svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */ 4416 svbi.MaximumIndex = max_vertex; 4417 } 4418 4419 /* initialize the rest of the SVBI's to reasonable values so that we don't 4420 * run out of room writing the regular data. 4421 */ 4422 for (int i = 1; i < 4; i++) { 4423 crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) { 4424 svbi.IndexNumber = i; 4425 svbi.StreamedVertexBufferIndex = 0; 4426 svbi.MaximumIndex = 0xffffffff; 4427 } 4428 } 4429} 4430 4431#endif 4432 4433 4434#if GFX_VER >= 6 4435static bool 4436crocus_is_drawing_points(const struct crocus_context *ice) 4437{ 4438 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; 4439 4440 if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT || 4441 cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT) 4442 return true; 4443 4444 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) { 4445 const struct brw_gs_prog_data *gs_prog_data = 4446 (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data; 4447 return gs_prog_data->output_topology == _3DPRIM_POINTLIST; 4448 } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) { 4449 const struct brw_tes_prog_data *tes_data = 4450 (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data; 4451 return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT; 4452 } else { 4453 return ice->state.prim_mode == PIPE_PRIM_POINTS; 4454 } 4455} 4456#endif 4457 4458#if GFX_VER >= 6 4459static void 4460get_attr_override( 4461 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr, 4462 const struct brw_vue_map *vue_map, 4463 int urb_entry_read_offset, int fs_attr, 4464 bool two_side_color, uint32_t *max_source_attr) 4465{ 4466 /* Find the VUE slot for this attribute. */ 4467 int slot = vue_map->varying_to_slot[fs_attr]; 4468 4469 /* Viewport and Layer are stored in the VUE header. We need to override 4470 * them to zero if earlier stages didn't write them, as GL requires that 4471 * they read back as zero when not explicitly set. 4472 */ 4473 if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) { 4474 attr->ComponentOverrideX = true; 4475 attr->ComponentOverrideW = true; 4476 attr->ConstantSource = CONST_0000; 4477 4478 if (!(vue_map->slots_valid & VARYING_BIT_LAYER)) 4479 attr->ComponentOverrideY = true; 4480 if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT)) 4481 attr->ComponentOverrideZ = true; 4482 4483 return; 4484 } 4485 4486 /* If there was only a back color written but not front, use back 4487 * as the color instead of undefined 4488 */ 4489 if (slot == -1 && fs_attr == VARYING_SLOT_COL0) 4490 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0]; 4491 if (slot == -1 && fs_attr == VARYING_SLOT_COL1) 4492 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1]; 4493 4494 if (slot == -1) { 4495 /* This attribute does not exist in the VUE--that means that the vertex 4496 * shader did not write to it. This means that either: 4497 * 4498 * (a) This attribute is a texture coordinate, and it is going to be 4499 * replaced with point coordinates (as a consequence of a call to 4500 * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the 4501 * hardware will ignore whatever attribute override we supply. 4502 * 4503 * (b) This attribute is read by the fragment shader but not written by 4504 * the vertex shader, so its value is undefined. Therefore the 4505 * attribute override we supply doesn't matter. 4506 * 4507 * (c) This attribute is gl_PrimitiveID, and it wasn't written by the 4508 * previous shader stage. 4509 * 4510 * Note that we don't have to worry about the cases where the attribute 4511 * is gl_PointCoord or is undergoing point sprite coordinate 4512 * replacement, because in those cases, this function isn't called. 4513 * 4514 * In case (c), we need to program the attribute overrides so that the 4515 * primitive ID will be stored in this slot. In every other case, the 4516 * attribute override we supply doesn't matter. So just go ahead and 4517 * program primitive ID in every case. 4518 */ 4519 attr->ComponentOverrideW = true; 4520 attr->ComponentOverrideX = true; 4521 attr->ComponentOverrideY = true; 4522 attr->ComponentOverrideZ = true; 4523 attr->ConstantSource = PRIM_ID; 4524 return; 4525 } 4526 4527 /* Compute the location of the attribute relative to urb_entry_read_offset. 4528 * Each increment of urb_entry_read_offset represents a 256-bit value, so 4529 * it counts for two 128-bit VUE slots. 4530 */ 4531 int source_attr = slot - 2 * urb_entry_read_offset; 4532 assert(source_attr >= 0 && source_attr < 32); 4533 4534 /* If we are doing two-sided color, and the VUE slot following this one 4535 * represents a back-facing color, then we need to instruct the SF unit to 4536 * do back-facing swizzling. 4537 */ 4538 bool swizzling = two_side_color && 4539 ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 && 4540 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) || 4541 (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 && 4542 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1)); 4543 4544 /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */ 4545 if (*max_source_attr < source_attr + swizzling) 4546 *max_source_attr = source_attr + swizzling; 4547 4548 attr->SourceAttribute = source_attr; 4549 if (swizzling) 4550 attr->SwizzleSelect = INPUTATTR_FACING; 4551} 4552 4553static void 4554calculate_attr_overrides( 4555 const struct crocus_context *ice, 4556 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides, 4557 uint32_t *point_sprite_enables, 4558 uint32_t *urb_entry_read_length, 4559 uint32_t *urb_entry_read_offset) 4560{ 4561 const struct brw_wm_prog_data *wm_prog_data = (void *) 4562 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data; 4563 const struct brw_vue_map *vue_map = ice->shaders.last_vue_map; 4564 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; 4565 uint32_t max_source_attr = 0; 4566 const struct shader_info *fs_info = 4567 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT); 4568 4569 int first_slot = 4570 brw_compute_first_urb_slot_required(fs_info->inputs_read, vue_map); 4571 4572 /* Each URB offset packs two varying slots */ 4573 assert(first_slot % 2 == 0); 4574 *urb_entry_read_offset = first_slot / 2; 4575 *point_sprite_enables = 0; 4576 4577 for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) { 4578 const int input_index = wm_prog_data->urb_setup[fs_attr]; 4579 4580 if (input_index < 0) 4581 continue; 4582 4583 bool point_sprite = false; 4584 if (crocus_is_drawing_points(ice)) { 4585 if (fs_attr >= VARYING_SLOT_TEX0 && 4586 fs_attr <= VARYING_SLOT_TEX7 && 4587 cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0))) 4588 point_sprite = true; 4589 4590 if (fs_attr == VARYING_SLOT_PNTC) 4591 point_sprite = true; 4592 4593 if (point_sprite) 4594 *point_sprite_enables |= 1U << input_index; 4595 } 4596 4597 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 }; 4598 if (!point_sprite) { 4599 get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr, 4600 cso_rast->cso.light_twoside, &max_source_attr); 4601 } 4602 4603 /* The hardware can only do the overrides on 16 overrides at a 4604 * time, and the other up to 16 have to be lined up so that the 4605 * input index = the output index. We'll need to do some 4606 * tweaking to make sure that's the case. 4607 */ 4608 if (input_index < 16) 4609 attr_overrides[input_index] = attribute; 4610 else 4611 assert(attribute.SourceAttribute == input_index); 4612 } 4613 4614 /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for 4615 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length": 4616 * 4617 * "This field should be set to the minimum length required to read the 4618 * maximum source attribute. The maximum source attribute is indicated 4619 * by the maximum value of the enabled Attribute # Source Attribute if 4620 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if 4621 * enable is not set. 4622 * read_length = ceiling((max_source_attr + 1) / 2) 4623 * 4624 * [errata] Corruption/Hang possible if length programmed larger than 4625 * recommended" 4626 * 4627 * Similar text exists for Ivy Bridge. 4628 */ 4629 *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2); 4630} 4631#endif 4632 4633#if GFX_VER >= 7 4634static void 4635crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice) 4636{ 4637 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; 4638 const struct brw_wm_prog_data *wm_prog_data = (void *) 4639 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data; 4640#if GFX_VER >= 8 4641 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } }; 4642#else 4643#define attr_overrides sbe.Attribute 4644#endif 4645 4646 uint32_t urb_entry_read_length; 4647 uint32_t urb_entry_read_offset; 4648 uint32_t point_sprite_enables; 4649 4650 crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) { 4651 sbe.AttributeSwizzleEnable = true; 4652 sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; 4653 sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode; 4654 4655 calculate_attr_overrides(ice, 4656 attr_overrides, 4657 &point_sprite_enables, 4658 &urb_entry_read_length, 4659 &urb_entry_read_offset); 4660 sbe.VertexURBEntryReadOffset = urb_entry_read_offset; 4661 sbe.VertexURBEntryReadLength = urb_entry_read_length; 4662 sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs; 4663 sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables; 4664#if GFX_VER >= 8 4665 sbe.ForceVertexURBEntryReadLength = true; 4666 sbe.ForceVertexURBEntryReadOffset = true; 4667#endif 4668 } 4669#if GFX_VER >= 8 4670 crocus_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) { 4671 for (int i = 0; i < 16; i++) 4672 sbes.Attribute[i] = attr_overrides[i]; 4673 } 4674#endif 4675} 4676#endif 4677 4678/* ------------------------------------------------------------------- */ 4679 4680/** 4681 * Populate VS program key fields based on the current state. 4682 */ 4683static void 4684crocus_populate_vs_key(const struct crocus_context *ice, 4685 const struct shader_info *info, 4686 gl_shader_stage last_stage, 4687 struct brw_vs_prog_key *key) 4688{ 4689 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; 4690 4691 if (info->clip_distance_array_size == 0 && 4692 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) && 4693 last_stage == MESA_SHADER_VERTEX) 4694 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; 4695 4696 if (last_stage == MESA_SHADER_VERTEX && 4697 info->outputs_written & (VARYING_BIT_PSIZ)) 4698 key->clamp_pointsize = 1; 4699 4700#if GFX_VER <= 5 4701 key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL || 4702 cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL); 4703 key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff; 4704#endif 4705 4706 key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color; 4707 4708#if GFX_VERx10 < 75 4709 uint64_t inputs_read = info->inputs_read; 4710 int ve_idx = 0; 4711 while (inputs_read) { 4712 int i = u_bit_scan64(&inputs_read); 4713 key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx]; 4714 ve_idx++; 4715 } 4716#endif 4717} 4718 4719/** 4720 * Populate TCS program key fields based on the current state. 4721 */ 4722static void 4723crocus_populate_tcs_key(const struct crocus_context *ice, 4724 struct brw_tcs_prog_key *key) 4725{ 4726} 4727 4728/** 4729 * Populate TES program key fields based on the current state. 4730 */ 4731static void 4732crocus_populate_tes_key(const struct crocus_context *ice, 4733 const struct shader_info *info, 4734 gl_shader_stage last_stage, 4735 struct brw_tes_prog_key *key) 4736{ 4737 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; 4738 4739 if (info->clip_distance_array_size == 0 && 4740 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) && 4741 last_stage == MESA_SHADER_TESS_EVAL) 4742 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; 4743 4744 if (last_stage == MESA_SHADER_TESS_EVAL && 4745 info->outputs_written & (VARYING_BIT_PSIZ)) 4746 key->clamp_pointsize = 1; 4747} 4748 4749/** 4750 * Populate GS program key fields based on the current state. 4751 */ 4752static void 4753crocus_populate_gs_key(const struct crocus_context *ice, 4754 const struct shader_info *info, 4755 gl_shader_stage last_stage, 4756 struct brw_gs_prog_key *key) 4757{ 4758 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; 4759 4760 if (info->clip_distance_array_size == 0 && 4761 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) && 4762 last_stage == MESA_SHADER_GEOMETRY) 4763 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; 4764 4765 if (last_stage == MESA_SHADER_GEOMETRY && 4766 info->outputs_written & (VARYING_BIT_PSIZ)) 4767 key->clamp_pointsize = 1; 4768} 4769 4770/** 4771 * Populate FS program key fields based on the current state. 4772 */ 4773static void 4774crocus_populate_fs_key(const struct crocus_context *ice, 4775 const struct shader_info *info, 4776 struct brw_wm_prog_key *key) 4777{ 4778 struct crocus_screen *screen = (void *) ice->ctx.screen; 4779 const struct pipe_framebuffer_state *fb = &ice->state.framebuffer; 4780 const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa; 4781 const struct crocus_rasterizer_state *rast = ice->state.cso_rast; 4782 const struct crocus_blend_state *blend = ice->state.cso_blend; 4783 4784#if GFX_VER < 6 4785 uint32_t lookup = 0; 4786 4787 if (info->fs.uses_discard || zsa->cso.alpha_enabled) 4788 lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT; 4789 4790 if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) 4791 lookup |= BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT; 4792 4793 if (fb->zsbuf && zsa->cso.depth_enabled) { 4794 lookup |= BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT; 4795 4796 if (zsa->cso.depth_writemask) 4797 lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT; 4798 4799 } 4800 if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) { 4801 lookup |= BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT; 4802 if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask) 4803 lookup |= BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT; 4804 } 4805 key->iz_lookup = lookup; 4806 key->stats_wm = ice->state.stats_wm; 4807#endif 4808 4809 uint32_t line_aa = BRW_WM_AA_NEVER; 4810 if (rast->cso.line_smooth) { 4811 int reduced_prim = ice->state.reduced_prim_mode; 4812 if (reduced_prim == PIPE_PRIM_LINES) 4813 line_aa = BRW_WM_AA_ALWAYS; 4814 else if (reduced_prim == PIPE_PRIM_TRIANGLES) { 4815 if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) { 4816 line_aa = BRW_WM_AA_SOMETIMES; 4817 4818 if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE || 4819 rast->cso.cull_face == PIPE_FACE_BACK) 4820 line_aa = BRW_WM_AA_ALWAYS; 4821 } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) { 4822 line_aa = BRW_WM_AA_SOMETIMES; 4823 4824 if (rast->cso.cull_face == PIPE_FACE_FRONT) 4825 line_aa = BRW_WM_AA_ALWAYS; 4826 } 4827 } 4828 } 4829 key->line_aa = line_aa; 4830 4831 key->nr_color_regions = fb->nr_cbufs; 4832 4833 key->clamp_fragment_color = rast->cso.clamp_fragment_color; 4834 4835 key->alpha_to_coverage = blend->cso.alpha_to_coverage; 4836 4837 key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled; 4838 4839 key->flat_shade = rast->cso.flatshade && 4840 (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1)); 4841 4842 key->persample_interp = rast->cso.force_persample_interp; 4843 key->multisample_fbo = rast->cso.multisample && fb->samples > 1; 4844 4845 key->ignore_sample_mask_out = !key->multisample_fbo; 4846 key->coherent_fb_fetch = false; // TODO: needed? 4847 4848 key->force_dual_color_blend = 4849 screen->driconf.dual_color_blend_by_location && 4850 (blend->blend_enables & 1) && blend->dual_color_blending; 4851 4852#if GFX_VER <= 5 4853 if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) { 4854 key->emit_alpha_test = true; 4855 key->alpha_test_func = zsa->cso.alpha_func; 4856 key->alpha_test_ref = zsa->cso.alpha_ref_value; 4857 } 4858#endif 4859} 4860 4861static void 4862crocus_populate_cs_key(const struct crocus_context *ice, 4863 struct brw_cs_prog_key *key) 4864{ 4865} 4866 4867#if GFX_VER == 4 4868#define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset); 4869#elif GFX_VER >= 5 4870static uint64_t 4871KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader) 4872{ 4873 return shader->offset; 4874} 4875#endif 4876 4877/* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable 4878 * prefetching of binding tables in A0 and B0 steppings. XXX: Revisit 4879 * this WA on C0 stepping. 4880 * 4881 * TODO: Fill out SamplerCount for prefetching? 4882 */ 4883 4884#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \ 4885 pkt.KernelStartPointer = KSP(ice, shader); \ 4886 pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \ 4887 pkt.FloatingPointMode = prog_data->use_alt_mode; \ 4888 \ 4889 pkt.DispatchGRFStartRegisterForURBData = \ 4890 prog_data->dispatch_grf_start_reg; \ 4891 pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \ 4892 pkt.prefix##URBEntryReadOffset = 0; \ 4893 \ 4894 pkt.StatisticsEnable = true; \ 4895 pkt.Enable = true; \ 4896 \ 4897 if (prog_data->total_scratch) { \ 4898 struct crocus_bo *bo = \ 4899 crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \ 4900 pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \ 4901 pkt.ScratchSpaceBasePointer = rw_bo(bo, 0); \ 4902 } 4903 4904/* ------------------------------------------------------------------- */ 4905#if GFX_VER >= 6 4906static const uint32_t push_constant_opcodes[] = { 4907 [MESA_SHADER_VERTEX] = 21, 4908 [MESA_SHADER_TESS_CTRL] = 25, /* HS */ 4909 [MESA_SHADER_TESS_EVAL] = 26, /* DS */ 4910 [MESA_SHADER_GEOMETRY] = 22, 4911 [MESA_SHADER_FRAGMENT] = 23, 4912 [MESA_SHADER_COMPUTE] = 0, 4913}; 4914#endif 4915 4916static void 4917emit_sized_null_surface(struct crocus_batch *batch, 4918 unsigned width, unsigned height, 4919 unsigned layers, unsigned levels, 4920 unsigned minimum_array_element, 4921 uint32_t *out_offset) 4922{ 4923 struct isl_device *isl_dev = &batch->screen->isl_dev; 4924 uint32_t *surf = stream_state(batch, isl_dev->ss.size, 4925 isl_dev->ss.align, 4926 out_offset); 4927 //TODO gen 6 multisample crash 4928 isl_null_fill_state(isl_dev, surf, 4929 .size = isl_extent3d(width, height, layers), 4930 .levels = levels, 4931 .minimum_array_element = minimum_array_element); 4932} 4933static void 4934emit_null_surface(struct crocus_batch *batch, 4935 uint32_t *out_offset) 4936{ 4937 emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset); 4938} 4939 4940static void 4941emit_null_fb_surface(struct crocus_batch *batch, 4942 struct crocus_context *ice, 4943 uint32_t *out_offset) 4944{ 4945 uint32_t width, height, layers, level, layer; 4946 /* If set_framebuffer_state() was never called, fall back to 1x1x1 */ 4947 if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) { 4948 emit_null_surface(batch, out_offset); 4949 return; 4950 } 4951 4952 struct pipe_framebuffer_state *cso = &ice->state.framebuffer; 4953 width = MAX2(cso->width, 1); 4954 height = MAX2(cso->height, 1); 4955 layers = cso->layers ? cso->layers : 1; 4956 level = 0; 4957 layer = 0; 4958 4959 if (cso->nr_cbufs == 0 && cso->zsbuf) { 4960 width = cso->zsbuf->width; 4961 height = cso->zsbuf->height; 4962 level = cso->zsbuf->u.tex.level; 4963 layer = cso->zsbuf->u.tex.first_layer; 4964 } 4965 emit_sized_null_surface(batch, width, height, 4966 layers, level, layer, 4967 out_offset); 4968} 4969 4970static void 4971emit_surface_state(struct crocus_batch *batch, 4972 struct crocus_resource *res, 4973 const struct isl_surf *in_surf, 4974 bool adjust_surf, 4975 struct isl_view *in_view, 4976 bool writeable, 4977 enum isl_aux_usage aux_usage, 4978 bool blend_enable, 4979 uint32_t write_disables, 4980 uint32_t *surf_state, 4981 uint32_t addr_offset) 4982{ 4983 struct isl_device *isl_dev = &batch->screen->isl_dev; 4984 uint32_t reloc = RELOC_32BIT; 4985 uint64_t offset_B = res->offset; 4986 uint32_t tile_x_sa = 0, tile_y_sa = 0; 4987 4988 if (writeable) 4989 reloc |= RELOC_WRITE; 4990 4991 struct isl_surf surf = *in_surf; 4992 struct isl_view view = *in_view; 4993 if (adjust_surf) { 4994 if (res->base.b.target == PIPE_TEXTURE_3D && view.array_len == 1) { 4995 isl_surf_get_image_surf(isl_dev, in_surf, 4996 view.base_level, 0, 4997 view.base_array_layer, 4998 &surf, &offset_B, 4999 &tile_x_sa, &tile_y_sa); 5000 view.base_array_layer = 0; 5001 view.base_level = 0; 5002 } else if (res->base.b.target == PIPE_TEXTURE_CUBE && GFX_VER == 4) { 5003 isl_surf_get_image_surf(isl_dev, in_surf, 5004 view.base_level, view.base_array_layer, 5005 0, 5006 &surf, &offset_B, 5007 &tile_x_sa, &tile_y_sa); 5008 view.base_array_layer = 0; 5009 view.base_level = 0; 5010 } else if (res->base.b.target == PIPE_TEXTURE_1D_ARRAY) 5011 surf.dim = ISL_SURF_DIM_2D; 5012 } 5013 5014 union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } }; 5015 struct crocus_bo *aux_bo = NULL; 5016 uint32_t aux_offset = 0; 5017 struct isl_surf *aux_surf = NULL; 5018 if (aux_usage != ISL_AUX_USAGE_NONE) { 5019 aux_surf = &res->aux.surf; 5020 aux_offset = res->aux.offset; 5021 aux_bo = res->aux.bo; 5022 5023 clear_color = crocus_resource_get_clear_color(res); 5024 } 5025 5026 isl_surf_fill_state(isl_dev, surf_state, 5027 .surf = &surf, 5028 .view = &view, 5029 .address = crocus_state_reloc(batch, 5030 addr_offset + isl_dev->ss.addr_offset, 5031 res->bo, offset_B, reloc), 5032 .aux_surf = aux_surf, 5033 .aux_usage = aux_usage, 5034 .aux_address = aux_offset, 5035 .mocs = crocus_mocs(res->bo, isl_dev), 5036 .clear_color = clear_color, 5037 .use_clear_address = false, 5038 .clear_address = 0, 5039 .x_offset_sa = tile_x_sa, 5040 .y_offset_sa = tile_y_sa, 5041#if GFX_VER <= 5 5042 .blend_enable = blend_enable, 5043 .write_disables = write_disables, 5044#endif 5045 ); 5046 5047 if (aux_surf) { 5048 /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the 5049 * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits 5050 * contain other control information. Since buffer addresses are always 5051 * on 4k boundaries (and thus have their lower 12 bits zero), we can use 5052 * an ordinary reloc to do the necessary address translation. 5053 * 5054 * FIXME: move to the point of assignment. 5055 */ 5056 if (GFX_VER == 8) { 5057 uint64_t *aux_addr = (uint64_t *)(surf_state + (isl_dev->ss.aux_addr_offset / 4)); 5058 *aux_addr = crocus_state_reloc(batch, 5059 addr_offset + isl_dev->ss.aux_addr_offset, 5060 aux_bo, *aux_addr, 5061 reloc); 5062 } else { 5063 uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4); 5064 *aux_addr = crocus_state_reloc(batch, 5065 addr_offset + isl_dev->ss.aux_addr_offset, 5066 aux_bo, *aux_addr, 5067 reloc); 5068 } 5069 } 5070 5071} 5072 5073static uint32_t 5074emit_surface(struct crocus_batch *batch, 5075 struct crocus_surface *surf, 5076 enum isl_aux_usage aux_usage, 5077 bool blend_enable, 5078 uint32_t write_disables) 5079{ 5080 struct isl_device *isl_dev = &batch->screen->isl_dev; 5081 struct crocus_resource *res = (struct crocus_resource *)surf->base.texture; 5082 struct isl_view *view = &surf->view; 5083 uint32_t offset = 0; 5084 enum pipe_texture_target target = res->base.b.target; 5085 bool adjust_surf = false; 5086 5087 if (GFX_VER == 4 && target == PIPE_TEXTURE_CUBE) 5088 adjust_surf = true; 5089 5090 if (surf->align_res) 5091 res = (struct crocus_resource *)surf->align_res; 5092 5093 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset); 5094 5095 emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true, 5096 aux_usage, blend_enable, 5097 write_disables, 5098 surf_state, offset); 5099 return offset; 5100} 5101 5102static uint32_t 5103emit_rt_surface(struct crocus_batch *batch, 5104 struct crocus_surface *surf, 5105 enum isl_aux_usage aux_usage) 5106{ 5107 struct isl_device *isl_dev = &batch->screen->isl_dev; 5108 struct crocus_resource *res = (struct crocus_resource *)surf->base.texture; 5109 struct isl_view *view = &surf->read_view; 5110 uint32_t offset = 0; 5111 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset); 5112 5113 emit_surface_state(batch, res, &surf->surf, true, view, false, 5114 aux_usage, 0, false, 5115 surf_state, offset); 5116 return offset; 5117} 5118 5119static uint32_t 5120emit_grid(struct crocus_context *ice, 5121 struct crocus_batch *batch) 5122{ 5123 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; 5124 uint32_t offset = 0; 5125 struct crocus_state_ref *grid_ref = &ice->state.grid_size; 5126 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, 5127 isl_dev->ss.align, &offset); 5128 isl_buffer_fill_state(isl_dev, surf_state, 5129 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, 5130 crocus_resource_bo(grid_ref->res), 5131 grid_ref->offset, 5132 RELOC_32BIT), 5133 .size_B = 12, 5134 .format = ISL_FORMAT_RAW, 5135 .stride_B = 1, 5136 .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev)); 5137 return offset; 5138} 5139 5140static uint32_t 5141emit_ubo_buffer(struct crocus_context *ice, 5142 struct crocus_batch *batch, 5143 struct pipe_constant_buffer *buffer) 5144{ 5145 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; 5146 uint32_t offset = 0; 5147 5148 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, 5149 isl_dev->ss.align, &offset); 5150 isl_buffer_fill_state(isl_dev, surf_state, 5151 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, 5152 crocus_resource_bo(buffer->buffer), 5153 buffer->buffer_offset, 5154 RELOC_32BIT), 5155 .size_B = buffer->buffer_size, 5156 .format = 0, 5157 .swizzle = ISL_SWIZZLE_IDENTITY, 5158 .stride_B = 1, 5159 .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev)); 5160 5161 return offset; 5162} 5163 5164static uint32_t 5165emit_ssbo_buffer(struct crocus_context *ice, 5166 struct crocus_batch *batch, 5167 struct pipe_shader_buffer *buffer, bool writeable) 5168{ 5169 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; 5170 uint32_t offset = 0; 5171 uint32_t reloc = RELOC_32BIT; 5172 5173 if (writeable) 5174 reloc |= RELOC_WRITE; 5175 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, 5176 isl_dev->ss.align, &offset); 5177 isl_buffer_fill_state(isl_dev, surf_state, 5178 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, 5179 crocus_resource_bo(buffer->buffer), 5180 buffer->buffer_offset, 5181 reloc), 5182 .size_B = buffer->buffer_size, 5183 .format = ISL_FORMAT_RAW, 5184 .swizzle = ISL_SWIZZLE_IDENTITY, 5185 .stride_B = 1, 5186 .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev)); 5187 5188 return offset; 5189} 5190 5191static uint32_t 5192emit_sampler_view(struct crocus_context *ice, 5193 struct crocus_batch *batch, 5194 bool for_gather, 5195 struct crocus_sampler_view *isv) 5196{ 5197 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; 5198 uint32_t offset = 0; 5199 5200 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, 5201 isl_dev->ss.align, &offset); 5202 5203 if (isv->base.target == PIPE_BUFFER) { 5204 const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format); 5205 const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8; 5206 unsigned final_size = 5207 MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset, 5208 CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp); 5209 isl_buffer_fill_state(isl_dev, surf_state, 5210 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, 5211 isv->res->bo, 5212 isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT), 5213 .size_B = final_size, 5214 .format = isv->view.format, 5215 .swizzle = isv->view.swizzle, 5216 .stride_B = cpp, 5217 .mocs = crocus_mocs(isv->res->bo, isl_dev) 5218 ); 5219 } else { 5220 enum isl_aux_usage aux_usage = 5221 crocus_resource_texture_aux_usage(isv->res); 5222 5223 emit_surface_state(batch, isv->res, &isv->res->surf, false, 5224 for_gather ? &isv->gather_view : &isv->view, 5225 false, aux_usage, false, 5226 0, surf_state, offset); 5227 } 5228 return offset; 5229} 5230 5231static uint32_t 5232emit_image_view(struct crocus_context *ice, 5233 struct crocus_batch *batch, 5234 struct crocus_image_view *iv) 5235{ 5236 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; 5237 uint32_t offset = 0; 5238 5239 struct crocus_resource *res = (struct crocus_resource *)iv->base.resource; 5240 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, 5241 isl_dev->ss.align, &offset); 5242 bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE; 5243 uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0); 5244 if (res->base.b.target == PIPE_BUFFER) { 5245 const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format); 5246 const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8; 5247 unsigned final_size = 5248 MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset, 5249 CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp); 5250 isl_buffer_fill_state(isl_dev, surf_state, 5251 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, 5252 res->bo, 5253 res->offset + iv->base.u.buf.offset, reloc), 5254 .size_B = final_size, 5255 .format = iv->view.format, 5256 .swizzle = iv->view.swizzle, 5257 .stride_B = cpp, 5258 .mocs = crocus_mocs(res->bo, isl_dev) 5259 ); 5260 } else { 5261 if (iv->view.format == ISL_FORMAT_RAW) { 5262 isl_buffer_fill_state(isl_dev, surf_state, 5263 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, 5264 res->bo, 5265 res->offset, reloc), 5266 .size_B = res->bo->size - res->offset, 5267 .format = iv->view.format, 5268 .swizzle = iv->view.swizzle, 5269 .stride_B = 1, 5270 .mocs = crocus_mocs(res->bo, isl_dev), 5271 ); 5272 5273 5274 } else { 5275 emit_surface_state(batch, res, 5276 &res->surf, false, &iv->view, 5277 write, 0, false, 5278 0, surf_state, offset); 5279 } 5280 } 5281 5282 return offset; 5283} 5284 5285#if GFX_VER == 6 5286static uint32_t 5287emit_sol_surface(struct crocus_batch *batch, 5288 struct pipe_stream_output_info *so_info, 5289 uint32_t idx) 5290{ 5291 struct crocus_context *ice = batch->ice; 5292 5293 if (idx >= so_info->num_outputs || !ice->state.streamout_active) 5294 return 0; 5295 const struct pipe_stream_output *output = &so_info->output[idx]; 5296 const int buffer = output->output_buffer; 5297 assert(output->stream == 0); 5298 5299 struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer; 5300 unsigned stride_dwords = so_info->stride[buffer]; 5301 unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset; 5302 5303 size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4; 5304 unsigned num_vector_components = output->num_components; 5305 unsigned num_elements; 5306 /* FIXME: can we rely on core Mesa to ensure that the buffer isn't 5307 * too big to map using a single binding table entry? 5308 */ 5309 // assert((size_dwords - offset_dwords) / stride_dwords 5310 // <= BRW_MAX_NUM_BUFFER_ENTRIES); 5311 5312 if (size_dwords > offset_dwords + num_vector_components) { 5313 /* There is room for at least 1 transform feedback output in the buffer. 5314 * Compute the number of additional transform feedback outputs the 5315 * buffer has room for. 5316 */ 5317 num_elements = 5318 (size_dwords - offset_dwords - num_vector_components); 5319 } else { 5320 /* There isn't even room for a single transform feedback output in the 5321 * buffer. We can't configure the binding table entry to prevent output 5322 * entirely; we'll have to rely on the geometry shader to detect 5323 * overflow. But to minimize the damage in case of a bug, set up the 5324 * binding table entry to just allow a single output. 5325 */ 5326 num_elements = 0; 5327 } 5328 num_elements += stride_dwords; 5329 5330 uint32_t surface_format; 5331 switch (num_vector_components) { 5332 case 1: 5333 surface_format = ISL_FORMAT_R32_FLOAT; 5334 break; 5335 case 2: 5336 surface_format = ISL_FORMAT_R32G32_FLOAT; 5337 break; 5338 case 3: 5339 surface_format = ISL_FORMAT_R32G32B32_FLOAT; 5340 break; 5341 case 4: 5342 surface_format = ISL_FORMAT_R32G32B32A32_FLOAT; 5343 break; 5344 default: 5345 unreachable("Invalid vector size for transform feedback output"); 5346 } 5347 5348 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; 5349 uint32_t offset = 0; 5350 5351 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, 5352 isl_dev->ss.align, &offset); 5353 isl_buffer_fill_state(isl_dev, surf_state, 5354 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, 5355 crocus_resource_bo(&buf->base.b), 5356 offset_dwords * 4, RELOC_32BIT|RELOC_WRITE), 5357 .size_B = num_elements * 4, 5358 .stride_B = stride_dwords * 4, 5359 .swizzle = ISL_SWIZZLE_IDENTITY, 5360 .format = surface_format); 5361 return offset; 5362} 5363#endif 5364 5365#define foreach_surface_used(index, group) \ 5366 for (int index = 0; index < bt->sizes[group]; index++) \ 5367 if (crocus_group_index_to_bti(bt, group, index) != \ 5368 CROCUS_SURFACE_NOT_USED) 5369 5370static void 5371crocus_populate_binding_table(struct crocus_context *ice, 5372 struct crocus_batch *batch, 5373 gl_shader_stage stage, bool ff_gs) 5374{ 5375 struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage]; 5376 struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage]; 5377 if (!shader) 5378 return; 5379 5380 struct crocus_binding_table *bt = &shader->bt; 5381 int s = 0; 5382 uint32_t *surf_offsets = shader->surf_offset; 5383 5384#if GFX_VER < 8 5385 const struct shader_info *info = crocus_get_shader_info(ice, stage); 5386#endif 5387 5388 if (stage == MESA_SHADER_FRAGMENT) { 5389 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; 5390 /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */ 5391 if (cso_fb->nr_cbufs) { 5392 for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) { 5393 uint32_t write_disables = 0; 5394 bool blend_enable = false; 5395#if GFX_VER <= 5 5396 const struct pipe_rt_blend_state *rt = 5397 &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0]; 5398 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT]; 5399 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data; 5400 write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8; 5401 write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4; 5402 write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2; 5403 write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1; 5404 /* Gen4/5 can't handle blending off when a dual src blend wm is enabled. */ 5405 blend_enable = rt->blend_enable || wm_prog_data->dual_src_blend; 5406#endif 5407 if (cso_fb->cbufs[i]) { 5408 surf_offsets[s] = emit_surface(batch, 5409 (struct crocus_surface *)cso_fb->cbufs[i], 5410 ice->state.draw_aux_usage[i], 5411 blend_enable, 5412 write_disables); 5413 } else { 5414 emit_null_fb_surface(batch, ice, &surf_offsets[s]); 5415 } 5416 s++; 5417 } 5418 } else { 5419 emit_null_fb_surface(batch, ice, &surf_offsets[s]); 5420 s++; 5421 } 5422 5423 foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) { 5424 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; 5425 if (cso_fb->cbufs[i]) { 5426 surf_offsets[s++] = emit_rt_surface(batch, 5427 (struct crocus_surface *)cso_fb->cbufs[i], 5428 ice->state.draw_aux_usage[i]); 5429 } 5430 } 5431 } 5432 5433 if (stage == MESA_SHADER_COMPUTE) { 5434 foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) { 5435 surf_offsets[s] = emit_grid(ice, batch); 5436 s++; 5437 } 5438 } 5439 5440#if GFX_VER == 6 5441 if (stage == MESA_SHADER_GEOMETRY) { 5442 struct pipe_stream_output_info *so_info; 5443 if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]) 5444 so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output; 5445 else 5446 so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output; 5447 5448 foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) { 5449 surf_offsets[s] = emit_sol_surface(batch, so_info, i); 5450 s++; 5451 } 5452 } 5453#endif 5454 5455 foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) { 5456 struct crocus_sampler_view *view = shs->textures[i]; 5457 if (view) 5458 surf_offsets[s] = emit_sampler_view(ice, batch, false, view); 5459 else 5460 emit_null_surface(batch, &surf_offsets[s]); 5461 s++; 5462 } 5463 5464#if GFX_VER < 8 5465 if (info && info->uses_texture_gather) { 5466 foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) { 5467 struct crocus_sampler_view *view = shs->textures[i]; 5468 if (view) 5469 surf_offsets[s] = emit_sampler_view(ice, batch, true, view); 5470 else 5471 emit_null_surface(batch, &surf_offsets[s]); 5472 s++; 5473 } 5474 } 5475#endif 5476 5477 foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) { 5478 struct crocus_image_view *view = &shs->image[i]; 5479 if (view->base.resource) 5480 surf_offsets[s] = emit_image_view(ice, batch, view); 5481 else 5482 emit_null_surface(batch, &surf_offsets[s]); 5483 s++; 5484 } 5485 foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) { 5486 if (shs->constbufs[i].buffer) 5487 surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]); 5488 else 5489 emit_null_surface(batch, &surf_offsets[s]); 5490 s++; 5491 } 5492 foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) { 5493 if (shs->ssbo[i].buffer) 5494 surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i], 5495 !!(shs->writable_ssbos & (1 << i))); 5496 else 5497 emit_null_surface(batch, &surf_offsets[s]); 5498 s++; 5499 } 5500 5501} 5502/* ------------------------------------------------------------------- */ 5503static uint32_t 5504crocus_upload_binding_table(struct crocus_context *ice, 5505 struct crocus_batch *batch, 5506 uint32_t *table, 5507 uint32_t size) 5508 5509{ 5510 if (size == 0) 5511 return 0; 5512 return emit_state(batch, table, size, 32); 5513} 5514 5515/** 5516 * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address. 5517 */ 5518 5519static void 5520crocus_update_surface_base_address(struct crocus_batch *batch) 5521{ 5522 if (batch->state_base_address_emitted) 5523 return; 5524 5525 UNUSED uint32_t mocs = batch->screen->isl_dev.mocs.internal; 5526 5527 flush_before_state_base_change(batch); 5528 5529 crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) { 5530 /* Set base addresses */ 5531 sba.GeneralStateBaseAddressModifyEnable = true; 5532 5533#if GFX_VER >= 6 5534 sba.DynamicStateBaseAddressModifyEnable = true; 5535 sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0); 5536#endif 5537 5538 sba.SurfaceStateBaseAddressModifyEnable = true; 5539 sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0); 5540 5541 sba.IndirectObjectBaseAddressModifyEnable = true; 5542 5543#if GFX_VER >= 5 5544 sba.InstructionBaseAddressModifyEnable = true; 5545 sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO! 5546#endif 5547 5548 /* Set buffer sizes on Gen8+ or upper bounds on Gen4-7 */ 5549#if GFX_VER == 8 5550 sba.GeneralStateBufferSize = 0xfffff; 5551 sba.IndirectObjectBufferSize = 0xfffff; 5552 sba.InstructionBufferSize = 0xfffff; 5553 sba.DynamicStateBufferSize = MAX_STATE_SIZE; 5554 5555 sba.GeneralStateBufferSizeModifyEnable = true; 5556 sba.DynamicStateBufferSizeModifyEnable = true; 5557 sba.IndirectObjectBufferSizeModifyEnable = true; 5558 sba.InstructionBuffersizeModifyEnable = true; 5559#else 5560 sba.GeneralStateAccessUpperBoundModifyEnable = true; 5561 sba.IndirectObjectAccessUpperBoundModifyEnable = true; 5562 5563#if GFX_VER >= 5 5564 sba.InstructionAccessUpperBoundModifyEnable = true; 5565#endif 5566 5567#if GFX_VER >= 6 5568 /* Dynamic state upper bound. Although the documentation says that 5569 * programming it to zero will cause it to be ignored, that is a lie. 5570 * If this isn't programmed to a real bound, the sampler border color 5571 * pointer is rejected, causing border color to mysteriously fail. 5572 */ 5573 sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000); 5574 sba.DynamicStateAccessUpperBoundModifyEnable = true; 5575#else 5576 /* Same idea but using General State Base Address on Gen4-5 */ 5577 sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000); 5578#endif 5579#endif 5580 5581#if GFX_VER >= 6 5582 /* The hardware appears to pay attention to the MOCS fields even 5583 * if you don't set the "Address Modify Enable" bit for the base. 5584 */ 5585 sba.GeneralStateMOCS = mocs; 5586 sba.StatelessDataPortAccessMOCS = mocs; 5587 sba.DynamicStateMOCS = mocs; 5588 sba.IndirectObjectMOCS = mocs; 5589 sba.InstructionMOCS = mocs; 5590 sba.SurfaceStateMOCS = mocs; 5591#endif 5592 } 5593 5594 flush_after_state_base_change(batch); 5595 5596 /* According to section 3.6.1 of VOL1 of the 965 PRM, 5597 * STATE_BASE_ADDRESS updates require a reissue of: 5598 * 5599 * 3DSTATE_PIPELINE_POINTERS 5600 * 3DSTATE_BINDING_TABLE_POINTERS 5601 * MEDIA_STATE_POINTERS 5602 * 5603 * and this continues through Ironlake. The Sandy Bridge PRM, vol 5604 * 1 part 1 says that the folowing packets must be reissued: 5605 * 5606 * 3DSTATE_CC_POINTERS 5607 * 3DSTATE_BINDING_TABLE_POINTERS 5608 * 3DSTATE_SAMPLER_STATE_POINTERS 5609 * 3DSTATE_VIEWPORT_STATE_POINTERS 5610 * MEDIA_STATE_POINTERS 5611 * 5612 * Those are always reissued following SBA updates anyway (new 5613 * batch time), except in the case of the program cache BO 5614 * changing. Having a separate state flag makes the sequence more 5615 * obvious. 5616 */ 5617#if GFX_VER <= 5 5618 batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS; 5619#elif GFX_VER == 6 5620 batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS; 5621#endif 5622 batch->state_base_address_emitted = true; 5623} 5624 5625static inline void 5626crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz, 5627 bool window_space_position, float *zmin, float *zmax) 5628{ 5629 if (window_space_position) { 5630 *zmin = 0.f; 5631 *zmax = 1.f; 5632 return; 5633 } 5634 util_viewport_zmin_zmax(vp, halfz, zmin, zmax); 5635} 5636 5637struct push_bos { 5638 struct { 5639 struct crocus_address addr; 5640 uint32_t length; 5641 } buffers[4]; 5642 int buffer_count; 5643 uint32_t max_length; 5644}; 5645 5646#if GFX_VER >= 6 5647static void 5648setup_constant_buffers(struct crocus_context *ice, 5649 struct crocus_batch *batch, 5650 int stage, 5651 struct push_bos *push_bos) 5652{ 5653 struct crocus_shader_state *shs = &ice->state.shaders[stage]; 5654 struct crocus_compiled_shader *shader = ice->shaders.prog[stage]; 5655 struct brw_stage_prog_data *prog_data = (void *) shader->prog_data; 5656 5657 uint32_t push_range_sum = 0; 5658 5659 int n = 0; 5660 for (int i = 0; i < 4; i++) { 5661 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; 5662 5663 if (range->length == 0) 5664 continue; 5665 5666 push_range_sum += range->length; 5667 5668 if (range->length > push_bos->max_length) 5669 push_bos->max_length = range->length; 5670 5671 /* Range block is a binding table index, map back to UBO index. */ 5672 unsigned block_index = crocus_bti_to_group_index( 5673 &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block); 5674 assert(block_index != CROCUS_SURFACE_NOT_USED); 5675 5676 struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index]; 5677 struct crocus_resource *res = (void *) cbuf->buffer; 5678 5679 assert(cbuf->buffer_offset % 32 == 0); 5680 5681 push_bos->buffers[n].length = range->length; 5682 push_bos->buffers[n].addr = 5683 res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset) 5684 : ro_bo(batch->ice->workaround_bo, 5685 batch->ice->workaround_offset); 5686 n++; 5687 } 5688 5689 /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes: 5690 * 5691 * "The sum of all four read length fields must be less than or 5692 * equal to the size of 64." 5693 */ 5694 assert(push_range_sum <= 64); 5695 5696 push_bos->buffer_count = n; 5697} 5698 5699#if GFX_VER == 7 5700static void 5701gen7_emit_vs_workaround_flush(struct crocus_batch *batch) 5702{ 5703 crocus_emit_pipe_control_write(batch, 5704 "vs workaround", 5705 PIPE_CONTROL_WRITE_IMMEDIATE 5706 | PIPE_CONTROL_DEPTH_STALL, 5707 batch->ice->workaround_bo, 5708 batch->ice->workaround_offset, 0); 5709} 5710#endif 5711 5712static void 5713emit_push_constant_packets(struct crocus_context *ice, 5714 struct crocus_batch *batch, 5715 int stage, 5716 const struct push_bos *push_bos) 5717{ 5718 struct crocus_compiled_shader *shader = ice->shaders.prog[stage]; 5719 struct brw_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL; 5720 UNUSED uint32_t mocs = crocus_mocs(NULL, &batch->screen->isl_dev); 5721 5722#if GFX_VER == 7 5723 if (stage == MESA_SHADER_VERTEX) { 5724 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB) 5725 gen7_emit_vs_workaround_flush(batch); 5726 } 5727#endif 5728 crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) { 5729 pkt._3DCommandSubOpcode = push_constant_opcodes[stage]; 5730#if GFX_VER >= 7 5731#if GFX_VER != 8 5732 /* MOCS is MBZ on Gen8 so we skip it there */ 5733 pkt.ConstantBody.MOCS = mocs; 5734#endif 5735 5736 if (prog_data) { 5737 /* The Skylake PRM contains the following restriction: 5738 * 5739 * "The driver must ensure The following case does not occur 5740 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with 5741 * buffer 3 read length equal to zero committed followed by a 5742 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to 5743 * zero committed." 5744 * 5745 * To avoid this, we program the buffers in the highest slots. 5746 * This way, slot 0 is only used if slot 3 is also used. 5747 */ 5748 int n = push_bos->buffer_count; 5749 assert(n <= 4); 5750#if GFX_VERx10 >= 75 5751 const unsigned shift = 4 - n; 5752#else 5753 const unsigned shift = 0; 5754#endif 5755 for (int i = 0; i < n; i++) { 5756 pkt.ConstantBody.ReadLength[i + shift] = 5757 push_bos->buffers[i].length; 5758 pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr; 5759 } 5760 } 5761#else 5762 if (prog_data) { 5763 int n = push_bos->buffer_count; 5764 assert (n <= 1); 5765 if (n == 1) { 5766 pkt.Buffer0Valid = true; 5767 pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset; 5768 pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1; 5769 } 5770 } 5771#endif 5772 } 5773} 5774 5775#endif 5776 5777#if GFX_VER == 8 5778typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML; 5779#elif GFX_VER >= 6 5780typedef struct GENX(DEPTH_STENCIL_STATE) DEPTH_STENCIL_GENXML; 5781#else 5782typedef struct GENX(COLOR_CALC_STATE) DEPTH_STENCIL_GENXML; 5783#endif 5784 5785static inline void 5786set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds) 5787{ 5788 struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa; 5789 ds->DepthTestEnable = cso->cso.depth_enabled; 5790 ds->DepthBufferWriteEnable = cso->cso.depth_writemask; 5791 ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func); 5792 5793 ds->StencilFailOp = cso->cso.stencil[0].fail_op; 5794 ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op; 5795 ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op; 5796 ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func); 5797 5798 ds->StencilTestMask = cso->cso.stencil[0].valuemask; 5799 ds->StencilWriteMask = cso->cso.stencil[0].writemask; 5800 5801 ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op; 5802 ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op; 5803 ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op; 5804 ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func); 5805 5806 ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask; 5807 ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask; 5808 ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled; 5809 ds->StencilTestEnable = cso->cso.stencil[0].enabled; 5810 ds->StencilBufferWriteEnable = 5811 cso->cso.stencil[0].writemask != 0 || 5812 (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0); 5813} 5814 5815static void 5816emit_vertex_buffer_state(struct crocus_batch *batch, 5817 unsigned buffer_id, 5818 struct crocus_bo *bo, 5819 unsigned start_offset, 5820 unsigned end_offset, 5821 unsigned stride, 5822 unsigned step_rate, 5823 uint32_t **map) 5824{ 5825 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length); 5826 _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) { 5827 vb.BufferStartingAddress = ro_bo(bo, start_offset); 5828#if GFX_VER >= 8 5829 vb.BufferSize = end_offset - start_offset; 5830#endif 5831 vb.VertexBufferIndex = buffer_id; 5832 vb.BufferPitch = stride; 5833#if GFX_VER >= 7 5834 vb.AddressModifyEnable = true; 5835#endif 5836#if GFX_VER >= 6 5837 vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev); 5838#endif 5839#if GFX_VER < 8 5840 vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA; 5841 vb.InstanceDataStepRate = step_rate; 5842#if GFX_VER >= 5 5843 vb.EndAddress = ro_bo(bo, end_offset - 1); 5844#endif 5845#endif 5846 } 5847 *map += vb_dwords; 5848} 5849 5850#if GFX_VER >= 6 5851static uint32_t 5852determine_sample_mask(struct crocus_context *ice) 5853{ 5854 uint32_t num_samples = ice->state.framebuffer.samples; 5855 5856 if (num_samples <= 1) 5857 return 1; 5858 5859 uint32_t fb_mask = (1 << num_samples) - 1; 5860 return ice->state.sample_mask & fb_mask; 5861} 5862#endif 5863 5864static void 5865crocus_upload_dirty_render_state(struct crocus_context *ice, 5866 struct crocus_batch *batch, 5867 const struct pipe_draw_info *draw) 5868{ 5869 uint64_t dirty = ice->state.dirty; 5870 uint64_t stage_dirty = ice->state.stage_dirty; 5871 5872 if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) && 5873 !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER)) 5874 return; 5875 5876 if (dirty & CROCUS_DIRTY_VF_STATISTICS) { 5877 crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) { 5878 vf.StatisticsEnable = true; 5879 } 5880 } 5881 5882#if GFX_VER <= 5 5883 if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS | 5884 CROCUS_STAGE_DIRTY_CONSTANTS_FS)) { 5885 bool ret = calculate_curbe_offsets(batch); 5886 if (ret) { 5887 dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP; 5888 stage_dirty |= CROCUS_STAGE_DIRTY_VS; 5889 } 5890 } 5891 5892 if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) || 5893 stage_dirty & CROCUS_STAGE_DIRTY_VS) { 5894 bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size, 5895 brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size, 5896 ((struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size); 5897 if (ret) { 5898 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_RASTER | CROCUS_DIRTY_CLIP; 5899 stage_dirty |= CROCUS_STAGE_DIRTY_GS | CROCUS_STAGE_DIRTY_VS; 5900 } 5901 } 5902#endif 5903 if (dirty & CROCUS_DIRTY_CC_VIEWPORT) { 5904 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; 5905 uint32_t cc_vp_address; 5906 5907 /* XXX: could avoid streaming for depth_clip [0,1] case. */ 5908 uint32_t *cc_vp_map = 5909 stream_state(batch, 5910 4 * ice->state.num_viewports * 5911 GENX(CC_VIEWPORT_length), 32, &cc_vp_address); 5912 for (int i = 0; i < ice->state.num_viewports; i++) { 5913 float zmin, zmax; 5914 crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz, 5915 ice->state.window_space_position, 5916 &zmin, &zmax); 5917 if (cso_rast->cso.depth_clip_near) 5918 zmin = 0.0; 5919 if (cso_rast->cso.depth_clip_far) 5920 zmax = 1.0; 5921 5922 crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) { 5923 ccv.MinimumDepth = zmin; 5924 ccv.MaximumDepth = zmax; 5925 } 5926 5927 cc_vp_map += GENX(CC_VIEWPORT_length); 5928 } 5929 5930#if GFX_VER >= 7 5931 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) { 5932 ptr.CCViewportPointer = cc_vp_address; 5933 } 5934#elif GFX_VER == 6 5935 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) { 5936 vp.CCViewportStateChange = 1; 5937 vp.PointertoCC_VIEWPORT = cc_vp_address; 5938 } 5939#else 5940 ice->state.cc_vp_address = cc_vp_address; 5941 dirty |= CROCUS_DIRTY_COLOR_CALC_STATE; 5942#endif 5943 } 5944 5945 if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) { 5946 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; 5947#if GFX_VER >= 7 5948 uint32_t sf_cl_vp_address; 5949 uint32_t *vp_map = 5950 stream_state(batch, 5951 4 * ice->state.num_viewports * 5952 GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address); 5953#else 5954 uint32_t *vp_map = 5955 stream_state(batch, 5956 4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length), 5957 32, &ice->state.sf_vp_address); 5958 uint32_t *clip_map = 5959 stream_state(batch, 5960 4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length), 5961 32, &ice->state.clip_vp_address); 5962#endif 5963 5964 for (unsigned i = 0; i < ice->state.num_viewports; i++) { 5965 const struct pipe_viewport_state *state = &ice->state.viewports[i]; 5966 float gb_xmin, gb_xmax, gb_ymin, gb_ymax; 5967 5968#if GFX_VER == 8 5969 float vp_xmin = viewport_extent(state, 0, -1.0f); 5970 float vp_xmax = viewport_extent(state, 0, 1.0f); 5971 float vp_ymin = viewport_extent(state, 1, -1.0f); 5972 float vp_ymax = viewport_extent(state, 1, 1.0f); 5973#endif 5974 intel_calculate_guardband_size(0, cso_fb->width, 0, cso_fb->height, 5975 state->scale[0], state->scale[1], 5976 state->translate[0], state->translate[1], 5977 &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax); 5978#if GFX_VER >= 7 5979 crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp) 5980#else 5981 crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp) 5982#endif 5983 { 5984 vp.ViewportMatrixElementm00 = state->scale[0]; 5985 vp.ViewportMatrixElementm11 = state->scale[1]; 5986 vp.ViewportMatrixElementm22 = state->scale[2]; 5987 vp.ViewportMatrixElementm30 = state->translate[0]; 5988 vp.ViewportMatrixElementm31 = state->translate[1]; 5989 vp.ViewportMatrixElementm32 = state->translate[2]; 5990#if GFX_VER < 6 5991 struct pipe_scissor_state scissor; 5992 crocus_fill_scissor_rect(ice, 0, &scissor); 5993 vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx; 5994 vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx; 5995 vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny; 5996 vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy; 5997#endif 5998 5999#if GFX_VER >= 7 6000 vp.XMinClipGuardband = gb_xmin; 6001 vp.XMaxClipGuardband = gb_xmax; 6002 vp.YMinClipGuardband = gb_ymin; 6003 vp.YMaxClipGuardband = gb_ymax; 6004#endif 6005#if GFX_VER == 8 6006 vp.XMinViewPort = MAX2(vp_xmin, 0); 6007 vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1; 6008 vp.YMinViewPort = MAX2(vp_ymin, 0); 6009 vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1; 6010#endif 6011 } 6012#if GFX_VER < 7 6013 crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) { 6014 clip.XMinClipGuardband = gb_xmin; 6015 clip.XMaxClipGuardband = gb_xmax; 6016 clip.YMinClipGuardband = gb_ymin; 6017 clip.YMaxClipGuardband = gb_ymax; 6018 } 6019#endif 6020#if GFX_VER >= 7 6021 vp_map += GENX(SF_CLIP_VIEWPORT_length); 6022#else 6023 vp_map += GENX(SF_VIEWPORT_length); 6024 clip_map += GENX(CLIP_VIEWPORT_length); 6025#endif 6026 } 6027#if GFX_VER >= 7 6028 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) { 6029 ptr.SFClipViewportPointer = sf_cl_vp_address; 6030 } 6031#elif GFX_VER == 6 6032 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) { 6033 vp.SFViewportStateChange = 1; 6034 vp.CLIPViewportStateChange = 1; 6035 vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address; 6036 vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address; 6037 } 6038#endif 6039 } 6040 6041#if GFX_VER >= 6 6042 if (dirty & CROCUS_DIRTY_GEN6_URB) { 6043#if GFX_VER == 6 6044 bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL 6045 || ice->shaders.ff_gs_prog; 6046 6047 struct brw_vue_prog_data *vue_prog_data = 6048 (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data; 6049 const unsigned vs_size = vue_prog_data->urb_entry_size; 6050 unsigned gs_size = vs_size; 6051 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) { 6052 struct brw_vue_prog_data *gs_vue_prog_data = 6053 (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data; 6054 gs_size = gs_vue_prog_data->urb_entry_size; 6055 } 6056 6057 genX(crocus_upload_urb)(batch, vs_size, gs_present, gs_size); 6058#endif 6059#if GFX_VER >= 7 6060 const struct intel_device_info *devinfo = &batch->screen->devinfo; 6061 bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL; 6062 bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL; 6063 unsigned entry_size[4]; 6064 6065 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { 6066 if (!ice->shaders.prog[i]) { 6067 entry_size[i] = 1; 6068 } else { 6069 struct brw_vue_prog_data *vue_prog_data = 6070 (void *) ice->shaders.prog[i]->prog_data; 6071 entry_size[i] = vue_prog_data->urb_entry_size; 6072 } 6073 assert(entry_size[i] != 0); 6074 } 6075 6076 /* If we're just switching between programs with the same URB requirements, 6077 * skip the rest of the logic. 6078 */ 6079 bool no_change = false; 6080 if (ice->urb.vsize == entry_size[MESA_SHADER_VERTEX] && 6081 ice->urb.gs_present == gs_present && 6082 ice->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] && 6083 ice->urb.tess_present == tess_present && 6084 ice->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] && 6085 ice->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) { 6086 no_change = true; 6087 } 6088 6089 if (!no_change) { 6090 ice->urb.vsize = entry_size[MESA_SHADER_VERTEX]; 6091 ice->urb.gs_present = gs_present; 6092 ice->urb.gsize = entry_size[MESA_SHADER_GEOMETRY]; 6093 ice->urb.tess_present = tess_present; 6094 ice->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL]; 6095 ice->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL]; 6096 6097 unsigned entries[4]; 6098 unsigned start[4]; 6099 bool constrained; 6100 intel_get_urb_config(devinfo, 6101 batch->screen->l3_config_3d, 6102 tess_present, 6103 gs_present, 6104 entry_size, 6105 entries, start, NULL, &constrained); 6106 6107#if GFX_VER == 7 6108 if (devinfo->platform == INTEL_PLATFORM_IVB) 6109 gen7_emit_vs_workaround_flush(batch); 6110#endif 6111 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { 6112 crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) { 6113 urb._3DCommandSubOpcode += i; 6114 urb.VSURBStartingAddress = start[i]; 6115 urb.VSURBEntryAllocationSize = entry_size[i] - 1; 6116 urb.VSNumberofURBEntries = entries[i]; 6117 } 6118 } 6119 } 6120#endif 6121 } 6122 6123 if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) { 6124 struct crocus_blend_state *cso_blend = ice->state.cso_blend; 6125 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; 6126 struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa; 6127 6128 STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2); 6129 int rt_dwords = 6130 MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length); 6131#if GFX_VER >= 8 6132 rt_dwords += GENX(BLEND_STATE_length); 6133#endif 6134 uint32_t blend_offset; 6135 uint32_t *blend_map = 6136 stream_state(batch, 6137 4 * rt_dwords, 64, &blend_offset); 6138 6139#if GFX_VER >= 8 6140 struct GENX(BLEND_STATE) be = { 0 }; 6141 { 6142#else 6143 for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) { 6144 struct GENX(BLEND_STATE_ENTRY) entry = { 0 }; 6145#define be entry 6146#endif 6147 6148 be.AlphaTestEnable = cso_zsa->cso.alpha_enabled; 6149 be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func); 6150 be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage; 6151 be.AlphaToOneEnable = cso_blend->cso.alpha_to_one; 6152 be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage; 6153 be.ColorDitherEnable = cso_blend->cso.dither; 6154 6155#if GFX_VER >= 8 6156 for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) { 6157 struct GENX(BLEND_STATE_ENTRY) entry = { 0 }; 6158#else 6159 { 6160#endif 6161 const struct pipe_rt_blend_state *rt = 6162 &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0]; 6163 6164 be.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &entry, cso_blend, i) || 6165 be.IndependentAlphaBlendEnable; 6166 6167 if (GFX_VER >= 8 || can_emit_logic_op(ice)) { 6168 entry.LogicOpEnable = cso_blend->cso.logicop_enable; 6169 entry.LogicOpFunction = cso_blend->cso.logicop_func; 6170 } 6171 6172 entry.ColorClampRange = COLORCLAMP_RTFORMAT; 6173 entry.PreBlendColorClampEnable = true; 6174 entry.PostBlendColorClampEnable = true; 6175 6176 entry.WriteDisableRed = !(rt->colormask & PIPE_MASK_R); 6177 entry.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G); 6178 entry.WriteDisableBlue = !(rt->colormask & PIPE_MASK_B); 6179 entry.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A); 6180 6181#if GFX_VER >= 8 6182 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry); 6183#else 6184 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry); 6185#endif 6186 } 6187 } 6188#if GFX_VER >= 8 6189 GENX(BLEND_STATE_pack)(NULL, blend_map, &be); 6190#endif 6191#if GFX_VER < 7 6192 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { 6193 ptr.PointertoBLEND_STATE = blend_offset; 6194 ptr.BLEND_STATEChange = true; 6195 } 6196#else 6197 crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) { 6198 ptr.BlendStatePointer = blend_offset; 6199#if GFX_VER >= 8 6200 ptr.BlendStatePointerValid = true; 6201#endif 6202 } 6203#endif 6204 } 6205#endif 6206 6207 if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) { 6208 struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa; 6209 UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend; 6210 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref; 6211 uint32_t cc_offset; 6212 void *cc_map = 6213 stream_state(batch, 6214 sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length), 6215 64, &cc_offset); 6216#if GFX_VER <= 5 6217 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; 6218#endif 6219 _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) { 6220 cc.AlphaTestFormat = ALPHATEST_FLOAT32; 6221 cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value; 6222 6223#if GFX_VER <= 5 6224 6225 set_depth_stencil_bits(ice, &cc); 6226 6227 if (cso_blend->cso.logicop_enable) { 6228 if (can_emit_logic_op(ice)) { 6229 cc.LogicOpEnable = cso_blend->cso.logicop_enable; 6230 cc.LogicOpFunction = cso_blend->cso.logicop_func; 6231 } 6232 } 6233 cc.ColorDitherEnable = cso_blend->cso.dither; 6234 6235 cc.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &cc, cso_blend, 0); 6236 6237 if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) { 6238 cc.AlphaTestEnable = cso->cso.alpha_enabled; 6239 cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func); 6240 } 6241 cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0; 6242 cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address); 6243#else 6244 cc.AlphaTestFormat = ALPHATEST_FLOAT32; 6245 cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value; 6246 6247 cc.BlendConstantColorRed = ice->state.blend_color.color[0]; 6248 cc.BlendConstantColorGreen = ice->state.blend_color.color[1]; 6249 cc.BlendConstantColorBlue = ice->state.blend_color.color[2]; 6250 cc.BlendConstantColorAlpha = ice->state.blend_color.color[3]; 6251#endif 6252 cc.StencilReferenceValue = p_stencil_refs->ref_value[0]; 6253 cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1]; 6254 } 6255 ice->shaders.cc_offset = cc_offset; 6256#if GFX_VER >= 6 6257 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { 6258 ptr.ColorCalcStatePointer = cc_offset; 6259#if GFX_VER != 7 6260 ptr.ColorCalcStatePointerValid = true; 6261#endif 6262 } 6263#endif 6264 } 6265#if GFX_VER <= 5 6266 if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) { 6267 crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) { 6268 blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0]; 6269 blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1]; 6270 blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2]; 6271 blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3]; 6272 } 6273 } 6274#endif 6275 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { 6276 if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage))) 6277 continue; 6278 6279 struct crocus_shader_state *shs = &ice->state.shaders[stage]; 6280 struct crocus_compiled_shader *shader = ice->shaders.prog[stage]; 6281 6282 if (!shader) 6283 continue; 6284 6285 if (shs->sysvals_need_upload) 6286 upload_sysvals(ice, stage); 6287 6288#if GFX_VER <= 5 6289 dirty |= CROCUS_DIRTY_GEN4_CURBE; 6290#endif 6291#if GFX_VER >= 7 6292 struct push_bos push_bos = {}; 6293 setup_constant_buffers(ice, batch, stage, &push_bos); 6294 6295 emit_push_constant_packets(ice, batch, stage, &push_bos); 6296#endif 6297 } 6298 6299 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { 6300 if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) { 6301 if (ice->shaders.prog[stage]) { 6302#if GFX_VER <= 6 6303 dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS; 6304#endif 6305 crocus_populate_binding_table(ice, batch, stage, false); 6306 ice->shaders.prog[stage]->bind_bo_offset = 6307 crocus_upload_binding_table(ice, batch, 6308 ice->shaders.prog[stage]->surf_offset, 6309 ice->shaders.prog[stage]->bt.size_bytes); 6310 6311#if GFX_VER >= 7 6312 crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) { 6313 ptr._3DCommandSubOpcode = 38 + stage; 6314 ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset; 6315 } 6316#endif 6317#if GFX_VER == 6 6318 } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) { 6319 dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS; 6320 crocus_populate_binding_table(ice, batch, stage, true); 6321 ice->shaders.ff_gs_prog->bind_bo_offset = 6322 crocus_upload_binding_table(ice, batch, 6323 ice->shaders.ff_gs_prog->surf_offset, 6324 ice->shaders.ff_gs_prog->bt.size_bytes); 6325#endif 6326 } 6327 } 6328 } 6329#if GFX_VER <= 6 6330 if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) { 6331 struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY]; 6332 if (gs == NULL) 6333 gs = ice->shaders.ff_gs_prog; 6334 crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) { 6335 ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset; 6336 ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset; 6337#if GFX_VER == 6 6338 ptr.VSBindingTableChange = true; 6339 ptr.PSBindingTableChange = true; 6340 ptr.GSBindingTableChange = gs ? true : false; 6341 ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0; 6342#endif 6343 } 6344 } 6345#endif 6346 6347 bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS; 6348 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { 6349 if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) || 6350 !ice->shaders.prog[stage]) 6351 continue; 6352 6353 crocus_upload_sampler_states(ice, batch, stage); 6354 6355 sampler_updates = true; 6356 6357#if GFX_VER >= 7 6358 struct crocus_shader_state *shs = &ice->state.shaders[stage]; 6359 6360 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) { 6361 ptr._3DCommandSubOpcode = 43 + stage; 6362 ptr.PointertoVSSamplerState = shs->sampler_offset; 6363 } 6364#endif 6365 } 6366 6367 if (sampler_updates) { 6368#if GFX_VER == 6 6369 struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX]; 6370 struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY]; 6371 struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT]; 6372 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) { 6373 if (ice->shaders.prog[MESA_SHADER_VERTEX] && 6374 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS || 6375 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) { 6376 ptr.VSSamplerStateChange = true; 6377 ptr.PointertoVSSamplerState = shs_vs->sampler_offset; 6378 } 6379 if (ice->shaders.prog[MESA_SHADER_GEOMETRY] && 6380 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS || 6381 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) { 6382 ptr.GSSamplerStateChange = true; 6383 ptr.PointertoGSSamplerState = shs_gs->sampler_offset; 6384 } 6385 if (ice->shaders.prog[MESA_SHADER_FRAGMENT] && 6386 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS || 6387 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) { 6388 ptr.PSSamplerStateChange = true; 6389 ptr.PointertoPSSamplerState = shs_fs->sampler_offset; 6390 } 6391 } 6392#endif 6393 } 6394 6395#if GFX_VER >= 6 6396 if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) { 6397 crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) { 6398 ms.PixelLocation = 6399 ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER; 6400 if (ice->state.framebuffer.samples > 0) 6401 ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1; 6402#if GFX_VER == 6 6403 INTEL_SAMPLE_POS_4X(ms.Sample); 6404#elif GFX_VER == 7 6405 switch (ice->state.framebuffer.samples) { 6406 case 1: 6407 INTEL_SAMPLE_POS_1X(ms.Sample); 6408 break; 6409 case 2: 6410 INTEL_SAMPLE_POS_2X(ms.Sample); 6411 break; 6412 case 4: 6413 INTEL_SAMPLE_POS_4X(ms.Sample); 6414 break; 6415 case 8: 6416 INTEL_SAMPLE_POS_8X(ms.Sample); 6417 break; 6418 default: 6419 break; 6420 } 6421#endif 6422 } 6423 } 6424 6425 if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) { 6426 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) { 6427 ms.SampleMask = determine_sample_mask(ice); 6428 } 6429 } 6430#endif 6431 6432#if GFX_VER >= 7 6433 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT]; 6434 if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) { 6435 struct brw_stage_prog_data *prog_data = shader->prog_data; 6436 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data; 6437 6438 crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) { 6439 6440 /* Initialize the execution mask with VMask. Otherwise, derivatives are 6441 * incorrect for subspans where some of the pixels are unlit. We believe 6442 * the bit just didn't take effect in previous generations. 6443 */ 6444 ps.VectorMaskEnable = GFX_VER >= 8 && wm_prog_data->uses_vmask; 6445 6446 ps._8PixelDispatchEnable = wm_prog_data->dispatch_8; 6447 ps._16PixelDispatchEnable = wm_prog_data->dispatch_16; 6448 ps._32PixelDispatchEnable = wm_prog_data->dispatch_32; 6449 6450 ps.DispatchGRFStartRegisterForConstantSetupData0 = 6451 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0); 6452 ps.DispatchGRFStartRegisterForConstantSetupData1 = 6453 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1); 6454 ps.DispatchGRFStartRegisterForConstantSetupData2 = 6455 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2); 6456 6457 ps.KernelStartPointer0 = KSP(ice, shader) + 6458 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0); 6459 ps.KernelStartPointer1 = KSP(ice, shader) + 6460 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1); 6461 ps.KernelStartPointer2 = KSP(ice, shader) + 6462 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2); 6463 6464#if GFX_VERx10 == 75 6465 ps.SampleMask = determine_sample_mask(ice); 6466#endif 6467 // XXX: WABTPPrefetchDisable, see above, drop at C0 6468 ps.BindingTableEntryCount = shader->bt.size_bytes / 4; 6469 ps.FloatingPointMode = prog_data->use_alt_mode; 6470#if GFX_VER >= 8 6471 ps.MaximumNumberofThreadsPerPSD = 6472 batch->screen->devinfo.max_threads_per_psd - 2; 6473#else 6474 ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1; 6475#endif 6476 6477 ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0; 6478 6479#if GFX_VER < 8 6480 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; 6481 ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending; 6482 ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0); 6483#endif 6484 /* From the documentation for this packet: 6485 * "If the PS kernel does not need the Position XY Offsets to 6486 * compute a Position Value, then this field should be programmed 6487 * to POSOFFSET_NONE." 6488 * 6489 * "SW Recommendation: If the PS kernel needs the Position Offsets 6490 * to compute a Position XY value, this field should match Position 6491 * ZW Interpolation Mode to ensure a consistent position.xyzw 6492 * computation." 6493 * 6494 * We only require XY sample offsets. So, this recommendation doesn't 6495 * look useful at the moment. We might need this in future. 6496 */ 6497 ps.PositionXYOffsetSelect = 6498 wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE; 6499 6500 if (wm_prog_data->base.total_scratch) { 6501 struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT); 6502 ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11; 6503 ps.ScratchSpaceBasePointer = rw_bo(bo, 0); 6504 } 6505 } 6506#if GFX_VER == 8 6507 const struct shader_info *fs_info = 6508 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT); 6509 crocus_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) { 6510 psx.PixelShaderValid = true; 6511 psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode; 6512 psx.PixelShaderKillsPixel = wm_prog_data->uses_kill; 6513 psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0; 6514 psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth; 6515 psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w; 6516 psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch; 6517 6518 /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */ 6519 if (wm_prog_data->uses_sample_mask) 6520 psx.PixelShaderUsesInputCoverageMask = true; 6521 6522 psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; 6523 6524 /* The stricter cross-primitive coherency guarantees that the hardware 6525 * gives us with the "Accesses UAV" bit set for at least one shader stage 6526 * and the "UAV coherency required" bit set on the 3DPRIMITIVE command 6527 * are redundant within the current image, atomic counter and SSBO GL 6528 * APIs, which all have very loose ordering and coherency requirements 6529 * and generally rely on the application to insert explicit barriers when 6530 * a shader invocation is expected to see the memory writes performed by 6531 * the invocations of some previous primitive. Regardless of the value 6532 * of "UAV coherency required", the "Accesses UAV" bits will implicitly 6533 * cause an in most cases useless DC flush when the lowermost stage with 6534 * the bit set finishes execution. 6535 * 6536 * It would be nice to disable it, but in some cases we can't because on 6537 * Gfx8+ it also has an influence on rasterization via the PS UAV-only 6538 * signal (which could be set independently from the coherency mechanism 6539 * in the 3DSTATE_WM command on Gfx7), and because in some cases it will 6540 * determine whether the hardware skips execution of the fragment shader 6541 * or not via the ThreadDispatchEnable signal. However if we know that 6542 * GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and 6543 * GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any 6544 * difference so we may just disable it here. 6545 * 6546 * Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't 6547 * take into account KillPixels when no depth or stencil writes are 6548 * enabled. In order for occlusion queries to work correctly with no 6549 * attachments, we need to force-enable here. 6550 * 6551 */ 6552 if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) && 6553 !(has_writeable_rt(ice->state.cso_blend, fs_info))) 6554 psx.PixelShaderHasUAV = true; 6555 } 6556#endif 6557 } 6558#endif 6559 6560#if GFX_VER >= 7 6561 if (ice->state.streamout_active) { 6562 if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) { 6563 for (int i = 0; i < 4; i++) { 6564 struct crocus_stream_output_target *tgt = 6565 (void *) ice->state.so_target[i]; 6566 6567 if (!tgt) { 6568 crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) { 6569 sob.SOBufferIndex = i; 6570 sob.MOCS = crocus_mocs(NULL, &batch->screen->isl_dev); 6571 } 6572 continue; 6573 } 6574 struct crocus_resource *res = (void *) tgt->base.buffer; 6575 uint32_t start = tgt->base.buffer_offset; 6576#if GFX_VER < 8 6577 uint32_t end = ALIGN(start + tgt->base.buffer_size, 4); 6578#endif 6579 crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) { 6580 sob.SOBufferIndex = i; 6581 6582 sob.SurfaceBaseAddress = rw_bo(res->bo, start); 6583 sob.MOCS = crocus_mocs(res->bo, &batch->screen->isl_dev); 6584#if GFX_VER < 8 6585 sob.SurfacePitch = tgt->stride; 6586 sob.SurfaceEndAddress = rw_bo(res->bo, end); 6587#else 6588 sob.SOBufferEnable = true; 6589 sob.StreamOffsetWriteEnable = true; 6590 sob.StreamOutputBufferOffsetAddressEnable = true; 6591 6592 sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1; 6593 sob.StreamOutputBufferOffsetAddress = 6594 rw_bo(crocus_resource_bo(&tgt->offset_res->base.b), tgt->offset_offset); 6595 if (tgt->zero_offset) { 6596 sob.StreamOffset = 0; 6597 tgt->zero_offset = false; 6598 } else 6599 sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */ 6600#endif 6601 } 6602 } 6603 } 6604 6605 if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) { 6606 uint32_t *decl_list = 6607 ice->state.streamout + GENX(3DSTATE_STREAMOUT_length); 6608 crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2)); 6609 } 6610 6611 if (dirty & CROCUS_DIRTY_STREAMOUT) { 6612 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; 6613 6614 uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)]; 6615 crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) { 6616 sol.SOFunctionEnable = true; 6617 sol.SOStatisticsEnable = true; 6618 6619 sol.RenderingDisable = cso_rast->cso.rasterizer_discard && 6620 !ice->state.prims_generated_query_active; 6621 sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING; 6622 } 6623 6624 assert(ice->state.streamout); 6625 6626 crocus_emit_merge(batch, ice->state.streamout, dynamic_sol, 6627 GENX(3DSTATE_STREAMOUT_length)); 6628 } 6629 } else { 6630 if (dirty & CROCUS_DIRTY_STREAMOUT) { 6631 crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol); 6632 } 6633 } 6634#endif 6635#if GFX_VER == 6 6636 if (ice->state.streamout_active) { 6637 if (dirty & CROCUS_DIRTY_GEN6_SVBI) { 6638 crocus_emit_so_svbi(ice); 6639 } 6640 } 6641#endif 6642 6643 if (dirty & CROCUS_DIRTY_CLIP) { 6644#if GFX_VER < 6 6645 const struct brw_clip_prog_data *clip_prog_data = (struct brw_clip_prog_data *)ice->shaders.clip_prog->prog_data; 6646 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso; 6647 6648 uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset); 6649 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; 6650 _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) { 6651 clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog); 6652 clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate; 6653 clip.SingleProgramFlow = true; 6654 clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1; 6655 6656 clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length; 6657 clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length; 6658 6659 clip.DispatchGRFStartRegisterForURBData = 1; 6660 clip.VertexURBEntryReadOffset = 0; 6661 clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2; 6662 6663 clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries; 6664 clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1; 6665 6666 if (batch->ice->urb.nr_clip_entries >= 10) { 6667 /* Half of the URB entries go to each thread, and it has to be an 6668 * even number. 6669 */ 6670 assert(batch->ice->urb.nr_clip_entries % 2 == 0); 6671 6672 /* Although up to 16 concurrent Clip threads are allowed on Ironlake, 6673 * only 2 threads can output VUEs at a time. 6674 */ 6675 clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1; 6676 } else { 6677 assert(batch->ice->urb.nr_clip_entries >= 5); 6678 clip.MaximumNumberofThreads = 1 - 1; 6679 } 6680 clip.VertexPositionSpace = VPOS_NDCSPACE; 6681 clip.UserClipFlagsMustClipEnable = true; 6682 clip.GuardbandClipTestEnable = true; 6683 6684 clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address); 6685 clip.ScreenSpaceViewportXMin = -1.0; 6686 clip.ScreenSpaceViewportXMax = 1.0; 6687 clip.ScreenSpaceViewportYMin = -1.0; 6688 clip.ScreenSpaceViewportYMax = 1.0; 6689 clip.ViewportXYClipTestEnable = true; 6690 clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far); 6691 6692#if GFX_VER == 5 || GFX_VERx10 == 45 6693 clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable; 6694#else 6695 /* Up to 6 actual clip flags, plus the 7th for the negative RHW 6696 * workaround. 6697 */ 6698 clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40; 6699#endif 6700 6701 clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL; 6702 clip.GuardbandClipTestEnable = true; 6703 6704 clip.ClipMode = clip_prog_data->clip_mode; 6705#if GFX_VERx10 == 45 6706 clip.NegativeWClipTestEnable = true; 6707#endif 6708 } 6709 6710#else //if GFX_VER >= 6 6711 struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; 6712 const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data ); 6713 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; 6714 bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] || 6715 ice->shaders.prog[MESA_SHADER_TESS_EVAL]; 6716 bool points_or_lines = cso_rast->fill_mode_point_or_line || 6717 (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines 6718 : ice->state.prim_is_points_or_lines); 6719 uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)]; 6720 crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) { 6721 cl.StatisticsEnable = ice->state.statistics_counters_enabled; 6722 if (cso_rast->cso.rasterizer_discard) 6723 cl.ClipMode = CLIPMODE_REJECT_ALL; 6724 else if (ice->state.window_space_position) 6725 cl.ClipMode = CLIPMODE_ACCEPT_ALL; 6726 else 6727 cl.ClipMode = CLIPMODE_NORMAL; 6728 6729 cl.PerspectiveDivideDisable = ice->state.window_space_position; 6730 cl.ViewportXYClipTestEnable = !points_or_lines; 6731 6732 cl.UserClipDistanceCullTestEnableBitmask = 6733 brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask; 6734 6735 cl.NonPerspectiveBarycentricEnable = wm_prog_data->uses_nonperspective_interp_modes; 6736 6737 cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1; 6738 cl.MaximumVPIndex = ice->state.num_viewports - 1; 6739 } 6740 crocus_emit_merge(batch, cso_rast->clip, dynamic_clip, 6741 ARRAY_SIZE(cso_rast->clip)); 6742#endif 6743 } 6744 6745 if (stage_dirty & CROCUS_STAGE_DIRTY_VS) { 6746 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX]; 6747 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data); 6748 const struct brw_stage_prog_data *prog_data = &vue_prog_data->base; 6749#if GFX_VER == 7 6750 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB) 6751 gen7_emit_vs_workaround_flush(batch); 6752#endif 6753 6754 6755#if GFX_VER == 6 6756 struct push_bos push_bos = {}; 6757 setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos); 6758 6759 emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos); 6760#endif 6761#if GFX_VER >= 6 6762 crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs) 6763#else 6764 uint32_t *vs_ptr = stream_state(batch, 6765 GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset); 6766 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; 6767 _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs) 6768#endif 6769 { 6770 INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX); 6771 6772 vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1; 6773 6774#if GFX_VER < 6 6775 vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1; 6776 vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length; 6777 vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2; 6778 6779 vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0); 6780 vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1; 6781 6782 vs.MaximumNumberofThreads = 6783 CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1; 6784 vs.StatisticsEnable = false; 6785 vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset); 6786#endif 6787#if GFX_VER == 5 6788 /* Force single program flow on Ironlake. We cannot reliably get 6789 * all applications working without it. See: 6790 * https://bugs.freedesktop.org/show_bug.cgi?id=29172 6791 * 6792 * The most notable and reliably failing application is the Humus 6793 * demo "CelShading" 6794 */ 6795 vs.SingleProgramFlow = true; 6796 vs.SamplerCount = 0; /* hardware requirement */ 6797 6798#endif 6799#if GFX_VER >= 8 6800 vs.SIMD8DispatchEnable = 6801 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8; 6802 6803 vs.UserClipDistanceCullTestEnableBitmask = 6804 vue_prog_data->cull_distance_mask; 6805#endif 6806 } 6807 6808#if GFX_VER == 6 6809 crocus_emit_pipe_control_flush(batch, 6810 "post VS const", 6811 PIPE_CONTROL_DEPTH_STALL | 6812 PIPE_CONTROL_INSTRUCTION_INVALIDATE | 6813 PIPE_CONTROL_STATE_CACHE_INVALIDATE); 6814#endif 6815 } 6816 6817 if (stage_dirty & CROCUS_STAGE_DIRTY_GS) { 6818 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY]; 6819 bool active = GFX_VER >= 6 && shader; 6820#if GFX_VER == 6 6821 struct push_bos push_bos = {}; 6822 if (shader) 6823 setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos); 6824 6825 emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos); 6826#endif 6827#if GFX_VERx10 == 70 6828 /** 6829 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages > 6830 * Geometry > Geometry Shader > State: 6831 * 6832 * "Note: Because of corruption in IVB:GT2, software needs to flush the 6833 * whole fixed function pipeline when the GS enable changes value in 6834 * the 3DSTATE_GS." 6835 * 6836 * The hardware architects have clarified that in this context "flush the 6837 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS 6838 * Stall" bit set. 6839 */ 6840 if (batch->screen->devinfo.gt == 2 && ice->state.gs_enabled != active) 6841 gen7_emit_cs_stall_flush(batch); 6842#endif 6843#if GFX_VER >= 6 6844 crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs) 6845#else 6846 uint32_t *gs_ptr = stream_state(batch, 6847 GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset); 6848 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; 6849 _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs) 6850#endif 6851 { 6852#if GFX_VER >= 6 6853 if (active) { 6854 const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(shader->prog_data); 6855 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data); 6856 const struct brw_stage_prog_data *prog_data = &gs_prog_data->base.base; 6857 6858 INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY); 6859#if GFX_VER >= 7 6860 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1; 6861 gs.OutputTopology = gs_prog_data->output_topology; 6862 gs.ControlDataHeaderSize = 6863 gs_prog_data->control_data_header_size_hwords; 6864 6865 gs.InstanceControl = gs_prog_data->invocations - 1; 6866 gs.DispatchMode = vue_prog_data->dispatch_mode; 6867 6868 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id; 6869 6870 gs.ControlDataFormat = gs_prog_data->control_data_format; 6871#endif 6872 6873 /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between 6874 * Ivy Bridge and Haswell. 6875 * 6876 * On Ivy Bridge, setting this bit causes the vertices of a triangle 6877 * strip to be delivered to the geometry shader in an order that does 6878 * not strictly follow the OpenGL spec, but preserves triangle 6879 * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then 6880 * the geometry shader sees triangles: 6881 * 6882 * (1, 2, 3), (2, 4, 3), (3, 4, 5) 6883 * 6884 * (Clearing the bit is even worse, because it fails to preserve 6885 * orientation). 6886 * 6887 * Triangle strips with adjacency always ordered in a way that preserves 6888 * triangle orientation but does not strictly follow the OpenGL spec, 6889 * regardless of the setting of this bit. 6890 * 6891 * On Haswell, both triangle strips and triangle strips with adjacency 6892 * are always ordered in a way that preserves triangle orientation. 6893 * Setting this bit causes the ordering to strictly follow the OpenGL 6894 * spec. 6895 * 6896 * So in either case we want to set the bit. Unfortunately on Ivy 6897 * Bridge this will get the order close to correct but not perfect. 6898 */ 6899 gs.ReorderMode = TRAILING; 6900 gs.MaximumNumberofThreads = 6901 GFX_VER == 8 ? (batch->screen->devinfo.max_gs_threads / 2 - 1) : 6902 (batch->screen->devinfo.max_gs_threads - 1); 6903#if GFX_VER < 7 6904 gs.SOStatisticsEnable = true; 6905 if (gs_prog_data->num_transform_feedback_bindings) 6906 gs.SVBIPayloadEnable = ice->state.streamout_active; 6907 6908 /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it 6909 * was previously done for gen6. 6910 * 6911 * TODO: test with both disabled to see if the HW is behaving 6912 * as expected, like in gen7. 6913 */ 6914 gs.SingleProgramFlow = true; 6915 gs.VectorMaskEnable = true; 6916#endif 6917#if GFX_VER >= 8 6918 gs.ExpectedVertexCount = gs_prog_data->vertices_in; 6919 6920 if (gs_prog_data->static_vertex_count != -1) { 6921 gs.StaticOutput = true; 6922 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count; 6923 } 6924 gs.IncludeVertexHandles = vue_prog_data->include_vue_handles; 6925 6926 gs.UserClipDistanceCullTestEnableBitmask = 6927 vue_prog_data->cull_distance_mask; 6928 6929 const int urb_entry_write_offset = 1; 6930 const uint32_t urb_entry_output_length = 6931 DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) - 6932 urb_entry_write_offset; 6933 6934 gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset; 6935 gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1); 6936#endif 6937 } 6938#endif 6939#if GFX_VER <= 6 6940 if (!active && ice->shaders.ff_gs_prog) { 6941 const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data; 6942 /* In gen6, transform feedback for the VS stage is done with an 6943 * ad-hoc GS program. This function provides the needed 3DSTATE_GS 6944 * for this. 6945 */ 6946 gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog); 6947 gs.SingleProgramFlow = true; 6948 gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1; 6949 gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length; 6950 6951#if GFX_VER <= 5 6952 gs.GRFRegisterCount = 6953 DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1; 6954 /* BRW_NEW_URB_FENCE */ 6955 gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries; 6956 gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1; 6957 gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0; 6958 gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate; 6959#else 6960 gs.Enable = true; 6961 gs.VectorMaskEnable = true; 6962 gs.SVBIPayloadEnable = true; 6963 gs.SVBIPostIncrementEnable = true; 6964 gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value; 6965 gs.SOStatisticsEnable = true; 6966 gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1; 6967#endif 6968 } 6969#endif 6970 if (!active && !ice->shaders.ff_gs_prog) { 6971#if GFX_VER < 8 6972 gs.DispatchGRFStartRegisterForURBData = 1; 6973#if GFX_VER >= 7 6974 gs.IncludeVertexHandles = true; 6975#endif 6976#endif 6977 } 6978#if GFX_VER >= 6 6979 gs.StatisticsEnable = true; 6980#endif 6981#if GFX_VER == 5 || GFX_VER == 6 6982 gs.RenderingEnabled = true; 6983#endif 6984#if GFX_VER <= 5 6985 gs.MaximumVPIndex = ice->state.num_viewports - 1; 6986#endif 6987 } 6988 ice->state.gs_enabled = active; 6989 } 6990 6991#if GFX_VER >= 7 6992 if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) { 6993 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL]; 6994 6995 if (shader) { 6996 const struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(shader->prog_data); 6997 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data); 6998 const struct brw_stage_prog_data *prog_data = &tcs_prog_data->base.base; 6999 7000 crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) { 7001 INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL); 7002 hs.InstanceCount = tcs_prog_data->instances - 1; 7003 hs.IncludeVertexHandles = true; 7004 hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1; 7005 } 7006 } else { 7007 crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs); 7008 } 7009 7010 } 7011 7012 if (stage_dirty & CROCUS_STAGE_DIRTY_TES) { 7013 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL]; 7014 if (shader) { 7015 const struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(shader->prog_data); 7016 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data); 7017 const struct brw_stage_prog_data *prog_data = &tes_prog_data->base.base; 7018 7019 crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) { 7020 te.Partitioning = tes_prog_data->partitioning; 7021 te.OutputTopology = tes_prog_data->output_topology; 7022 te.TEDomain = tes_prog_data->domain; 7023 te.TEEnable = true; 7024 te.MaximumTessellationFactorOdd = 63.0; 7025 te.MaximumTessellationFactorNotOdd = 64.0; 7026 }; 7027 crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) { 7028 INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL); 7029 7030 ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1; 7031 ds.ComputeWCoordinateEnable = 7032 tes_prog_data->domain == BRW_TESS_DOMAIN_TRI; 7033 7034#if GFX_VER >= 8 7035 if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8) 7036 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH; 7037 ds.UserClipDistanceCullTestEnableBitmask = 7038 vue_prog_data->cull_distance_mask; 7039#endif 7040 }; 7041 } else { 7042 crocus_emit_cmd(batch, GENX(3DSTATE_TE), te); 7043 crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds); 7044 } 7045 } 7046#endif 7047 if (dirty & CROCUS_DIRTY_RASTER) { 7048 7049#if GFX_VER < 6 7050 const struct brw_sf_prog_data *sf_prog_data = (struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data; 7051 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso; 7052 uint32_t *sf_ptr = stream_state(batch, 7053 GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset); 7054 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; 7055 _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) { 7056 sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog); 7057 sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate; 7058 sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1; 7059 sf.DispatchGRFStartRegisterForURBData = 3; 7060 sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET; 7061 sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length; 7062 sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1; 7063 sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries; 7064 sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT; 7065 7066 sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address); 7067 7068 sf.MaximumNumberofThreads = 7069 MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1; 7070 7071 sf.SpritePointEnable = cso_state->point_quad_rasterization; 7072 sf.DestinationOriginHorizontalBias = 0.5; 7073 sf.DestinationOriginVerticalBias = 0.5; 7074 7075 sf.LineEndCapAntialiasingRegionWidth = 7076 cso_state->line_smooth ? _10pixels : _05pixels; 7077 sf.LastPixelEnable = cso_state->line_last_pixel; 7078 sf.AntialiasingEnable = cso_state->line_smooth; 7079 7080 sf.LineWidth = get_line_width(cso_state); 7081 sf.PointWidth = cso_state->point_size; 7082 sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State; 7083#if GFX_VERx10 >= 45 7084 sf.AALineDistanceMode = AALINEDISTANCE_TRUE; 7085#endif 7086 sf.ViewportTransformEnable = true; 7087 sf.FrontWinding = cso_state->front_ccw ? 1 : 0; 7088 sf.ScissorRectangleEnable = true; 7089 sf.CullMode = translate_cull_mode(cso_state->cull_face); 7090 7091 if (cso_state->flatshade_first) { 7092 sf.TriangleFanProvokingVertexSelect = 1; 7093 } else { 7094 sf.TriangleStripListProvokingVertexSelect = 2; 7095 sf.TriangleFanProvokingVertexSelect = 2; 7096 sf.LineStripListProvokingVertexSelect = 1; 7097 } 7098 } 7099#else 7100 struct crocus_rasterizer_state *cso = ice->state.cso_rast; 7101 uint32_t dynamic_sf[GENX(3DSTATE_SF_length)]; 7102 crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) { 7103 sf.ViewportTransformEnable = !ice->state.window_space_position; 7104 7105#if GFX_VER == 6 7106 const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data); 7107 uint32_t urb_entry_read_length; 7108 uint32_t urb_entry_read_offset; 7109 uint32_t point_sprite_enables; 7110 calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables, 7111 &urb_entry_read_length, 7112 &urb_entry_read_offset); 7113 sf.VertexURBEntryReadLength = urb_entry_read_length; 7114 sf.VertexURBEntryReadOffset = urb_entry_read_offset; 7115 sf.PointSpriteTextureCoordinateEnable = point_sprite_enables; 7116 sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs; 7117 sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; 7118#endif 7119 7120#if GFX_VER >= 6 && GFX_VER < 8 7121 if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample) 7122 sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN; 7123#endif 7124#if GFX_VER == 7 7125 if (ice->state.framebuffer.zsbuf) { 7126 struct crocus_resource *zres, *sres; 7127 crocus_get_depth_stencil_resources(&batch->screen->devinfo, 7128 ice->state.framebuffer.zsbuf->texture, 7129 &zres, &sres); 7130 /* ANV thinks that the stencil-ness doesn't matter, this is just 7131 * about handling polygon offset scaling. 7132 */ 7133 sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM; 7134 } 7135#endif 7136 } 7137 crocus_emit_merge(batch, cso->sf, dynamic_sf, 7138 ARRAY_SIZE(dynamic_sf)); 7139#if GFX_VER == 8 7140 crocus_batch_emit(batch, cso->raster, sizeof(cso->raster)); 7141#endif 7142#endif 7143 } 7144 7145 if (dirty & CROCUS_DIRTY_WM) { 7146 struct crocus_rasterizer_state *cso = ice->state.cso_rast; 7147 const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data); 7148 UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF; 7149 UNUSED const struct shader_info *fs_info = 7150 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT); 7151 7152#if GFX_VER == 6 7153 struct push_bos push_bos = {}; 7154 setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos); 7155 7156 emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos); 7157#endif 7158#if GFX_VER >= 6 7159 crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm) 7160#else 7161 uint32_t *wm_ptr = stream_state(batch, 7162 GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset); 7163 7164 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; 7165 7166 _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm) 7167#endif 7168 { 7169#if GFX_VER <= 6 7170 wm._8PixelDispatchEnable = wm_prog_data->dispatch_8; 7171 wm._16PixelDispatchEnable = wm_prog_data->dispatch_16; 7172 wm._32PixelDispatchEnable = wm_prog_data->dispatch_32; 7173#endif 7174#if GFX_VER == 4 7175 /* On gen4, we only have one shader kernel */ 7176 if (brw_wm_state_has_ksp(wm, 0)) { 7177 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]); 7178 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0); 7179 wm.DispatchGRFStartRegisterForConstantSetupData0 = 7180 wm_prog_data->base.dispatch_grf_start_reg; 7181 } 7182#elif GFX_VER == 5 7183 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) + 7184 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0); 7185 wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) + 7186 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1); 7187 wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) + 7188 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2); 7189 7190 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0); 7191 wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1); 7192 wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2); 7193 7194 wm.DispatchGRFStartRegisterForConstantSetupData0 = 7195 wm_prog_data->base.dispatch_grf_start_reg; 7196#elif GFX_VER == 6 7197 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) + 7198 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0); 7199 wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) + 7200 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1); 7201 wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) + 7202 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2); 7203 7204 wm.DispatchGRFStartRegisterForConstantSetupData0 = 7205 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0); 7206 wm.DispatchGRFStartRegisterForConstantSetupData1 = 7207 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1); 7208 wm.DispatchGRFStartRegisterForConstantSetupData2 = 7209 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2); 7210#endif 7211#if GFX_VER <= 5 7212 wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length; 7213 wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2; 7214 wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2; 7215 wm.SetupURBEntryReadOffset = 0; 7216 wm.EarlyDepthTestEnable = true; 7217 wm.LineAntialiasingRegionWidth = _05pixels; 7218 wm.LineEndCapAntialiasingRegionWidth = _10pixels; 7219 wm.DepthCoefficientURBReadOffset = 1; 7220 7221 if (cso->cso.offset_tri) { 7222 wm.GlobalDepthOffsetEnable = true; 7223 7224 /* Something weird going on with legacy_global_depth_bias, 7225 * offset_constant, scaling and MRD. This value passes glean 7226 * but gives some odd results elsewere (eg. the 7227 * quad-offset-units test). 7228 */ 7229 wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2; 7230 wm.GlobalDepthOffsetScale = cso->cso.offset_scale; 7231 } 7232 wm.SamplerStatePointer = ro_bo(batch->state.bo, 7233 ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset); 7234#endif 7235 7236 wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ? 7237 ice->state.statistics_counters_enabled : 0; 7238 7239#if GFX_VER >= 6 7240 wm.LineAntialiasingRegionWidth = _10pixels; 7241 wm.LineEndCapAntialiasingRegionWidth = _05pixels; 7242 7243 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT; 7244 wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes; 7245#endif 7246#if GFX_VER == 6 7247 wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend && 7248 ice->state.cso_blend->dual_color_blending; 7249 wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; 7250 wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; 7251 7252 /* From the SNB PRM, volume 2 part 1, page 281: 7253 * "If the PS kernel does not need the Position XY Offsets 7254 * to compute a Position XY value, then this field should be 7255 * programmed to POSOFFSET_NONE." 7256 * 7257 * "SW Recommendation: If the PS kernel needs the Position Offsets 7258 * to compute a Position XY value, this field should match Position 7259 * ZW Interpolation Mode to ensure a consistent position.xyzw 7260 * computation." 7261 * We only require XY sample offsets. So, this recommendation doesn't 7262 * look useful at the moment. We might need this in future. 7263 */ 7264 if (wm_prog_data->uses_pos_offset) 7265 wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE; 7266 else 7267 wm.PositionXYOffsetSelect = POSOFFSET_NONE; 7268#endif 7269 wm.LineStippleEnable = cso->cso.line_stipple_enable; 7270 wm.PolygonStippleEnable = cso->cso.poly_stipple_enable; 7271 7272#if GFX_VER < 7 7273 if (wm_prog_data->base.use_alt_mode) 7274 wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate; 7275 wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4; 7276 wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1; 7277#endif 7278 7279#if GFX_VER < 8 7280#if GFX_VER >= 6 7281 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w; 7282 7283 struct pipe_framebuffer_state *fb = &ice->state.framebuffer; 7284 if (fb->samples > 1) { 7285 if (cso->cso.multisample) 7286 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN; 7287 else 7288 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL; 7289 7290 if (wm_prog_data->persample_dispatch) 7291 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; 7292 else 7293 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL; 7294 } else { 7295 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL; 7296 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; 7297 } 7298#endif 7299 7300 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth; 7301 7302 if (wm_prog_data->uses_kill || 7303 ice->state.cso_zsa->cso.alpha_enabled || 7304 ice->state.cso_blend->cso.alpha_to_coverage || 7305 (GFX_VER >= 6 && wm_prog_data->uses_omask)) 7306 wm.PixelShaderKillsPixel = true; 7307 7308 if (has_writeable_rt(ice->state.cso_blend, fs_info) || 7309 writes_depth || wm.PixelShaderKillsPixel || 7310 (GFX_VER >= 6 && wm_prog_data->has_side_effects)) 7311 wm.ThreadDispatchEnable = true; 7312 7313#if GFX_VER >= 7 7314 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode; 7315 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask; 7316#else 7317 if (wm_prog_data->base.total_scratch) { 7318 struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, 7319 MESA_SHADER_FRAGMENT); 7320 wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11; 7321 wm.ScratchSpaceBasePointer = rw_bo(bo, 0); 7322 } 7323 7324 wm.PixelShaderComputedDepth = writes_depth; 7325 7326#endif 7327 /* The "UAV access enable" bits are unnecessary on HSW because they only 7328 * seem to have an effect on the HW-assisted coherency mechanism which we 7329 * don't need, and the rasterization-related UAV_ONLY flag and the 7330 * DISPATCH_ENABLE bit can be set independently from it. 7331 * C.f. gen8_upload_ps_extra(). 7332 * 7333 * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS | 7334 * _NEW_COLOR 7335 */ 7336#if GFX_VERx10 == 75 7337 if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) && 7338 wm_prog_data->has_side_effects) 7339 wm.PSUAVonly = ON; 7340#endif 7341#endif 7342#if GFX_VER >= 7 7343 /* BRW_NEW_FS_PROG_DATA */ 7344 if (wm_prog_data->early_fragment_tests) 7345 wm.EarlyDepthStencilControl = EDSC_PREPS; 7346 else if (wm_prog_data->has_side_effects) 7347 wm.EarlyDepthStencilControl = EDSC_PSEXEC; 7348#endif 7349#if GFX_VER == 8 7350 /* We could skip this bit if color writes are enabled. */ 7351 if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill) 7352 wm.ForceThreadDispatchEnable = ForceON; 7353#endif 7354 }; 7355 7356#if GFX_VER <= 5 7357 if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) { 7358 crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) { 7359 clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp; 7360 } 7361 ice->state.global_depth_offset_clamp = cso->cso.offset_clamp; 7362 } 7363#endif 7364 } 7365 7366#if GFX_VER >= 7 7367 if (dirty & CROCUS_DIRTY_GEN7_SBE) { 7368 crocus_emit_sbe(batch, ice); 7369 } 7370#endif 7371 7372#if GFX_VER >= 8 7373 if (dirty & CROCUS_DIRTY_GEN8_PS_BLEND) { 7374 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT]; 7375 struct crocus_blend_state *cso_blend = ice->state.cso_blend; 7376 struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa; 7377 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data; 7378 const struct shader_info *fs_info = 7379 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT); 7380 uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)]; 7381 crocus_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) { 7382 pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info); 7383 pb.AlphaTestEnable = cso_zsa->cso.alpha_enabled; 7384 pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) && 7385 (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend); 7386 } 7387 crocus_emit_merge(batch, cso_blend->ps_blend, dynamic_pb, 7388 ARRAY_SIZE(cso_blend->ps_blend)); 7389 } 7390#endif 7391 7392#if GFX_VER >= 6 7393 if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) { 7394 7395#if GFX_VER >= 8 7396 crocus_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) { 7397 set_depth_stencil_bits(ice, &wmds); 7398 } 7399#else 7400 uint32_t ds_offset; 7401 void *ds_map = stream_state(batch, 7402 sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length), 7403 64, &ds_offset); 7404 _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) { 7405 set_depth_stencil_bits(ice, &ds); 7406 } 7407 7408#if GFX_VER == 6 7409 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { 7410 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset; 7411 ptr.DEPTH_STENCIL_STATEChange = true; 7412 } 7413#else 7414 crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) { 7415 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset; 7416 } 7417#endif 7418#endif 7419 } 7420 7421 if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) { 7422 /* Align to 64-byte boundary as per anv. */ 7423 uint32_t scissor_offset; 7424 struct pipe_scissor_state *scissor_map = (void *) 7425 stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports, 7426 64, &scissor_offset); 7427 for (int i = 0; i < ice->state.num_viewports; i++) { 7428 struct pipe_scissor_state scissor; 7429 crocus_fill_scissor_rect(ice, i, &scissor); 7430 scissor_map[i] = scissor; 7431 } 7432 7433 crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) { 7434 ptr.ScissorRectPointer = scissor_offset; 7435 } 7436 } 7437#endif 7438 7439 if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) { 7440 struct isl_device *isl_dev = &batch->screen->isl_dev; 7441#if GFX_VER >= 6 7442 crocus_emit_depth_stall_flushes(batch); 7443#endif 7444 void *batch_ptr; 7445 struct crocus_resource *zres, *sres; 7446 struct pipe_framebuffer_state *cso = &ice->state.framebuffer; 7447 batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size); 7448 7449 struct isl_view view = { 7450 .base_level = 0, 7451 .levels = 1, 7452 .base_array_layer = 0, 7453 .array_len = 1, 7454 .swizzle = ISL_SWIZZLE_IDENTITY, 7455 }; 7456 struct isl_depth_stencil_hiz_emit_info info = { 7457 .view = &view, 7458 .mocs = crocus_mocs(NULL, isl_dev), 7459 }; 7460 7461 if (cso->zsbuf) { 7462 crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres); 7463 struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf; 7464 if (zsbuf->align_res) { 7465 zres = (struct crocus_resource *)zsbuf->align_res; 7466 } 7467 view.base_level = cso->zsbuf->u.tex.level; 7468 view.base_array_layer = cso->zsbuf->u.tex.first_layer; 7469 view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1; 7470 7471 if (zres) { 7472 view.usage |= ISL_SURF_USAGE_DEPTH_BIT; 7473 7474 info.depth_surf = &zres->surf; 7475 info.depth_address = crocus_command_reloc(batch, 7476 (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset, 7477 zres->bo, 0, RELOC_32BIT); 7478 7479 info.mocs = crocus_mocs(zres->bo, isl_dev); 7480 view.format = zres->surf.format; 7481 7482 if (crocus_resource_level_has_hiz(zres, view.base_level)) { 7483 info.hiz_usage = zres->aux.usage; 7484 info.hiz_surf = &zres->aux.surf; 7485 uint64_t hiz_offset = 0; 7486 7487#if GFX_VER == 6 7488 /* HiZ surfaces on Sandy Bridge technically don't support 7489 * mip-mapping. However, we can fake it by offsetting to the 7490 * first slice of LOD0 in the HiZ surface. 7491 */ 7492 isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf, 7493 view.base_level, 0, 0, 7494 &hiz_offset, NULL, NULL); 7495#endif 7496 info.hiz_address = crocus_command_reloc(batch, 7497 (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset, 7498 zres->aux.bo, zres->aux.offset + hiz_offset, 7499 RELOC_32BIT); 7500 info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0]; 7501 } 7502 } 7503 7504#if GFX_VER >= 6 7505 if (sres) { 7506 view.usage |= ISL_SURF_USAGE_STENCIL_BIT; 7507 info.stencil_aux_usage = sres->aux.usage; 7508 info.stencil_surf = &sres->surf; 7509 7510 uint64_t stencil_offset = 0; 7511#if GFX_VER == 6 7512 /* Stencil surfaces on Sandy Bridge technically don't support 7513 * mip-mapping. However, we can fake it by offsetting to the 7514 * first slice of LOD0 in the stencil surface. 7515 */ 7516 isl_surf_get_image_offset_B_tile_sa(&sres->surf, 7517 view.base_level, 0, 0, 7518 &stencil_offset, NULL, NULL); 7519#endif 7520 7521 info.stencil_address = crocus_command_reloc(batch, 7522 (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset, 7523 sres->bo, stencil_offset, RELOC_32BIT); 7524 if (!zres) { 7525 view.format = sres->surf.format; 7526 info.mocs = crocus_mocs(sres->bo, isl_dev); 7527 } 7528 } 7529#endif 7530 } 7531 isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info); 7532 } 7533 7534 /* TODO: Disable emitting this until something uses a stipple. */ 7535 if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) { 7536 crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) { 7537 for (int i = 0; i < 32; i++) { 7538 poly.PatternRow[i] = ice->state.poly_stipple.stipple[i]; 7539 } 7540 } 7541 } 7542 7543 if (dirty & CROCUS_DIRTY_LINE_STIPPLE) { 7544 struct crocus_rasterizer_state *cso = ice->state.cso_rast; 7545 crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple)); 7546 } 7547 7548#if GFX_VER >= 8 7549 if (dirty & CROCUS_DIRTY_GEN8_VF_TOPOLOGY) { 7550 crocus_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) { 7551 topo.PrimitiveTopologyType = 7552 translate_prim_type(draw->mode, ice->state.patch_vertices); 7553 } 7554 } 7555#endif 7556 7557#if GFX_VER <= 5 7558 if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) { 7559 upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset, 7560 ice->shaders.vs_offset, ice->shaders.sf_offset, 7561 ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset); 7562 crocus_upload_urb_fence(batch); 7563 7564 crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) { 7565 cs.NumberofURBEntries = ice->urb.nr_cs_entries; 7566 cs.URBEntryAllocationSize = ice->urb.csize - 1; 7567 } 7568 dirty |= CROCUS_DIRTY_GEN4_CURBE; 7569 } 7570#endif 7571 if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) { 7572 struct pipe_framebuffer_state *fb = &ice->state.framebuffer; 7573 if (fb->width && fb->height) { 7574 crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) { 7575 rect.ClippedDrawingRectangleXMax = fb->width - 1; 7576 rect.ClippedDrawingRectangleYMax = fb->height - 1; 7577 } 7578 } 7579 } 7580 7581 if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) { 7582 const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers); 7583 const uint32_t count = user_count + 7584 ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params; 7585 uint32_t dynamic_bound = ice->state.bound_vertex_buffers; 7586 7587 if (count) { 7588 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length); 7589 7590 uint32_t *map = 7591 crocus_get_command_space(batch, 4 * (1 + vb_dwords * count)); 7592 _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) { 7593 vb.DWordLength = (vb_dwords * count + 1) - 2; 7594 } 7595 map += 1; 7596 7597 uint32_t bound = dynamic_bound; 7598 int i; 7599 while (bound) { 7600 i = u_bit_scan(&bound); 7601 struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i]; 7602 struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource); 7603 uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i]; 7604 7605 emit_vertex_buffer_state(batch, i, bo, 7606 buf->buffer_offset, 7607 ice->state.vb_end[i], 7608 buf->stride, 7609 step_rate, 7610 &map); 7611 } 7612 i = user_count; 7613 if (ice->state.vs_uses_draw_params) { 7614 struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res; 7615 emit_vertex_buffer_state(batch, i++, 7616 res->bo, 7617 ice->draw.draw_params.offset, 7618 ice->draw.draw_params.res->width0, 7619 0, 0, &map); 7620 } 7621 if (ice->state.vs_uses_derived_draw_params) { 7622 struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res; 7623 emit_vertex_buffer_state(batch, i++, 7624 res->bo, 7625 ice->draw.derived_draw_params.offset, 7626 ice->draw.derived_draw_params.res->width0, 7627 0, 0, &map); 7628 } 7629 } 7630 } 7631 7632 if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) { 7633 struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements; 7634 const unsigned entries = MAX2(cso->count, 1); 7635 if (!(ice->state.vs_needs_sgvs_element || 7636 ice->state.vs_uses_derived_draw_params || 7637 ice->state.vs_needs_edge_flag)) { 7638 crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) * 7639 (1 + entries * GENX(VERTEX_ELEMENT_STATE_length))); 7640 } else { 7641 uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)]; 7642 const unsigned dyn_count = cso->count + 7643 ice->state.vs_needs_sgvs_element + 7644 ice->state.vs_uses_derived_draw_params; 7645 7646 crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), 7647 &dynamic_ves, ve) { 7648 ve.DWordLength = 7649 1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2; 7650 } 7651 memcpy(&dynamic_ves[1], &cso->vertex_elements[1], 7652 (cso->count - ice->state.vs_needs_edge_flag) * 7653 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t)); 7654 uint32_t *ve_pack_dest = 7655 &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) * 7656 GENX(VERTEX_ELEMENT_STATE_length)]; 7657 7658 if (ice->state.vs_needs_sgvs_element) { 7659 uint32_t base_ctrl = ice->state.vs_uses_draw_params ? 7660 VFCOMP_STORE_SRC : VFCOMP_STORE_0; 7661 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) { 7662 ve.Valid = true; 7663 ve.VertexBufferIndex = 7664 util_bitcount64(ice->state.bound_vertex_buffers); 7665 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT; 7666 ve.Component0Control = base_ctrl; 7667 ve.Component1Control = base_ctrl; 7668#if GFX_VER < 8 7669 ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0; 7670 ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0; 7671#else 7672 ve.Component2Control = VFCOMP_STORE_0; 7673 ve.Component3Control = VFCOMP_STORE_0; 7674#endif 7675#if GFX_VER < 5 7676 ve.DestinationElementOffset = cso->count * 4; 7677#endif 7678 } 7679 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length); 7680 } 7681 if (ice->state.vs_uses_derived_draw_params) { 7682 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) { 7683 ve.Valid = true; 7684 ve.VertexBufferIndex = 7685 util_bitcount64(ice->state.bound_vertex_buffers) + 7686 ice->state.vs_uses_draw_params; 7687 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT; 7688 ve.Component0Control = VFCOMP_STORE_SRC; 7689 ve.Component1Control = VFCOMP_STORE_SRC; 7690 ve.Component2Control = VFCOMP_STORE_0; 7691 ve.Component3Control = VFCOMP_STORE_0; 7692#if GFX_VER < 5 7693 ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4; 7694#endif 7695 } 7696 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length); 7697 } 7698 if (ice->state.vs_needs_edge_flag) { 7699 for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length); i++) 7700 ve_pack_dest[i] = cso->edgeflag_ve[i]; 7701 } 7702 7703 crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) * 7704 (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length))); 7705 } 7706 7707#if GFX_VER == 8 7708 if (!ice->state.vs_needs_edge_flag) { 7709 crocus_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) * 7710 entries * GENX(3DSTATE_VF_INSTANCING_length)); 7711 } else { 7712 assert(cso->count > 0); 7713 const unsigned edgeflag_index = cso->count - 1; 7714 uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)]; 7715 memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index * 7716 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t)); 7717 7718 uint32_t *vfi_pack_dest = &dynamic_vfi[0] + 7719 edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length); 7720 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) { 7721 vi.VertexElementIndex = edgeflag_index + 7722 ice->state.vs_needs_sgvs_element + 7723 ice->state.vs_uses_derived_draw_params; 7724 } 7725 for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length); i++) 7726 vfi_pack_dest[i] |= cso->edgeflag_vfi[i]; 7727 7728 crocus_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) * 7729 entries * GENX(3DSTATE_VF_INSTANCING_length)); 7730 } 7731#endif 7732 } 7733 7734#if GFX_VER == 8 7735 if (dirty & CROCUS_DIRTY_GEN8_VF_SGVS) { 7736 const struct brw_vs_prog_data *vs_prog_data = (void *) 7737 ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data; 7738 struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements; 7739 7740 crocus_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) { 7741 if (vs_prog_data->uses_vertexid) { 7742 sgv.VertexIDEnable = true; 7743 sgv.VertexIDComponentNumber = 2; 7744 sgv.VertexIDElementOffset = 7745 cso->count - ice->state.vs_needs_edge_flag; 7746 } 7747 7748 if (vs_prog_data->uses_instanceid) { 7749 sgv.InstanceIDEnable = true; 7750 sgv.InstanceIDComponentNumber = 3; 7751 sgv.InstanceIDElementOffset = 7752 cso->count - ice->state.vs_needs_edge_flag; 7753 } 7754 } 7755 } 7756#endif 7757#if GFX_VERx10 >= 75 7758 if (dirty & CROCUS_DIRTY_GEN75_VF) { 7759 crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) { 7760 if (draw->primitive_restart) { 7761 vf.IndexedDrawCutIndexEnable = true; 7762 vf.CutIndex = draw->restart_index; 7763 } 7764 } 7765 } 7766#endif 7767 7768#if GFX_VER == 8 7769 if (dirty & CROCUS_DIRTY_GEN8_PMA_FIX) { 7770 bool enable = want_pma_fix(ice); 7771 genX(crocus_update_pma_fix)(ice, batch, enable); 7772 } 7773#endif 7774 7775#if GFX_VER <= 5 7776 if (dirty & CROCUS_DIRTY_GEN4_CURBE) { 7777 gen4_upload_curbe(batch); 7778 } 7779#endif 7780} 7781 7782static void 7783crocus_upload_render_state(struct crocus_context *ice, 7784 struct crocus_batch *batch, 7785 const struct pipe_draw_info *draw, 7786 unsigned drawid_offset, 7787 const struct pipe_draw_indirect_info *indirect, 7788 const struct pipe_draw_start_count_bias *sc) 7789{ 7790#if GFX_VER >= 7 7791 bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT; 7792#endif 7793 7794 batch->no_wrap = true; 7795 batch->contains_draw = true; 7796 7797 crocus_update_surface_base_address(batch); 7798 7799 crocus_upload_dirty_render_state(ice, batch, draw); 7800 7801 batch->no_wrap = false; 7802 if (draw->index_size > 0) { 7803 unsigned offset; 7804 unsigned size; 7805 bool emit_index = false; 7806 7807 if (draw->has_user_indices) { 7808 unsigned start_offset = draw->index_size * sc->start; 7809 u_upload_data(ice->ctx.stream_uploader, 0, 7810 sc->count * draw->index_size, 4, 7811 (char *)draw->index.user + start_offset, 7812 &offset, &ice->state.index_buffer.res); 7813 offset -= start_offset; 7814 size = start_offset + sc->count * draw->index_size; 7815 emit_index = true; 7816 } else { 7817 struct crocus_resource *res = (void *) draw->index.resource; 7818 7819 if (ice->state.index_buffer.res != draw->index.resource) { 7820 res->bind_history |= PIPE_BIND_INDEX_BUFFER; 7821 pipe_resource_reference(&ice->state.index_buffer.res, 7822 draw->index.resource); 7823 emit_index = true; 7824 } 7825 offset = 0; 7826 size = draw->index.resource->width0; 7827 } 7828 7829 if (!emit_index && 7830 (ice->state.index_buffer.size != size || 7831 ice->state.index_buffer.index_size != draw->index_size 7832#if GFX_VERx10 < 75 7833 || ice->state.index_buffer.prim_restart != draw->primitive_restart 7834#endif 7835 ) 7836 ) 7837 emit_index = true; 7838 7839 if (emit_index) { 7840 struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res); 7841 7842 crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) { 7843#if GFX_VERx10 < 75 7844 ib.CutIndexEnable = draw->primitive_restart; 7845#endif 7846 ib.IndexFormat = draw->index_size >> 1; 7847 ib.BufferStartingAddress = ro_bo(bo, offset); 7848#if GFX_VER >= 8 7849 ib.BufferSize = bo->size - offset; 7850#else 7851 ib.BufferEndingAddress = ro_bo(bo, offset + size - 1); 7852#endif 7853#if GFX_VER >= 6 7854 ib.MOCS = crocus_mocs(bo, &batch->screen->isl_dev); 7855#endif 7856 } 7857 ice->state.index_buffer.size = size; 7858 ice->state.index_buffer.offset = offset; 7859 ice->state.index_buffer.index_size = draw->index_size; 7860#if GFX_VERx10 < 75 7861 ice->state.index_buffer.prim_restart = draw->primitive_restart; 7862#endif 7863 } 7864 } 7865 7866#define _3DPRIM_END_OFFSET 0x2420 7867#define _3DPRIM_START_VERTEX 0x2430 7868#define _3DPRIM_VERTEX_COUNT 0x2434 7869#define _3DPRIM_INSTANCE_COUNT 0x2438 7870#define _3DPRIM_START_INSTANCE 0x243C 7871#define _3DPRIM_BASE_VERTEX 0x2440 7872 7873#if GFX_VER >= 7 7874 if (indirect && !indirect->count_from_stream_output) { 7875 if (indirect->indirect_draw_count) { 7876 use_predicate = true; 7877 7878 struct crocus_bo *draw_count_bo = 7879 crocus_resource_bo(indirect->indirect_draw_count); 7880 unsigned draw_count_offset = 7881 indirect->indirect_draw_count_offset; 7882 7883 crocus_emit_pipe_control_flush(batch, 7884 "ensure indirect draw buffer is flushed", 7885 PIPE_CONTROL_FLUSH_ENABLE); 7886 if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) { 7887#if GFX_VERx10 >= 75 7888 struct mi_builder b; 7889 mi_builder_init(&b, &batch->screen->devinfo, batch); 7890 7891 /* comparison = draw id < draw count */ 7892 struct mi_value comparison = 7893 mi_ult(&b, mi_imm(drawid_offset), 7894 mi_mem32(ro_bo(draw_count_bo, 7895 draw_count_offset))); 7896#if GFX_VER == 8 7897 /* predicate = comparison & conditional rendering predicate */ 7898 mi_store(&b, mi_reg32(MI_PREDICATE_RESULT), 7899 mi_iand(&b, comparison, mi_reg32(CS_GPR(15)))); 7900#else 7901 /* predicate = comparison & conditional rendering predicate */ 7902 struct mi_value pred = mi_iand(&b, comparison, 7903 mi_reg32(CS_GPR(15))); 7904 7905 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred); 7906 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); 7907 7908 unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV | 7909 MI_PREDICATE_COMBINEOP_SET | 7910 MI_PREDICATE_COMPAREOP_SRCS_EQUAL; 7911 7912 crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t)); 7913#endif 7914#endif 7915 } else { 7916 uint32_t mi_predicate; 7917 7918 /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */ 7919 crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset); 7920 /* Upload the current draw count from the draw parameters buffer 7921 * to MI_PREDICATE_SRC0. 7922 */ 7923 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, 7924 draw_count_bo, draw_count_offset); 7925 /* Zero the top 32-bits of MI_PREDICATE_SRC0 */ 7926 crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0); 7927 7928 if (drawid_offset == 0) { 7929 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV | 7930 MI_PREDICATE_COMBINEOP_SET | 7931 MI_PREDICATE_COMPAREOP_SRCS_EQUAL; 7932 } else { 7933 /* While draw_index < draw_count the predicate's result will be 7934 * (draw_index == draw_count) ^ TRUE = TRUE 7935 * When draw_index == draw_count the result is 7936 * (TRUE) ^ TRUE = FALSE 7937 * After this all results will be: 7938 * (FALSE) ^ FALSE = FALSE 7939 */ 7940 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD | 7941 MI_PREDICATE_COMBINEOP_XOR | 7942 MI_PREDICATE_COMPAREOP_SRCS_EQUAL; 7943 } 7944 crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t)); 7945 } 7946 } 7947 7948#if GFX_VER >= 7 7949 struct crocus_bo *bo = crocus_resource_bo(indirect->buffer); 7950 assert(bo); 7951 7952 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 7953 lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT; 7954 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0); 7955 } 7956 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 7957 lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT; 7958 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4); 7959 } 7960 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 7961 lrm.RegisterAddress = _3DPRIM_START_VERTEX; 7962 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8); 7963 } 7964 if (draw->index_size) { 7965 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 7966 lrm.RegisterAddress = _3DPRIM_BASE_VERTEX; 7967 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12); 7968 } 7969 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 7970 lrm.RegisterAddress = _3DPRIM_START_INSTANCE; 7971 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16); 7972 } 7973 } else { 7974 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 7975 lrm.RegisterAddress = _3DPRIM_START_INSTANCE; 7976 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12); 7977 } 7978 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { 7979 lri.RegisterOffset = _3DPRIM_BASE_VERTEX; 7980 lri.DataDWord = 0; 7981 } 7982 } 7983#endif 7984 } else if (indirect && indirect->count_from_stream_output) { 7985#if GFX_VERx10 >= 75 7986 struct crocus_stream_output_target *so = 7987 (void *) indirect->count_from_stream_output; 7988 7989 /* XXX: Replace with actual cache tracking */ 7990 crocus_emit_pipe_control_flush(batch, 7991 "draw count from stream output stall", 7992 PIPE_CONTROL_CS_STALL); 7993 7994 struct mi_builder b; 7995 mi_builder_init(&b, &batch->screen->devinfo, batch); 7996 7997 struct crocus_address addr = 7998 ro_bo(crocus_resource_bo(&so->offset_res->base.b), so->offset_offset); 7999 struct mi_value offset = 8000 mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset); 8001 8002 mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT), 8003 mi_udiv32_imm(&b, offset, so->stride)); 8004 8005 _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0); 8006 _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0); 8007 _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0); 8008 _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count); 8009#endif 8010 } 8011#else 8012 assert(!indirect); 8013#endif 8014 8015 crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) { 8016 prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL; 8017#if GFX_VER >= 7 8018 prim.PredicateEnable = use_predicate; 8019#endif 8020 8021 prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, ice->state.patch_vertices); 8022 if (indirect) { 8023 // XXX Probably have to do something for gen6 here? 8024#if GFX_VER >= 7 8025 prim.IndirectParameterEnable = true; 8026#endif 8027 } else { 8028#if GFX_VER >= 5 8029 prim.StartInstanceLocation = draw->start_instance; 8030#endif 8031 prim.InstanceCount = draw->instance_count; 8032 prim.VertexCountPerInstance = sc->count; 8033 8034 prim.StartVertexLocation = sc->start; 8035 8036 if (draw->index_size) { 8037 prim.BaseVertexLocation += sc->index_bias; 8038 } 8039 } 8040 } 8041} 8042 8043#if GFX_VER >= 7 8044 8045static void 8046crocus_upload_compute_state(struct crocus_context *ice, 8047 struct crocus_batch *batch, 8048 const struct pipe_grid_info *grid) 8049{ 8050 const uint64_t stage_dirty = ice->state.stage_dirty; 8051 struct crocus_screen *screen = batch->screen; 8052 const struct intel_device_info *devinfo = &screen->devinfo; 8053 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE]; 8054 struct crocus_compiled_shader *shader = 8055 ice->shaders.prog[MESA_SHADER_COMPUTE]; 8056 struct brw_stage_prog_data *prog_data = shader->prog_data; 8057 struct brw_cs_prog_data *cs_prog_data = (void *) prog_data; 8058 const struct brw_cs_dispatch_info dispatch = 8059 brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block); 8060 8061 crocus_update_surface_base_address(batch); 8062 if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload) 8063 upload_sysvals(ice, MESA_SHADER_COMPUTE); 8064 8065 if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) { 8066 crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false); 8067 ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset = 8068 crocus_upload_binding_table(ice, batch, 8069 ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset, 8070 ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes); 8071 } 8072 8073 if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS) 8074 crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE); 8075 8076 if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) || 8077 cs_prog_data->local_size[0] == 0 /* Variable local group size */) { 8078 /* The MEDIA_VFE_STATE documentation for Gen8+ says: 8079 * 8080 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless 8081 * the only bits that are changed are scoreboard related: Scoreboard 8082 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For 8083 * these scoreboard related states, a MEDIA_STATE_FLUSH is 8084 * sufficient." 8085 */ 8086 crocus_emit_pipe_control_flush(batch, 8087 "workaround: stall before MEDIA_VFE_STATE", 8088 PIPE_CONTROL_CS_STALL); 8089 8090 crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) { 8091 if (prog_data->total_scratch) { 8092 struct crocus_bo *bo = 8093 crocus_get_scratch_space(ice, prog_data->total_scratch, 8094 MESA_SHADER_COMPUTE); 8095#if GFX_VER == 8 8096 /* Broadwell's Per Thread Scratch Space is in the range [0, 11] 8097 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. 8098 */ 8099 vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; 8100#elif GFX_VERx10 == 75 8101 /* Haswell's Per Thread Scratch Space is in the range [0, 10] 8102 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M. 8103 */ 8104 vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12; 8105#else 8106 /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB] 8107 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB. 8108 */ 8109 vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1; 8110#endif 8111 vfe.ScratchSpaceBasePointer = rw_bo(bo, 0); 8112 } 8113 8114 vfe.MaximumNumberofThreads = 8115 devinfo->max_cs_threads * devinfo->subslice_total - 1; 8116 vfe.ResetGatewayTimer = 8117 Resettingrelativetimerandlatchingtheglobaltimestamp; 8118 vfe.BypassGatewayControl = true; 8119#if GFX_VER == 7 8120 vfe.GPGPUMode = true; 8121#endif 8122#if GFX_VER == 8 8123 vfe.BypassGatewayControl = true; 8124#endif 8125 vfe.NumberofURBEntries = GFX_VER == 8 ? 2 : 0; 8126 vfe.URBEntryAllocationSize = GFX_VER == 8 ? 2 : 0; 8127 8128 vfe.CURBEAllocationSize = 8129 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads + 8130 cs_prog_data->push.cross_thread.regs, 2); 8131 } 8132 } 8133 8134 /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */ 8135 if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) || 8136 cs_prog_data->local_size[0] == 0 /* Variable local group size */) { 8137 uint32_t curbe_data_offset = 0; 8138 assert(cs_prog_data->push.cross_thread.dwords == 0 && 8139 cs_prog_data->push.per_thread.dwords == 1 && 8140 cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID); 8141 const unsigned push_const_size = 8142 brw_cs_push_const_total_size(cs_prog_data, dispatch.threads); 8143 uint32_t *curbe_data_map = 8144 stream_state(batch, 8145 ALIGN(push_const_size, 64), 64, 8146 &curbe_data_offset); 8147 assert(curbe_data_map); 8148 memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64)); 8149 crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads, 8150 curbe_data_map); 8151 8152 crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) { 8153 curbe.CURBETotalDataLength = ALIGN(push_const_size, 64); 8154 curbe.CURBEDataStartAddress = curbe_data_offset; 8155 } 8156 } 8157 8158 if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS | 8159 CROCUS_STAGE_DIRTY_BINDINGS_CS | 8160 CROCUS_STAGE_DIRTY_CONSTANTS_CS | 8161 CROCUS_STAGE_DIRTY_CS)) { 8162 uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; 8163 const uint64_t ksp = KSP(ice,shader) + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size); 8164 crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) { 8165 idd.KernelStartPointer = ksp; 8166 idd.SamplerStatePointer = shs->sampler_offset; 8167 idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset; 8168 idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31); 8169 idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads; 8170 idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs; 8171 idd.BarrierEnable = cs_prog_data->uses_barrier; 8172 idd.SharedLocalMemorySize = encode_slm_size(GFX_VER, 8173 prog_data->total_shared); 8174#if GFX_VERx10 >= 75 8175 idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs; 8176#endif 8177 } 8178 8179 crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) { 8180 load.InterfaceDescriptorTotalLength = 8181 GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); 8182 load.InterfaceDescriptorDataStartAddress = 8183 emit_state(batch, desc, sizeof(desc), 64); 8184 } 8185 } 8186 8187#define GPGPU_DISPATCHDIMX 0x2500 8188#define GPGPU_DISPATCHDIMY 0x2504 8189#define GPGPU_DISPATCHDIMZ 0x2508 8190 8191 if (grid->indirect) { 8192 struct crocus_state_ref *grid_size = &ice->state.grid_size; 8193 struct crocus_bo *bo = crocus_resource_bo(grid_size->res); 8194 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 8195 lrm.RegisterAddress = GPGPU_DISPATCHDIMX; 8196 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0); 8197 } 8198 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 8199 lrm.RegisterAddress = GPGPU_DISPATCHDIMY; 8200 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4); 8201 } 8202 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 8203 lrm.RegisterAddress = GPGPU_DISPATCHDIMZ; 8204 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8); 8205 } 8206 8207#if GFX_VER == 7 8208 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */ 8209 _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0); 8210 crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0); 8211 8212 /* Load compute_dispatch_indirect_x_size into SRC0 */ 8213 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0); 8214 8215 /* predicate = (compute_dispatch_indirect_x_size == 0); */ 8216 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) { 8217 mip.LoadOperation = LOAD_LOAD; 8218 mip.CombineOperation = COMBINE_SET; 8219 mip.CompareOperation = COMPARE_SRCS_EQUAL; 8220 }; 8221 8222 /* Load compute_dispatch_indirect_y_size into SRC0 */ 8223 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4); 8224 8225 /* predicate = (compute_dispatch_indirect_y_size == 0); */ 8226 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) { 8227 mip.LoadOperation = LOAD_LOAD; 8228 mip.CombineOperation = COMBINE_OR; 8229 mip.CompareOperation = COMPARE_SRCS_EQUAL; 8230 }; 8231 8232 /* Load compute_dispatch_indirect_z_size into SRC0 */ 8233 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8); 8234 8235 /* predicate = (compute_dispatch_indirect_z_size == 0); */ 8236 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) { 8237 mip.LoadOperation = LOAD_LOAD; 8238 mip.CombineOperation = COMBINE_OR; 8239 mip.CompareOperation = COMPARE_SRCS_EQUAL; 8240 }; 8241 8242 /* predicate = !predicate; */ 8243#define COMPARE_FALSE 1 8244 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) { 8245 mip.LoadOperation = LOAD_LOADINV; 8246 mip.CombineOperation = COMBINE_OR; 8247 mip.CompareOperation = COMPARE_FALSE; 8248 } 8249#endif 8250 } 8251 8252 crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) { 8253 ggw.IndirectParameterEnable = grid->indirect != NULL; 8254 ggw.PredicateEnable = GFX_VER <= 7 && grid->indirect != NULL; 8255 ggw.SIMDSize = dispatch.simd_size / 16; 8256 ggw.ThreadDepthCounterMaximum = 0; 8257 ggw.ThreadHeightCounterMaximum = 0; 8258 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1; 8259 ggw.ThreadGroupIDXDimension = grid->grid[0]; 8260 ggw.ThreadGroupIDYDimension = grid->grid[1]; 8261 ggw.ThreadGroupIDZDimension = grid->grid[2]; 8262 ggw.RightExecutionMask = dispatch.right_mask; 8263 ggw.BottomExecutionMask = 0xffffffff; 8264 } 8265 8266 crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf); 8267 8268 batch->contains_draw = true; 8269} 8270 8271#endif /* GFX_VER >= 7 */ 8272 8273/** 8274 * State module teardown. 8275 */ 8276static void 8277crocus_destroy_state(struct crocus_context *ice) 8278{ 8279 pipe_resource_reference(&ice->draw.draw_params.res, NULL); 8280 pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL); 8281 8282 free(ice->state.genx); 8283 8284 for (int i = 0; i < 4; i++) { 8285 pipe_so_target_reference(&ice->state.so_target[i], NULL); 8286 } 8287 8288 for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) { 8289 pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL); 8290 } 8291 pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL); 8292 8293 for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) { 8294 struct crocus_shader_state *shs = &ice->state.shaders[stage]; 8295 for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { 8296 pipe_resource_reference(&shs->constbufs[i].buffer, NULL); 8297 } 8298 for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) { 8299 pipe_resource_reference(&shs->image[i].base.resource, NULL); 8300 } 8301 for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) { 8302 pipe_resource_reference(&shs->ssbo[i].buffer, NULL); 8303 } 8304 for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) { 8305 pipe_sampler_view_reference((struct pipe_sampler_view **) 8306 &shs->textures[i], NULL); 8307 } 8308 } 8309 8310 for (int i = 0; i < 16; i++) 8311 pipe_resource_reference(&ice->state.vertex_buffers[i].buffer.resource, NULL); 8312 pipe_resource_reference(&ice->state.grid_size.res, NULL); 8313 8314 pipe_resource_reference(&ice->state.index_buffer.res, NULL); 8315} 8316 8317/* ------------------------------------------------------------------- */ 8318 8319static void 8320crocus_rebind_buffer(struct crocus_context *ice, 8321 struct crocus_resource *res) 8322{ 8323 struct pipe_context *ctx = &ice->ctx; 8324 8325 assert(res->base.b.target == PIPE_BUFFER); 8326 8327 /* Buffers can't be framebuffer attachments, nor display related, 8328 * and we don't have upstream Clover support. 8329 */ 8330 assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL | 8331 PIPE_BIND_RENDER_TARGET | 8332 PIPE_BIND_BLENDABLE | 8333 PIPE_BIND_DISPLAY_TARGET | 8334 PIPE_BIND_CURSOR | 8335 PIPE_BIND_COMPUTE_RESOURCE | 8336 PIPE_BIND_GLOBAL))); 8337 8338 if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) { 8339 uint64_t bound_vbs = ice->state.bound_vertex_buffers; 8340 while (bound_vbs) { 8341 const int i = u_bit_scan64(&bound_vbs); 8342 struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i]; 8343 8344 if (!buffer->is_user_buffer && &res->base.b == buffer->buffer.resource) 8345 ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS; 8346 } 8347 } 8348 8349 if ((res->bind_history & PIPE_BIND_INDEX_BUFFER) && 8350 ice->state.index_buffer.res) { 8351 if (res->bo == crocus_resource_bo(ice->state.index_buffer.res)) 8352 pipe_resource_reference(&ice->state.index_buffer.res, NULL); 8353 } 8354 /* There is no need to handle these: 8355 * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw) 8356 * - PIPE_BIND_QUERY_BUFFER (no persistent state references) 8357 */ 8358 8359 if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) { 8360 /* XXX: be careful about resetting vs appending... */ 8361 for (int i = 0; i < 4; i++) { 8362 if (ice->state.so_target[i] && 8363 (ice->state.so_target[i]->buffer == &res->base.b)) { 8364#if GFX_VER == 6 8365 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS; 8366#else 8367 ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS; 8368#endif 8369 } 8370 } 8371 } 8372 8373 for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) { 8374 struct crocus_shader_state *shs = &ice->state.shaders[s]; 8375 enum pipe_shader_type p_stage = stage_to_pipe(s); 8376 8377 if (!(res->bind_stages & (1 << s))) 8378 continue; 8379 8380 if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) { 8381 /* Skip constant buffer 0, it's for regular uniforms, not UBOs */ 8382 uint32_t bound_cbufs = shs->bound_cbufs & ~1u; 8383 while (bound_cbufs) { 8384 const int i = u_bit_scan(&bound_cbufs); 8385 struct pipe_constant_buffer *cbuf = &shs->constbufs[i]; 8386 8387 if (res->bo == crocus_resource_bo(cbuf->buffer)) { 8388 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s; 8389 } 8390 } 8391 } 8392 8393 if (res->bind_history & PIPE_BIND_SHADER_BUFFER) { 8394 uint32_t bound_ssbos = shs->bound_ssbos; 8395 while (bound_ssbos) { 8396 const int i = u_bit_scan(&bound_ssbos); 8397 struct pipe_shader_buffer *ssbo = &shs->ssbo[i]; 8398 8399 if (res->bo == crocus_resource_bo(ssbo->buffer)) { 8400 struct pipe_shader_buffer buf = { 8401 .buffer = &res->base.b, 8402 .buffer_offset = ssbo->buffer_offset, 8403 .buffer_size = ssbo->buffer_size, 8404 }; 8405 crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf, 8406 (shs->writable_ssbos >> i) & 1); 8407 } 8408 } 8409 } 8410 8411 if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) { 8412 uint32_t bound_sampler_views = shs->bound_sampler_views; 8413 while (bound_sampler_views) { 8414 const int i = u_bit_scan(&bound_sampler_views); 8415 struct crocus_sampler_view *isv = shs->textures[i]; 8416 struct crocus_bo *bo = isv->res->bo; 8417 8418 if (res->bo == bo) { 8419 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s; 8420 } 8421 } 8422 } 8423 8424 if (res->bind_history & PIPE_BIND_SHADER_IMAGE) { 8425 uint32_t bound_image_views = shs->bound_image_views; 8426 while (bound_image_views) { 8427 const int i = u_bit_scan(&bound_image_views); 8428 struct crocus_image_view *iv = &shs->image[i]; 8429 struct crocus_bo *bo = crocus_resource_bo(iv->base.resource); 8430 8431 if (res->bo == bo) 8432 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s; 8433 } 8434 } 8435 } 8436} 8437 8438/* ------------------------------------------------------------------- */ 8439 8440static unsigned 8441flags_to_post_sync_op(uint32_t flags) 8442{ 8443 if (flags & PIPE_CONTROL_WRITE_IMMEDIATE) 8444 return WriteImmediateData; 8445 8446 if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) 8447 return WritePSDepthCount; 8448 8449 if (flags & PIPE_CONTROL_WRITE_TIMESTAMP) 8450 return WriteTimestamp; 8451 8452 return 0; 8453} 8454 8455/* 8456 * Do the given flags have a Post Sync or LRI Post Sync operation? 8457 */ 8458static enum pipe_control_flags 8459get_post_sync_flags(enum pipe_control_flags flags) 8460{ 8461 flags &= PIPE_CONTROL_WRITE_IMMEDIATE | 8462 PIPE_CONTROL_WRITE_DEPTH_COUNT | 8463 PIPE_CONTROL_WRITE_TIMESTAMP | 8464 PIPE_CONTROL_LRI_POST_SYNC_OP; 8465 8466 /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with 8467 * "LRI Post Sync Operation". So more than one bit set would be illegal. 8468 */ 8469 assert(util_bitcount(flags) <= 1); 8470 8471 return flags; 8472} 8473 8474#define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE) 8475 8476/** 8477 * Emit a series of PIPE_CONTROL commands, taking into account any 8478 * workarounds necessary to actually accomplish the caller's request. 8479 * 8480 * Unless otherwise noted, spec quotations in this function come from: 8481 * 8482 * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming 8483 * Restrictions for PIPE_CONTROL. 8484 * 8485 * You should not use this function directly. Use the helpers in 8486 * crocus_pipe_control.c instead, which may split the pipe control further. 8487 */ 8488static void 8489crocus_emit_raw_pipe_control(struct crocus_batch *batch, 8490 const char *reason, 8491 uint32_t flags, 8492 struct crocus_bo *bo, 8493 uint32_t offset, 8494 uint64_t imm) 8495{ 8496 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo; 8497 enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags); 8498 UNUSED enum pipe_control_flags non_lri_post_sync_flags = 8499 post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP; 8500 8501 /* Recursive PIPE_CONTROL workarounds -------------------------------- 8502 * (http://knowyourmeme.com/memes/xzibit-yo-dawg) 8503 * 8504 * We do these first because we want to look at the original operation, 8505 * rather than any workarounds we set. 8506 */ 8507 8508 /* "Flush Types" workarounds --------------------------------------------- 8509 * We do these now because they may add post-sync operations or CS stalls. 8510 */ 8511 8512 if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) { 8513 /* Hardware workaround: SNB B-Spec says: 8514 * 8515 * "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush 8516 * Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is 8517 * required." 8518 */ 8519 crocus_emit_post_sync_nonzero_flush(batch); 8520 } 8521 8522#if GFX_VER == 8 8523 if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) { 8524 /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate 8525 * 8526 * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or 8527 * 'Write PS Depth Count' or 'Write Timestamp'." 8528 */ 8529 if (!bo) { 8530 flags |= PIPE_CONTROL_WRITE_IMMEDIATE; 8531 post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE; 8532 non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE; 8533 bo = batch->ice->workaround_bo; 8534 offset = batch->ice->workaround_offset; 8535 } 8536 } 8537#endif 8538 8539#if GFX_VERx10 < 75 8540 if (flags & PIPE_CONTROL_DEPTH_STALL) { 8541 /* Project: PRE-HSW / Argument: Depth Stall 8542 * 8543 * "The following bits must be clear: 8544 * - Render Target Cache Flush Enable ([12] of DW1) 8545 * - Depth Cache Flush Enable ([0] of DW1)" 8546 */ 8547 assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH | 8548 PIPE_CONTROL_DEPTH_CACHE_FLUSH))); 8549 } 8550#endif 8551 if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) { 8552 /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable): 8553 * 8554 * "This bit must be DISABLED for operations other than writing 8555 * PS_DEPTH_COUNT." 8556 * 8557 * This seems like nonsense. An Ivybridge workaround requires us to 8558 * emit a PIPE_CONTROL with a depth stall and write immediate post-sync 8559 * operation. Gen8+ requires us to emit depth stalls and depth cache 8560 * flushes together. So, it's hard to imagine this means anything other 8561 * than "we originally intended this to be used for PS_DEPTH_COUNT". 8562 * 8563 * We ignore the supposed restriction and do nothing. 8564 */ 8565 } 8566 8567 if (GFX_VERx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) { 8568 /* Project: PRE-HSW / Argument: Depth Cache Flush 8569 * 8570 * "Depth Stall must be clear ([13] of DW1)." 8571 */ 8572 assert(!(flags & PIPE_CONTROL_DEPTH_STALL)); 8573 } 8574 8575 if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH | 8576 PIPE_CONTROL_STALL_AT_SCOREBOARD)) { 8577 /* From the PIPE_CONTROL instruction table, bit 12 and bit 1: 8578 * 8579 * "This bit must be DISABLED for End-of-pipe (Read) fences, 8580 * PS_DEPTH_COUNT or TIMESTAMP queries." 8581 * 8582 * TODO: Implement end-of-pipe checking. 8583 */ 8584 assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT | 8585 PIPE_CONTROL_WRITE_TIMESTAMP))); 8586 } 8587 8588 if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) { 8589 /* From the PIPE_CONTROL instruction table, bit 1: 8590 * 8591 * "This bit is ignored if Depth Stall Enable is set. 8592 * Further, the render cache is not flushed even if Write Cache 8593 * Flush Enable bit is set." 8594 * 8595 * We assert that the caller doesn't do this combination, to try and 8596 * prevent mistakes. It shouldn't hurt the GPU, though. 8597 * 8598 * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard" 8599 * and "Render Target Flush" combo is explicitly required for BTI 8600 * update workarounds. 8601 */ 8602 assert(!(flags & (PIPE_CONTROL_DEPTH_STALL | 8603 PIPE_CONTROL_RENDER_TARGET_FLUSH))); 8604 } 8605 8606 /* PIPE_CONTROL page workarounds ------------------------------------- */ 8607 8608 if (GFX_VER >= 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) { 8609 /* From the PIPE_CONTROL page itself: 8610 * 8611 * "IVB, HSW, BDW 8612 * Restriction: Pipe_control with CS-stall bit set must be issued 8613 * before a pipe-control command that has the State Cache 8614 * Invalidate bit set." 8615 */ 8616 flags |= PIPE_CONTROL_CS_STALL; 8617 } 8618 8619 if ((GFX_VERx10 == 75)) { 8620 /* From the PIPE_CONTROL page itself: 8621 * 8622 * "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation: 8623 * Prior to programming a PIPECONTROL command with any of the RO 8624 * cache invalidation bit set, program a PIPECONTROL flush command 8625 * with “CS stall” bit and “HDC Flush” bit set." 8626 * 8627 * TODO: Actually implement this. What's an HDC Flush? 8628 */ 8629 } 8630 8631 if (flags & PIPE_CONTROL_FLUSH_LLC) { 8632 /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC): 8633 * 8634 * "Project: ALL 8635 * SW must always program Post-Sync Operation to "Write Immediate 8636 * Data" when Flush LLC is set." 8637 * 8638 * For now, we just require the caller to do it. 8639 */ 8640 assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE); 8641 } 8642 8643 /* "Post-Sync Operation" workarounds -------------------------------- */ 8644 8645 /* Project: All / Argument: Global Snapshot Count Reset [19] 8646 * 8647 * "This bit must not be exercised on any product. 8648 * Requires stall bit ([20] of DW1) set." 8649 * 8650 * We don't use this, so we just assert that it isn't used. The 8651 * PIPE_CONTROL instruction page indicates that they intended this 8652 * as a debug feature and don't think it is useful in production, 8653 * but it may actually be usable, should we ever want to. 8654 */ 8655 assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0); 8656 8657 if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR | 8658 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) { 8659 /* Project: All / Arguments: 8660 * 8661 * - Generic Media State Clear [16] 8662 * - Indirect State Pointers Disable [16] 8663 * 8664 * "Requires stall bit ([20] of DW1) set." 8665 * 8666 * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media 8667 * State Clear) says: 8668 * 8669 * "PIPECONTROL command with “Command Streamer Stall Enable” must be 8670 * programmed prior to programming a PIPECONTROL command with "Media 8671 * State Clear" set in GPGPU mode of operation" 8672 * 8673 * This is a subset of the earlier rule, so there's nothing to do. 8674 */ 8675 flags |= PIPE_CONTROL_CS_STALL; 8676 } 8677 8678 if (flags & PIPE_CONTROL_STORE_DATA_INDEX) { 8679 /* Project: All / Argument: Store Data Index 8680 * 8681 * "Post-Sync Operation ([15:14] of DW1) must be set to something other 8682 * than '0'." 8683 * 8684 * For now, we just assert that the caller does this. We might want to 8685 * automatically add a write to the workaround BO... 8686 */ 8687 assert(non_lri_post_sync_flags != 0); 8688 } 8689 8690 if (flags & PIPE_CONTROL_SYNC_GFDT) { 8691 /* Project: All / Argument: Sync GFDT 8692 * 8693 * "Post-Sync Operation ([15:14] of DW1) must be set to something other 8694 * than '0' or 0x2520[13] must be set." 8695 * 8696 * For now, we just assert that the caller does this. 8697 */ 8698 assert(non_lri_post_sync_flags != 0); 8699 } 8700 8701 if (GFX_VER >= 6 && GFX_VER < 8 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) { 8702 /* Project: SNB, IVB, HSW / Argument: TLB inv 8703 * 8704 * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1) 8705 * must be set to something other than '0'." 8706 * 8707 * For now, we just assert that the caller does this. 8708 */ 8709 assert(non_lri_post_sync_flags != 0); 8710 } 8711 8712 if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) { 8713 /* Project: IVB+ / Argument: TLB inv 8714 * 8715 * "Requires stall bit ([20] of DW1) set." 8716 * 8717 * Also, from the PIPE_CONTROL instruction table: 8718 * 8719 * "Project: SKL+ 8720 * Post Sync Operation or CS stall must be set to ensure a TLB 8721 * invalidation occurs. Otherwise no cycle will occur to the TLB 8722 * cache to invalidate." 8723 * 8724 * This is not a subset of the earlier rule, so there's nothing to do. 8725 */ 8726 flags |= PIPE_CONTROL_CS_STALL; 8727 } 8728#if GFX_VER == 8 8729 if (IS_COMPUTE_PIPELINE(batch)) { 8730 if (post_sync_flags || 8731 (flags & (PIPE_CONTROL_NOTIFY_ENABLE | 8732 PIPE_CONTROL_DEPTH_STALL | 8733 PIPE_CONTROL_RENDER_TARGET_FLUSH | 8734 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 8735 PIPE_CONTROL_DATA_CACHE_FLUSH))) { 8736 /* Project: BDW / Arguments: 8737 * 8738 * - LRI Post Sync Operation [23] 8739 * - Post Sync Op [15:14] 8740 * - Notify En [8] 8741 * - Depth Stall [13] 8742 * - Render Target Cache Flush [12] 8743 * - Depth Cache Flush [0] 8744 * - DC Flush Enable [5] 8745 * 8746 * "Requires stall bit ([20] of DW) set for all GPGPU and Media 8747 * Workloads." 8748 * 8749 * (The docs have separate table rows for each bit, with essentially 8750 * the same workaround text. We've combined them here.) 8751 */ 8752 flags |= PIPE_CONTROL_CS_STALL; 8753 8754 /* Also, from the PIPE_CONTROL instruction table, bit 20: 8755 * 8756 * "Project: BDW 8757 * This bit must be always set when PIPE_CONTROL command is 8758 * programmed by GPGPU and MEDIA workloads, except for the cases 8759 * when only Read Only Cache Invalidation bits are set (State 8760 * Cache Invalidation Enable, Instruction cache Invalidation 8761 * Enable, Texture Cache Invalidation Enable, Constant Cache 8762 * Invalidation Enable). This is to WA FFDOP CG issue, this WA 8763 * need not implemented when FF_DOP_CG is disable via "Fixed 8764 * Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register." 8765 * 8766 * It sounds like we could avoid CS stalls in some cases, but we 8767 * don't currently bother. This list isn't exactly the list above, 8768 * either... 8769 */ 8770 } 8771 } 8772#endif 8773 /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT: 8774 * 8775 * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with 8776 * only read-cache-invalidate bit(s) set, must have a CS_STALL bit set." 8777 * 8778 * Note that the kernel does CS stalls between batches, so we only need 8779 * to count them within a batch. We currently naively count every 4, and 8780 * don't skip the ones with only read-cache-invalidate bits set. This 8781 * may or may not be a problem... 8782 */ 8783 if (GFX_VER == 7 && !(GFX_VERx10 == 75)) { 8784 if (flags & PIPE_CONTROL_CS_STALL) { 8785 /* If we're doing a CS stall, reset the counter and carry on. */ 8786 batch->pipe_controls_since_last_cs_stall = 0; 8787 } 8788 8789 /* If this is the fourth pipe control without a CS stall, do one now. */ 8790 if (++batch->pipe_controls_since_last_cs_stall == 4) { 8791 batch->pipe_controls_since_last_cs_stall = 0; 8792 flags |= PIPE_CONTROL_CS_STALL; 8793 } 8794 } 8795 8796 /* "Stall" workarounds ---------------------------------------------- 8797 * These have to come after the earlier ones because we may have added 8798 * some additional CS stalls above. 8799 */ 8800 8801 if (flags & PIPE_CONTROL_CS_STALL) { 8802 /* Project: PRE-SKL, VLV, CHV 8803 * 8804 * "[All Stepping][All SKUs]: 8805 * 8806 * One of the following must also be set: 8807 * 8808 * - Render Target Cache Flush Enable ([12] of DW1) 8809 * - Depth Cache Flush Enable ([0] of DW1) 8810 * - Stall at Pixel Scoreboard ([1] of DW1) 8811 * - Depth Stall ([13] of DW1) 8812 * - Post-Sync Operation ([13] of DW1) 8813 * - DC Flush Enable ([5] of DW1)" 8814 * 8815 * If we don't already have one of those bits set, we choose to add 8816 * "Stall at Pixel Scoreboard". Some of the other bits require a 8817 * CS stall as a workaround (see above), which would send us into 8818 * an infinite recursion of PIPE_CONTROLs. "Stall at Pixel Scoreboard" 8819 * appears to be safe, so we choose that. 8820 */ 8821 const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH | 8822 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 8823 PIPE_CONTROL_WRITE_IMMEDIATE | 8824 PIPE_CONTROL_WRITE_DEPTH_COUNT | 8825 PIPE_CONTROL_WRITE_TIMESTAMP | 8826 PIPE_CONTROL_STALL_AT_SCOREBOARD | 8827 PIPE_CONTROL_DEPTH_STALL | 8828 PIPE_CONTROL_DATA_CACHE_FLUSH; 8829 if (!(flags & wa_bits)) 8830 flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD; 8831 } 8832 8833 /* Emit --------------------------------------------------------------- */ 8834 8835 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { 8836 fprintf(stderr, 8837 " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n", 8838 (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "", 8839 (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "", 8840 (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "", 8841 (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "", 8842 (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "", 8843 (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "", 8844 (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "", 8845 (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "", 8846 (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "", 8847 (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "", 8848 (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "", 8849 (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "", 8850 (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "", 8851 (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "", 8852 (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "", 8853 (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ? 8854 "SnapRes" : "", 8855 (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ? 8856 "ISPDis" : "", 8857 (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "", 8858 (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "", 8859 (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "", 8860 imm, reason); 8861 } 8862 8863 crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) { 8864#if GFX_VER >= 7 8865 pc.LRIPostSyncOperation = NoLRIOperation; 8866 pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE; 8867 pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH; 8868#endif 8869#if GFX_VER >= 6 8870 pc.StoreDataIndex = 0; 8871 pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL; 8872 pc.GlobalSnapshotCountReset = 8873 flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET; 8874 pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE; 8875 pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR; 8876 pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD; 8877 pc.RenderTargetCacheFlushEnable = 8878 flags & PIPE_CONTROL_RENDER_TARGET_FLUSH; 8879 pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH; 8880 pc.StateCacheInvalidationEnable = 8881 flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE; 8882 pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE; 8883 pc.ConstantCacheInvalidationEnable = 8884 flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE; 8885#else 8886 pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH; 8887#endif 8888 pc.PostSyncOperation = flags_to_post_sync_op(flags); 8889 pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL; 8890 pc.InstructionCacheInvalidateEnable = 8891 flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE; 8892 pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE; 8893#if GFX_VER >= 5 || GFX_VERx10 == 45 8894 pc.IndirectStatePointersDisable = 8895 flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE; 8896#endif 8897#if GFX_VER >= 6 8898 pc.TextureCacheInvalidationEnable = 8899 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 8900#elif GFX_VER == 5 || GFX_VERx10 == 45 8901 pc.TextureCacheFlushEnable = 8902 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 8903#endif 8904 pc.Address = ggtt_bo(bo, offset); 8905 if (GFX_VER < 7 && bo) 8906 pc.DestinationAddressType = DAT_GGTT; 8907 pc.ImmediateData = imm; 8908 } 8909} 8910 8911#if GFX_VER == 6 8912void 8913genX(crocus_upload_urb)(struct crocus_batch *batch, 8914 unsigned vs_size, 8915 bool gs_present, 8916 unsigned gs_size) 8917{ 8918 struct crocus_context *ice = batch->ice; 8919 int nr_vs_entries, nr_gs_entries; 8920 int total_urb_size = ice->urb.size * 1024; /* in bytes */ 8921 const struct intel_device_info *devinfo = &batch->screen->devinfo; 8922 8923 /* Calculate how many entries fit in each stage's section of the URB */ 8924 if (gs_present) { 8925 nr_vs_entries = (total_urb_size/2) / (vs_size * 128); 8926 nr_gs_entries = (total_urb_size/2) / (gs_size * 128); 8927 } else { 8928 nr_vs_entries = total_urb_size / (vs_size * 128); 8929 nr_gs_entries = 0; 8930 } 8931 8932 /* Then clamp to the maximum allowed by the hardware */ 8933 if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX]) 8934 nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX]; 8935 8936 if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY]) 8937 nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY]; 8938 8939 /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */ 8940 ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4); 8941 ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4); 8942 8943 assert(ice->urb.nr_vs_entries >= 8944 devinfo->urb.min_entries[MESA_SHADER_VERTEX]); 8945 assert(ice->urb.nr_vs_entries % 4 == 0); 8946 assert(ice->urb.nr_gs_entries % 4 == 0); 8947 assert(vs_size <= 5); 8948 assert(gs_size <= 5); 8949 8950 crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) { 8951 urb.VSNumberofURBEntries = ice->urb.nr_vs_entries; 8952 urb.VSURBEntryAllocationSize = vs_size - 1; 8953 8954 urb.GSNumberofURBEntries = ice->urb.nr_gs_entries; 8955 urb.GSURBEntryAllocationSize = gs_size - 1; 8956 }; 8957 /* From the PRM Volume 2 part 1, section 1.4.7: 8958 * 8959 * Because of a urb corruption caused by allocating a previous gsunit’s 8960 * urb entry to vsunit software is required to send a "GS NULL 8961 * Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus 8962 * a dummy DRAW call before any case where VS will be taking over GS URB 8963 * space. 8964 * 8965 * It is not clear exactly what this means ("URB fence" is a command that 8966 * doesn't exist on Gen6). So for now we just do a full pipeline flush as 8967 * a workaround. 8968 */ 8969 if (ice->urb.gs_present && !gs_present) 8970 crocus_emit_mi_flush(batch); 8971 ice->urb.gs_present = gs_present; 8972} 8973#endif 8974 8975static void 8976crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch) 8977{ 8978} 8979 8980static void 8981crocus_emit_mi_report_perf_count(struct crocus_batch *batch, 8982 struct crocus_bo *bo, 8983 uint32_t offset_in_bytes, 8984 uint32_t report_id) 8985{ 8986#if GFX_VER >= 7 8987 crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) { 8988 mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes); 8989 mi_rpc.ReportID = report_id; 8990 } 8991#endif 8992} 8993 8994/** 8995 * From the PRM, Volume 2a: 8996 * 8997 * "Indirect State Pointers Disable 8998 * 8999 * At the completion of the post-sync operation associated with this pipe 9000 * control packet, the indirect state pointers in the hardware are 9001 * considered invalid; the indirect pointers are not saved in the context. 9002 * If any new indirect state commands are executed in the command stream 9003 * while the pipe control is pending, the new indirect state commands are 9004 * preserved. 9005 * 9006 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context 9007 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant 9008 * commands are only considered as Indirect State Pointers. Once ISP is 9009 * issued in a context, SW must initialize by programming push constant 9010 * commands for all the shaders (at least to zero length) before attempting 9011 * any rendering operation for the same context." 9012 * 9013 * 3DSTATE_CONSTANT_* packets are restored during a context restore, 9014 * even though they point to a BO that has been already unreferenced at 9015 * the end of the previous batch buffer. This has been fine so far since 9016 * we are protected by these scratch page (every address not covered by 9017 * a BO should be pointing to the scratch page). But on CNL, it is 9018 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_* 9019 * instruction. 9020 * 9021 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the 9022 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a 9023 * context restore, so the mentioned hang doesn't happen. However, 9024 * software must program push constant commands for all stages prior to 9025 * rendering anything, so we flag them as dirty. 9026 * 9027 * Finally, we also make sure to stall at pixel scoreboard to make sure the 9028 * constants have been loaded into the EUs prior to disable the push constants 9029 * so that it doesn't hang a previous 3DPRIMITIVE. 9030 */ 9031#if GFX_VER >= 7 9032static void 9033gen7_emit_isp_disable(struct crocus_batch *batch) 9034{ 9035 crocus_emit_raw_pipe_control(batch, "isp disable", 9036 PIPE_CONTROL_STALL_AT_SCOREBOARD | 9037 PIPE_CONTROL_CS_STALL, 9038 NULL, 0, 0); 9039 crocus_emit_raw_pipe_control(batch, "isp disable", 9040 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE | 9041 PIPE_CONTROL_CS_STALL, 9042 NULL, 0, 0); 9043 9044 struct crocus_context *ice = batch->ice; 9045 ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS | 9046 CROCUS_STAGE_DIRTY_CONSTANTS_TCS | 9047 CROCUS_STAGE_DIRTY_CONSTANTS_TES | 9048 CROCUS_STAGE_DIRTY_CONSTANTS_GS | 9049 CROCUS_STAGE_DIRTY_CONSTANTS_FS); 9050} 9051#endif 9052 9053#if GFX_VER >= 7 9054static void 9055crocus_state_finish_batch(struct crocus_batch *batch) 9056{ 9057#if GFX_VERx10 == 75 9058 if (batch->name == CROCUS_BATCH_RENDER) { 9059 crocus_emit_mi_flush(batch); 9060 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { 9061 ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset; 9062 } 9063 9064 crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH | 9065 PIPE_CONTROL_CS_STALL); 9066 } 9067#endif 9068 gen7_emit_isp_disable(batch); 9069} 9070#endif 9071 9072static void 9073crocus_batch_reset_dirty(struct crocus_batch *batch) 9074{ 9075 /* unreference any index buffer so it get reemitted. */ 9076 pipe_resource_reference(&batch->ice->state.index_buffer.res, NULL); 9077 9078 /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch 9079 * as the old state batch won't still be available. 9080 */ 9081 batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER | 9082 CROCUS_DIRTY_COLOR_CALC_STATE; 9083 9084 batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS; 9085 9086 batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS; 9087 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS; 9088 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES; 9089 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS; 9090 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS; 9091 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS; 9092 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS; 9093 9094 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS; 9095 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES; 9096 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS; 9097 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS; 9098 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS; 9099 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS; 9100 9101 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS; 9102 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS; 9103 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS; 9104 batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT; 9105 9106#if GFX_VER >= 6 9107 /* SCISSOR_STATE */ 9108 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE; 9109 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT; 9110 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL; 9111 9112#endif 9113#if GFX_VER <= 5 9114 /* dirty the SF state on gen4/5 */ 9115 batch->ice->state.dirty |= CROCUS_DIRTY_RASTER; 9116 batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE; 9117 batch->ice->state.dirty |= CROCUS_DIRTY_CLIP; 9118 batch->ice->state.dirty |= CROCUS_DIRTY_WM; 9119#endif 9120#if GFX_VER >= 7 9121 /* Streamout dirty */ 9122 batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT; 9123 batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST; 9124 batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS; 9125#endif 9126} 9127 9128#if GFX_VERx10 == 75 9129struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice) 9130{ 9131 return &ice->state.cso_rast->cso; 9132} 9133#endif 9134 9135#if GFX_VER >= 6 9136static void update_so_strides(struct crocus_context *ice, 9137 uint16_t *strides) 9138{ 9139 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { 9140 struct crocus_stream_output_target *so = (void *)ice->state.so_target[i]; 9141 if (so) 9142 so->stride = strides[i] * sizeof(uint32_t); 9143 } 9144} 9145#endif 9146 9147static void crocus_fill_clamp_mask(const struct crocus_sampler_state *samp, 9148 int s, 9149 uint32_t *clamp_mask) 9150{ 9151#if GFX_VER < 8 9152 if (samp->pstate.min_img_filter != PIPE_TEX_FILTER_NEAREST && 9153 samp->pstate.mag_img_filter != PIPE_TEX_FILTER_NEAREST) { 9154 if (samp->pstate.wrap_s == PIPE_TEX_WRAP_CLAMP) 9155 clamp_mask[0] |= (1 << s); 9156 if (samp->pstate.wrap_t == PIPE_TEX_WRAP_CLAMP) 9157 clamp_mask[1] |= (1 << s); 9158 if (samp->pstate.wrap_r == PIPE_TEX_WRAP_CLAMP) 9159 clamp_mask[2] |= (1 << s); 9160 } 9161#endif 9162} 9163 9164static void 9165crocus_set_frontend_noop(struct pipe_context *ctx, bool enable) 9166{ 9167 struct crocus_context *ice = (struct crocus_context *) ctx; 9168 9169 if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) { 9170 ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER; 9171 ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER; 9172 } 9173 9174 if (ice->batch_count == 1) 9175 return; 9176 9177 if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) { 9178 ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE; 9179 ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE; 9180 } 9181} 9182 9183void 9184genX(crocus_init_screen_state)(struct crocus_screen *screen) 9185{ 9186 assert(screen->devinfo.verx10 == GFX_VERx10); 9187 assert(screen->devinfo.ver == GFX_VER); 9188 screen->vtbl.destroy_state = crocus_destroy_state; 9189 screen->vtbl.init_render_context = crocus_init_render_context; 9190 screen->vtbl.upload_render_state = crocus_upload_render_state; 9191#if GFX_VER >= 7 9192 screen->vtbl.init_compute_context = crocus_init_compute_context; 9193 screen->vtbl.upload_compute_state = crocus_upload_compute_state; 9194#endif 9195 screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control; 9196 screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count; 9197 screen->vtbl.rebind_buffer = crocus_rebind_buffer; 9198#if GFX_VERx10 >= 75 9199 screen->vtbl.load_register_reg32 = crocus_load_register_reg32; 9200 screen->vtbl.load_register_reg64 = crocus_load_register_reg64; 9201 screen->vtbl.load_register_imm32 = crocus_load_register_imm32; 9202 screen->vtbl.load_register_imm64 = crocus_load_register_imm64; 9203 screen->vtbl.store_data_imm32 = crocus_store_data_imm32; 9204 screen->vtbl.store_data_imm64 = crocus_store_data_imm64; 9205#endif 9206#if GFX_VER >= 7 9207 screen->vtbl.load_register_mem32 = crocus_load_register_mem32; 9208 screen->vtbl.load_register_mem64 = crocus_load_register_mem64; 9209 screen->vtbl.copy_mem_mem = crocus_copy_mem_mem; 9210 screen->vtbl.create_so_decl_list = crocus_create_so_decl_list; 9211#endif 9212 screen->vtbl.update_surface_base_address = crocus_update_surface_base_address; 9213#if GFX_VER >= 6 9214 screen->vtbl.store_register_mem32 = crocus_store_register_mem32; 9215 screen->vtbl.store_register_mem64 = crocus_store_register_mem64; 9216#endif 9217 screen->vtbl.populate_vs_key = crocus_populate_vs_key; 9218 screen->vtbl.populate_tcs_key = crocus_populate_tcs_key; 9219 screen->vtbl.populate_tes_key = crocus_populate_tes_key; 9220 screen->vtbl.populate_gs_key = crocus_populate_gs_key; 9221 screen->vtbl.populate_fs_key = crocus_populate_fs_key; 9222 screen->vtbl.populate_cs_key = crocus_populate_cs_key; 9223 screen->vtbl.lost_genx_state = crocus_lost_genx_state; 9224#if GFX_VER >= 7 9225 screen->vtbl.finish_batch = crocus_state_finish_batch; 9226#endif 9227#if GFX_VER <= 5 9228 screen->vtbl.upload_urb_fence = crocus_upload_urb_fence; 9229 screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence; 9230#endif 9231 screen->vtbl.fill_clamp_mask = crocus_fill_clamp_mask; 9232 screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty; 9233 screen->vtbl.translate_prim_type = translate_prim_type; 9234#if GFX_VER >= 6 9235 screen->vtbl.update_so_strides = update_so_strides; 9236 screen->vtbl.get_so_offset = crocus_get_so_offset; 9237#endif 9238 9239 genX(crocus_init_blt)(screen); 9240} 9241 9242void 9243genX(crocus_init_state)(struct crocus_context *ice) 9244{ 9245 struct pipe_context *ctx = &ice->ctx; 9246 9247 ctx->create_blend_state = crocus_create_blend_state; 9248 ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state; 9249 ctx->create_rasterizer_state = crocus_create_rasterizer_state; 9250 ctx->create_sampler_state = crocus_create_sampler_state; 9251 ctx->create_sampler_view = crocus_create_sampler_view; 9252 ctx->create_surface = crocus_create_surface; 9253 ctx->create_vertex_elements_state = crocus_create_vertex_elements; 9254 ctx->bind_blend_state = crocus_bind_blend_state; 9255 ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state; 9256 ctx->bind_sampler_states = crocus_bind_sampler_states; 9257 ctx->bind_rasterizer_state = crocus_bind_rasterizer_state; 9258 ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state; 9259 ctx->delete_blend_state = crocus_delete_state; 9260 ctx->delete_depth_stencil_alpha_state = crocus_delete_state; 9261 ctx->delete_rasterizer_state = crocus_delete_state; 9262 ctx->delete_sampler_state = crocus_delete_state; 9263 ctx->delete_vertex_elements_state = crocus_delete_state; 9264 ctx->set_blend_color = crocus_set_blend_color; 9265 ctx->set_clip_state = crocus_set_clip_state; 9266 ctx->set_constant_buffer = crocus_set_constant_buffer; 9267 ctx->set_shader_buffers = crocus_set_shader_buffers; 9268 ctx->set_shader_images = crocus_set_shader_images; 9269 ctx->set_sampler_views = crocus_set_sampler_views; 9270 ctx->set_tess_state = crocus_set_tess_state; 9271 ctx->set_patch_vertices = crocus_set_patch_vertices; 9272 ctx->set_framebuffer_state = crocus_set_framebuffer_state; 9273 ctx->set_polygon_stipple = crocus_set_polygon_stipple; 9274 ctx->set_sample_mask = crocus_set_sample_mask; 9275 ctx->set_scissor_states = crocus_set_scissor_states; 9276 ctx->set_stencil_ref = crocus_set_stencil_ref; 9277 ctx->set_vertex_buffers = crocus_set_vertex_buffers; 9278 ctx->set_viewport_states = crocus_set_viewport_states; 9279 ctx->sampler_view_destroy = crocus_sampler_view_destroy; 9280 ctx->surface_destroy = crocus_surface_destroy; 9281 ctx->draw_vbo = crocus_draw_vbo; 9282 ctx->launch_grid = crocus_launch_grid; 9283 9284 ctx->set_frontend_noop = crocus_set_frontend_noop; 9285 9286#if GFX_VER >= 6 9287 ctx->create_stream_output_target = crocus_create_stream_output_target; 9288 ctx->stream_output_target_destroy = crocus_stream_output_target_destroy; 9289 ctx->set_stream_output_targets = crocus_set_stream_output_targets; 9290#endif 9291 9292 ice->state.dirty = ~0ull; 9293 ice->state.stage_dirty = ~0ull; 9294 9295 ice->state.statistics_counters_enabled = true; 9296 9297 ice->state.sample_mask = 0xff; 9298 ice->state.num_viewports = 1; 9299 ice->state.prim_mode = PIPE_PRIM_MAX; 9300 ice->state.reduced_prim_mode = PIPE_PRIM_MAX; 9301 ice->state.genx = calloc(1, sizeof(struct crocus_genx_state)); 9302 ice->draw.derived_params.drawid = -1; 9303 9304 /* Default all scissor rectangles to be empty regions. */ 9305 for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) { 9306 ice->state.scissors[i] = (struct pipe_scissor_state) { 9307 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0, 9308 }; 9309 } 9310} 9311