1/*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23/**
24 * @file crocus_state.c
25 *
26 * ============================= GENXML CODE =============================
27 *              [This file is compiled once per generation.]
28 * =======================================================================
29 *
30 * This is the main state upload code.
31 *
32 * Gallium uses Constant State Objects, or CSOs, for most state.  Large,
33 * complex, or highly reusable state can be created once, and bound and
34 * rebound multiple times.  This is modeled with the pipe->create_*_state()
35 * and pipe->bind_*_state() hooks.  Highly dynamic or inexpensive state is
36 * streamed out on the fly, via pipe->set_*_state() hooks.
37 *
38 * OpenGL involves frequently mutating context state, which is mirrored in
39 * core Mesa by highly mutable data structures.  However, most applications
40 * typically draw the same things over and over - from frame to frame, most
41 * of the same objects are still visible and need to be redrawn.  So, rather
42 * than inventing new state all the time, applications usually mutate to swap
43 * between known states that we've seen before.
44 *
45 * Gallium isolates us from this mutation by tracking API state, and
46 * distilling it into a set of Constant State Objects, or CSOs.  Large,
47 * complex, or typically reusable state can be created once, then reused
48 * multiple times.  Drivers can create and store their own associated data.
49 * This create/bind model corresponds to the pipe->create_*_state() and
50 * pipe->bind_*_state() driver hooks.
51 *
52 * Some state is cheap to create, or expected to be highly dynamic.  Rather
53 * than creating and caching piles of CSOs for these, Gallium simply streams
54 * them out, via the pipe->set_*_state() driver hooks.
55 *
56 * To reduce draw time overhead, we try to compute as much state at create
57 * time as possible.  Wherever possible, we translate the Gallium pipe state
58 * to 3DSTATE commands, and store those commands in the CSO.  At draw time,
59 * we can simply memcpy them into a batch buffer.
60 *
61 * No hardware matches the abstraction perfectly, so some commands require
62 * information from multiple CSOs.  In this case, we can store two copies
63 * of the packet (one in each CSO), and simply | together their DWords at
64 * draw time.  Sometimes the second set is trivial (one or two fields), so
65 * we simply pack it at draw time.
66 *
67 * There are two main components in the file below.  First, the CSO hooks
68 * create/bind/track state.  The second are the draw-time upload functions,
69 * crocus_upload_render_state() and crocus_upload_compute_state(), which read
70 * the context state and emit the commands into the actual batch.
71 */
72
73#include <errno.h>
74#include <stdio.h>
75
76#if HAVE_VALGRIND
77#include <memcheck.h>
78#include <valgrind.h>
79#define VG(x) x
80#ifdef DEBUG
81#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
82#endif
83#else
84#define VG(x)
85#endif
86
87#include "drm-uapi/i915_drm.h"
88#include "intel/common/intel_l3_config.h"
89#include "intel/common/intel_sample_positions.h"
90#include "intel/compiler/brw_compiler.h"
91#include "compiler/shader_info.h"
92#include "pipe/p_context.h"
93#include "pipe/p_defines.h"
94#include "pipe/p_screen.h"
95#include "pipe/p_state.h"
96#include "util/format/u_format.h"
97#include "util/half_float.h"
98#include "util/u_dual_blend.h"
99#include "util/u_framebuffer.h"
100#include "util/u_helpers.h"
101#include "util/u_inlines.h"
102#include "util/u_memory.h"
103#include "util/u_prim.h"
104#include "util/u_transfer.h"
105#include "util/u_upload_mgr.h"
106#include "util/u_viewport.h"
107#include "crocus_batch.h"
108#include "crocus_context.h"
109#include "crocus_defines.h"
110#include "crocus_pipe.h"
111#include "crocus_resource.h"
112
113#include "crocus_genx_macros.h"
114#include "intel/common/intel_guardband.h"
115#include "main/macros.h" /* UNCLAMPED_* */
116
117/**
118 * Statically assert that PIPE_* enums match the hardware packets.
119 * (As long as they match, we don't need to translate them.)
120 */
121UNUSED static void pipe_asserts()
122{
123#define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
124
125   /* pipe_logicop happens to match the hardware. */
126   PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
127   PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
128   PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
129   PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
130   PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
131   PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
132   PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
133   PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
134   PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
135   PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
136   PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
137   PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
138   PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
139   PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
140   PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
141   PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
142
143   /* pipe_blend_func happens to match the hardware. */
144   PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
145   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
146   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
147   PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
148   PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
149   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
150   PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
151   PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
152   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
153   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
154   PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
155   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
156   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
157   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
158   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
159   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
160   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
161   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
162   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
163
164   /* pipe_blend_func happens to match the hardware. */
165   PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
166   PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
167   PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
168   PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
169   PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
170
171   /* pipe_stencil_op happens to match the hardware. */
172   PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
173   PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
174   PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
175   PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
176   PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
177   PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
178   PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
179   PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
180
181#if GFX_VER >= 6
182   /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
183   PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
184   PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
185#endif
186#undef PIPE_ASSERT
187}
188
189static unsigned
190translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch)
191{
192   static const unsigned map[] = {
193      [PIPE_PRIM_POINTS]                   = _3DPRIM_POINTLIST,
194      [PIPE_PRIM_LINES]                    = _3DPRIM_LINELIST,
195      [PIPE_PRIM_LINE_LOOP]                = _3DPRIM_LINELOOP,
196      [PIPE_PRIM_LINE_STRIP]               = _3DPRIM_LINESTRIP,
197      [PIPE_PRIM_TRIANGLES]                = _3DPRIM_TRILIST,
198      [PIPE_PRIM_TRIANGLE_STRIP]           = _3DPRIM_TRISTRIP,
199      [PIPE_PRIM_TRIANGLE_FAN]             = _3DPRIM_TRIFAN,
200      [PIPE_PRIM_QUADS]                    = _3DPRIM_QUADLIST,
201      [PIPE_PRIM_QUAD_STRIP]               = _3DPRIM_QUADSTRIP,
202      [PIPE_PRIM_POLYGON]                  = _3DPRIM_POLYGON,
203#if GFX_VER >= 6
204      [PIPE_PRIM_LINES_ADJACENCY]          = _3DPRIM_LINELIST_ADJ,
205      [PIPE_PRIM_LINE_STRIP_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
206      [PIPE_PRIM_TRIANGLES_ADJACENCY]      = _3DPRIM_TRILIST_ADJ,
207      [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
208#endif
209#if GFX_VER >= 7
210      [PIPE_PRIM_PATCHES]                  = _3DPRIM_PATCHLIST_1 - 1,
211#endif
212   };
213
214   return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0);
215}
216
217static unsigned
218translate_compare_func(enum pipe_compare_func pipe_func)
219{
220   static const unsigned map[] = {
221      [PIPE_FUNC_NEVER]    = COMPAREFUNCTION_NEVER,
222      [PIPE_FUNC_LESS]     = COMPAREFUNCTION_LESS,
223      [PIPE_FUNC_EQUAL]    = COMPAREFUNCTION_EQUAL,
224      [PIPE_FUNC_LEQUAL]   = COMPAREFUNCTION_LEQUAL,
225      [PIPE_FUNC_GREATER]  = COMPAREFUNCTION_GREATER,
226      [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
227      [PIPE_FUNC_GEQUAL]   = COMPAREFUNCTION_GEQUAL,
228      [PIPE_FUNC_ALWAYS]   = COMPAREFUNCTION_ALWAYS,
229   };
230   return map[pipe_func];
231}
232
233static unsigned
234translate_shadow_func(enum pipe_compare_func pipe_func)
235{
236   /* Gallium specifies the result of shadow comparisons as:
237    *
238    *    1 if ref <op> texel,
239    *    0 otherwise.
240    *
241    * The hardware does:
242    *
243    *    0 if texel <op> ref,
244    *    1 otherwise.
245    *
246    * So we need to flip the operator and also negate.
247    */
248   static const unsigned map[] = {
249      [PIPE_FUNC_NEVER]    = PREFILTEROP_ALWAYS,
250      [PIPE_FUNC_LESS]     = PREFILTEROP_LEQUAL,
251      [PIPE_FUNC_EQUAL]    = PREFILTEROP_NOTEQUAL,
252      [PIPE_FUNC_LEQUAL]   = PREFILTEROP_LESS,
253      [PIPE_FUNC_GREATER]  = PREFILTEROP_GEQUAL,
254      [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
255      [PIPE_FUNC_GEQUAL]   = PREFILTEROP_GREATER,
256      [PIPE_FUNC_ALWAYS]   = PREFILTEROP_NEVER,
257   };
258   return map[pipe_func];
259}
260
261static unsigned
262translate_cull_mode(unsigned pipe_face)
263{
264   static const unsigned map[4] = {
265      [PIPE_FACE_NONE]           = CULLMODE_NONE,
266      [PIPE_FACE_FRONT]          = CULLMODE_FRONT,
267      [PIPE_FACE_BACK]           = CULLMODE_BACK,
268      [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
269   };
270   return map[pipe_face];
271}
272
273#if GFX_VER >= 6
274static unsigned
275translate_fill_mode(unsigned pipe_polymode)
276{
277   static const unsigned map[4] = {
278      [PIPE_POLYGON_MODE_FILL]           = FILL_MODE_SOLID,
279      [PIPE_POLYGON_MODE_LINE]           = FILL_MODE_WIREFRAME,
280      [PIPE_POLYGON_MODE_POINT]          = FILL_MODE_POINT,
281      [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
282   };
283   return map[pipe_polymode];
284}
285#endif
286
287static unsigned
288translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
289{
290   static const unsigned map[] = {
291      [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
292      [PIPE_TEX_MIPFILTER_LINEAR]  = MIPFILTER_LINEAR,
293      [PIPE_TEX_MIPFILTER_NONE]    = MIPFILTER_NONE,
294   };
295   return map[pipe_mip];
296}
297
298static uint32_t
299translate_wrap(unsigned pipe_wrap, bool either_nearest)
300{
301   static const unsigned map[] = {
302      [PIPE_TEX_WRAP_REPEAT]                 = TCM_WRAP,
303#if GFX_VER == 8
304      [PIPE_TEX_WRAP_CLAMP]                  = TCM_HALF_BORDER,
305#else
306      [PIPE_TEX_WRAP_CLAMP]                  = TCM_CLAMP_BORDER,
307#endif
308      [PIPE_TEX_WRAP_CLAMP_TO_EDGE]          = TCM_CLAMP,
309      [PIPE_TEX_WRAP_CLAMP_TO_BORDER]        = TCM_CLAMP_BORDER,
310      [PIPE_TEX_WRAP_MIRROR_REPEAT]          = TCM_MIRROR,
311      [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE]   = TCM_MIRROR_ONCE,
312
313      /* These are unsupported. */
314      [PIPE_TEX_WRAP_MIRROR_CLAMP]           = -1,
315      [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
316   };
317#if GFX_VER < 8
318   if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest)
319      return TCM_CLAMP;
320#endif
321   return map[pipe_wrap];
322}
323
324/**
325 * Equiv if brw_state_batch
326 */
327static uint32_t *
328stream_state(struct crocus_batch *batch,
329             unsigned size,
330             unsigned alignment,
331             uint32_t *out_offset)
332{
333   uint32_t offset = ALIGN(batch->state.used, alignment);
334
335   if (offset + size >= STATE_SZ && !batch->no_wrap) {
336      crocus_batch_flush(batch);
337      offset = ALIGN(batch->state.used, alignment);
338   } else if (offset + size >= batch->state.bo->size) {
339      const unsigned new_size =
340         MIN2(batch->state.bo->size + batch->state.bo->size / 2,
341              MAX_STATE_SIZE);
342      crocus_grow_buffer(batch, true, batch->state.used, new_size);
343      assert(offset + size < batch->state.bo->size);
344   }
345
346   crocus_record_state_size(batch->state_sizes, offset, size);
347
348   batch->state.used = offset + size;
349   *out_offset = offset;
350
351   return (uint32_t *)batch->state.map + (offset >> 2);
352}
353
354/**
355 * stream_state() + memcpy.
356 */
357static uint32_t
358emit_state(struct crocus_batch *batch, const void *data, unsigned size,
359           unsigned alignment)
360{
361   unsigned offset = 0;
362   uint32_t *map = stream_state(batch, size, alignment, &offset);
363
364   if (map)
365      memcpy(map, data, size);
366
367   return offset;
368}
369
370#if GFX_VER <= 5
371static void
372upload_pipelined_state_pointers(struct crocus_batch *batch,
373                                bool gs_active, uint32_t gs_offset,
374                                uint32_t vs_offset, uint32_t sf_offset,
375                                uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset)
376{
377#if GFX_VER == 5
378   /* Need to flush before changing clip max threads for errata. */
379   crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
380#endif
381
382   crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
383      pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset);
384      pp.GSEnable = gs_active;
385      if (gs_active)
386         pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset);
387      pp.ClipEnable = true;
388      pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset);
389      pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset);
390      pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset);
391      pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset);
392   }
393}
394
395#endif
396/**
397 * Did field 'x' change between 'old_cso' and 'new_cso'?
398 *
399 * (If so, we may want to set some dirty flags.)
400 */
401#define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
402#define cso_changed_memcmp(x) \
403   (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
404
405static void
406flush_before_state_base_change(struct crocus_batch *batch)
407{
408#if GFX_VER >= 6
409   /* Flush before emitting STATE_BASE_ADDRESS.
410    *
411    * This isn't documented anywhere in the PRM.  However, it seems to be
412    * necessary prior to changing the surface state base adress.  We've
413    * seen issues in Vulkan where we get GPU hangs when using multi-level
414    * command buffers which clear depth, reset state base address, and then
415    * go render stuff.
416    *
417    * Normally, in GL, we would trust the kernel to do sufficient stalls
418    * and flushes prior to executing our batch.  However, it doesn't seem
419    * as if the kernel's flushing is always sufficient and we don't want to
420    * rely on it.
421    *
422    * We make this an end-of-pipe sync instead of a normal flush because we
423    * do not know the current status of the GPU.  On Haswell at least,
424    * having a fast-clear operation in flight at the same time as a normal
425    * rendering operation can cause hangs.  Since the kernel's flushing is
426    * insufficient, we need to ensure that any rendering operations from
427    * other processes are definitely complete before we try to do our own
428    * rendering.  It's a bit of a big hammer but it appears to work.
429    */
430   const unsigned dc_flush =
431      GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
432   crocus_emit_end_of_pipe_sync(batch,
433                                "change STATE_BASE_ADDRESS (flushes)",
434                                PIPE_CONTROL_RENDER_TARGET_FLUSH |
435                                dc_flush |
436                                PIPE_CONTROL_DEPTH_CACHE_FLUSH);
437#endif
438}
439
440static void
441flush_after_state_base_change(struct crocus_batch *batch)
442{
443   /* After re-setting the surface state base address, we have to do some
444    * cache flusing so that the sampler engine will pick up the new
445    * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
446    * Shared Function > 3D Sampler > State > State Caching (page 96):
447    *
448    *    Coherency with system memory in the state cache, like the texture
449    *    cache is handled partially by software. It is expected that the
450    *    command stream or shader will issue Cache Flush operation or
451    *    Cache_Flush sampler message to ensure that the L1 cache remains
452    *    coherent with system memory.
453    *
454    *    [...]
455    *
456    *    Whenever the value of the Dynamic_State_Base_Addr,
457    *    Surface_State_Base_Addr are altered, the L1 state cache must be
458    *    invalidated to ensure the new surface or sampler state is fetched
459    *    from system memory.
460    *
461    * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
462    * which, according the PIPE_CONTROL instruction documentation in the
463    * Broadwell PRM:
464    *
465    *    Setting this bit is independent of any other bit in this packet.
466    *    This bit controls the invalidation of the L1 and L2 state caches
467    *    at the top of the pipe i.e. at the parsing time.
468    *
469    * Unfortunately, experimentation seems to indicate that state cache
470    * invalidation through a PIPE_CONTROL does nothing whatsoever in
471    * regards to surface state and binding tables.  In stead, it seems that
472    * invalidating the texture cache is what is actually needed.
473    *
474    * XXX:  As far as we have been able to determine through
475    * experimentation, shows that flush the texture cache appears to be
476    * sufficient.  The theory here is that all of the sampling/rendering
477    * units cache the binding table in the texture cache.  However, we have
478    * yet to be able to actually confirm this.
479    */
480#if GFX_VER >= 6
481   crocus_emit_end_of_pipe_sync(batch,
482                                "change STATE_BASE_ADDRESS (invalidates)",
483                                PIPE_CONTROL_INSTRUCTION_INVALIDATE |
484                                PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
485                                PIPE_CONTROL_CONST_CACHE_INVALIDATE |
486                                PIPE_CONTROL_STATE_CACHE_INVALIDATE);
487#endif
488}
489
490#if GFX_VER >= 6
491static void
492crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg,
493                            struct crocus_bo *bo, uint32_t offset,
494                            bool predicated)
495{
496   crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
497      srm.RegisterAddress = reg;
498      srm.MemoryAddress = ggtt_bo(bo, offset);
499#if GFX_VERx10 >= 75
500      srm.PredicateEnable = predicated;
501#else
502      if (predicated)
503         unreachable("unsupported predication");
504#endif
505   }
506}
507
508static void
509crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg,
510                            struct crocus_bo *bo, uint32_t offset,
511                            bool predicated)
512{
513   crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
514   crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
515}
516#endif
517
518#if GFX_VER >= 7
519static void
520_crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val)
521{
522   crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
523      lri.RegisterOffset = reg;
524      lri.DataDWord      = val;
525   }
526}
527#define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v)
528
529#if GFX_VERx10 >= 75
530static void
531_crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src)
532{
533   crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
534      lrr.SourceRegisterAddress = src;
535      lrr.DestinationRegisterAddress = dst;
536   }
537}
538
539static void
540crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst,
541                           uint32_t src)
542{
543   _crocus_emit_lrr(batch, dst, src);
544}
545
546static void
547crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst,
548                           uint32_t src)
549{
550   _crocus_emit_lrr(batch, dst, src);
551   _crocus_emit_lrr(batch, dst + 4, src + 4);
552}
553#endif
554
555static void
556crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg,
557                           uint32_t val)
558{
559   _crocus_emit_lri(batch, reg, val);
560}
561
562static void
563crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg,
564                           uint64_t val)
565{
566   _crocus_emit_lri(batch, reg + 0, val & 0xffffffff);
567   _crocus_emit_lri(batch, reg + 4, val >> 32);
568}
569
570/**
571 * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
572 */
573static void
574crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg,
575                           struct crocus_bo *bo, uint32_t offset)
576{
577   crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
578      lrm.RegisterAddress = reg;
579      lrm.MemoryAddress = ro_bo(bo, offset);
580   }
581}
582
583/**
584 * Load a 64-bit value from a buffer into a MMIO register via
585 * two MI_LOAD_REGISTER_MEM commands.
586 */
587static void
588crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg,
589                           struct crocus_bo *bo, uint32_t offset)
590{
591   crocus_load_register_mem32(batch, reg + 0, bo, offset + 0);
592   crocus_load_register_mem32(batch, reg + 4, bo, offset + 4);
593}
594
595#if GFX_VERx10 >= 75
596static void
597crocus_store_data_imm32(struct crocus_batch *batch,
598                        struct crocus_bo *bo, uint32_t offset,
599                        uint32_t imm)
600{
601   crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
602      sdi.Address = rw_bo(bo, offset);
603#if GFX_VER >= 6
604      sdi.ImmediateData = imm;
605#endif
606   }
607}
608
609static void
610crocus_store_data_imm64(struct crocus_batch *batch,
611                        struct crocus_bo *bo, uint32_t offset,
612                        uint64_t imm)
613{
614   /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of
615    * 2 in genxml but it's actually variable length and we need 5 DWords.
616    */
617   void *map = crocus_get_command_space(batch, 4 * 5);
618   _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
619      sdi.DWordLength = 5 - 2;
620      sdi.Address = rw_bo(bo, offset);
621#if GFX_VER >= 6
622      sdi.ImmediateData = imm;
623#endif
624   }
625}
626#endif
627
628static void
629crocus_copy_mem_mem(struct crocus_batch *batch,
630                    struct crocus_bo *dst_bo, uint32_t dst_offset,
631                    struct crocus_bo *src_bo, uint32_t src_offset,
632                    unsigned bytes)
633{
634   assert(bytes % 4 == 0);
635   assert(dst_offset % 4 == 0);
636   assert(src_offset % 4 == 0);
637
638#define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
639   for (unsigned i = 0; i < bytes; i += 4) {
640      crocus_load_register_mem32(batch, CROCUS_TEMP_REG,
641                                 src_bo, src_offset + i);
642      crocus_store_register_mem32(batch, CROCUS_TEMP_REG,
643                                  dst_bo, dst_offset + i, false);
644   }
645}
646#endif
647
648/**
649 * Gallium CSO for rasterizer state.
650 */
651struct crocus_rasterizer_state {
652   struct pipe_rasterizer_state cso;
653#if GFX_VER >= 6
654   uint32_t sf[GENX(3DSTATE_SF_length)];
655   uint32_t clip[GENX(3DSTATE_CLIP_length)];
656#endif
657#if GFX_VER >= 8
658   uint32_t raster[GENX(3DSTATE_RASTER_length)];
659#endif
660   uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
661
662   uint8_t num_clip_plane_consts;
663   bool fill_mode_point_or_line;
664};
665
666#if GFX_VER <= 5
667#define URB_VS 0
668#define URB_GS 1
669#define URB_CLP 2
670#define URB_SF 3
671#define URB_CS 4
672
673static const struct {
674   uint32_t min_nr_entries;
675   uint32_t preferred_nr_entries;
676   uint32_t min_entry_size;
677   uint32_t  max_entry_size;
678} limits[URB_CS+1] = {
679   { 16, 32, 1, 5 },                        /* vs */
680   { 4, 8,  1, 5 },                        /* gs */
681   { 5, 10,  1, 5 },                        /* clp */
682   { 1, 8,  1, 12 },                        /* sf */
683   { 1, 4,  1, 32 }                        /* cs */
684};
685
686static bool check_urb_layout(struct crocus_context *ice)
687{
688   ice->urb.vs_start = 0;
689   ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize;
690   ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize;
691   ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize;
692   ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize;
693
694   return ice->urb.cs_start + ice->urb.nr_cs_entries *
695      ice->urb.csize <= ice->urb.size;
696}
697
698
699static bool
700crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
701                           unsigned vsize, unsigned sfsize)
702{
703   struct crocus_context *ice = batch->ice;
704   if (csize < limits[URB_CS].min_entry_size)
705      csize = limits[URB_CS].min_entry_size;
706
707   if (vsize < limits[URB_VS].min_entry_size)
708      vsize = limits[URB_VS].min_entry_size;
709
710   if (sfsize < limits[URB_SF].min_entry_size)
711      sfsize = limits[URB_SF].min_entry_size;
712
713   if (ice->urb.vsize < vsize ||
714       ice->urb.sfsize < sfsize ||
715       ice->urb.csize < csize ||
716       (ice->urb.constrained && (ice->urb.vsize > vsize ||
717                                 ice->urb.sfsize > sfsize ||
718                                 ice->urb.csize > csize))) {
719
720
721      ice->urb.csize = csize;
722      ice->urb.sfsize = sfsize;
723      ice->urb.vsize = vsize;
724
725      ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
726      ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries;
727      ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries;
728      ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
729      ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries;
730
731      ice->urb.constrained = 0;
732
733      if (GFX_VER == 5) {
734         ice->urb.nr_vs_entries = 128;
735         ice->urb.nr_sf_entries = 48;
736         if (check_urb_layout(ice)) {
737            goto done;
738         } else {
739            ice->urb.constrained = 1;
740            ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
741            ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
742         }
743      } else if (GFX_VERx10 == 45) {
744         ice->urb.nr_vs_entries = 64;
745         if (check_urb_layout(ice)) {
746            goto done;
747         } else {
748            ice->urb.constrained = 1;
749            ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
750         }
751      }
752
753      if (!check_urb_layout(ice)) {
754         ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries;
755         ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries;
756         ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries;
757         ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries;
758         ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries;
759
760         /* Mark us as operating with constrained nr_entries, so that next
761          * time we recalculate we'll resize the fences in the hope of
762          * escaping constrained mode and getting back to normal performance.
763          */
764         ice->urb.constrained = 1;
765
766         if (!check_urb_layout(ice)) {
767            /* This is impossible, given the maximal sizes of urb
768             * entries and the values for minimum nr of entries
769             * provided above.
770             */
771            fprintf(stderr, "couldn't calculate URB layout!\n");
772            exit(1);
773         }
774
775         if (INTEL_DEBUG(DEBUG_URB|DEBUG_PERF))
776            fprintf(stderr, "URB CONSTRAINED\n");
777      }
778
779done:
780      if (INTEL_DEBUG(DEBUG_URB))
781         fprintf(stderr,
782                 "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
783                 ice->urb.vs_start,
784                 ice->urb.gs_start,
785                 ice->urb.clip_start,
786                 ice->urb.sf_start,
787                 ice->urb.cs_start,
788                 ice->urb.size);
789      return true;
790   }
791   return false;
792}
793
794static void
795crocus_upload_urb_fence(struct crocus_batch *batch)
796{
797   uint32_t urb_fence[3];
798   _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) {
799      urb.VSUnitURBReallocationRequest = 1;
800      urb.GSUnitURBReallocationRequest = 1;
801      urb.CLIPUnitURBReallocationRequest = 1;
802      urb.SFUnitURBReallocationRequest = 1;
803      urb.VFEUnitURBReallocationRequest = 1;
804      urb.CSUnitURBReallocationRequest = 1;
805
806      urb.VSFence = batch->ice->urb.gs_start;
807      urb.GSFence = batch->ice->urb.clip_start;
808      urb.CLIPFence = batch->ice->urb.sf_start;
809      urb.SFFence = batch->ice->urb.cs_start;
810      urb.CSFence = batch->ice->urb.size;
811   }
812
813   /* erratum: URB_FENCE must not cross a 64byte cacheline */
814   if ((crocus_batch_bytes_used(batch) & 15) > 12) {
815      int pad = 16 - (crocus_batch_bytes_used(batch) & 15);
816      do {
817         *(uint32_t *)batch->command.map_next = 0;
818         batch->command.map_next += sizeof(uint32_t);
819      } while (--pad);
820   }
821
822   crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3);
823}
824
825static bool
826calculate_curbe_offsets(struct crocus_batch *batch)
827{
828   struct crocus_context *ice = batch->ice;
829
830   unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0;
831   unsigned total_regs;
832
833   nr_fp_regs = 0;
834   for (int i = 0; i < 4; i++) {
835      const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i];
836      if (range->length == 0)
837         continue;
838
839      /* ubo range tracks at 256-bit, we need 512-bit */
840      nr_fp_regs += (range->length + 1) / 2;
841   }
842
843   if (ice->state.cso_rast->cso.clip_plane_enable) {
844      unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable);
845      nr_clip_regs = (nr_planes * 4 + 15) / 16;
846   }
847
848   nr_vp_regs = 0;
849   for (int i = 0; i < 4; i++) {
850      const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i];
851      if (range->length == 0)
852         continue;
853
854      /* ubo range tracks at 256-bit, we need 512-bit */
855      nr_vp_regs += (range->length + 1) / 2;
856   }
857   if (nr_vp_regs == 0) {
858      /* The pre-gen6 VS requires that some push constants get loaded no
859       * matter what, or the GPU would hang.
860       */
861      nr_vp_regs = 1;
862   }
863   total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
864
865   /* The CURBE allocation size is limited to 32 512-bit units (128 EU
866    * registers, or 1024 floats).  See CS_URB_STATE in the gen4 or gen5
867    * (volume 1, part 1) PRMs.
868    *
869    * Note that in brw_fs.cpp we're only loading up to 16 EU registers of
870    * values as push constants before spilling to pull constants, and in
871    * brw_vec4.cpp we're loading up to 32 registers of push constants.  An EU
872    * register is 1/2 of one of these URB entry units, so that leaves us 16 EU
873    * regs for clip.
874    */
875   assert(total_regs <= 32);
876
877   /* Lazy resize:
878    */
879   if (nr_fp_regs > ice->curbe.wm_size ||
880       nr_vp_regs > ice->curbe.vs_size ||
881       nr_clip_regs != ice->curbe.clip_size ||
882       (total_regs < ice->curbe.total_size / 4 &&
883        ice->curbe.total_size > 16)) {
884
885      GLuint reg = 0;
886
887      /* Calculate a new layout:
888       */
889      reg = 0;
890      ice->curbe.wm_start = reg;
891      ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
892      ice->curbe.clip_start = reg;
893      ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
894      ice->curbe.vs_start = reg;
895      ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
896      ice->curbe.total_size = reg;
897
898      if (0)
899         fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
900                 ice->curbe.wm_start,
901                 ice->curbe.wm_size,
902                 ice->curbe.clip_start,
903                 ice->curbe.clip_size,
904                 ice->curbe.vs_start,
905                 ice->curbe.vs_size );
906      return true;
907   }
908   return false;
909}
910
911static void
912upload_shader_consts(struct crocus_context *ice,
913                     gl_shader_stage stage,
914                     uint32_t *map,
915                     unsigned start)
916{
917   struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
918   struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
919   uint32_t *cmap;
920   bool found = false;
921   unsigned offset = start * 16;
922   int total = 0;
923   for (int i = 0; i < 4; i++) {
924      const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
925
926      if (range->length == 0)
927         continue;
928
929      unsigned block_index = crocus_bti_to_group_index(
930         &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
931      unsigned len = range->length * 8 * sizeof(float);
932      unsigned start = range->start * 8 * sizeof(float);
933      struct pipe_transfer *transfer;
934
935      cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer,
936                                   ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len,
937                                   PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer);
938      if (cmap)
939         memcpy(&map[offset + (total * 8)], cmap, len);
940      pipe_buffer_unmap(&ice->ctx, transfer);
941      total += range->length;
942      found = true;
943   }
944
945   if (stage == MESA_SHADER_VERTEX && !found) {
946      /* The pre-gen6 VS requires that some push constants get loaded no
947       * matter what, or the GPU would hang.
948       */
949      unsigned len = 16;
950      memset(&map[offset], 0, len);
951   }
952}
953
954static const float fixed_plane[6][4] = {
955   { 0,    0,   -1, 1 },
956   { 0,    0,    1, 1 },
957   { 0,   -1,    0, 1 },
958   { 0,    1,    0, 1 },
959   {-1,    0,    0, 1 },
960   { 1,    0,    0, 1 }
961};
962
963static void
964gen4_upload_curbe(struct crocus_batch *batch)
965{
966   struct crocus_context *ice = batch->ice;
967   const unsigned sz = ice->curbe.total_size;
968   const unsigned buf_sz = sz * 16 * sizeof(float);
969
970   if (sz == 0)
971      goto emit;
972
973   uint32_t *map;
974   u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64,
975                  &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map);
976
977   /* fragment shader constants */
978   if (ice->curbe.wm_size) {
979      upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start);
980   }
981
982   /* clipper constants */
983   if (ice->curbe.clip_size) {
984      unsigned offset = ice->curbe.clip_start * 16;
985      float *fmap = (float *)map;
986      unsigned i;
987      /* If any planes are going this way, send them all this way:
988       */
989      for (i = 0; i < 6; i++) {
990         fmap[offset + i * 4 + 0] = fixed_plane[i][0];
991         fmap[offset + i * 4 + 1] = fixed_plane[i][1];
992         fmap[offset + i * 4 + 2] = fixed_plane[i][2];
993         fmap[offset + i * 4 + 3] = fixed_plane[i][3];
994      }
995
996      unsigned mask = ice->state.cso_rast->cso.clip_plane_enable;
997      struct pipe_clip_state *cp = &ice->state.clip_planes;
998      while (mask) {
999         const int j = u_bit_scan(&mask);
1000         fmap[offset + i * 4 + 0] = cp->ucp[j][0];
1001         fmap[offset + i * 4 + 1] = cp->ucp[j][1];
1002         fmap[offset + i * 4 + 2] = cp->ucp[j][2];
1003         fmap[offset + i * 4 + 3] = cp->ucp[j][3];
1004         i++;
1005      }
1006   }
1007
1008   /* vertex shader constants */
1009   if (ice->curbe.vs_size) {
1010      upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start);
1011   }
1012   if (0) {
1013      for (int i = 0; i < sz*16; i+=4) {
1014         float *f = (float *)map;
1015         fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
1016                 f[i+0], f[i+1], f[i+2], f[i+3]);
1017      }
1018   }
1019
1020emit:
1021   crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) {
1022      if (ice->curbe.curbe_res) {
1023         cb.BufferLength = ice->curbe.total_size - 1;
1024         cb.Valid = 1;
1025         cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset);
1026      }
1027   }
1028
1029#if GFX_VER == 4 && GFX_VERx10 != 45
1030   /* Work around a Broadwater/Crestline depth interpolator bug.  The
1031    * following sequence will cause GPU hangs:
1032    *
1033    * 1. Change state so that all depth related fields in CC_STATE are
1034    *    disabled, and in WM_STATE, only "PS Use Source Depth" is enabled.
1035    * 2. Emit a CONSTANT_BUFFER packet.
1036    * 3. Draw via 3DPRIMITIVE.
1037    *
1038    * The recommended workaround is to emit a non-pipelined state change after
1039    * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline.
1040    *
1041    * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small),
1042    * and always emit it when "PS Use Source Depth" is set.  We could be more
1043    * precise, but the additional complexity is probably not worth it.
1044    *
1045    */
1046   const struct shader_info *fs_info =
1047      crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
1048
1049   if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
1050      ice->state.global_depth_offset_clamp = 0;
1051      crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp);
1052   }
1053#endif
1054}
1055#endif
1056
1057#if GFX_VER >= 7
1058
1059#define IVB_L3SQCREG1_SQGHPCI_DEFAULT     0x00730000
1060#define VLV_L3SQCREG1_SQGHPCI_DEFAULT     0x00d30000
1061#define HSW_L3SQCREG1_SQGHPCI_DEFAULT     0x00610000
1062
1063static void
1064setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg)
1065{
1066#if GFX_VER == 7
1067   const struct intel_device_info *devinfo = &batch->screen->devinfo;
1068   const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
1069   const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
1070                       cfg->n[INTEL_L3P_ALL];
1071   const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
1072                      cfg->n[INTEL_L3P_ALL];
1073   const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
1074                      cfg->n[INTEL_L3P_ALL];
1075   const bool has_slm = cfg->n[INTEL_L3P_SLM];
1076#endif
1077
1078   /* According to the hardware docs, the L3 partitioning can only be changed
1079    * while the pipeline is completely drained and the caches are flushed,
1080    * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1081    */
1082   crocus_emit_pipe_control_flush(batch, "l3_config",
1083                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
1084                                  PIPE_CONTROL_CS_STALL);
1085
1086   /* ...followed by a second pipelined PIPE_CONTROL that initiates
1087    * invalidation of the relevant caches.  Note that because RO invalidation
1088    * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1089    * command is processed by the CS) we cannot combine it with the previous
1090    * stalling flush as the hardware documentation suggests, because that
1091    * would cause the CS to stall on previous rendering *after* RO
1092    * invalidation and wouldn't prevent the RO caches from being polluted by
1093    * concurrent rendering before the stall completes.  This intentionally
1094    * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1095    * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1096    * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1097    * already guarantee that there is no concurrent GPGPU kernel execution
1098    * (see SKL HSD 2132585).
1099    */
1100   crocus_emit_pipe_control_flush(batch, "l3 config",
1101                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1102                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1103                                  PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1104                                  PIPE_CONTROL_STATE_CACHE_INVALIDATE);
1105
1106   /* Now send a third stalling flush to make sure that invalidation is
1107    * complete when the L3 configuration registers are modified.
1108    */
1109   crocus_emit_pipe_control_flush(batch, "l3 config",
1110                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
1111                                  PIPE_CONTROL_CS_STALL);
1112
1113#if GFX_VER == 8
1114   assert(!cfg->n[INTEL_L3P_IS] && !cfg->n[INTEL_L3P_C] && !cfg->n[INTEL_L3P_T]);
1115   crocus_emit_reg(batch, GENX(L3CNTLREG), reg) {
1116      reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
1117      reg.URBAllocation = cfg->n[INTEL_L3P_URB];
1118      reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1119      reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1120      reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
1121   }
1122#else
1123   assert(!cfg->n[INTEL_L3P_ALL]);
1124
1125   /* When enabled SLM only uses a portion of the L3 on half of the banks,
1126    * the matching space on the remaining banks has to be allocated to a
1127    * client (URB for all validated configurations) set to the
1128    * lower-bandwidth 2-bank address hashing mode.
1129    */
1130   const bool urb_low_bw = has_slm && devinfo->platform != INTEL_PLATFORM_BYT;
1131   assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
1132
1133   /* Minimum number of ways that can be allocated to the URB. */
1134   const unsigned n0_urb = (devinfo->platform == INTEL_PLATFORM_BYT ? 32 : 0);
1135   assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
1136
1137   uint32_t l3sqcr1, l3cr2, l3cr3;
1138
1139   crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) {
1140      reg.ConvertDC_UC = !has_dc;
1141      reg.ConvertIS_UC = !has_is;
1142      reg.ConvertC_UC = !has_c;
1143      reg.ConvertT_UC = !has_t;
1144#if GFX_VERx10 == 75
1145      reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
1146#else
1147      reg.L3SQGeneralPriorityCreditInitialization =
1148         devinfo->platform == INTEL_PLATFORM_BYT ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
1149#endif
1150      reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
1151   };
1152
1153   crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) {
1154      reg.SLMEnable = has_slm;
1155      reg.URBLowBandwidth = urb_low_bw;
1156      reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
1157#if !(GFX_VERx10 == 75)
1158      reg.ALLAllocation = cfg->n[INTEL_L3P_ALL];
1159#endif
1160      reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1161      reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1162   };
1163
1164   crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) {
1165      reg.ISAllocation = cfg->n[INTEL_L3P_IS];
1166      reg.ISLowBandwidth = 0;
1167      reg.CAllocation = cfg->n[INTEL_L3P_C];
1168      reg.CLowBandwidth = 0;
1169      reg.TAllocation = cfg->n[INTEL_L3P_T];
1170      reg.TLowBandwidth = 0;
1171   };
1172
1173   /* Set up the L3 partitioning. */
1174   crocus_emit_lri(batch, L3SQCREG1, l3sqcr1);
1175   crocus_emit_lri(batch, L3CNTLREG2, l3cr2);
1176   crocus_emit_lri(batch, L3CNTLREG3, l3cr3);
1177
1178#if GFX_VERSIONx10 == 75
1179   /* TODO: Fail screen creation if command parser version < 4 */
1180   uint32_t scratch1, chicken3;
1181   crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) {
1182      reg.L3AtomicDisable = !has_dc;
1183   }
1184   crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) {
1185      reg.L3AtomicDisableMask = true;
1186      reg.L3AtomicDisable = !has_dc;
1187   }
1188   crocus_emit_lri(batch, SCRATCH1, scratch1);
1189   crocus_emit_lri(batch, CHICKEN3, chicken3);
1190#endif
1191#endif
1192}
1193
1194static void
1195emit_l3_state(struct crocus_batch *batch, bool compute)
1196{
1197   const struct intel_l3_config *const cfg =
1198      compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
1199
1200   setup_l3_config(batch, cfg);
1201   if (INTEL_DEBUG(DEBUG_L3)) {
1202      intel_dump_l3_config(cfg, stderr);
1203   }
1204}
1205
1206/**
1207 * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
1208 */
1209static void
1210gen7_emit_cs_stall_flush(struct crocus_batch *batch)
1211{
1212   crocus_emit_pipe_control_write(batch,
1213                                  "workaround",
1214                                  PIPE_CONTROL_CS_STALL
1215                                  | PIPE_CONTROL_WRITE_IMMEDIATE,
1216                                  batch->ice->workaround_bo,
1217                                  batch->ice->workaround_offset, 0);
1218}
1219#endif
1220
1221static void
1222emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
1223{
1224#if GFX_VER == 8
1225   /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1226    *
1227    *   Software must clear the COLOR_CALC_STATE Valid field in
1228    *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1229    *   with Pipeline Select set to GPGPU.
1230    *
1231    * The internal hardware docs recommend the same workaround for Gfx9
1232    * hardware too.
1233    */
1234   if (pipeline == GPGPU)
1235      crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1236#endif
1237
1238#if GFX_VER >= 6
1239   /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1240    * PIPELINE_SELECT [DevBWR+]":
1241    *
1242    *    "Project: DEVSNB+
1243    *
1244    *     Software must ensure all the write caches are flushed through a
1245    *     stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1246    *     command to invalidate read only caches prior to programming
1247    *     MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
1248    */
1249   const unsigned dc_flush =
1250      GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
1251   crocus_emit_pipe_control_flush(batch,
1252                                  "workaround: PIPELINE_SELECT flushes (1/2)",
1253                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
1254                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1255                                  dc_flush |
1256                                  PIPE_CONTROL_CS_STALL);
1257
1258   crocus_emit_pipe_control_flush(batch,
1259                                  "workaround: PIPELINE_SELECT flushes (2/2)",
1260                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1261                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1262                                  PIPE_CONTROL_STATE_CACHE_INVALIDATE |
1263                                  PIPE_CONTROL_INSTRUCTION_INVALIDATE);
1264#else
1265   /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1266    * PIPELINE_SELECT [DevBWR+]":
1267    *
1268    *   Project: PRE-DEVSNB
1269    *
1270    *   Software must ensure the current pipeline is flushed via an
1271    *   MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
1272    */
1273   crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
1274#endif
1275
1276   crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
1277      sel.PipelineSelection = pipeline;
1278   }
1279
1280#if GFX_VER == 7 && !(GFX_VERx10 == 75)
1281   if (pipeline == _3D) {
1282      gen7_emit_cs_stall_flush(batch);
1283
1284      crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
1285         prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
1286      };
1287   }
1288#endif
1289}
1290
1291/**
1292 * The following diagram shows how we partition the URB:
1293 *
1294 *        16kB or 32kB               Rest of the URB space
1295 *   __________-__________   _________________-_________________
1296 *  /                     \ /                                   \
1297 * +-------------------------------------------------------------+
1298 * |  VS/HS/DS/GS/FS Push  |           VS/HS/DS/GS URB           |
1299 * |       Constants       |               Entries               |
1300 * +-------------------------------------------------------------+
1301 *
1302 * Notably, push constants must be stored at the beginning of the URB
1303 * space, while entries can be stored anywhere.  Ivybridge and Haswell
1304 * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
1305 * doubles this (32kB).
1306 *
1307 * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
1308 * sized) in increments of 1kB.  Haswell GT3 requires them to be located and
1309 * sized in increments of 2kB.
1310 *
1311 * Currently we split the constant buffer space evenly among whatever stages
1312 * are active.  This is probably not ideal, but simple.
1313 *
1314 * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
1315 * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
1316 * Haswell GT3 has 512kB of URB space.
1317 *
1318 * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
1319 * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
1320 */
1321#if GFX_VER >= 7
1322static void
1323crocus_alloc_push_constants(struct crocus_batch *batch)
1324{
1325   const unsigned push_constant_kb =
1326      batch->screen->devinfo.max_constant_urb_size_kb;
1327   unsigned size_per_stage = push_constant_kb / 5;
1328
1329   /* For now, we set a static partitioning of the push constant area,
1330    * assuming that all stages could be in use.
1331    *
1332    * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1333    *       see if that improves performance by offering more space to
1334    *       the VS/FS when those aren't in use.  Also, try dynamically
1335    *       enabling/disabling it like i965 does.  This would be more
1336    *       stalls and may not actually help; we don't know yet.
1337    */
1338   for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1339      crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1340         alloc._3DCommandSubOpcode = 18 + i;
1341         alloc.ConstantBufferOffset = size_per_stage * i;
1342         alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage;
1343      }
1344   }
1345
1346   /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
1347    *
1348    *     A PIPE_CONTROL command with the CS Stall bit set must be programmed
1349    *     in the ring after this instruction.
1350    *
1351    * No such restriction exists for Haswell or Baytrail.
1352    */
1353   if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
1354      gen7_emit_cs_stall_flush(batch);
1355}
1356#endif
1357
1358/**
1359 * Upload the initial GPU state for a render context.
1360 *
1361 * This sets some invariant state that needs to be programmed a particular
1362 * way, but we never actually change.
1363 */
1364static void
1365crocus_init_render_context(struct crocus_batch *batch)
1366{
1367   UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1368
1369   emit_pipeline_select(batch, _3D);
1370
1371   crocus_emit_cmd(batch, GENX(STATE_SIP), foo);
1372
1373#if GFX_VER >= 7
1374   emit_l3_state(batch, false);
1375#endif
1376#if (GFX_VERx10 == 70 || GFX_VERx10 == 80)
1377   crocus_emit_reg(batch, GENX(INSTPM), reg) {
1378      reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1379      reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1380   }
1381#endif
1382#if GFX_VER >= 5 || GFX_VERx10 == 45
1383   /* Use the legacy AA line coverage computation. */
1384   crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1385#endif
1386
1387   /* No polygon stippling offsets are necessary. */
1388   /* TODO: may need to set an offset for origin-UL framebuffers */
1389   crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1390
1391#if GFX_VER >= 7
1392   crocus_alloc_push_constants(batch);
1393#endif
1394
1395#if GFX_VER == 8
1396   /* Set the initial MSAA sample positions. */
1397   crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1398      INTEL_SAMPLE_POS_1X(pat._1xSample);
1399      INTEL_SAMPLE_POS_2X(pat._2xSample);
1400      INTEL_SAMPLE_POS_4X(pat._4xSample);
1401      INTEL_SAMPLE_POS_8X(pat._8xSample);
1402   }
1403
1404   /* Disable chromakeying (it's for media) */
1405   crocus_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1406
1407   /* We want regular rendering, not special HiZ operations. */
1408   crocus_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1409#endif
1410}
1411
1412#if GFX_VER >= 7
1413static void
1414crocus_init_compute_context(struct crocus_batch *batch)
1415{
1416   UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1417
1418   emit_pipeline_select(batch, GPGPU);
1419
1420#if GFX_VER >= 7
1421   emit_l3_state(batch, true);
1422#endif
1423}
1424#endif
1425
1426/**
1427 * Generation-specific context state (ice->state.genx->...).
1428 *
1429 * Most state can go in crocus_context directly, but these encode hardware
1430 * packets which vary by generation.
1431 */
1432struct crocus_genx_state {
1433   struct {
1434#if GFX_VER >= 7
1435      struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1436#endif
1437   } shaders[MESA_SHADER_STAGES];
1438
1439#if GFX_VER == 8
1440   bool pma_fix_enabled;
1441#endif
1442};
1443
1444/**
1445 * The pipe->set_blend_color() driver hook.
1446 *
1447 * This corresponds to our COLOR_CALC_STATE.
1448 */
1449static void
1450crocus_set_blend_color(struct pipe_context *ctx,
1451                       const struct pipe_blend_color *state)
1452{
1453   struct crocus_context *ice = (struct crocus_context *) ctx;
1454
1455   /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1456   memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1457#if GFX_VER <= 5
1458   ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR;
1459#else
1460   ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1461#endif
1462}
1463
1464/**
1465 * Gallium CSO for blend state (see pipe_blend_state).
1466 */
1467struct crocus_blend_state {
1468#if GFX_VER == 8
1469   /** Partial 3DSTATE_PS_BLEND */
1470   uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1471#endif
1472
1473   /** copy of BLEND_STATE */
1474   struct pipe_blend_state cso;
1475
1476   /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1477   uint8_t blend_enables;
1478
1479   /** Bitfield of whether color writes are enabled for RT[i] */
1480   uint8_t color_write_enables;
1481
1482   /** Does RT[0] use dual color blending? */
1483   bool dual_color_blending;
1484};
1485
1486static enum pipe_blendfactor
1487fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1488{
1489   if (alpha_to_one) {
1490      if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1491         return PIPE_BLENDFACTOR_ONE;
1492
1493      if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1494         return PIPE_BLENDFACTOR_ZERO;
1495   }
1496
1497   return f;
1498}
1499
1500#if GFX_VER >= 6
1501typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
1502#else
1503typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
1504#endif
1505
1506static bool
1507can_emit_logic_op(struct crocus_context *ice)
1508{
1509   /* all pre gen8 have logicop restricted to unorm */
1510   enum pipe_format pformat = PIPE_FORMAT_NONE;
1511   for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
1512      if (ice->state.framebuffer.cbufs[i]) {
1513         pformat = ice->state.framebuffer.cbufs[i]->format;
1514         break;
1515      }
1516   }
1517   return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat));
1518}
1519
1520static bool
1521set_blend_entry_bits(struct crocus_batch *batch, BLEND_ENTRY_GENXML *entry,
1522                     struct crocus_blend_state *cso_blend,
1523                     int idx)
1524{
1525   struct crocus_context *ice = batch->ice;
1526   bool independent_alpha_blend = false;
1527   const struct pipe_rt_blend_state *rt =
1528      &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? idx : 0];
1529   const unsigned blend_enabled = rt->blend_enable;
1530
1531   enum pipe_blendfactor src_rgb =
1532      fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one);
1533   enum pipe_blendfactor src_alpha =
1534      fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one);
1535   enum pipe_blendfactor dst_rgb =
1536      fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one);
1537   enum pipe_blendfactor dst_alpha =
1538      fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one);
1539
1540   if (rt->rgb_func != rt->alpha_func ||
1541       src_rgb != src_alpha || dst_rgb != dst_alpha)
1542      independent_alpha_blend = true;
1543   if (cso_blend->cso.logicop_enable) {
1544      if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
1545         entry->LogicOpEnable = cso_blend->cso.logicop_enable;
1546         entry->LogicOpFunction = cso_blend->cso.logicop_func;
1547      }
1548   } else if (blend_enabled) {
1549      if (idx == 0) {
1550         struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1551         struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1552         entry->ColorBufferBlendEnable =
1553            (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
1554      } else
1555         entry->ColorBufferBlendEnable = 1;
1556
1557      entry->ColorBlendFunction          = rt->rgb_func;
1558      entry->AlphaBlendFunction          = rt->alpha_func;
1559      entry->SourceBlendFactor           = (int) src_rgb;
1560      entry->SourceAlphaBlendFactor      = (int) src_alpha;
1561      entry->DestinationBlendFactor      = (int) dst_rgb;
1562      entry->DestinationAlphaBlendFactor = (int) dst_alpha;
1563   }
1564#if GFX_VER <= 5
1565   /*
1566    * Gen4/GM45/ILK can't handle have ColorBufferBlendEnable == 0
1567    * when a dual src blend shader is in use. Setup dummy blending.
1568    */
1569   struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1570   struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1571   if (idx == 0 && !blend_enabled && wm_prog_data->dual_src_blend) {
1572      entry->ColorBufferBlendEnable = 1;
1573      entry->ColorBlendFunction = PIPE_BLEND_ADD;
1574      entry->AlphaBlendFunction = PIPE_BLEND_ADD;
1575      entry->SourceBlendFactor = PIPE_BLENDFACTOR_ONE;
1576      entry->SourceAlphaBlendFactor = PIPE_BLENDFACTOR_ONE;
1577      entry->DestinationBlendFactor = PIPE_BLENDFACTOR_ZERO;
1578      entry->DestinationAlphaBlendFactor = PIPE_BLENDFACTOR_ZERO;
1579   }
1580#endif
1581   return independent_alpha_blend;
1582}
1583
1584/**
1585 * The pipe->create_blend_state() driver hook.
1586 *
1587 * Translates a pipe_blend_state into crocus_blend_state.
1588 */
1589static void *
1590crocus_create_blend_state(struct pipe_context *ctx,
1591                          const struct pipe_blend_state *state)
1592{
1593   struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state));
1594
1595   cso->blend_enables = 0;
1596   cso->color_write_enables = 0;
1597   STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS <= 8);
1598
1599   cso->cso = *state;
1600   cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1601
1602#if GFX_VER == 8
1603   bool indep_alpha_blend = false;
1604#endif
1605   for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
1606      const struct pipe_rt_blend_state *rt =
1607         &state->rt[state->independent_blend_enable ? i : 0];
1608      if (rt->blend_enable)
1609         cso->blend_enables |= 1u << i;
1610      if (rt->colormask)
1611         cso->color_write_enables |= 1u << i;
1612#if GFX_VER == 8
1613      enum pipe_blendfactor src_rgb =
1614         fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1615      enum pipe_blendfactor src_alpha =
1616         fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1617      enum pipe_blendfactor dst_rgb =
1618         fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1619      enum pipe_blendfactor dst_alpha =
1620         fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1621
1622      if (rt->rgb_func != rt->alpha_func ||
1623          src_rgb != src_alpha || dst_rgb != dst_alpha)
1624         indep_alpha_blend = true;
1625#endif
1626   }
1627
1628#if GFX_VER == 8
1629   crocus_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1630      /* pb.HasWriteableRT is filled in at draw time.
1631       * pb.AlphaTestEnable is filled in at draw time.
1632       *
1633       * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1634       * setting it when dual color blending without an appropriate shader.
1635       */
1636
1637      pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1638      pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1639
1640      /* The casts prevent warnings about implicit enum type conversions. */
1641      pb.SourceBlendFactor =
1642         (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1643      pb.SourceAlphaBlendFactor =
1644         (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1645      pb.DestinationBlendFactor =
1646         (int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one);
1647      pb.DestinationAlphaBlendFactor =
1648         (int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one);
1649   }
1650#endif
1651   return cso;
1652}
1653
1654/**
1655 * The pipe->bind_blend_state() driver hook.
1656 *
1657 * Bind a blending CSO and flag related dirty bits.
1658 */
1659static void
1660crocus_bind_blend_state(struct pipe_context *ctx, void *state)
1661{
1662   struct crocus_context *ice = (struct crocus_context *) ctx;
1663   struct crocus_blend_state *cso = state;
1664
1665   ice->state.cso_blend = cso;
1666   ice->state.blend_enables = cso ? cso->blend_enables : 0;
1667
1668   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
1669   ice->state.dirty |= CROCUS_DIRTY_WM;
1670#if GFX_VER >= 6
1671   ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1672#endif
1673#if GFX_VER >= 7
1674   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
1675#endif
1676#if GFX_VER == 8
1677   ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1678   ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1679#endif
1680   ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1681   ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1682   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND];
1683}
1684
1685/**
1686 * Return true if the FS writes to any color outputs which are not disabled
1687 * via color masking.
1688 */
1689static bool
1690has_writeable_rt(const struct crocus_blend_state *cso_blend,
1691                 const struct shader_info *fs_info)
1692{
1693   if (!fs_info)
1694      return false;
1695
1696   unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1697
1698   if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1699      rt_outputs = (1 << BRW_MAX_DRAW_BUFFERS) - 1;
1700
1701   return cso_blend->color_write_enables & rt_outputs;
1702}
1703
1704/**
1705 * Gallium CSO for depth, stencil, and alpha testing state.
1706 */
1707struct crocus_depth_stencil_alpha_state {
1708   struct pipe_depth_stencil_alpha_state cso;
1709
1710   bool depth_writes_enabled;
1711   bool stencil_writes_enabled;
1712};
1713
1714/**
1715 * The pipe->create_depth_stencil_alpha_state() driver hook.
1716 *
1717 * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1718 * testing state since we need pieces of it in a variety of places.
1719 */
1720static void *
1721crocus_create_zsa_state(struct pipe_context *ctx,
1722                        const struct pipe_depth_stencil_alpha_state *state)
1723{
1724   struct crocus_depth_stencil_alpha_state *cso =
1725      malloc(sizeof(struct crocus_depth_stencil_alpha_state));
1726
1727   bool two_sided_stencil = state->stencil[1].enabled;
1728   cso->cso = *state;
1729
1730   cso->depth_writes_enabled = state->depth_writemask;
1731   cso->stencil_writes_enabled =
1732      state->stencil[0].writemask != 0 ||
1733      (two_sided_stencil && state->stencil[1].writemask != 0);
1734
1735   /* The state tracker needs to optimize away EQUAL writes for us. */
1736   assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1737
1738   return cso;
1739}
1740
1741/**
1742 * The pipe->bind_depth_stencil_alpha_state() driver hook.
1743 *
1744 * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1745 */
1746static void
1747crocus_bind_zsa_state(struct pipe_context *ctx, void *state)
1748{
1749   struct crocus_context *ice = (struct crocus_context *) ctx;
1750   struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1751   struct crocus_depth_stencil_alpha_state *new_cso = state;
1752
1753   if (new_cso) {
1754      if (cso_changed(cso.alpha_ref_value))
1755         ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1756
1757      if (cso_changed(cso.alpha_enabled))
1758         ice->state.dirty |= CROCUS_DIRTY_WM;
1759#if GFX_VER >= 6
1760      if (cso_changed(cso.alpha_enabled))
1761         ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1762
1763      if (cso_changed(cso.alpha_func))
1764         ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1765#endif
1766#if GFX_VER == 8
1767      if (cso_changed(cso.alpha_enabled))
1768         ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1769#endif
1770
1771      if (cso_changed(depth_writes_enabled))
1772         ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1773
1774      ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
1775      ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
1776
1777#if GFX_VER <= 5
1778      ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1779#endif
1780   }
1781
1782   ice->state.cso_zsa = new_cso;
1783   ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
1784#if GFX_VER >= 6
1785   ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
1786#endif
1787#if GFX_VER == 8
1788   ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1789#endif
1790   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA];
1791}
1792
1793#if GFX_VER == 8
1794static bool
1795want_pma_fix(struct crocus_context *ice)
1796{
1797   UNUSED struct crocus_screen *screen = (void *) ice->ctx.screen;
1798   UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
1799   const struct brw_wm_prog_data *wm_prog_data = (void *)
1800      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
1801   const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
1802   const struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
1803   const struct crocus_blend_state *cso_blend = ice->state.cso_blend;
1804
1805   /* In very specific combinations of state, we can instruct Gfx8-9 hardware
1806    * to avoid stalling at the pixel mask array.  The state equations are
1807    * documented in these places:
1808    *
1809    * - Gfx8 Depth PMA Fix:   CACHE_MODE_1::NP_PMA_FIX_ENABLE
1810    * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
1811    *
1812    * Both equations share some common elements:
1813    *
1814    *    no_hiz_op =
1815    *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1816    *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1817    *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1818    *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
1819    *
1820    *    killpixels =
1821    *       3DSTATE_WM::ForceKillPix != ForceOff &&
1822    *       (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1823    *        3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1824    *        3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1825    *        3DSTATE_PS_BLEND::AlphaTestEnable ||
1826    *        3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1827    *
1828    *    (Technically the stencil PMA treats ForceKillPix differently,
1829    *     but I think this is a documentation oversight, and we don't
1830    *     ever use it in this way, so it doesn't matter).
1831    *
1832    *    common_pma_fix =
1833    *       3DSTATE_WM::ForceThreadDispatch != 1 &&
1834    *       3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
1835    *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1836    *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1837    *       3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
1838    *       3DSTATE_PS_EXTRA::PixelShaderValid &&
1839    *       no_hiz_op
1840    *
1841    * These are always true:
1842    *
1843    *    3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
1844    *    3DSTATE_PS_EXTRA::PixelShaderValid
1845    *
1846    * Also, we never use the normal drawing path for HiZ ops; these are true:
1847    *
1848    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1849    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1850    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1851    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
1852    *
1853    * This happens sometimes:
1854    *
1855    *    3DSTATE_WM::ForceThreadDispatch != 1
1856    *
1857    * However, we choose to ignore it as it either agrees with the signal
1858    * (dispatch was already enabled, so nothing out of the ordinary), or
1859    * there are no framebuffer attachments (so no depth or HiZ anyway,
1860    * meaning the PMA signal will already be disabled).
1861    */
1862
1863   if (!cso_fb->zsbuf)
1864      return false;
1865
1866   struct crocus_resource *zres, *sres;
1867   crocus_get_depth_stencil_resources(devinfo,
1868                                      cso_fb->zsbuf->texture, &zres, &sres);
1869
1870   /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1871    * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1872    */
1873   if (!zres || !crocus_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))
1874      return false;
1875
1876   /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
1877   if (wm_prog_data->early_fragment_tests)
1878      return false;
1879
1880   /* 3DSTATE_WM::ForceKillPix != ForceOff &&
1881    * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1882    *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1883    *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1884    *  3DSTATE_PS_BLEND::AlphaTestEnable ||
1885    *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1886    */
1887   bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||
1888                     cso_blend->cso.alpha_to_coverage || cso_zsa->cso.alpha_enabled;
1889
1890   /* The Gfx8 depth PMA equation becomes:
1891    *
1892    *    depth_writes =
1893    *       3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
1894    *       3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
1895    *
1896    *    stencil_writes =
1897    *       3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
1898    *       3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
1899    *       3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
1900    *
1901    *    Z_PMA_OPT =
1902    *       common_pma_fix &&
1903    *       3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
1904    *       ((killpixels && (depth_writes || stencil_writes)) ||
1905    *        3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
1906    *
1907    */
1908   if (!cso_zsa->cso.depth_enabled)
1909      return false;
1910
1911   return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||
1912          (killpixels && (cso_zsa->depth_writes_enabled ||
1913                          (sres && cso_zsa->stencil_writes_enabled)));
1914}
1915#endif
1916void
1917genX(crocus_update_pma_fix)(struct crocus_context *ice,
1918                            struct crocus_batch *batch,
1919                            bool enable)
1920{
1921#if GFX_VER == 8
1922   struct crocus_genx_state *genx = ice->state.genx;
1923
1924   if (genx->pma_fix_enabled == enable)
1925      return;
1926
1927   genx->pma_fix_enabled = enable;
1928
1929   /* According to the Broadwell PIPE_CONTROL documentation, software should
1930    * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
1931    * prior to the LRI.  If stencil buffer writes are enabled, then a Render        * Cache Flush is also necessary.
1932    *
1933    * The Gfx9 docs say to use a depth stall rather than a command streamer
1934    * stall.  However, the hardware seems to violently disagree.  A full
1935    * command streamer stall seems to be needed in both cases.
1936    */
1937   crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1938                                  PIPE_CONTROL_CS_STALL |
1939                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1940                                  PIPE_CONTROL_RENDER_TARGET_FLUSH);
1941
1942   crocus_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1943      reg.NPPMAFixEnable = enable;
1944      reg.NPEarlyZFailsDisable = enable;
1945      reg.NPPMAFixEnableMask = true;
1946      reg.NPEarlyZFailsDisableMask = true;
1947   }
1948
1949   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
1950    * Flush bits is often necessary.  We do it regardless because it's easier.
1951    * The render cache flush is also necessary if stencil writes are enabled.
1952    *
1953    * Again, the Gfx9 docs give a different set of flushes but the Broadwell
1954    * flushes seem to work just as well.
1955    */
1956   crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1957                                  PIPE_CONTROL_DEPTH_STALL |
1958                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1959                                  PIPE_CONTROL_RENDER_TARGET_FLUSH);
1960#endif
1961}
1962
1963static float
1964get_line_width(const struct pipe_rasterizer_state *state)
1965{
1966   float line_width = state->line_width;
1967
1968   /* From the OpenGL 4.4 spec:
1969    *
1970    * "The actual width of non-antialiased lines is determined by rounding
1971    *  the supplied width to the nearest integer, then clamping it to the
1972    *  implementation-dependent maximum non-antialiased line width."
1973    */
1974   if (!state->multisample && !state->line_smooth)
1975      line_width = roundf(state->line_width);
1976
1977   if (!state->multisample && state->line_smooth && line_width < 1.5f) {
1978      /* For 1 pixel line thickness or less, the general anti-aliasing
1979       * algorithm gives up, and a garbage line is generated.  Setting a
1980       * Line Width of 0.0 specifies the rasterization of the "thinnest"
1981       * (one-pixel-wide), non-antialiased lines.
1982       *
1983       * Lines rendered with zero Line Width are rasterized using the
1984       * "Grid Intersection Quantization" rules as specified by the
1985       * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
1986       */
1987      /* hack around this for gfx4/5 fps counters in hud. */
1988      line_width = GFX_VER < 6 ? 1.5f : 0.0f;
1989   }
1990   return line_width;
1991}
1992
1993/**
1994 * The pipe->create_rasterizer_state() driver hook.
1995 */
1996static void *
1997crocus_create_rasterizer_state(struct pipe_context *ctx,
1998                               const struct pipe_rasterizer_state *state)
1999{
2000   struct crocus_rasterizer_state *cso =
2001      malloc(sizeof(struct crocus_rasterizer_state));
2002
2003   cso->fill_mode_point_or_line =
2004      state->fill_front == PIPE_POLYGON_MODE_LINE ||
2005      state->fill_front == PIPE_POLYGON_MODE_POINT ||
2006      state->fill_back == PIPE_POLYGON_MODE_LINE ||
2007      state->fill_back == PIPE_POLYGON_MODE_POINT;
2008
2009   if (state->clip_plane_enable != 0)
2010      cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2011   else
2012      cso->num_clip_plane_consts = 0;
2013
2014   cso->cso = *state;
2015
2016#if GFX_VER >= 6
2017   float line_width = get_line_width(state);
2018
2019   crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2020      sf.StatisticsEnable = true;
2021      sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2022      sf.LineEndCapAntialiasingRegionWidth =
2023         state->line_smooth ? _10pixels : _05pixels;
2024      sf.LastPixelEnable = state->line_last_pixel;
2025#if GFX_VER <= 7
2026      sf.AntialiasingEnable = state->line_smooth;
2027#endif
2028#if GFX_VER == 8
2029      struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2030      if (screen->devinfo.platform == INTEL_PLATFORM_CHV)
2031         sf.CHVLineWidth = line_width;
2032      else
2033         sf.LineWidth = line_width;
2034#else
2035      sf.LineWidth = line_width;
2036#endif
2037      sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2038      sf.PointWidth = state->point_size;
2039
2040      if (state->flatshade_first) {
2041         sf.TriangleFanProvokingVertexSelect = 1;
2042      } else {
2043         sf.TriangleStripListProvokingVertexSelect = 2;
2044         sf.TriangleFanProvokingVertexSelect = 2;
2045         sf.LineStripListProvokingVertexSelect = 1;
2046      }
2047
2048#if GFX_VER == 6
2049      sf.AttributeSwizzleEnable = true;
2050      if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2051         sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
2052      else
2053         sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
2054#endif
2055
2056#if GFX_VER <= 7
2057      sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way...
2058
2059#if GFX_VER >= 6
2060      sf.GlobalDepthOffsetEnableSolid = state->offset_tri;
2061      sf.GlobalDepthOffsetEnableWireframe = state->offset_line;
2062      sf.GlobalDepthOffsetEnablePoint = state->offset_point;
2063      sf.GlobalDepthOffsetConstant = state->offset_units * 2;
2064      sf.GlobalDepthOffsetScale = state->offset_scale;
2065      sf.GlobalDepthOffsetClamp = state->offset_clamp;
2066
2067      sf.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2068      sf.BackFaceFillMode = translate_fill_mode(state->fill_back);
2069#endif
2070
2071      sf.CullMode = translate_cull_mode(state->cull_face);
2072      sf.ScissorRectangleEnable = true;
2073
2074#if GFX_VERx10 == 75
2075      sf.LineStippleEnable = state->line_stipple_enable;
2076#endif
2077#endif
2078   }
2079#endif
2080
2081#if GFX_VER == 8
2082   crocus_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2083      rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2084      rr.CullMode = translate_cull_mode(state->cull_face);
2085      rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2086      rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2087      rr.DXMultisampleRasterizationEnable = state->multisample;
2088      rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2089      rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2090      rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2091      rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2092      rr.GlobalDepthOffsetScale = state->offset_scale;
2093      rr.GlobalDepthOffsetClamp = state->offset_clamp;
2094      rr.SmoothPointEnable = state->point_smooth;
2095      rr.AntialiasingEnable = state->line_smooth;
2096      rr.ScissorRectangleEnable = state->scissor;
2097      rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2098   }
2099#endif
2100
2101#if GFX_VER >= 6
2102   crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2103      /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2104       * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2105       */
2106#if GFX_VER >= 7
2107      cl.EarlyCullEnable = true;
2108#endif
2109
2110#if GFX_VER == 7
2111      cl.FrontWinding = state->front_ccw ? 1 : 0;
2112      cl.CullMode = translate_cull_mode(state->cull_face);
2113#endif
2114      cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2115#if GFX_VER < 8
2116      cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2117#endif
2118      cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2119      cl.GuardbandClipTestEnable = true;
2120      cl.ClipEnable = true;
2121      cl.MinimumPointWidth = 0.125;
2122      cl.MaximumPointWidth = 255.875;
2123
2124#if GFX_VER == 8
2125      cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2126#endif
2127
2128      if (state->flatshade_first) {
2129         cl.TriangleFanProvokingVertexSelect = 1;
2130      } else {
2131         cl.TriangleStripListProvokingVertexSelect = 2;
2132         cl.TriangleFanProvokingVertexSelect = 2;
2133         cl.LineStripListProvokingVertexSelect = 1;
2134      }
2135   }
2136#endif
2137
2138   /* Remap from 0..255 back to 1..256 */
2139   const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2140
2141   crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2142      if (state->line_stipple_enable) {
2143         line.LineStipplePattern = state->line_stipple_pattern;
2144         line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2145         line.LineStippleRepeatCount = line_stipple_factor;
2146      }
2147   }
2148
2149   return cso;
2150}
2151
2152/**
2153 * The pipe->bind_rasterizer_state() driver hook.
2154 *
2155 * Bind a rasterizer CSO and flag related dirty bits.
2156 */
2157static void
2158crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2159{
2160   struct crocus_context *ice = (struct crocus_context *) ctx;
2161   struct crocus_rasterizer_state *old_cso = ice->state.cso_rast;
2162   struct crocus_rasterizer_state *new_cso = state;
2163
2164   if (new_cso) {
2165      /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2166      if (cso_changed_memcmp(line_stipple))
2167         ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE;
2168#if GFX_VER >= 6
2169      if (cso_changed(cso.half_pixel_center))
2170         ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
2171      if (cso_changed(cso.scissor))
2172         ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
2173      if (cso_changed(cso.multisample))
2174	 ice->state.dirty |= CROCUS_DIRTY_WM;
2175#else
2176      if (cso_changed(cso.scissor))
2177         ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
2178#endif
2179
2180      if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable))
2181         ice->state.dirty |= CROCUS_DIRTY_WM;
2182
2183#if GFX_VER >= 6
2184      if (cso_changed(cso.rasterizer_discard))
2185         ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
2186
2187      if (cso_changed(cso.flatshade_first))
2188         ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
2189#endif
2190
2191      if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) ||
2192          cso_changed(cso.clip_halfz))
2193         ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
2194
2195#if GFX_VER >= 7
2196      if (cso_changed(cso.sprite_coord_enable) ||
2197          cso_changed(cso.sprite_coord_mode) ||
2198          cso_changed(cso.light_twoside))
2199         ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
2200#endif
2201#if GFX_VER <= 5
2202      if (cso_changed(cso.clip_plane_enable))
2203         ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
2204#endif
2205   }
2206
2207   ice->state.cso_rast = new_cso;
2208   ice->state.dirty |= CROCUS_DIRTY_RASTER;
2209   ice->state.dirty |= CROCUS_DIRTY_CLIP;
2210#if GFX_VER <= 5
2211   ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
2212   ice->state.dirty |= CROCUS_DIRTY_WM;
2213#endif
2214#if GFX_VER <= 6
2215   ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
2216#endif
2217   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER];
2218}
2219
2220/**
2221 * Return true if the given wrap mode requires the border color to exist.
2222 *
2223 * (We can skip uploading it if the sampler isn't going to use it.)
2224 */
2225static bool
2226wrap_mode_needs_border_color(unsigned wrap_mode)
2227{
2228#if GFX_VER == 8
2229   return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2230#else
2231   return wrap_mode == TCM_CLAMP_BORDER;
2232#endif
2233}
2234
2235/**
2236 * Gallium CSO for sampler state.
2237 */
2238struct crocus_sampler_state {
2239   struct pipe_sampler_state pstate;
2240   union pipe_color_union border_color;
2241   bool needs_border_color;
2242   unsigned wrap_s;
2243   unsigned wrap_t;
2244   unsigned wrap_r;
2245   unsigned mag_img_filter;
2246   float min_lod;
2247};
2248
2249/**
2250 * The pipe->create_sampler_state() driver hook.
2251 *
2252 * We fill out SAMPLER_STATE (except for the border color pointer), and
2253 * store that on the CPU.  It doesn't make sense to upload it to a GPU
2254 * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2255 * all bound sampler states to be in contiguous memor.
2256 */
2257static void *
2258crocus_create_sampler_state(struct pipe_context *ctx,
2259                            const struct pipe_sampler_state *state)
2260{
2261   struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state);
2262
2263   if (!cso)
2264      return NULL;
2265
2266   STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2267   STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2268
2269   bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
2270      state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
2271   cso->wrap_s = translate_wrap(state->wrap_s, either_nearest);
2272   cso->wrap_t = translate_wrap(state->wrap_t, either_nearest);
2273   cso->wrap_r = translate_wrap(state->wrap_r, either_nearest);
2274
2275   cso->pstate = *state;
2276
2277   memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2278
2279   cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) ||
2280                             wrap_mode_needs_border_color(cso->wrap_t) ||
2281                             wrap_mode_needs_border_color(cso->wrap_r);
2282
2283   cso->min_lod = state->min_lod;
2284   cso->mag_img_filter = state->mag_img_filter;
2285
2286   // XXX: explain this code ported from ilo...I don't get it at all...
2287   if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2288       state->min_lod > 0.0f) {
2289      cso->min_lod = 0.0f;
2290      cso->mag_img_filter = state->min_img_filter;
2291   }
2292
2293   return cso;
2294}
2295
2296/**
2297 * The pipe->bind_sampler_states() driver hook.
2298 */
2299static void
2300crocus_bind_sampler_states(struct pipe_context *ctx,
2301                           enum pipe_shader_type p_stage,
2302                           unsigned start, unsigned count,
2303                           void **states)
2304{
2305   struct crocus_context *ice = (struct crocus_context *) ctx;
2306   gl_shader_stage stage = stage_from_pipe(p_stage);
2307   struct crocus_shader_state *shs = &ice->state.shaders[stage];
2308
2309   assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS);
2310
2311   bool dirty = false;
2312
2313   for (int i = 0; i < count; i++) {
2314      if (shs->samplers[start + i] != states[i]) {
2315         shs->samplers[start + i] = states[i];
2316         dirty = true;
2317      }
2318   }
2319
2320   if (dirty) {
2321#if GFX_VER <= 5
2322      if (p_stage == PIPE_SHADER_FRAGMENT)
2323         ice->state.dirty |= CROCUS_DIRTY_WM;
2324      else if (p_stage == PIPE_SHADER_VERTEX)
2325         ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
2326#endif
2327      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2328      ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
2329   }
2330}
2331
2332enum samp_workaround {
2333   SAMP_NORMAL,
2334   SAMP_CUBE_CLAMP,
2335   SAMP_CUBE_CUBE,
2336   SAMP_T_WRAP,
2337};
2338
2339static void
2340crocus_upload_sampler_state(struct crocus_batch *batch,
2341                            struct crocus_sampler_state *cso,
2342                            uint32_t border_color_offset,
2343                            enum samp_workaround samp_workaround,
2344                            uint32_t first_level,
2345                            void *map)
2346{
2347   struct pipe_sampler_state *state = &cso->pstate;
2348   uint32_t wrap_s, wrap_t, wrap_r;
2349
2350   wrap_s = cso->wrap_s;
2351   wrap_t = cso->wrap_t;
2352   wrap_r = cso->wrap_r;
2353
2354   switch (samp_workaround) {
2355   case SAMP_CUBE_CLAMP:
2356      wrap_s = TCM_CLAMP;
2357      wrap_t = TCM_CLAMP;
2358      wrap_r = TCM_CLAMP;
2359      break;
2360   case SAMP_CUBE_CUBE:
2361      wrap_s = TCM_CUBE;
2362      wrap_t = TCM_CUBE;
2363      wrap_r = TCM_CUBE;
2364      break;
2365   case SAMP_T_WRAP:
2366      wrap_t = TCM_WRAP;
2367      break;
2368   default:
2369      break;
2370   }
2371
2372   _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) {
2373      samp.TCXAddressControlMode = wrap_s;
2374      samp.TCYAddressControlMode = wrap_t;
2375      samp.TCZAddressControlMode = wrap_r;
2376
2377#if GFX_VER >= 6
2378      samp.NonnormalizedCoordinateEnable = !state->normalized_coords;
2379#endif
2380      samp.MinModeFilter = state->min_img_filter;
2381      samp.MagModeFilter = cso->mag_img_filter;
2382      samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2383      samp.MaximumAnisotropy = RATIO21;
2384
2385      if (state->max_anisotropy >= 2) {
2386         if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2387            samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2388#if GFX_VER >= 7
2389            samp.AnisotropicAlgorithm = EWAApproximation;
2390#endif
2391         }
2392
2393         if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
2394            samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2395
2396         samp.MaximumAnisotropy =
2397            MIN2((state->max_anisotropy - 2) / 2, RATIO161);
2398      }
2399
2400      /* Set address rounding bits if not using nearest filtering. */
2401      if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2402         samp.UAddressMinFilterRoundingEnable = true;
2403         samp.VAddressMinFilterRoundingEnable = true;
2404         samp.RAddressMinFilterRoundingEnable = true;
2405      }
2406
2407      if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2408         samp.UAddressMagFilterRoundingEnable = true;
2409         samp.VAddressMagFilterRoundingEnable = true;
2410         samp.RAddressMagFilterRoundingEnable = true;
2411      }
2412
2413      if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2414         samp.ShadowFunction = translate_shadow_func(state->compare_func);
2415
2416      const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2417
2418#if GFX_VER == 8
2419      samp.LODPreClampMode = CLAMP_MODE_OGL;
2420#else
2421      samp.LODPreClampEnable = true;
2422#endif
2423      samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod);
2424      samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2425      samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2426
2427#if GFX_VER == 6
2428      samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod);
2429      samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter;
2430#endif
2431
2432#if GFX_VER < 6
2433      samp.BorderColorPointer =
2434         ro_bo(batch->state.bo, border_color_offset);
2435#else
2436      samp.BorderColorPointer = border_color_offset;
2437#endif
2438   }
2439}
2440
2441static void
2442crocus_upload_border_color(struct crocus_batch *batch,
2443                           struct crocus_sampler_state *cso,
2444                           struct crocus_sampler_view *tex,
2445                           uint32_t *bc_offset)
2446{
2447   /* We may need to swizzle the border color for format faking.
2448    * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2449    * This means we need to move the border color's A channel into
2450    * the R or G channels so that those read swizzles will move it
2451    * back into A.
2452    */
2453   enum pipe_format internal_format = PIPE_FORMAT_NONE;
2454   union pipe_color_union *color = &cso->border_color;
2455   union pipe_color_union tmp;
2456   if (tex) {
2457      internal_format = tex->res->internal_format;
2458
2459      if (util_format_is_alpha(internal_format)) {
2460         unsigned char swz[4] = {
2461            PIPE_SWIZZLE_0, PIPE_SWIZZLE_0,
2462            PIPE_SWIZZLE_0, PIPE_SWIZZLE_W,
2463         };
2464         util_format_apply_color_swizzle(&tmp, color, swz, true);
2465         color = &tmp;
2466      } else if (util_format_is_luminance_alpha(internal_format) &&
2467                 internal_format != PIPE_FORMAT_L8A8_SRGB) {
2468         unsigned char swz[4] = {
2469            PIPE_SWIZZLE_X, PIPE_SWIZZLE_X,
2470            PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
2471         };
2472         util_format_apply_color_swizzle(&tmp, color, swz, true);
2473         color = &tmp;
2474      }
2475   }
2476   bool is_integer_format = util_format_is_pure_integer(internal_format);
2477   unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4;
2478   const int sbc_align = (GFX_VER == 8 ? 64 : ((GFX_VERx10 == 75 && is_integer_format) ? 512 : 32));
2479   uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset);
2480
2481   struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
2482
2483#define ASSIGN(dst, src)                        \
2484   do {                                         \
2485      dst = src;                                \
2486   } while (0)
2487
2488#define ASSIGNu16(dst, src)                     \
2489   do {                                         \
2490      dst = (uint16_t)src;                      \
2491   } while (0)
2492
2493#define ASSIGNu8(dst, src)                      \
2494   do {                                         \
2495      dst = (uint8_t)src;                       \
2496   } while (0)
2497
2498#define BORDER_COLOR_ATTR(macro, _color_type, src)              \
2499   macro(state.BorderColor ## _color_type ## Red, src[0]);      \
2500   macro(state.BorderColor ## _color_type ## Green, src[1]);    \
2501   macro(state.BorderColor ## _color_type ## Blue, src[2]);     \
2502   macro(state.BorderColor ## _color_type ## Alpha, src[3]);
2503
2504#if GFX_VER >= 8
2505   /* On Broadwell, the border color is represented as four 32-bit floats,
2506    * integers, or unsigned values, interpreted according to the surface
2507    * format.  This matches the sampler->BorderColor union exactly; just
2508    * memcpy the values.
2509    */
2510   BORDER_COLOR_ATTR(ASSIGN, 32bit, color->ui);
2511#elif GFX_VERx10 == 75
2512   if (is_integer_format) {
2513      const struct util_format_description *format_desc =
2514         util_format_description(internal_format);
2515
2516      /* From the Haswell PRM, "Command Reference: Structures", Page 36:
2517       * "If any color channel is missing from the surface format,
2518       *  corresponding border color should be programmed as zero and if
2519       *  alpha channel is missing, corresponding Alpha border color should
2520       *  be programmed as 1."
2521       */
2522      unsigned c[4] = { 0, 0, 0, 1 };
2523      for (int i = 0; i < 4; i++) {
2524         if (format_desc->channel[i].size)
2525            c[i] = color->ui[i];
2526      }
2527
2528      switch (format_desc->channel[0].size) {
2529      case 8:
2530         /* Copy RGBA in order. */
2531         BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
2532         break;
2533      case 10:
2534         /* R10G10B10A2_UINT is treated like a 16-bit format. */
2535      case 16:
2536         BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
2537         break;
2538      case 32:
2539         if (format_desc->channel[1].size && !format_desc->channel[2].size) {
2540            /* Careful inspection of the tables reveals that for RG32 formats,
2541             * the green channel needs to go where blue normally belongs.
2542             */
2543            state.BorderColor32bitRed = c[0];
2544            state.BorderColor32bitBlue = c[1];
2545            state.BorderColor32bitAlpha = 1;
2546         } else {
2547            /* Copy RGBA in order. */
2548            BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
2549         }
2550         break;
2551      default:
2552         assert(!"Invalid number of bits per channel in integer format.");
2553         break;
2554      }
2555   } else {
2556      BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2557   }
2558#elif GFX_VER == 5 || GFX_VER == 6
2559   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f);
2560   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f);
2561   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f);
2562
2563#define MESA_FLOAT_TO_HALF(dst, src)            \
2564   dst = _mesa_float_to_half(src);
2565
2566   BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f);
2567
2568#undef MESA_FLOAT_TO_HALF
2569
2570   state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
2571   state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
2572   state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
2573   state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
2574
2575   BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2576
2577#elif GFX_VER == 4
2578   BORDER_COLOR_ATTR(ASSIGN, , color->f);
2579#else
2580   BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2581#endif
2582
2583#undef ASSIGN
2584#undef BORDER_COLOR_ATTR
2585
2586   GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state);
2587}
2588
2589/**
2590 * Upload the sampler states into a contiguous area of GPU memory, for
2591 * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2592 *
2593 * Also fill out the border color state pointers.
2594 */
2595static void
2596crocus_upload_sampler_states(struct crocus_context *ice,
2597                             struct crocus_batch *batch, gl_shader_stage stage)
2598{
2599   struct crocus_shader_state *shs = &ice->state.shaders[stage];
2600   const struct shader_info *info = crocus_get_shader_info(ice, stage);
2601
2602   /* We assume the state tracker will call pipe->bind_sampler_states()
2603    * if the program's number of textures changes.
2604    */
2605   unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
2606
2607   if (!count)
2608      return;
2609
2610   /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2611    * in the dynamic state memory zone, so we can point to it via the
2612    * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2613    */
2614   unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2615   uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset);
2616
2617   if (unlikely(!map))
2618      return;
2619
2620   for (int i = 0; i < count; i++) {
2621      struct crocus_sampler_state *state = shs->samplers[i];
2622      struct crocus_sampler_view *tex = shs->textures[i];
2623
2624      if (!state || !tex) {
2625         memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2626      } else {
2627         unsigned border_color_offset = 0;
2628         if (state->needs_border_color) {
2629            crocus_upload_border_color(batch, state, tex, &border_color_offset);
2630         }
2631
2632         enum samp_workaround wa = SAMP_NORMAL;
2633         /* There's a bug in 1D texture sampling - it actually pays
2634          * attention to the wrap_t value, though it should not.
2635          * Override the wrap_t value here to GL_REPEAT to keep
2636          * any nonexistent border pixels from floating in.
2637          */
2638         if (tex->base.target == PIPE_TEXTURE_1D)
2639            wa = SAMP_T_WRAP;
2640         else if (tex->base.target == PIPE_TEXTURE_CUBE ||
2641                  tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
2642            /* Cube maps must use the same wrap mode for all three coordinate
2643             * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
2644             *
2645             * Ivybridge and Baytrail seem to have problems with CUBE mode and
2646             * integer formats.  Fall back to CLAMP for now.
2647             */
2648            if (state->pstate.seamless_cube_map &&
2649                !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format)))
2650               wa = SAMP_CUBE_CUBE;
2651            else
2652               wa = SAMP_CUBE_CLAMP;
2653         }
2654
2655         uint32_t first_level = 0;
2656         if (tex->base.target != PIPE_BUFFER)
2657            first_level = tex->base.u.tex.first_level;
2658
2659         crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map);
2660      }
2661
2662      map += GENX(SAMPLER_STATE_length);
2663   }
2664}
2665
2666/**
2667 * The pipe->create_sampler_view() driver hook.
2668 */
2669static struct pipe_sampler_view *
2670crocus_create_sampler_view(struct pipe_context *ctx,
2671                           struct pipe_resource *tex,
2672                           const struct pipe_sampler_view *tmpl)
2673{
2674   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2675   const struct intel_device_info *devinfo = &screen->devinfo;
2676   struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view));
2677
2678   if (!isv)
2679      return NULL;
2680
2681   /* initialize base object */
2682   isv->base = *tmpl;
2683   isv->base.context = ctx;
2684   isv->base.texture = NULL;
2685   pipe_reference_init(&isv->base.reference, 1);
2686   pipe_resource_reference(&isv->base.texture, tex);
2687
2688   if (util_format_is_depth_or_stencil(tmpl->format)) {
2689      struct crocus_resource *zres, *sres;
2690      const struct util_format_description *desc =
2691         util_format_description(tmpl->format);
2692
2693      crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres);
2694
2695      tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
2696
2697      if (tex->format == PIPE_FORMAT_S8_UINT)
2698         if (GFX_VER == 7 && sres->shadow)
2699            tex = &sres->shadow->base.b;
2700   }
2701
2702   isv->res = (struct crocus_resource *) tex;
2703
2704   isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
2705
2706   if (isv->base.target == PIPE_TEXTURE_CUBE ||
2707       isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
2708      usage |= ISL_SURF_USAGE_CUBE_BIT;
2709
2710   const struct crocus_format_info fmt =
2711      crocus_format_for_usage(devinfo, tmpl->format, usage);
2712
2713   enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a };
2714   crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
2715
2716   /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
2717   if (GFX_VER < 6 &&
2718       (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
2719        tmpl->format == PIPE_FORMAT_X24S8_UINT)) {
2720      isv->swizzle[0] = tmpl->swizzle_g;
2721      isv->swizzle[1] = tmpl->swizzle_g;
2722      isv->swizzle[2] = tmpl->swizzle_g;
2723      isv->swizzle[3] = tmpl->swizzle_g;
2724   }
2725
2726   isv->clear_color = isv->res->aux.clear_color;
2727
2728   isv->view = (struct isl_view) {
2729      .format = fmt.fmt,
2730#if GFX_VERx10 >= 75
2731      .swizzle = (struct isl_swizzle) {
2732         .r = pipe_to_isl_swizzle(isv->swizzle[0], false),
2733         .g = pipe_to_isl_swizzle(isv->swizzle[1], false),
2734         .b = pipe_to_isl_swizzle(isv->swizzle[2], false),
2735         .a = pipe_to_isl_swizzle(isv->swizzle[3], false),
2736      },
2737#else
2738      /* swizzling handled in shader code */
2739      .swizzle = ISL_SWIZZLE_IDENTITY,
2740#endif
2741      .usage = usage,
2742   };
2743
2744   /* Fill out SURFACE_STATE for this view. */
2745   if (tmpl->target != PIPE_BUFFER) {
2746      isv->view.base_level = tmpl->u.tex.first_level;
2747      isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
2748
2749      /* Hardware older than skylake ignores this value */
2750      assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer);
2751
2752      // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
2753      isv->view.base_array_layer = tmpl->u.tex.first_layer;
2754      isv->view.array_len =
2755         tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2756   }
2757#if GFX_VER >= 6
2758   /* just create a second view struct for texture gather just in case */
2759   isv->gather_view = isv->view;
2760
2761#if GFX_VER == 7
2762   if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT ||
2763       fmt.fmt == ISL_FORMAT_R32G32_SINT ||
2764       fmt.fmt == ISL_FORMAT_R32G32_UINT) {
2765      isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD;
2766#if GFX_VERx10 >= 75
2767      isv->gather_view.swizzle = (struct isl_swizzle) {
2768         .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75),
2769         .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75),
2770         .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75),
2771         .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75),
2772      };
2773#endif
2774   }
2775#endif
2776#if GFX_VER == 6
2777   /* Sandybridge's gather4 message is broken for integer formats.
2778    * To work around this, we pretend the surface is UNORM for
2779    * 8 or 16-bit formats, and emit shader instructions to recover
2780    * the real INT/UINT value.  For 32-bit formats, we pretend
2781    * the surface is FLOAT, and simply reinterpret the resulting
2782    * bits.
2783    */
2784   switch (fmt.fmt) {
2785   case ISL_FORMAT_R8_SINT:
2786   case ISL_FORMAT_R8_UINT:
2787      isv->gather_view.format = ISL_FORMAT_R8_UNORM;
2788      break;
2789
2790   case ISL_FORMAT_R16_SINT:
2791   case ISL_FORMAT_R16_UINT:
2792      isv->gather_view.format = ISL_FORMAT_R16_UNORM;
2793      break;
2794
2795   case ISL_FORMAT_R32_SINT:
2796   case ISL_FORMAT_R32_UINT:
2797      isv->gather_view.format = ISL_FORMAT_R32_FLOAT;
2798      break;
2799
2800   default:
2801      break;
2802   }
2803#endif
2804#endif
2805   /* Fill out SURFACE_STATE for this view. */
2806   if (tmpl->target != PIPE_BUFFER) {
2807      if (crocus_resource_unfinished_aux_import(isv->res))
2808         crocus_resource_finish_aux_import(&screen->base, isv->res);
2809
2810   }
2811
2812   return &isv->base;
2813}
2814
2815static void
2816crocus_sampler_view_destroy(struct pipe_context *ctx,
2817                            struct pipe_sampler_view *state)
2818{
2819   struct crocus_sampler_view *isv = (void *) state;
2820   pipe_resource_reference(&state->texture, NULL);
2821   free(isv);
2822}
2823
2824/**
2825 * The pipe->create_surface() driver hook.
2826 *
2827 * In Gallium nomenclature, "surfaces" are a view of a resource that
2828 * can be bound as a render target or depth/stencil buffer.
2829 */
2830static struct pipe_surface *
2831crocus_create_surface(struct pipe_context *ctx,
2832                      struct pipe_resource *tex,
2833                      const struct pipe_surface *tmpl)
2834{
2835   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2836   const struct intel_device_info *devinfo = &screen->devinfo;
2837
2838   isl_surf_usage_flags_t usage = 0;
2839   if (tmpl->writable)
2840      usage = ISL_SURF_USAGE_STORAGE_BIT;
2841   else if (util_format_is_depth_or_stencil(tmpl->format))
2842      usage = ISL_SURF_USAGE_DEPTH_BIT;
2843   else
2844      usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
2845
2846   const struct crocus_format_info fmt =
2847      crocus_format_for_usage(devinfo, tmpl->format, usage);
2848
2849   if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
2850       !isl_format_supports_rendering(devinfo, fmt.fmt)) {
2851      /* Framebuffer validation will reject this invalid case, but it
2852       * hasn't had the opportunity yet.  In the meantime, we need to
2853       * avoid hitting ISL asserts about unsupported formats below.
2854       */
2855      return NULL;
2856   }
2857
2858   struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface));
2859   struct pipe_surface *psurf = &surf->base;
2860   struct crocus_resource *res = (struct crocus_resource *) tex;
2861
2862   if (!surf)
2863      return NULL;
2864
2865   pipe_reference_init(&psurf->reference, 1);
2866   pipe_resource_reference(&psurf->texture, tex);
2867   psurf->context = ctx;
2868   psurf->format = tmpl->format;
2869   psurf->width = tex->width0;
2870   psurf->height = tex->height0;
2871   psurf->texture = tex;
2872   psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
2873   psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
2874   psurf->u.tex.level = tmpl->u.tex.level;
2875
2876   uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2877
2878   struct isl_view *view = &surf->view;
2879   *view = (struct isl_view) {
2880      .format = fmt.fmt,
2881      .base_level = tmpl->u.tex.level,
2882      .levels = 1,
2883      .base_array_layer = tmpl->u.tex.first_layer,
2884      .array_len = array_len,
2885      .swizzle = ISL_SWIZZLE_IDENTITY,
2886      .usage = usage,
2887   };
2888
2889#if GFX_VER >= 6
2890   struct isl_view *read_view = &surf->read_view;
2891   *read_view = (struct isl_view) {
2892      .format = fmt.fmt,
2893      .base_level = tmpl->u.tex.level,
2894      .levels = 1,
2895      .base_array_layer = tmpl->u.tex.first_layer,
2896      .array_len = array_len,
2897      .swizzle = ISL_SWIZZLE_IDENTITY,
2898      .usage = ISL_SURF_USAGE_TEXTURE_BIT,
2899   };
2900#endif
2901
2902   surf->clear_color = res->aux.clear_color;
2903
2904   /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
2905   if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
2906                          ISL_SURF_USAGE_STENCIL_BIT))
2907      return psurf;
2908
2909   if (!isl_format_is_compressed(res->surf.format)) {
2910      if (crocus_resource_unfinished_aux_import(res))
2911         crocus_resource_finish_aux_import(&screen->base, res);
2912
2913      memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2914      uint64_t temp_offset;
2915      uint32_t temp_x, temp_y;
2916
2917      isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level,
2918                                          res->base.b.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer,
2919                                          res->base.b.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0,
2920                                          &temp_offset, &temp_x, &temp_y);
2921      if (!devinfo->has_surface_tile_offset &&
2922          (temp_x || temp_y)) {
2923         /* Original gfx4 hardware couldn't draw to a non-tile-aligned
2924          * destination.
2925          */
2926         /* move to temp */
2927         struct pipe_resource wa_templ = (struct pipe_resource) {
2928            .width0 = u_minify(res->base.b.width0, tmpl->u.tex.level),
2929            .height0 = u_minify(res->base.b.height0, tmpl->u.tex.level),
2930            .depth0 = 1,
2931            .array_size = 1,
2932            .format = res->base.b.format,
2933            .target = PIPE_TEXTURE_2D,
2934            .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW,
2935         };
2936         surf->align_res = screen->base.resource_create(&screen->base, &wa_templ);
2937         view->base_level = 0;
2938         view->base_array_layer = 0;
2939         view->array_len = 1;
2940         struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res;
2941         memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf));
2942      }
2943      return psurf;
2944   }
2945
2946   /* The resource has a compressed format, which is not renderable, but we
2947    * have a renderable view format.  We must be attempting to upload blocks
2948    * of compressed data via an uncompressed view.
2949    *
2950    * In this case, we can assume there are no auxiliary buffers, a single
2951    * miplevel, and that the resource is single-sampled.  Gallium may try
2952    * and create an uncompressed view with multiple layers, however.
2953    */
2954   assert(!isl_format_is_compressed(fmt.fmt));
2955   assert(res->surf.samples == 1);
2956   assert(view->levels == 1);
2957
2958   /* TODO: compressed pbo uploads aren't working here */
2959   return NULL;
2960
2961   uint64_t offset_B = 0;
2962   uint32_t tile_x_sa = 0, tile_y_sa = 0;
2963
2964   if (view->base_level > 0) {
2965      /* We can't rely on the hardware's miplevel selection with such
2966       * a substantial lie about the format, so we select a single image
2967       * using the Tile X/Y Offset fields.  In this case, we can't handle
2968       * multiple array slices.
2969       *
2970       * On Broadwell, HALIGN and VALIGN are specified in pixels and are
2971       * hard-coded to align to exactly the block size of the compressed
2972       * texture.  This means that, when reinterpreted as a non-compressed
2973       * texture, the tile offsets may be anything and we can't rely on
2974       * X/Y Offset.
2975       *
2976       * Return NULL to force the state tracker to take fallback paths.
2977       */
2978      // TODO: check if the gen7 check is right, originally gen8
2979      if (view->array_len > 1 || GFX_VER == 7)
2980         return NULL;
2981
2982      const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D;
2983      isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
2984                              view->base_level,
2985                              is_3d ? 0 : view->base_array_layer,
2986                              is_3d ? view->base_array_layer : 0,
2987                              &surf->surf,
2988                              &offset_B, &tile_x_sa, &tile_y_sa);
2989
2990      /* We use address and tile offsets to access a single level/layer
2991       * as a subimage, so reset level/layer so it doesn't offset again.
2992       */
2993      view->base_array_layer = 0;
2994      view->base_level = 0;
2995   } else {
2996      /* Level 0 doesn't require tile offsets, and the hardware can find
2997       * array slices using QPitch even with the format override, so we
2998       * can allow layers in this case.  Copy the original ISL surface.
2999       */
3000      memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
3001   }
3002
3003   /* Scale down the image dimensions by the block size. */
3004   const struct isl_format_layout *fmtl =
3005      isl_format_get_layout(res->surf.format);
3006   surf->surf.format = fmt.fmt;
3007   surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf);
3008   surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf);
3009   tile_x_sa /= fmtl->bw;
3010   tile_y_sa /= fmtl->bh;
3011
3012   psurf->width = surf->surf.logical_level0_px.width;
3013   psurf->height = surf->surf.logical_level0_px.height;
3014
3015   return psurf;
3016}
3017
3018#if GFX_VER >= 7
3019static void
3020fill_default_image_param(struct brw_image_param *param)
3021{
3022   memset(param, 0, sizeof(*param));
3023   /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3024    * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
3025    * detailed explanation of these parameters.
3026    */
3027   param->swizzling[0] = 0xff;
3028   param->swizzling[1] = 0xff;
3029}
3030
3031static void
3032fill_buffer_image_param(struct brw_image_param *param,
3033                        enum pipe_format pfmt,
3034                        unsigned size)
3035{
3036   const unsigned cpp = util_format_get_blocksize(pfmt);
3037
3038   fill_default_image_param(param);
3039   param->size[0] = size / cpp;
3040   param->stride[0] = cpp;
3041}
3042
3043#endif
3044
3045/**
3046 * The pipe->set_shader_images() driver hook.
3047 */
3048static void
3049crocus_set_shader_images(struct pipe_context *ctx,
3050                         enum pipe_shader_type p_stage,
3051                         unsigned start_slot, unsigned count,
3052                         unsigned unbind_num_trailing_slots,
3053                         const struct pipe_image_view *p_images)
3054{
3055#if GFX_VER >= 7
3056   struct crocus_context *ice = (struct crocus_context *) ctx;
3057   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3058   const struct intel_device_info *devinfo = &screen->devinfo;
3059   gl_shader_stage stage = stage_from_pipe(p_stage);
3060   struct crocus_shader_state *shs = &ice->state.shaders[stage];
3061   struct crocus_genx_state *genx = ice->state.genx;
3062   struct brw_image_param *image_params = genx->shaders[stage].image_param;
3063
3064   shs->bound_image_views &= ~u_bit_consecutive(start_slot, count);
3065
3066   for (unsigned i = 0; i < count; i++) {
3067      struct crocus_image_view *iv = &shs->image[start_slot + i];
3068
3069      if (p_images && p_images[i].resource) {
3070         const struct pipe_image_view *img = &p_images[i];
3071         struct crocus_resource *res = (void *) img->resource;
3072
3073         util_copy_image_view(&iv->base, img);
3074
3075         shs->bound_image_views |= 1 << (start_slot + i);
3076
3077         res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3078         res->bind_stages |= 1 << stage;
3079
3080         isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
3081         struct crocus_format_info fmt =
3082            crocus_format_for_usage(devinfo, img->format, usage);
3083
3084         struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles);
3085         if (img->shader_access & PIPE_IMAGE_ACCESS_READ) {
3086            /* On Gen8, try to use typed surfaces reads (which support a
3087             * limited number of formats), and if not possible, fall back
3088             * to untyped reads.
3089             */
3090            if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt))
3091               fmt.fmt = ISL_FORMAT_RAW;
3092            else
3093               fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt);
3094         }
3095
3096         if (res->base.b.target != PIPE_BUFFER) {
3097            struct isl_view view = {
3098               .format = fmt.fmt,
3099               .base_level = img->u.tex.level,
3100               .levels = 1,
3101               .base_array_layer = img->u.tex.first_layer,
3102               .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3103               .swizzle = swiz,
3104               .usage = usage,
3105            };
3106
3107            iv->view = view;
3108
3109            isl_surf_fill_image_param(&screen->isl_dev,
3110                                      &image_params[start_slot + i],
3111                                      &res->surf, &view);
3112         } else {
3113            struct isl_view view = {
3114               .format = fmt.fmt,
3115               .swizzle = swiz,
3116               .usage = usage,
3117            };
3118            iv->view = view;
3119
3120            util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3121                           img->u.buf.offset + img->u.buf.size);
3122            fill_buffer_image_param(&image_params[start_slot + i],
3123                                    img->format, img->u.buf.size);
3124         }
3125      } else {
3126         pipe_resource_reference(&iv->base.resource, NULL);
3127         fill_default_image_param(&image_params[start_slot + i]);
3128      }
3129   }
3130
3131   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3132   ice->state.dirty |=
3133      stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3134                                   : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3135
3136   /* Broadwell also needs brw_image_params re-uploaded */
3137   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3138   shs->sysvals_need_upload = true;
3139#endif
3140}
3141
3142
3143/**
3144 * The pipe->set_sampler_views() driver hook.
3145 */
3146static void
3147crocus_set_sampler_views(struct pipe_context *ctx,
3148                         enum pipe_shader_type p_stage,
3149                         unsigned start, unsigned count,
3150                         unsigned unbind_num_trailing_slots,
3151                         bool take_ownership,
3152                         struct pipe_sampler_view **views)
3153{
3154   struct crocus_context *ice = (struct crocus_context *) ctx;
3155   gl_shader_stage stage = stage_from_pipe(p_stage);
3156   struct crocus_shader_state *shs = &ice->state.shaders[stage];
3157
3158   shs->bound_sampler_views &= ~u_bit_consecutive(start, count);
3159
3160   for (unsigned i = 0; i < count; i++) {
3161      struct pipe_sampler_view *pview = views ? views[i] : NULL;
3162
3163      if (take_ownership) {
3164         pipe_sampler_view_reference((struct pipe_sampler_view **)
3165                                     &shs->textures[start + i], NULL);
3166         shs->textures[start + i] = (struct crocus_sampler_view *)pview;
3167      } else {
3168         pipe_sampler_view_reference((struct pipe_sampler_view **)
3169                                     &shs->textures[start + i], pview);
3170      }
3171
3172      struct crocus_sampler_view *view = (void *) pview;
3173      if (view) {
3174         view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3175         view->res->bind_stages |= 1 << stage;
3176
3177         shs->bound_sampler_views |= 1 << (start + i);
3178      }
3179   }
3180#if GFX_VER == 6
3181   /* first level parameters to crocus_upload_sampler_state is gfx6 only */
3182   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3183#endif
3184   ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage);
3185   ice->state.dirty |=
3186      stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3187                                   : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3188   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
3189}
3190
3191/**
3192 * The pipe->set_tess_state() driver hook.
3193 */
3194static void
3195crocus_set_tess_state(struct pipe_context *ctx,
3196                      const float default_outer_level[4],
3197                      const float default_inner_level[2])
3198{
3199   struct crocus_context *ice = (struct crocus_context *) ctx;
3200   struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3201
3202   memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3203   memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3204
3205   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
3206   shs->sysvals_need_upload = true;
3207}
3208
3209static void
3210crocus_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3211{
3212   struct crocus_context *ice = (struct crocus_context *) ctx;
3213
3214   ice->state.patch_vertices = patch_vertices;
3215}
3216
3217static void
3218crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3219{
3220   struct crocus_surface *surf = (void *) p_surf;
3221   pipe_resource_reference(&p_surf->texture, NULL);
3222
3223   pipe_resource_reference(&surf->align_res, NULL);
3224   free(surf);
3225}
3226
3227static void
3228crocus_set_clip_state(struct pipe_context *ctx,
3229                      const struct pipe_clip_state *state)
3230{
3231   struct crocus_context *ice = (struct crocus_context *) ctx;
3232   struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3233   struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3234   struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3235
3236   memcpy(&ice->state.clip_planes, state, sizeof(*state));
3237
3238#if GFX_VER <= 5
3239   ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
3240#endif
3241   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS |
3242                             CROCUS_STAGE_DIRTY_CONSTANTS_TES;
3243   shs->sysvals_need_upload = true;
3244   gshs->sysvals_need_upload = true;
3245   tshs->sysvals_need_upload = true;
3246}
3247
3248/**
3249 * The pipe->set_polygon_stipple() driver hook.
3250 */
3251static void
3252crocus_set_polygon_stipple(struct pipe_context *ctx,
3253                           const struct pipe_poly_stipple *state)
3254{
3255   struct crocus_context *ice = (struct crocus_context *) ctx;
3256   memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3257   ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE;
3258}
3259
3260/**
3261 * The pipe->set_sample_mask() driver hook.
3262 */
3263static void
3264crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3265{
3266   struct crocus_context *ice = (struct crocus_context *) ctx;
3267
3268   /* We only support 16x MSAA, so we have 16 bits of sample maks.
3269    * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3270    */
3271   ice->state.sample_mask = sample_mask & 0xff;
3272   ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3273}
3274
3275static void
3276crocus_fill_scissor_rect(struct crocus_context *ice,
3277                         int idx,
3278                         struct pipe_scissor_state *ss)
3279{
3280   struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
3281   struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
3282   const struct pipe_viewport_state *vp = &ice->state.viewports[idx];
3283   struct pipe_scissor_state scissor = (struct pipe_scissor_state) {
3284      .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0),
3285      .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1,
3286      .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0),
3287      .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1,
3288   };
3289   if (cso_state->scissor) {
3290      struct pipe_scissor_state *s = &ice->state.scissors[idx];
3291      scissor.minx = MAX2(scissor.minx, s->minx);
3292      scissor.miny = MAX2(scissor.miny, s->miny);
3293      scissor.maxx = MIN2(scissor.maxx, s->maxx);
3294      scissor.maxy = MIN2(scissor.maxy, s->maxy);
3295   }
3296   *ss = scissor;
3297}
3298
3299/**
3300 * The pipe->set_scissor_states() driver hook.
3301 *
3302 * This corresponds to our SCISSOR_RECT state structures.  It's an
3303 * exact match, so we just store them, and memcpy them out later.
3304 */
3305static void
3306crocus_set_scissor_states(struct pipe_context *ctx,
3307                          unsigned start_slot,
3308                          unsigned num_scissors,
3309                          const struct pipe_scissor_state *rects)
3310{
3311   struct crocus_context *ice = (struct crocus_context *) ctx;
3312
3313   for (unsigned i = 0; i < num_scissors; i++) {
3314      if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3315         /* If the scissor was out of bounds and got clamped to 0 width/height
3316          * at the bounds, the subtraction of 1 from maximums could produce a
3317          * negative number and thus not clip anything.  Instead, just provide
3318          * a min > max scissor inside the bounds, which produces the expected
3319          * no rendering.
3320          */
3321         ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3322            .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3323         };
3324      } else {
3325         ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3326            .minx = rects[i].minx,     .miny = rects[i].miny,
3327            .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3328         };
3329      }
3330   }
3331
3332#if GFX_VER < 6
3333   ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */
3334#else
3335   ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3336#endif
3337   ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3338
3339}
3340
3341/**
3342 * The pipe->set_stencil_ref() driver hook.
3343 *
3344 * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3345 */
3346static void
3347crocus_set_stencil_ref(struct pipe_context *ctx,
3348                       const struct pipe_stencil_ref ref)
3349{
3350   struct crocus_context *ice = (struct crocus_context *) ctx;
3351   ice->state.stencil_ref = ref;
3352   ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
3353}
3354
3355#if GFX_VER == 8
3356static float
3357viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3358{
3359   return copysignf(state->scale[axis], sign) + state->translate[axis];
3360}
3361#endif
3362
3363/**
3364 * The pipe->set_viewport_states() driver hook.
3365 *
3366 * This corresponds to our SF_CLIP_VIEWPORT states.  We can't calculate
3367 * the guardband yet, as we need the framebuffer dimensions, but we can
3368 * at least fill out the rest.
3369 */
3370static void
3371crocus_set_viewport_states(struct pipe_context *ctx,
3372                           unsigned start_slot,
3373                           unsigned count,
3374                           const struct pipe_viewport_state *states)
3375{
3376   struct crocus_context *ice = (struct crocus_context *) ctx;
3377
3378   memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3379
3380   ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3381   ice->state.dirty |= CROCUS_DIRTY_RASTER;
3382#if GFX_VER >= 6
3383   ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3384#endif
3385
3386   if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near ||
3387                               !ice->state.cso_rast->cso.depth_clip_far))
3388      ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
3389}
3390
3391/**
3392 * The pipe->set_framebuffer_state() driver hook.
3393 *
3394 * Sets the current draw FBO, including color render targets, depth,
3395 * and stencil buffers.
3396 */
3397static void
3398crocus_set_framebuffer_state(struct pipe_context *ctx,
3399                             const struct pipe_framebuffer_state *state)
3400{
3401   struct crocus_context *ice = (struct crocus_context *) ctx;
3402   struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3403   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3404   const struct intel_device_info *devinfo = &screen->devinfo;
3405#if 0
3406   struct isl_device *isl_dev = &screen->isl_dev;
3407   struct crocus_resource *zres;
3408   struct crocus_resource *stencil_res;
3409#endif
3410
3411   unsigned samples = util_framebuffer_get_num_samples(state);
3412   unsigned layers = util_framebuffer_get_num_layers(state);
3413
3414#if GFX_VER >= 6
3415   if (cso->samples != samples) {
3416      ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
3417      ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3418      ice->state.dirty |= CROCUS_DIRTY_RASTER;
3419#if GFX_VERx10 == 75
3420      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
3421#endif
3422   }
3423#endif
3424
3425#if GFX_VER >= 6 && GFX_VER < 8
3426   ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
3427#endif
3428
3429   if ((cso->layers == 0) != (layers == 0)) {
3430      ice->state.dirty |= CROCUS_DIRTY_CLIP;
3431   }
3432
3433   if (cso->width != state->width || cso->height != state->height) {
3434      ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3435      ice->state.dirty |= CROCUS_DIRTY_RASTER;
3436      ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE;
3437#if GFX_VER >= 6
3438      ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3439#endif
3440   }
3441
3442   if (cso->zsbuf || state->zsbuf) {
3443      ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
3444
3445      /* update SF's depth buffer format */
3446      if (GFX_VER == 7 && cso->zsbuf)
3447         ice->state.dirty |= CROCUS_DIRTY_RASTER;
3448   }
3449
3450   /* wm thread dispatch enable */
3451   ice->state.dirty |= CROCUS_DIRTY_WM;
3452   util_copy_framebuffer_state(cso, state);
3453   cso->samples = samples;
3454   cso->layers = layers;
3455
3456   if (cso->zsbuf) {
3457      struct crocus_resource *zres;
3458      struct crocus_resource *stencil_res;
3459      enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
3460      crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres,
3461                                         &stencil_res);
3462      if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) {
3463         aux_usage = zres->aux.usage;
3464      }
3465      ice->state.hiz_usage = aux_usage;
3466   }
3467
3468   /* Render target change */
3469   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
3470
3471   ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3472
3473   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER];
3474}
3475
3476/**
3477 * The pipe->set_constant_buffer() driver hook.
3478 *
3479 * This uploads any constant data in user buffers, and references
3480 * any UBO resources containing constant data.
3481 */
3482static void
3483crocus_set_constant_buffer(struct pipe_context *ctx,
3484                           enum pipe_shader_type p_stage, unsigned index,
3485                           bool take_ownership,
3486                           const struct pipe_constant_buffer *input)
3487{
3488   struct crocus_context *ice = (struct crocus_context *) ctx;
3489   gl_shader_stage stage = stage_from_pipe(p_stage);
3490   struct crocus_shader_state *shs = &ice->state.shaders[stage];
3491   struct pipe_constant_buffer *cbuf = &shs->constbufs[index];
3492
3493   util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership);
3494
3495   if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3496      shs->bound_cbufs |= 1u << index;
3497
3498      if (input->user_buffer) {
3499         void *map = NULL;
3500         pipe_resource_reference(&cbuf->buffer, NULL);
3501         u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3502                        &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3503
3504         if (!cbuf->buffer) {
3505            /* Allocation was unsuccessful - just unbind */
3506            crocus_set_constant_buffer(ctx, p_stage, index, false, NULL);
3507            return;
3508         }
3509
3510         assert(map);
3511         memcpy(map, input->user_buffer, input->buffer_size);
3512      }
3513      cbuf->buffer_size =
3514         MIN2(input->buffer_size,
3515              crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3516
3517      struct crocus_resource *res = (void *) cbuf->buffer;
3518      res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3519      res->bind_stages |= 1 << stage;
3520   } else {
3521      shs->bound_cbufs &= ~(1u << index);
3522   }
3523
3524   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3525}
3526
3527static void
3528upload_sysvals(struct crocus_context *ice,
3529               gl_shader_stage stage)
3530{
3531   UNUSED struct crocus_genx_state *genx = ice->state.genx;
3532   struct crocus_shader_state *shs = &ice->state.shaders[stage];
3533
3534   struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
3535   if (!shader || shader->num_system_values == 0)
3536      return;
3537
3538   assert(shader->num_cbufs > 0);
3539
3540   unsigned sysval_cbuf_index = shader->num_cbufs - 1;
3541   struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index];
3542   unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
3543   uint32_t *map = NULL;
3544
3545   assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
3546   u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
3547                  &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3548
3549   for (int i = 0; i < shader->num_system_values; i++) {
3550      uint32_t sysval = shader->system_values[i];
3551      uint32_t value = 0;
3552
3553      if (BRW_PARAM_DOMAIN(sysval) == BRW_PARAM_DOMAIN_IMAGE) {
3554#if GFX_VER >= 7
3555         unsigned img = BRW_PARAM_IMAGE_IDX(sysval);
3556         unsigned offset = BRW_PARAM_IMAGE_OFFSET(sysval);
3557         struct brw_image_param *param =
3558            &genx->shaders[stage].image_param[img];
3559
3560         assert(offset < sizeof(struct brw_image_param));
3561         value = ((uint32_t *) param)[offset];
3562#endif
3563      } else if (sysval == BRW_PARAM_BUILTIN_ZERO) {
3564         value = 0;
3565      } else if (BRW_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
3566         int plane = BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
3567         int comp  = BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
3568         value = fui(ice->state.clip_planes.ucp[plane][comp]);
3569      } else if (sysval == BRW_PARAM_BUILTIN_PATCH_VERTICES_IN) {
3570         if (stage == MESA_SHADER_TESS_CTRL) {
3571            value = ice->state.vertices_per_patch;
3572         } else {
3573            assert(stage == MESA_SHADER_TESS_EVAL);
3574            const struct shader_info *tcs_info =
3575               crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
3576            if (tcs_info)
3577               value = tcs_info->tess.tcs_vertices_out;
3578            else
3579               value = ice->state.vertices_per_patch;
3580         }
3581      } else if (sysval >= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
3582                 sysval <= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
3583         unsigned i = sysval - BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
3584         value = fui(ice->state.default_outer_level[i]);
3585      } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
3586         value = fui(ice->state.default_inner_level[0]);
3587      } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
3588         value = fui(ice->state.default_inner_level[1]);
3589      } else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
3590                 sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
3591         unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
3592         value = ice->state.last_block[i];
3593      } else {
3594         assert(!"unhandled system value");
3595      }
3596
3597      *map++ = value;
3598   }
3599
3600   cbuf->buffer_size = upload_size;
3601   shs->sysvals_need_upload = false;
3602}
3603
3604/**
3605 * The pipe->set_shader_buffers() driver hook.
3606 *
3607 * This binds SSBOs and ABOs.  Unfortunately, we need to stream out
3608 * SURFACE_STATE here, as the buffer offset may change each time.
3609 */
3610static void
3611crocus_set_shader_buffers(struct pipe_context *ctx,
3612                          enum pipe_shader_type p_stage,
3613                          unsigned start_slot, unsigned count,
3614                          const struct pipe_shader_buffer *buffers,
3615                          unsigned writable_bitmask)
3616{
3617   struct crocus_context *ice = (struct crocus_context *) ctx;
3618   gl_shader_stage stage = stage_from_pipe(p_stage);
3619   struct crocus_shader_state *shs = &ice->state.shaders[stage];
3620
3621   unsigned modified_bits = u_bit_consecutive(start_slot, count);
3622
3623   shs->bound_ssbos &= ~modified_bits;
3624   shs->writable_ssbos &= ~modified_bits;
3625   shs->writable_ssbos |= writable_bitmask << start_slot;
3626
3627   for (unsigned i = 0; i < count; i++) {
3628      if (buffers && buffers[i].buffer) {
3629         struct crocus_resource *res = (void *) buffers[i].buffer;
3630         struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
3631         pipe_resource_reference(&ssbo->buffer, &res->base.b);
3632         ssbo->buffer_offset = buffers[i].buffer_offset;
3633         ssbo->buffer_size =
3634            MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
3635
3636         shs->bound_ssbos |= 1 << (start_slot + i);
3637
3638         res->bind_history |= PIPE_BIND_SHADER_BUFFER;
3639         res->bind_stages |= 1 << stage;
3640
3641         util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
3642                        ssbo->buffer_offset + ssbo->buffer_size);
3643      } else {
3644         pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
3645      }
3646   }
3647
3648   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3649}
3650
3651static void
3652crocus_delete_state(struct pipe_context *ctx, void *state)
3653{
3654   free(state);
3655}
3656
3657/**
3658 * The pipe->set_vertex_buffers() driver hook.
3659 *
3660 * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
3661 */
3662static void
3663crocus_set_vertex_buffers(struct pipe_context *ctx,
3664                          unsigned start_slot, unsigned count,
3665                          unsigned unbind_num_trailing_slots,
3666                          bool take_ownership,
3667                          const struct pipe_vertex_buffer *buffers)
3668{
3669   struct crocus_context *ice = (struct crocus_context *) ctx;
3670   struct crocus_screen *screen = (struct crocus_screen *) ctx->screen;
3671   const unsigned padding =
3672      (GFX_VERx10 < 75 && screen->devinfo.platform != INTEL_PLATFORM_BYT) * 2;
3673   ice->state.bound_vertex_buffers &=
3674      ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
3675
3676   util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers,
3677                                buffers, start_slot, count, unbind_num_trailing_slots,
3678                                take_ownership);
3679
3680   for (unsigned i = 0; i < count; i++) {
3681      struct pipe_vertex_buffer *state =
3682         &ice->state.vertex_buffers[start_slot + i];
3683
3684      if (!state->is_user_buffer && state->buffer.resource) {
3685         struct crocus_resource *res = (void *)state->buffer.resource;
3686         res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
3687      }
3688
3689      uint32_t end = 0;
3690      if (state->buffer.resource)
3691         end = state->buffer.resource->width0 + padding;
3692      ice->state.vb_end[start_slot + i] = end;
3693   }
3694   ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
3695}
3696
3697#if GFX_VERx10 < 75
3698static uint8_t get_wa_flags(enum isl_format format)
3699{
3700   uint8_t wa_flags = 0;
3701
3702   switch (format) {
3703   case ISL_FORMAT_R10G10B10A2_USCALED:
3704      wa_flags = BRW_ATTRIB_WA_SCALE;
3705      break;
3706   case ISL_FORMAT_R10G10B10A2_SSCALED:
3707      wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE;
3708      break;
3709   case ISL_FORMAT_R10G10B10A2_UNORM:
3710      wa_flags = BRW_ATTRIB_WA_NORMALIZE;
3711      break;
3712   case ISL_FORMAT_R10G10B10A2_SNORM:
3713      wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE;
3714      break;
3715   case ISL_FORMAT_R10G10B10A2_SINT:
3716      wa_flags = BRW_ATTRIB_WA_SIGN;
3717      break;
3718   case ISL_FORMAT_B10G10R10A2_USCALED:
3719      wa_flags = BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
3720      break;
3721   case ISL_FORMAT_B10G10R10A2_SSCALED:
3722      wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
3723      break;
3724   case ISL_FORMAT_B10G10R10A2_UNORM:
3725      wa_flags = BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
3726      break;
3727   case ISL_FORMAT_B10G10R10A2_SNORM:
3728      wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
3729      break;
3730   case ISL_FORMAT_B10G10R10A2_SINT:
3731      wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_BGRA;
3732      break;
3733   case ISL_FORMAT_B10G10R10A2_UINT:
3734      wa_flags = BRW_ATTRIB_WA_BGRA;
3735      break;
3736   default:
3737      break;
3738   }
3739   return wa_flags;
3740}
3741#endif
3742
3743/**
3744 * Gallium CSO for vertex elements.
3745 */
3746struct crocus_vertex_element_state {
3747   uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
3748#if GFX_VER == 8
3749   uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
3750#endif
3751   uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
3752#if GFX_VER == 8
3753   uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
3754#endif
3755   uint32_t step_rate[16];
3756   uint8_t wa_flags[33];
3757   unsigned count;
3758};
3759
3760/**
3761 * The pipe->create_vertex_elements() driver hook.
3762 *
3763 * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
3764 * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
3765 * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
3766 * needed. In these cases we will need information available at draw time.
3767 * We setup edgeflag_ve and edgeflag_vfi as alternatives last
3768 * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
3769 * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
3770 */
3771static void *
3772crocus_create_vertex_elements(struct pipe_context *ctx,
3773                              unsigned count,
3774                              const struct pipe_vertex_element *state)
3775{
3776   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3777   const struct intel_device_info *devinfo = &screen->devinfo;
3778   struct crocus_vertex_element_state *cso =
3779      malloc(sizeof(struct crocus_vertex_element_state));
3780
3781   cso->count = count;
3782
3783   crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
3784      ve.DWordLength =
3785         1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
3786   }
3787
3788   uint32_t *ve_pack_dest = &cso->vertex_elements[1];
3789#if GFX_VER == 8
3790   uint32_t *vfi_pack_dest = cso->vf_instancing;
3791#endif
3792
3793   if (count == 0) {
3794      crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3795         ve.Valid = true;
3796         ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
3797         ve.Component0Control = VFCOMP_STORE_0;
3798         ve.Component1Control = VFCOMP_STORE_0;
3799         ve.Component2Control = VFCOMP_STORE_0;
3800         ve.Component3Control = VFCOMP_STORE_1_FP;
3801      }
3802#if GFX_VER == 8
3803      crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3804      }
3805#endif
3806   }
3807
3808   for (int i = 0; i < count; i++) {
3809      const struct crocus_format_info fmt =
3810         crocus_format_for_usage(devinfo, state[i].src_format, 0);
3811      unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
3812                           VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
3813      enum isl_format actual_fmt = fmt.fmt;
3814
3815#if GFX_VERx10 < 75
3816      cso->wa_flags[i] = get_wa_flags(fmt.fmt);
3817
3818      if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED ||
3819          fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED ||
3820          fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM ||
3821          fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM ||
3822          fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT ||
3823          fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED ||
3824          fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED ||
3825          fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM ||
3826          fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM ||
3827          fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT ||
3828          fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT)
3829         actual_fmt = ISL_FORMAT_R10G10B10A2_UINT;
3830      if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT)
3831         actual_fmt = ISL_FORMAT_R8G8B8A8_SINT;
3832      if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT)
3833         actual_fmt = ISL_FORMAT_R8G8B8A8_UINT;
3834      if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT)
3835         actual_fmt = ISL_FORMAT_R16G16B16A16_SINT;
3836      if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT)
3837         actual_fmt = ISL_FORMAT_R16G16B16A16_UINT;
3838#endif
3839
3840      cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor;
3841
3842      switch (isl_format_get_num_channels(fmt.fmt)) {
3843      case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
3844      case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
3845      case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
3846      case 3:
3847         comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
3848            : VFCOMP_STORE_1_FP;
3849         break;
3850      }
3851      crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3852#if GFX_VER >= 6
3853         ve.EdgeFlagEnable = false;
3854#endif
3855         ve.VertexBufferIndex = state[i].vertex_buffer_index;
3856         ve.Valid = true;
3857         ve.SourceElementOffset = state[i].src_offset;
3858         ve.SourceElementFormat = actual_fmt;
3859         ve.Component0Control = comp[0];
3860         ve.Component1Control = comp[1];
3861         ve.Component2Control = comp[2];
3862         ve.Component3Control = comp[3];
3863#if GFX_VER < 5
3864         ve.DestinationElementOffset = i * 4;
3865#endif
3866      }
3867
3868#if GFX_VER == 8
3869      crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3870         vi.VertexElementIndex = i;
3871         vi.InstancingEnable = state[i].instance_divisor > 0;
3872         vi.InstanceDataStepRate = state[i].instance_divisor;
3873      }
3874#endif
3875      ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
3876#if GFX_VER == 8
3877      vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
3878#endif
3879   }
3880
3881   /* An alternative version of the last VE and VFI is stored so it
3882    * can be used at draw time in case Vertex Shader uses EdgeFlag
3883    */
3884   if (count) {
3885      const unsigned edgeflag_index = count - 1;
3886      const struct crocus_format_info fmt =
3887         crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
3888      crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
3889#if GFX_VER >= 6
3890         ve.EdgeFlagEnable = true;
3891#endif
3892         ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
3893         ve.Valid = true;
3894         ve.SourceElementOffset = state[edgeflag_index].src_offset;
3895         ve.SourceElementFormat = fmt.fmt;
3896         ve.Component0Control = VFCOMP_STORE_SRC;
3897         ve.Component1Control = VFCOMP_STORE_0;
3898         ve.Component2Control = VFCOMP_STORE_0;
3899         ve.Component3Control = VFCOMP_STORE_0;
3900      }
3901#if GFX_VER == 8
3902      crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
3903         /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
3904          * at draw time, as it should change if SGVs are emitted.
3905          */
3906         vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
3907         vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
3908      }
3909#endif
3910   }
3911
3912   return cso;
3913}
3914
3915/**
3916 * The pipe->bind_vertex_elements_state() driver hook.
3917 */
3918static void
3919crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
3920{
3921   struct crocus_context *ice = (struct crocus_context *) ctx;
3922#if GFX_VER == 8
3923   struct crocus_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
3924   struct crocus_vertex_element_state *new_cso = state;
3925
3926   if (new_cso && cso_changed(count))
3927      ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_SGVS;
3928#endif
3929   ice->state.cso_vertex_elements = state;
3930   ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
3931   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS];
3932}
3933
3934#if GFX_VER >= 6
3935struct crocus_streamout_counter {
3936   uint32_t offset_start;
3937   uint32_t offset_end;
3938
3939   uint64_t accum;
3940};
3941
3942/**
3943 * Gallium CSO for stream output (transform feedback) targets.
3944 */
3945struct crocus_stream_output_target {
3946   struct pipe_stream_output_target base;
3947
3948   /** Stride (bytes-per-vertex) during this transform feedback operation */
3949   uint16_t stride;
3950
3951   /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */
3952   bool zeroed;
3953
3954   struct crocus_resource *offset_res;
3955   uint32_t offset_offset;
3956
3957#if GFX_VER == 6
3958   void *prim_map;
3959   struct crocus_streamout_counter prev_count;
3960   struct crocus_streamout_counter count;
3961#endif
3962#if GFX_VER == 8
3963   /** Does the next 3DSTATE_SO_BUFFER need to zero the offsets? */
3964   bool zero_offset;
3965#endif
3966};
3967
3968#if GFX_VER >= 7
3969static uint32_t
3970crocus_get_so_offset(struct pipe_stream_output_target *so)
3971{
3972   struct crocus_stream_output_target *tgt = (void *)so;
3973   struct pipe_transfer *transfer;
3974   struct pipe_box box;
3975   uint32_t result;
3976   u_box_1d(tgt->offset_offset, 4, &box);
3977   void *val = so->context->buffer_map(so->context, &tgt->offset_res->base.b,
3978                                       0, PIPE_MAP_DIRECTLY,
3979                                       &box, &transfer);
3980   assert(val);
3981   result = *(uint32_t *)val;
3982   so->context->buffer_unmap(so->context, transfer);
3983
3984   return result / tgt->stride;
3985}
3986#endif
3987
3988#if GFX_VER == 6
3989static void
3990compute_vertices_written_so_far(struct crocus_context *ice,
3991                                struct crocus_stream_output_target *tgt,
3992                                struct crocus_streamout_counter *count,
3993                                uint64_t *svbi);
3994
3995static uint32_t
3996crocus_get_so_offset(struct pipe_stream_output_target *so)
3997{
3998   struct crocus_stream_output_target *tgt = (void *)so;
3999   struct crocus_context *ice = (void *)so->context;
4000
4001   uint64_t vert_written;
4002   compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written);
4003   return vert_written;
4004}
4005#endif
4006
4007/**
4008 * The pipe->create_stream_output_target() driver hook.
4009 *
4010 * "Target" here refers to a destination buffer.  We translate this into
4011 * a 3DSTATE_SO_BUFFER packet.  We can handle most fields, but don't yet
4012 * know which buffer this represents, or whether we ought to zero the
4013 * write-offsets, or append.  Those are handled in the set() hook.
4014 */
4015static struct pipe_stream_output_target *
4016crocus_create_stream_output_target(struct pipe_context *ctx,
4017                                   struct pipe_resource *p_res,
4018                                   unsigned buffer_offset,
4019                                   unsigned buffer_size)
4020{
4021   struct crocus_resource *res = (void *) p_res;
4022   struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso));
4023   if (!cso)
4024      return NULL;
4025
4026   res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4027
4028   pipe_reference_init(&cso->base.reference, 1);
4029   pipe_resource_reference(&cso->base.buffer, p_res);
4030   cso->base.buffer_offset = buffer_offset;
4031   cso->base.buffer_size = buffer_size;
4032   cso->base.context = ctx;
4033
4034   util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4035                  buffer_offset + buffer_size);
4036#if GFX_VER >= 7
4037   struct crocus_context *ice = (struct crocus_context *) ctx;
4038   void *temp;
4039   u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4,
4040                  &cso->offset_offset,
4041                  (struct pipe_resource **)&cso->offset_res,
4042                  &temp);
4043#endif
4044
4045   return &cso->base;
4046}
4047
4048static void
4049crocus_stream_output_target_destroy(struct pipe_context *ctx,
4050                                    struct pipe_stream_output_target *state)
4051{
4052   struct crocus_stream_output_target *cso = (void *) state;
4053
4054   pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL);
4055   pipe_resource_reference(&cso->base.buffer, NULL);
4056
4057   free(cso);
4058}
4059
4060#define GEN6_SO_NUM_PRIMS_WRITTEN       0x2288
4061#define GEN7_SO_WRITE_OFFSET(n)         (0x5280 + (n) * 4)
4062
4063#if GFX_VER == 6
4064static void
4065aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt,
4066                         struct crocus_streamout_counter *counter)
4067{
4068   uint64_t *prim_counts = tgt->prim_map;
4069
4070   if (crocus_batch_references(batch, tgt->offset_res->bo)) {
4071      struct pipe_fence_handle *out_fence = NULL;
4072      batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0);
4073      batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX);
4074      batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL);
4075   }
4076
4077   for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) {
4078      counter->accum += prim_counts[i + 1] - prim_counts[i];
4079   }
4080   tgt->count.offset_start = tgt->count.offset_end = 0;
4081}
4082
4083static void
4084crocus_stream_store_prims_written(struct crocus_batch *batch,
4085                                  struct crocus_stream_output_target *tgt)
4086{
4087   if (!tgt->offset_res) {
4088      u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4,
4089                     &tgt->offset_offset,
4090                     (struct pipe_resource **)&tgt->offset_res,
4091                     &tgt->prim_map);
4092      tgt->count.offset_start = tgt->count.offset_end = 0;
4093   }
4094
4095   if (tgt->count.offset_end + 16 >= 4096) {
4096      aggregate_stream_counter(batch, tgt, &tgt->prev_count);
4097      aggregate_stream_counter(batch, tgt, &tgt->count);
4098   }
4099
4100   crocus_emit_mi_flush(batch);
4101   crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN,
4102                               tgt->offset_res->bo,
4103                               tgt->count.offset_end + tgt->offset_offset, false);
4104   tgt->count.offset_end += 8;
4105}
4106
4107static void
4108compute_vertices_written_so_far(struct crocus_context *ice,
4109                                struct crocus_stream_output_target *tgt,
4110                                struct crocus_streamout_counter *counter,
4111                                uint64_t *svbi)
4112{
4113   //TODO vertices per prim
4114   aggregate_stream_counter(&ice->batches[0], tgt, counter);
4115
4116   *svbi = counter->accum * ice->state.last_xfb_verts_per_prim;
4117}
4118#endif
4119/**
4120 * The pipe->set_stream_output_targets() driver hook.
4121 *
4122 * At this point, we know which targets are bound to a particular index,
4123 * and also whether we want to append or start over.  We can finish the
4124 * 3DSTATE_SO_BUFFER packets we started earlier.
4125 */
4126static void
4127crocus_set_stream_output_targets(struct pipe_context *ctx,
4128                                 unsigned num_targets,
4129                                 struct pipe_stream_output_target **targets,
4130                                 const unsigned *offsets)
4131{
4132   struct crocus_context *ice = (struct crocus_context *) ctx;
4133   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4134   struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL };
4135   const bool active = num_targets > 0;
4136   if (ice->state.streamout_active != active) {
4137      ice->state.streamout_active = active;
4138#if GFX_VER >= 7
4139      ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
4140#else
4141      ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
4142#endif
4143
4144      /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4145       * it's a non-pipelined command.  If we're switching streamout on, we
4146       * may have missed emitting it earlier, so do so now.  (We're already
4147       * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4148       */
4149      if (active) {
4150#if GFX_VER >= 7
4151         ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
4152#endif
4153      } else {
4154         uint32_t flush = 0;
4155         for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4156            struct crocus_stream_output_target *tgt =
4157               (void *) ice->state.so_target[i];
4158            if (tgt) {
4159               struct crocus_resource *res = (void *) tgt->base.buffer;
4160
4161               flush |= crocus_flush_bits_for_history(res);
4162               crocus_dirty_for_history(ice, res);
4163            }
4164         }
4165         crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
4166                                        "make streamout results visible", flush);
4167      }
4168   }
4169
4170   ice->state.so_targets = num_targets;
4171   for (int i = 0; i < 4; i++) {
4172      pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]);
4173      pipe_so_target_reference(&ice->state.so_target[i],
4174                               i < num_targets ? targets[i] : NULL);
4175   }
4176
4177#if GFX_VER == 6
4178   bool stored_num_prims = false;
4179   for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4180      if (num_targets) {
4181         struct crocus_stream_output_target *tgt =
4182            (void *) ice->state.so_target[i];
4183
4184         if (!tgt)
4185            continue;
4186         if (offsets[i] == 0) {
4187            // This means that we're supposed to ignore anything written to
4188            // the buffer before. We can do this by just clearing out the
4189            // count of writes to the prim count buffer.
4190            tgt->count.offset_start = tgt->count.offset_end;
4191            tgt->count.accum = 0;
4192            ice->state.svbi = 0;
4193         } else {
4194            if (tgt->offset_res) {
4195               compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi);
4196               tgt->count.offset_start = tgt->count.offset_end;
4197            }
4198         }
4199
4200         if (!stored_num_prims) {
4201            crocus_stream_store_prims_written(batch, tgt);
4202            stored_num_prims = true;
4203         }
4204      } else {
4205         struct crocus_stream_output_target *tgt =
4206            (void *) old_tgt[i];
4207         if (tgt) {
4208            if (!stored_num_prims) {
4209               crocus_stream_store_prims_written(batch, tgt);
4210               stored_num_prims = true;
4211            }
4212
4213            if (tgt->offset_res) {
4214               tgt->prev_count = tgt->count;
4215            }
4216         }
4217      }
4218      pipe_so_target_reference(&old_tgt[i], NULL);
4219   }
4220   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
4221#else
4222   for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4223      if (num_targets) {
4224         struct crocus_stream_output_target *tgt =
4225            (void *) ice->state.so_target[i];
4226
4227         if (offsets[i] == 0) {
4228#if GFX_VER == 8
4229            if (tgt)
4230               tgt->zero_offset = true;
4231#endif
4232            crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0);
4233         }
4234         else if (tgt)
4235            crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4236                                       tgt->offset_res->bo,
4237                                       tgt->offset_offset);
4238      } else {
4239         struct crocus_stream_output_target *tgt =
4240            (void *) old_tgt[i];
4241         if (tgt)
4242            crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4243                                        tgt->offset_res->bo,
4244                                        tgt->offset_offset, false);
4245      }
4246      pipe_so_target_reference(&old_tgt[i], NULL);
4247   }
4248#endif
4249   /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4250   if (!active)
4251      return;
4252#if GFX_VER >= 7
4253   ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
4254#elif GFX_VER == 6
4255   ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI;
4256#endif
4257}
4258
4259#endif
4260
4261#if GFX_VER >= 7
4262/**
4263 * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4264 * 3DSTATE_STREAMOUT packets.
4265 *
4266 * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4267 * hardware to record.  We can create it entirely based on the shader, with
4268 * no dynamic state dependencies.
4269 *
4270 * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4271 * state-based settings.  We capture the shader-related ones here, and merge
4272 * the rest in at draw time.
4273 */
4274static uint32_t *
4275crocus_create_so_decl_list(const struct pipe_stream_output_info *info,
4276                           const struct brw_vue_map *vue_map)
4277{
4278   struct GENX(SO_DECL) so_decl[PIPE_MAX_VERTEX_STREAMS][128];
4279   int buffer_mask[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4280   int next_offset[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4281   int decls[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4282   int max_decls = 0;
4283   STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= PIPE_MAX_SO_OUTPUTS);
4284
4285   memset(so_decl, 0, sizeof(so_decl));
4286
4287   /* Construct the list of SO_DECLs to be emitted.  The formatting of the
4288    * command feels strange -- each dword pair contains a SO_DECL per stream.
4289    */
4290   for (unsigned i = 0; i < info->num_outputs; i++) {
4291      const struct pipe_stream_output *output = &info->output[i];
4292      const int buffer = output->output_buffer;
4293      const int varying = output->register_index;
4294      const unsigned stream_id = output->stream;
4295      assert(stream_id < PIPE_MAX_VERTEX_STREAMS);
4296
4297      buffer_mask[stream_id] |= 1 << buffer;
4298
4299      assert(vue_map->varying_to_slot[varying] >= 0);
4300
4301      /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4302       * array.  Instead, it simply increments DstOffset for the following
4303       * input by the number of components that should be skipped.
4304       *
4305       * Our hardware is unusual in that it requires us to program SO_DECLs
4306       * for fake "hole" components, rather than simply taking the offset
4307       * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
4308       * program as many size = 4 holes as we can, then a final hole to
4309       * accommodate the final 1, 2, or 3 remaining.
4310       */
4311      int skip_components = output->dst_offset - next_offset[buffer];
4312
4313      while (skip_components > 0) {
4314         so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4315            .HoleFlag = 1,
4316            .OutputBufferSlot = output->output_buffer,
4317            .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4318         };
4319         skip_components -= 4;
4320      }
4321
4322      next_offset[buffer] = output->dst_offset + output->num_components;
4323
4324      so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4325         .OutputBufferSlot = output->output_buffer,
4326         .RegisterIndex = vue_map->varying_to_slot[varying],
4327         .ComponentMask =
4328            ((1 << output->num_components) - 1) << output->start_component,
4329      };
4330
4331      if (decls[stream_id] > max_decls)
4332         max_decls = decls[stream_id];
4333   }
4334
4335   unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4336   uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4337   uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4338
4339   crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4340      int urb_entry_read_offset = 0;
4341      int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4342         urb_entry_read_offset;
4343
4344      /* We always read the whole vertex.  This could be reduced at some
4345       * point by reading less and offsetting the register index in the
4346       * SO_DECLs.
4347       */
4348      sol.Stream0VertexReadOffset = urb_entry_read_offset;
4349      sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4350      sol.Stream1VertexReadOffset = urb_entry_read_offset;
4351      sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4352      sol.Stream2VertexReadOffset = urb_entry_read_offset;
4353      sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4354      sol.Stream3VertexReadOffset = urb_entry_read_offset;
4355      sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4356
4357      // TODO: Double-check that stride == 0 means no buffer. Probably this
4358      // needs to go elsewhere, where the buffer enable stuff is actually
4359      // known.
4360#if GFX_VER < 8
4361      sol.SOBufferEnable0 = !!info->stride[0];
4362      sol.SOBufferEnable1 = !!info->stride[1];
4363      sol.SOBufferEnable2 = !!info->stride[2];
4364      sol.SOBufferEnable3 = !!info->stride[3];
4365#else
4366      /* Set buffer pitches; 0 means unbound. */
4367      sol.Buffer0SurfacePitch = 4 * info->stride[0];
4368      sol.Buffer1SurfacePitch = 4 * info->stride[1];
4369      sol.Buffer2SurfacePitch = 4 * info->stride[2];
4370      sol.Buffer3SurfacePitch = 4 * info->stride[3];
4371#endif
4372   }
4373
4374   crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4375      list.DWordLength = 3 + 2 * max_decls - 2;
4376      list.StreamtoBufferSelects0 = buffer_mask[0];
4377      list.StreamtoBufferSelects1 = buffer_mask[1];
4378      list.StreamtoBufferSelects2 = buffer_mask[2];
4379      list.StreamtoBufferSelects3 = buffer_mask[3];
4380      list.NumEntries0 = decls[0];
4381      list.NumEntries1 = decls[1];
4382      list.NumEntries2 = decls[2];
4383      list.NumEntries3 = decls[3];
4384   }
4385
4386   for (int i = 0; i < max_decls; i++) {
4387      crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4388         entry.Stream0Decl = so_decl[0][i];
4389         entry.Stream1Decl = so_decl[1][i];
4390         entry.Stream2Decl = so_decl[2][i];
4391         entry.Stream3Decl = so_decl[3][i];
4392      }
4393   }
4394
4395   return map;
4396}
4397#endif
4398
4399#if GFX_VER == 6
4400static void
4401crocus_emit_so_svbi(struct crocus_context *ice)
4402{
4403   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4404
4405   unsigned max_vertex = 0xffffffff;
4406   for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4407      struct crocus_stream_output_target *tgt =
4408         (void *) ice->state.so_target[i];
4409      if (tgt)
4410         max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride);
4411   }
4412
4413   crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4414      svbi.IndexNumber = 0;
4415      svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */
4416      svbi.MaximumIndex = max_vertex;
4417   }
4418
4419   /* initialize the rest of the SVBI's to reasonable values so that we don't
4420    * run out of room writing the regular data.
4421    */
4422   for (int i = 1; i < 4; i++) {
4423      crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4424         svbi.IndexNumber = i;
4425         svbi.StreamedVertexBufferIndex = 0;
4426         svbi.MaximumIndex = 0xffffffff;
4427      }
4428   }
4429}
4430
4431#endif
4432
4433
4434#if GFX_VER >= 6
4435static bool
4436crocus_is_drawing_points(const struct crocus_context *ice)
4437{
4438   const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4439
4440   if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT ||
4441       cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT)
4442      return true;
4443
4444   if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4445      const struct brw_gs_prog_data *gs_prog_data =
4446         (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
4447      return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
4448   } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4449      const struct brw_tes_prog_data *tes_data =
4450         (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
4451      return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
4452   } else {
4453      return ice->state.prim_mode == PIPE_PRIM_POINTS;
4454   }
4455}
4456#endif
4457
4458#if GFX_VER >= 6
4459static void
4460get_attr_override(
4461   struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
4462   const struct brw_vue_map *vue_map,
4463   int urb_entry_read_offset, int fs_attr,
4464   bool two_side_color, uint32_t *max_source_attr)
4465{
4466   /* Find the VUE slot for this attribute. */
4467   int slot = vue_map->varying_to_slot[fs_attr];
4468
4469   /* Viewport and Layer are stored in the VUE header.  We need to override
4470    * them to zero if earlier stages didn't write them, as GL requires that
4471    * they read back as zero when not explicitly set.
4472    */
4473   if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
4474      attr->ComponentOverrideX = true;
4475      attr->ComponentOverrideW = true;
4476      attr->ConstantSource = CONST_0000;
4477
4478      if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4479         attr->ComponentOverrideY = true;
4480      if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4481         attr->ComponentOverrideZ = true;
4482
4483      return;
4484   }
4485
4486   /* If there was only a back color written but not front, use back
4487    * as the color instead of undefined
4488    */
4489   if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4490      slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4491   if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4492      slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4493
4494   if (slot == -1) {
4495      /* This attribute does not exist in the VUE--that means that the vertex
4496       * shader did not write to it.  This means that either:
4497       *
4498       * (a) This attribute is a texture coordinate, and it is going to be
4499       * replaced with point coordinates (as a consequence of a call to
4500       * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
4501       * hardware will ignore whatever attribute override we supply.
4502       *
4503       * (b) This attribute is read by the fragment shader but not written by
4504       * the vertex shader, so its value is undefined.  Therefore the
4505       * attribute override we supply doesn't matter.
4506       *
4507       * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
4508       * previous shader stage.
4509       *
4510       * Note that we don't have to worry about the cases where the attribute
4511       * is gl_PointCoord or is undergoing point sprite coordinate
4512       * replacement, because in those cases, this function isn't called.
4513       *
4514       * In case (c), we need to program the attribute overrides so that the
4515       * primitive ID will be stored in this slot.  In every other case, the
4516       * attribute override we supply doesn't matter.  So just go ahead and
4517       * program primitive ID in every case.
4518       */
4519      attr->ComponentOverrideW = true;
4520      attr->ComponentOverrideX = true;
4521      attr->ComponentOverrideY = true;
4522      attr->ComponentOverrideZ = true;
4523      attr->ConstantSource = PRIM_ID;
4524      return;
4525   }
4526
4527   /* Compute the location of the attribute relative to urb_entry_read_offset.
4528    * Each increment of urb_entry_read_offset represents a 256-bit value, so
4529    * it counts for two 128-bit VUE slots.
4530    */
4531   int source_attr = slot - 2 * urb_entry_read_offset;
4532   assert(source_attr >= 0 && source_attr < 32);
4533
4534   /* If we are doing two-sided color, and the VUE slot following this one
4535    * represents a back-facing color, then we need to instruct the SF unit to
4536    * do back-facing swizzling.
4537    */
4538   bool swizzling = two_side_color &&
4539      ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4540        vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4541       (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4542        vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
4543
4544   /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
4545   if (*max_source_attr < source_attr + swizzling)
4546      *max_source_attr = source_attr + swizzling;
4547
4548   attr->SourceAttribute = source_attr;
4549   if (swizzling)
4550      attr->SwizzleSelect = INPUTATTR_FACING;
4551}
4552
4553static void
4554calculate_attr_overrides(
4555   const struct crocus_context *ice,
4556   struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
4557   uint32_t *point_sprite_enables,
4558   uint32_t *urb_entry_read_length,
4559   uint32_t *urb_entry_read_offset)
4560{
4561   const struct brw_wm_prog_data *wm_prog_data = (void *)
4562      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4563   const struct brw_vue_map *vue_map = ice->shaders.last_vue_map;
4564   const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4565   uint32_t max_source_attr = 0;
4566   const struct shader_info *fs_info =
4567      crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
4568
4569   int first_slot =
4570      brw_compute_first_urb_slot_required(fs_info->inputs_read, vue_map);
4571
4572   /* Each URB offset packs two varying slots */
4573   assert(first_slot % 2 == 0);
4574   *urb_entry_read_offset = first_slot / 2;
4575   *point_sprite_enables = 0;
4576
4577   for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
4578      const int input_index = wm_prog_data->urb_setup[fs_attr];
4579
4580      if (input_index < 0)
4581         continue;
4582
4583      bool point_sprite = false;
4584      if (crocus_is_drawing_points(ice)) {
4585         if (fs_attr >= VARYING_SLOT_TEX0 &&
4586             fs_attr <= VARYING_SLOT_TEX7 &&
4587             cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0)))
4588            point_sprite = true;
4589
4590         if (fs_attr == VARYING_SLOT_PNTC)
4591            point_sprite = true;
4592
4593         if (point_sprite)
4594            *point_sprite_enables |= 1U << input_index;
4595      }
4596
4597      struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
4598      if (!point_sprite) {
4599         get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr,
4600                           cso_rast->cso.light_twoside, &max_source_attr);
4601      }
4602
4603      /* The hardware can only do the overrides on 16 overrides at a
4604       * time, and the other up to 16 have to be lined up so that the
4605       * input index = the output index.  We'll need to do some
4606       * tweaking to make sure that's the case.
4607       */
4608      if (input_index < 16)
4609         attr_overrides[input_index] = attribute;
4610      else
4611         assert(attribute.SourceAttribute == input_index);
4612   }
4613
4614   /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4615    * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4616    *
4617    * "This field should be set to the minimum length required to read the
4618    *  maximum source attribute.  The maximum source attribute is indicated
4619    *  by the maximum value of the enabled Attribute # Source Attribute if
4620    *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4621    *  enable is not set.
4622    *  read_length = ceiling((max_source_attr + 1) / 2)
4623    *
4624    *  [errata] Corruption/Hang possible if length programmed larger than
4625    *  recommended"
4626    *
4627    * Similar text exists for Ivy Bridge.
4628    */
4629   *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
4630}
4631#endif
4632
4633#if GFX_VER >= 7
4634static void
4635crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice)
4636{
4637   const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4638   const struct brw_wm_prog_data *wm_prog_data = (void *)
4639      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4640#if GFX_VER >= 8
4641   struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
4642#else
4643#define attr_overrides sbe.Attribute
4644#endif
4645
4646   uint32_t urb_entry_read_length;
4647   uint32_t urb_entry_read_offset;
4648   uint32_t point_sprite_enables;
4649
4650   crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4651      sbe.AttributeSwizzleEnable = true;
4652      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
4653      sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode;
4654
4655      calculate_attr_overrides(ice,
4656                               attr_overrides,
4657                               &point_sprite_enables,
4658                               &urb_entry_read_length,
4659                               &urb_entry_read_offset);
4660      sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
4661      sbe.VertexURBEntryReadLength = urb_entry_read_length;
4662      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
4663      sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
4664#if GFX_VER >= 8
4665      sbe.ForceVertexURBEntryReadLength = true;
4666      sbe.ForceVertexURBEntryReadOffset = true;
4667#endif
4668   }
4669#if GFX_VER >= 8
4670   crocus_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4671      for (int i = 0; i < 16; i++)
4672         sbes.Attribute[i] = attr_overrides[i];
4673   }
4674#endif
4675}
4676#endif
4677
4678/* ------------------------------------------------------------------- */
4679
4680/**
4681 * Populate VS program key fields based on the current state.
4682 */
4683static void
4684crocus_populate_vs_key(const struct crocus_context *ice,
4685                       const struct shader_info *info,
4686                       gl_shader_stage last_stage,
4687                       struct brw_vs_prog_key *key)
4688{
4689   const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4690
4691   if (info->clip_distance_array_size == 0 &&
4692       (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4693       last_stage == MESA_SHADER_VERTEX)
4694      key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4695
4696   if (last_stage == MESA_SHADER_VERTEX &&
4697       info->outputs_written & (VARYING_BIT_PSIZ))
4698      key->clamp_pointsize = 1;
4699
4700#if GFX_VER <= 5
4701   key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL ||
4702                         cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL);
4703   key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff;
4704#endif
4705
4706   key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color;
4707
4708#if GFX_VERx10 < 75
4709   uint64_t inputs_read = info->inputs_read;
4710   int ve_idx = 0;
4711   while (inputs_read) {
4712      int i = u_bit_scan64(&inputs_read);
4713      key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx];
4714      ve_idx++;
4715   }
4716#endif
4717}
4718
4719/**
4720 * Populate TCS program key fields based on the current state.
4721 */
4722static void
4723crocus_populate_tcs_key(const struct crocus_context *ice,
4724                        struct brw_tcs_prog_key *key)
4725{
4726}
4727
4728/**
4729 * Populate TES program key fields based on the current state.
4730 */
4731static void
4732crocus_populate_tes_key(const struct crocus_context *ice,
4733                        const struct shader_info *info,
4734                        gl_shader_stage last_stage,
4735                        struct brw_tes_prog_key *key)
4736{
4737   const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4738
4739   if (info->clip_distance_array_size == 0 &&
4740       (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4741       last_stage == MESA_SHADER_TESS_EVAL)
4742      key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4743
4744   if (last_stage == MESA_SHADER_TESS_EVAL &&
4745       info->outputs_written & (VARYING_BIT_PSIZ))
4746      key->clamp_pointsize = 1;
4747}
4748
4749/**
4750 * Populate GS program key fields based on the current state.
4751 */
4752static void
4753crocus_populate_gs_key(const struct crocus_context *ice,
4754                       const struct shader_info *info,
4755                       gl_shader_stage last_stage,
4756                       struct brw_gs_prog_key *key)
4757{
4758   const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4759
4760   if (info->clip_distance_array_size == 0 &&
4761       (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4762       last_stage == MESA_SHADER_GEOMETRY)
4763      key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4764
4765   if (last_stage == MESA_SHADER_GEOMETRY &&
4766       info->outputs_written & (VARYING_BIT_PSIZ))
4767      key->clamp_pointsize = 1;
4768}
4769
4770/**
4771 * Populate FS program key fields based on the current state.
4772 */
4773static void
4774crocus_populate_fs_key(const struct crocus_context *ice,
4775                       const struct shader_info *info,
4776                       struct brw_wm_prog_key *key)
4777{
4778   struct crocus_screen *screen = (void *) ice->ctx.screen;
4779   const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
4780   const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
4781   const struct crocus_rasterizer_state *rast = ice->state.cso_rast;
4782   const struct crocus_blend_state *blend = ice->state.cso_blend;
4783
4784#if GFX_VER < 6
4785   uint32_t lookup = 0;
4786
4787   if (info->fs.uses_discard || zsa->cso.alpha_enabled)
4788      lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT;
4789
4790   if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4791      lookup |= BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT;
4792
4793   if (fb->zsbuf && zsa->cso.depth_enabled) {
4794      lookup |= BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT;
4795
4796      if (zsa->cso.depth_writemask)
4797         lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
4798
4799   }
4800   if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) {
4801      lookup |= BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT;
4802      if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask)
4803         lookup |= BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT;
4804   }
4805   key->iz_lookup = lookup;
4806   key->stats_wm = ice->state.stats_wm;
4807#endif
4808
4809   uint32_t line_aa = BRW_WM_AA_NEVER;
4810   if (rast->cso.line_smooth) {
4811      int reduced_prim = ice->state.reduced_prim_mode;
4812      if (reduced_prim == PIPE_PRIM_LINES)
4813         line_aa = BRW_WM_AA_ALWAYS;
4814      else if (reduced_prim == PIPE_PRIM_TRIANGLES) {
4815         if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) {
4816            line_aa = BRW_WM_AA_SOMETIMES;
4817
4818            if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE ||
4819                rast->cso.cull_face == PIPE_FACE_BACK)
4820               line_aa = BRW_WM_AA_ALWAYS;
4821         } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) {
4822            line_aa = BRW_WM_AA_SOMETIMES;
4823
4824            if (rast->cso.cull_face == PIPE_FACE_FRONT)
4825               line_aa = BRW_WM_AA_ALWAYS;
4826         }
4827      }
4828   }
4829   key->line_aa = line_aa;
4830
4831   key->nr_color_regions = fb->nr_cbufs;
4832
4833   key->clamp_fragment_color = rast->cso.clamp_fragment_color;
4834
4835   key->alpha_to_coverage = blend->cso.alpha_to_coverage;
4836
4837   key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled;
4838
4839   key->flat_shade = rast->cso.flatshade &&
4840      (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
4841
4842   key->persample_interp = rast->cso.force_persample_interp;
4843   key->multisample_fbo = rast->cso.multisample && fb->samples > 1;
4844
4845   key->ignore_sample_mask_out = !key->multisample_fbo;
4846   key->coherent_fb_fetch = false; // TODO: needed?
4847
4848   key->force_dual_color_blend =
4849      screen->driconf.dual_color_blend_by_location &&
4850      (blend->blend_enables & 1) && blend->dual_color_blending;
4851
4852#if GFX_VER <= 5
4853   if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) {
4854      key->emit_alpha_test = true;
4855      key->alpha_test_func = zsa->cso.alpha_func;
4856      key->alpha_test_ref = zsa->cso.alpha_ref_value;
4857   }
4858#endif
4859}
4860
4861static void
4862crocus_populate_cs_key(const struct crocus_context *ice,
4863                       struct brw_cs_prog_key *key)
4864{
4865}
4866
4867#if GFX_VER == 4
4868#define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset);
4869#elif GFX_VER >= 5
4870static uint64_t
4871KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader)
4872{
4873   return shader->offset;
4874}
4875#endif
4876
4877/* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable
4878 * prefetching of binding tables in A0 and B0 steppings.  XXX: Revisit
4879 * this WA on C0 stepping.
4880 *
4881 * TODO: Fill out SamplerCount for prefetching?
4882 */
4883
4884#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage)                 \
4885   pkt.KernelStartPointer = KSP(ice, shader);                           \
4886   pkt.BindingTableEntryCount = shader->bt.size_bytes / 4;              \
4887   pkt.FloatingPointMode = prog_data->use_alt_mode;                     \
4888                                                                        \
4889   pkt.DispatchGRFStartRegisterForURBData =                             \
4890      prog_data->dispatch_grf_start_reg;                                \
4891   pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;     \
4892   pkt.prefix##URBEntryReadOffset = 0;                                  \
4893                                                                        \
4894   pkt.StatisticsEnable = true;                                         \
4895   pkt.Enable           = true;                                         \
4896                                                                        \
4897   if (prog_data->total_scratch) {                                      \
4898      struct crocus_bo *bo =                                            \
4899         crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \
4900      pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;   \
4901      pkt.ScratchSpaceBasePointer = rw_bo(bo, 0);                       \
4902   }
4903
4904/* ------------------------------------------------------------------- */
4905#if GFX_VER >= 6
4906static const uint32_t push_constant_opcodes[] = {
4907   [MESA_SHADER_VERTEX]    = 21,
4908   [MESA_SHADER_TESS_CTRL] = 25, /* HS */
4909   [MESA_SHADER_TESS_EVAL] = 26, /* DS */
4910   [MESA_SHADER_GEOMETRY]  = 22,
4911   [MESA_SHADER_FRAGMENT]  = 23,
4912   [MESA_SHADER_COMPUTE]   = 0,
4913};
4914#endif
4915
4916static void
4917emit_sized_null_surface(struct crocus_batch *batch,
4918                        unsigned width, unsigned height,
4919                        unsigned layers, unsigned levels,
4920                        unsigned minimum_array_element,
4921                        uint32_t *out_offset)
4922{
4923   struct isl_device *isl_dev = &batch->screen->isl_dev;
4924   uint32_t *surf = stream_state(batch, isl_dev->ss.size,
4925                                 isl_dev->ss.align,
4926                                 out_offset);
4927   //TODO gen 6 multisample crash
4928   isl_null_fill_state(isl_dev, surf,
4929                       .size = isl_extent3d(width, height, layers),
4930                       .levels = levels,
4931                       .minimum_array_element = minimum_array_element);
4932}
4933static void
4934emit_null_surface(struct crocus_batch *batch,
4935                  uint32_t *out_offset)
4936{
4937   emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset);
4938}
4939
4940static void
4941emit_null_fb_surface(struct crocus_batch *batch,
4942                     struct crocus_context *ice,
4943                     uint32_t *out_offset)
4944{
4945   uint32_t width, height, layers, level, layer;
4946   /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
4947   if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) {
4948      emit_null_surface(batch, out_offset);
4949      return;
4950   }
4951
4952   struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
4953   width = MAX2(cso->width, 1);
4954   height = MAX2(cso->height, 1);
4955   layers = cso->layers ? cso->layers : 1;
4956   level = 0;
4957   layer = 0;
4958
4959   if (cso->nr_cbufs == 0 && cso->zsbuf) {
4960      width = cso->zsbuf->width;
4961      height = cso->zsbuf->height;
4962      level = cso->zsbuf->u.tex.level;
4963      layer = cso->zsbuf->u.tex.first_layer;
4964   }
4965   emit_sized_null_surface(batch, width, height,
4966                           layers, level, layer,
4967                           out_offset);
4968}
4969
4970static void
4971emit_surface_state(struct crocus_batch *batch,
4972                   struct crocus_resource *res,
4973                   const struct isl_surf *in_surf,
4974                   bool adjust_surf,
4975                   struct isl_view *in_view,
4976                   bool writeable,
4977                   enum isl_aux_usage aux_usage,
4978                   bool blend_enable,
4979                   uint32_t write_disables,
4980                   uint32_t *surf_state,
4981                   uint32_t addr_offset)
4982{
4983   struct isl_device *isl_dev = &batch->screen->isl_dev;
4984   uint32_t reloc = RELOC_32BIT;
4985   uint64_t offset_B = res->offset;
4986   uint32_t tile_x_sa = 0, tile_y_sa = 0;
4987
4988   if (writeable)
4989      reloc |= RELOC_WRITE;
4990
4991   struct isl_surf surf = *in_surf;
4992   struct isl_view view = *in_view;
4993   if (adjust_surf) {
4994      if (res->base.b.target == PIPE_TEXTURE_3D && view.array_len == 1) {
4995         isl_surf_get_image_surf(isl_dev, in_surf,
4996                                 view.base_level, 0,
4997                                 view.base_array_layer,
4998                                 &surf, &offset_B,
4999                                 &tile_x_sa, &tile_y_sa);
5000         view.base_array_layer = 0;
5001         view.base_level = 0;
5002      } else if (res->base.b.target == PIPE_TEXTURE_CUBE && GFX_VER == 4) {
5003         isl_surf_get_image_surf(isl_dev, in_surf,
5004                                 view.base_level, view.base_array_layer,
5005                                 0,
5006                                 &surf, &offset_B,
5007                                 &tile_x_sa, &tile_y_sa);
5008         view.base_array_layer = 0;
5009         view.base_level = 0;
5010      } else if (res->base.b.target == PIPE_TEXTURE_1D_ARRAY)
5011         surf.dim = ISL_SURF_DIM_2D;
5012   }
5013
5014   union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
5015   struct crocus_bo *aux_bo = NULL;
5016   uint32_t aux_offset = 0;
5017   struct isl_surf *aux_surf = NULL;
5018   if (aux_usage != ISL_AUX_USAGE_NONE) {
5019      aux_surf = &res->aux.surf;
5020      aux_offset = res->aux.offset;
5021      aux_bo = res->aux.bo;
5022
5023      clear_color = crocus_resource_get_clear_color(res);
5024   }
5025
5026   isl_surf_fill_state(isl_dev, surf_state,
5027                       .surf = &surf,
5028                       .view = &view,
5029                       .address = crocus_state_reloc(batch,
5030                                                     addr_offset + isl_dev->ss.addr_offset,
5031                                                     res->bo, offset_B, reloc),
5032                       .aux_surf = aux_surf,
5033                       .aux_usage = aux_usage,
5034                       .aux_address = aux_offset,
5035                       .mocs = crocus_mocs(res->bo, isl_dev),
5036                       .clear_color = clear_color,
5037                       .use_clear_address = false,
5038                       .clear_address = 0,
5039                       .x_offset_sa = tile_x_sa,
5040                       .y_offset_sa = tile_y_sa,
5041#if GFX_VER <= 5
5042                       .blend_enable = blend_enable,
5043                       .write_disables = write_disables,
5044#endif
5045      );
5046
5047   if (aux_surf) {
5048      /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the
5049       * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits
5050       * contain other control information.  Since buffer addresses are always
5051       * on 4k boundaries (and thus have their lower 12 bits zero), we can use
5052       * an ordinary reloc to do the necessary address translation.
5053       *
5054       * FIXME: move to the point of assignment.
5055       */
5056      if (GFX_VER == 8) {
5057         uint64_t *aux_addr = (uint64_t *)(surf_state + (isl_dev->ss.aux_addr_offset / 4));
5058         *aux_addr = crocus_state_reloc(batch,
5059                                        addr_offset + isl_dev->ss.aux_addr_offset,
5060                                        aux_bo, *aux_addr,
5061                                        reloc);
5062      } else {
5063         uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4);
5064         *aux_addr = crocus_state_reloc(batch,
5065                                        addr_offset + isl_dev->ss.aux_addr_offset,
5066                                        aux_bo, *aux_addr,
5067                                        reloc);
5068      }
5069   }
5070
5071}
5072
5073static uint32_t
5074emit_surface(struct crocus_batch *batch,
5075             struct crocus_surface *surf,
5076             enum isl_aux_usage aux_usage,
5077             bool blend_enable,
5078             uint32_t write_disables)
5079{
5080   struct isl_device *isl_dev = &batch->screen->isl_dev;
5081   struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5082   struct isl_view *view = &surf->view;
5083   uint32_t offset = 0;
5084   enum pipe_texture_target target = res->base.b.target;
5085   bool adjust_surf = false;
5086
5087   if (GFX_VER == 4 && target == PIPE_TEXTURE_CUBE)
5088      adjust_surf = true;
5089
5090   if (surf->align_res)
5091      res = (struct crocus_resource *)surf->align_res;
5092
5093   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5094
5095   emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true,
5096                      aux_usage, blend_enable,
5097                      write_disables,
5098                      surf_state, offset);
5099   return offset;
5100}
5101
5102static uint32_t
5103emit_rt_surface(struct crocus_batch *batch,
5104                struct crocus_surface *surf,
5105                enum isl_aux_usage aux_usage)
5106{
5107   struct isl_device *isl_dev = &batch->screen->isl_dev;
5108   struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5109   struct isl_view *view = &surf->read_view;
5110   uint32_t offset = 0;
5111   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5112
5113   emit_surface_state(batch, res, &surf->surf, true, view, false,
5114                      aux_usage, 0, false,
5115                      surf_state, offset);
5116   return offset;
5117}
5118
5119static uint32_t
5120emit_grid(struct crocus_context *ice,
5121          struct crocus_batch *batch)
5122{
5123   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5124   uint32_t offset = 0;
5125   struct crocus_state_ref *grid_ref = &ice->state.grid_size;
5126   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5127                                       isl_dev->ss.align, &offset);
5128   isl_buffer_fill_state(isl_dev, surf_state,
5129                         .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5130                                                       crocus_resource_bo(grid_ref->res),
5131                                                       grid_ref->offset,
5132                                                       RELOC_32BIT),
5133                         .size_B = 12,
5134                         .format = ISL_FORMAT_RAW,
5135                         .stride_B = 1,
5136                         .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev));
5137   return offset;
5138}
5139
5140static uint32_t
5141emit_ubo_buffer(struct crocus_context *ice,
5142                struct crocus_batch *batch,
5143                struct pipe_constant_buffer *buffer)
5144{
5145   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5146   uint32_t offset = 0;
5147
5148   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5149                                       isl_dev->ss.align, &offset);
5150   isl_buffer_fill_state(isl_dev, surf_state,
5151                         .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5152                                                       crocus_resource_bo(buffer->buffer),
5153                                                       buffer->buffer_offset,
5154                                                       RELOC_32BIT),
5155                         .size_B = buffer->buffer_size,
5156                         .format = 0,
5157                         .swizzle = ISL_SWIZZLE_IDENTITY,
5158                         .stride_B = 1,
5159                         .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5160
5161   return offset;
5162}
5163
5164static uint32_t
5165emit_ssbo_buffer(struct crocus_context *ice,
5166                 struct crocus_batch *batch,
5167                 struct pipe_shader_buffer *buffer, bool writeable)
5168{
5169   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5170   uint32_t offset = 0;
5171   uint32_t reloc = RELOC_32BIT;
5172
5173   if (writeable)
5174      reloc |= RELOC_WRITE;
5175   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5176                                       isl_dev->ss.align, &offset);
5177   isl_buffer_fill_state(isl_dev, surf_state,
5178                         .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5179                                                       crocus_resource_bo(buffer->buffer),
5180                                                       buffer->buffer_offset,
5181                                                       reloc),
5182                         .size_B = buffer->buffer_size,
5183                         .format = ISL_FORMAT_RAW,
5184                         .swizzle = ISL_SWIZZLE_IDENTITY,
5185                         .stride_B = 1,
5186                         .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5187
5188   return offset;
5189}
5190
5191static uint32_t
5192emit_sampler_view(struct crocus_context *ice,
5193                  struct crocus_batch *batch,
5194                  bool for_gather,
5195                  struct crocus_sampler_view *isv)
5196{
5197   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5198   uint32_t offset = 0;
5199
5200   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5201                                       isl_dev->ss.align, &offset);
5202
5203   if (isv->base.target == PIPE_BUFFER) {
5204      const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format);
5205      const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5206      unsigned final_size =
5207         MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset,
5208              CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5209      isl_buffer_fill_state(isl_dev, surf_state,
5210                            .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5211                                                          isv->res->bo,
5212                                                          isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT),
5213                            .size_B = final_size,
5214                            .format = isv->view.format,
5215                            .swizzle = isv->view.swizzle,
5216                            .stride_B = cpp,
5217                            .mocs = crocus_mocs(isv->res->bo, isl_dev)
5218         );
5219   } else {
5220      enum isl_aux_usage aux_usage =
5221         crocus_resource_texture_aux_usage(isv->res);
5222
5223      emit_surface_state(batch, isv->res, &isv->res->surf, false,
5224                         for_gather ? &isv->gather_view : &isv->view,
5225                         false, aux_usage, false,
5226                         0, surf_state, offset);
5227   }
5228   return offset;
5229}
5230
5231static uint32_t
5232emit_image_view(struct crocus_context *ice,
5233                struct crocus_batch *batch,
5234                struct crocus_image_view *iv)
5235{
5236   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5237   uint32_t offset = 0;
5238
5239   struct crocus_resource *res = (struct crocus_resource *)iv->base.resource;
5240   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5241                                       isl_dev->ss.align, &offset);
5242   bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5243   uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0);
5244   if (res->base.b.target == PIPE_BUFFER) {
5245      const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format);
5246      const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5247      unsigned final_size =
5248         MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset,
5249              CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5250      isl_buffer_fill_state(isl_dev, surf_state,
5251                            .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5252                                                          res->bo,
5253                                                          res->offset + iv->base.u.buf.offset, reloc),
5254                            .size_B = final_size,
5255                            .format = iv->view.format,
5256                            .swizzle = iv->view.swizzle,
5257                            .stride_B = cpp,
5258                            .mocs = crocus_mocs(res->bo, isl_dev)
5259         );
5260   } else {
5261      if (iv->view.format == ISL_FORMAT_RAW) {
5262         isl_buffer_fill_state(isl_dev, surf_state,
5263                               .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5264                                                             res->bo,
5265                                                             res->offset, reloc),
5266                               .size_B = res->bo->size - res->offset,
5267                               .format = iv->view.format,
5268                               .swizzle = iv->view.swizzle,
5269                               .stride_B = 1,
5270                               .mocs = crocus_mocs(res->bo, isl_dev),
5271            );
5272
5273
5274      } else {
5275         emit_surface_state(batch, res,
5276                            &res->surf, false, &iv->view,
5277                            write, 0, false,
5278                            0, surf_state, offset);
5279      }
5280   }
5281
5282   return offset;
5283}
5284
5285#if GFX_VER == 6
5286static uint32_t
5287emit_sol_surface(struct crocus_batch *batch,
5288                 struct pipe_stream_output_info *so_info,
5289                 uint32_t idx)
5290{
5291   struct crocus_context *ice = batch->ice;
5292
5293   if (idx >= so_info->num_outputs || !ice->state.streamout_active)
5294      return 0;
5295   const struct pipe_stream_output *output = &so_info->output[idx];
5296   const int buffer = output->output_buffer;
5297   assert(output->stream == 0);
5298
5299   struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer;
5300   unsigned stride_dwords = so_info->stride[buffer];
5301   unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset;
5302
5303   size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4;
5304   unsigned num_vector_components = output->num_components;
5305   unsigned num_elements;
5306   /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
5307    * too big to map using a single binding table entry?
5308    */
5309   //   assert((size_dwords - offset_dwords) / stride_dwords
5310   //          <= BRW_MAX_NUM_BUFFER_ENTRIES);
5311
5312   if (size_dwords > offset_dwords + num_vector_components) {
5313      /* There is room for at least 1 transform feedback output in the buffer.
5314       * Compute the number of additional transform feedback outputs the
5315       * buffer has room for.
5316       */
5317      num_elements =
5318         (size_dwords - offset_dwords - num_vector_components);
5319   } else {
5320      /* There isn't even room for a single transform feedback output in the
5321       * buffer.  We can't configure the binding table entry to prevent output
5322       * entirely; we'll have to rely on the geometry shader to detect
5323       * overflow.  But to minimize the damage in case of a bug, set up the
5324       * binding table entry to just allow a single output.
5325       */
5326      num_elements = 0;
5327   }
5328   num_elements += stride_dwords;
5329
5330   uint32_t surface_format;
5331   switch (num_vector_components) {
5332   case 1:
5333      surface_format = ISL_FORMAT_R32_FLOAT;
5334      break;
5335   case 2:
5336      surface_format = ISL_FORMAT_R32G32_FLOAT;
5337      break;
5338   case 3:
5339      surface_format = ISL_FORMAT_R32G32B32_FLOAT;
5340      break;
5341   case 4:
5342      surface_format = ISL_FORMAT_R32G32B32A32_FLOAT;
5343      break;
5344   default:
5345      unreachable("Invalid vector size for transform feedback output");
5346   }
5347
5348   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5349   uint32_t offset = 0;
5350
5351   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5352                                       isl_dev->ss.align, &offset);
5353   isl_buffer_fill_state(isl_dev, surf_state,
5354                         .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5355                                                       crocus_resource_bo(&buf->base.b),
5356                                                       offset_dwords * 4, RELOC_32BIT|RELOC_WRITE),
5357                         .size_B = num_elements * 4,
5358                         .stride_B = stride_dwords * 4,
5359                         .swizzle = ISL_SWIZZLE_IDENTITY,
5360                         .format = surface_format);
5361   return offset;
5362}
5363#endif
5364
5365#define foreach_surface_used(index, group)                      \
5366   for (int index = 0; index < bt->sizes[group]; index++)       \
5367      if (crocus_group_index_to_bti(bt, group, index) !=        \
5368          CROCUS_SURFACE_NOT_USED)
5369
5370static void
5371crocus_populate_binding_table(struct crocus_context *ice,
5372                              struct crocus_batch *batch,
5373                              gl_shader_stage stage, bool ff_gs)
5374{
5375   struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage];
5376   struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage];
5377   if (!shader)
5378      return;
5379
5380   struct crocus_binding_table *bt = &shader->bt;
5381   int s = 0;
5382   uint32_t *surf_offsets = shader->surf_offset;
5383
5384#if GFX_VER < 8
5385   const struct shader_info *info = crocus_get_shader_info(ice, stage);
5386#endif
5387
5388   if (stage == MESA_SHADER_FRAGMENT) {
5389      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5390      /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5391      if (cso_fb->nr_cbufs) {
5392         for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5393            uint32_t write_disables = 0;
5394            bool blend_enable = false;
5395#if GFX_VER <= 5
5396            const struct pipe_rt_blend_state *rt =
5397               &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0];
5398            struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
5399            struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
5400            write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8;
5401            write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4;
5402            write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2;
5403            write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1;
5404            /* Gen4/5 can't handle blending off when a dual src blend wm is enabled. */
5405            blend_enable = rt->blend_enable || wm_prog_data->dual_src_blend;
5406#endif
5407            if (cso_fb->cbufs[i]) {
5408               surf_offsets[s] = emit_surface(batch,
5409                                              (struct crocus_surface *)cso_fb->cbufs[i],
5410                                              ice->state.draw_aux_usage[i],
5411                                              blend_enable,
5412                                              write_disables);
5413            } else {
5414               emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5415            }
5416            s++;
5417         }
5418      } else {
5419         emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5420         s++;
5421      }
5422
5423      foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) {
5424         struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5425         if (cso_fb->cbufs[i]) {
5426            surf_offsets[s++] = emit_rt_surface(batch,
5427                                                (struct crocus_surface *)cso_fb->cbufs[i],
5428                                                ice->state.draw_aux_usage[i]);
5429         }
5430      }
5431   }
5432
5433   if (stage == MESA_SHADER_COMPUTE) {
5434      foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) {
5435         surf_offsets[s] = emit_grid(ice, batch);
5436         s++;
5437      }
5438   }
5439
5440#if GFX_VER == 6
5441   if (stage == MESA_SHADER_GEOMETRY) {
5442      struct pipe_stream_output_info *so_info;
5443      if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
5444         so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output;
5445      else
5446         so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output;
5447
5448      foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) {
5449         surf_offsets[s] = emit_sol_surface(batch, so_info, i);
5450         s++;
5451      }
5452   }
5453#endif
5454
5455   foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) {
5456      struct crocus_sampler_view *view = shs->textures[i];
5457      if (view)
5458         surf_offsets[s] = emit_sampler_view(ice, batch, false, view);
5459      else
5460         emit_null_surface(batch, &surf_offsets[s]);
5461      s++;
5462   }
5463
5464#if GFX_VER < 8
5465   if (info && info->uses_texture_gather) {
5466      foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) {
5467         struct crocus_sampler_view *view = shs->textures[i];
5468         if (view)
5469            surf_offsets[s] = emit_sampler_view(ice, batch, true, view);
5470         else
5471            emit_null_surface(batch, &surf_offsets[s]);
5472         s++;
5473      }
5474   }
5475#endif
5476
5477   foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) {
5478      struct crocus_image_view *view = &shs->image[i];
5479      if (view->base.resource)
5480         surf_offsets[s] = emit_image_view(ice, batch, view);
5481      else
5482         emit_null_surface(batch, &surf_offsets[s]);
5483      s++;
5484   }
5485   foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) {
5486      if (shs->constbufs[i].buffer)
5487         surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]);
5488      else
5489         emit_null_surface(batch, &surf_offsets[s]);
5490      s++;
5491   }
5492   foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) {
5493      if (shs->ssbo[i].buffer)
5494         surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i],
5495                                            !!(shs->writable_ssbos & (1 << i)));
5496      else
5497         emit_null_surface(batch, &surf_offsets[s]);
5498      s++;
5499   }
5500
5501}
5502/* ------------------------------------------------------------------- */
5503static uint32_t
5504crocus_upload_binding_table(struct crocus_context *ice,
5505                            struct crocus_batch *batch,
5506                            uint32_t *table,
5507                            uint32_t size)
5508
5509{
5510   if (size == 0)
5511      return 0;
5512   return emit_state(batch, table, size, 32);
5513}
5514
5515/**
5516 * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
5517 */
5518
5519static void
5520crocus_update_surface_base_address(struct crocus_batch *batch)
5521{
5522   if (batch->state_base_address_emitted)
5523      return;
5524
5525   UNUSED uint32_t mocs = batch->screen->isl_dev.mocs.internal;
5526
5527   flush_before_state_base_change(batch);
5528
5529   crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
5530      /* Set base addresses */
5531      sba.GeneralStateBaseAddressModifyEnable = true;
5532
5533#if GFX_VER >= 6
5534      sba.DynamicStateBaseAddressModifyEnable = true;
5535      sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0);
5536#endif
5537
5538      sba.SurfaceStateBaseAddressModifyEnable = true;
5539      sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0);
5540
5541      sba.IndirectObjectBaseAddressModifyEnable = true;
5542
5543#if GFX_VER >= 5
5544      sba.InstructionBaseAddressModifyEnable = true;
5545      sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO!
5546#endif
5547
5548      /* Set buffer sizes on Gen8+ or upper bounds on Gen4-7 */
5549#if GFX_VER == 8
5550      sba.GeneralStateBufferSize   = 0xfffff;
5551      sba.IndirectObjectBufferSize = 0xfffff;
5552      sba.InstructionBufferSize    = 0xfffff;
5553      sba.DynamicStateBufferSize   = MAX_STATE_SIZE;
5554
5555      sba.GeneralStateBufferSizeModifyEnable    = true;
5556      sba.DynamicStateBufferSizeModifyEnable    = true;
5557      sba.IndirectObjectBufferSizeModifyEnable  = true;
5558      sba.InstructionBuffersizeModifyEnable     = true;
5559#else
5560      sba.GeneralStateAccessUpperBoundModifyEnable = true;
5561      sba.IndirectObjectAccessUpperBoundModifyEnable = true;
5562
5563#if GFX_VER >= 5
5564      sba.InstructionAccessUpperBoundModifyEnable = true;
5565#endif
5566
5567#if GFX_VER >= 6
5568      /* Dynamic state upper bound.  Although the documentation says that
5569       * programming it to zero will cause it to be ignored, that is a lie.
5570       * If this isn't programmed to a real bound, the sampler border color
5571       * pointer is rejected, causing border color to mysteriously fail.
5572       */
5573      sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5574      sba.DynamicStateAccessUpperBoundModifyEnable = true;
5575#else
5576      /* Same idea but using General State Base Address on Gen4-5 */
5577      sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5578#endif
5579#endif
5580
5581#if GFX_VER >= 6
5582      /* The hardware appears to pay attention to the MOCS fields even
5583       * if you don't set the "Address Modify Enable" bit for the base.
5584       */
5585      sba.GeneralStateMOCS            = mocs;
5586      sba.StatelessDataPortAccessMOCS = mocs;
5587      sba.DynamicStateMOCS            = mocs;
5588      sba.IndirectObjectMOCS          = mocs;
5589      sba.InstructionMOCS             = mocs;
5590      sba.SurfaceStateMOCS            = mocs;
5591#endif
5592   }
5593
5594   flush_after_state_base_change(batch);
5595
5596   /* According to section 3.6.1 of VOL1 of the 965 PRM,
5597    * STATE_BASE_ADDRESS updates require a reissue of:
5598    *
5599    * 3DSTATE_PIPELINE_POINTERS
5600    * 3DSTATE_BINDING_TABLE_POINTERS
5601    * MEDIA_STATE_POINTERS
5602    *
5603    * and this continues through Ironlake.  The Sandy Bridge PRM, vol
5604    * 1 part 1 says that the folowing packets must be reissued:
5605    *
5606    * 3DSTATE_CC_POINTERS
5607    * 3DSTATE_BINDING_TABLE_POINTERS
5608    * 3DSTATE_SAMPLER_STATE_POINTERS
5609    * 3DSTATE_VIEWPORT_STATE_POINTERS
5610    * MEDIA_STATE_POINTERS
5611    *
5612    * Those are always reissued following SBA updates anyway (new
5613    * batch time), except in the case of the program cache BO
5614    * changing.  Having a separate state flag makes the sequence more
5615    * obvious.
5616    */
5617#if GFX_VER <= 5
5618   batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
5619#elif GFX_VER == 6
5620   batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
5621#endif
5622   batch->state_base_address_emitted = true;
5623}
5624
5625static inline void
5626crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
5627                          bool window_space_position, float *zmin, float *zmax)
5628{
5629   if (window_space_position) {
5630      *zmin = 0.f;
5631      *zmax = 1.f;
5632      return;
5633   }
5634   util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
5635}
5636
5637struct push_bos {
5638   struct {
5639      struct crocus_address addr;
5640      uint32_t length;
5641   } buffers[4];
5642   int buffer_count;
5643   uint32_t max_length;
5644};
5645
5646#if GFX_VER >= 6
5647static void
5648setup_constant_buffers(struct crocus_context *ice,
5649                       struct crocus_batch *batch,
5650                       int stage,
5651                       struct push_bos *push_bos)
5652{
5653   struct crocus_shader_state *shs = &ice->state.shaders[stage];
5654   struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5655   struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
5656
5657   uint32_t push_range_sum = 0;
5658
5659   int n = 0;
5660   for (int i = 0; i < 4; i++) {
5661      const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
5662
5663      if (range->length == 0)
5664         continue;
5665
5666      push_range_sum += range->length;
5667
5668      if (range->length > push_bos->max_length)
5669         push_bos->max_length = range->length;
5670
5671      /* Range block is a binding table index, map back to UBO index. */
5672      unsigned block_index = crocus_bti_to_group_index(
5673         &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
5674      assert(block_index != CROCUS_SURFACE_NOT_USED);
5675
5676      struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index];
5677      struct crocus_resource *res = (void *) cbuf->buffer;
5678
5679      assert(cbuf->buffer_offset % 32 == 0);
5680
5681      push_bos->buffers[n].length = range->length;
5682      push_bos->buffers[n].addr =
5683         res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
5684         : ro_bo(batch->ice->workaround_bo,
5685                 batch->ice->workaround_offset);
5686      n++;
5687   }
5688
5689   /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
5690    *
5691    *    "The sum of all four read length fields must be less than or
5692    *    equal to the size of 64."
5693    */
5694   assert(push_range_sum <= 64);
5695
5696   push_bos->buffer_count = n;
5697}
5698
5699#if GFX_VER == 7
5700static void
5701gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
5702{
5703   crocus_emit_pipe_control_write(batch,
5704                                  "vs workaround",
5705                                  PIPE_CONTROL_WRITE_IMMEDIATE
5706                                  | PIPE_CONTROL_DEPTH_STALL,
5707                                  batch->ice->workaround_bo,
5708                                  batch->ice->workaround_offset, 0);
5709}
5710#endif
5711
5712static void
5713emit_push_constant_packets(struct crocus_context *ice,
5714                           struct crocus_batch *batch,
5715                           int stage,
5716                           const struct push_bos *push_bos)
5717{
5718   struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5719   struct brw_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL;
5720   UNUSED uint32_t mocs = crocus_mocs(NULL, &batch->screen->isl_dev);
5721
5722#if GFX_VER == 7
5723   if (stage == MESA_SHADER_VERTEX) {
5724      if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
5725         gen7_emit_vs_workaround_flush(batch);
5726   }
5727#endif
5728   crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
5729      pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
5730#if GFX_VER >= 7
5731#if GFX_VER != 8
5732      /* MOCS is MBZ on Gen8 so we skip it there */
5733      pkt.ConstantBody.MOCS = mocs;
5734#endif
5735
5736      if (prog_data) {
5737         /* The Skylake PRM contains the following restriction:
5738          *
5739          *    "The driver must ensure The following case does not occur
5740          *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
5741          *     buffer 3 read length equal to zero committed followed by a
5742          *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
5743          *     zero committed."
5744          *
5745          * To avoid this, we program the buffers in the highest slots.
5746          * This way, slot 0 is only used if slot 3 is also used.
5747          */
5748         int n = push_bos->buffer_count;
5749         assert(n <= 4);
5750#if GFX_VERx10 >= 75
5751         const unsigned shift = 4 - n;
5752#else
5753         const unsigned shift = 0;
5754#endif
5755         for (int i = 0; i < n; i++) {
5756            pkt.ConstantBody.ReadLength[i + shift] =
5757               push_bos->buffers[i].length;
5758            pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
5759         }
5760      }
5761#else
5762      if (prog_data) {
5763         int n = push_bos->buffer_count;
5764         assert (n <= 1);
5765         if (n == 1) {
5766            pkt.Buffer0Valid = true;
5767            pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset;
5768            pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1;
5769         }
5770      }
5771#endif
5772   }
5773}
5774
5775#endif
5776
5777#if GFX_VER == 8
5778typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
5779#elif GFX_VER >= 6
5780typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
5781#else
5782typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
5783#endif
5784
5785static inline void
5786set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds)
5787{
5788   struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
5789   ds->DepthTestEnable = cso->cso.depth_enabled;
5790   ds->DepthBufferWriteEnable = cso->cso.depth_writemask;
5791   ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func);
5792
5793   ds->StencilFailOp = cso->cso.stencil[0].fail_op;
5794   ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op;
5795   ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op;
5796   ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func);
5797
5798   ds->StencilTestMask = cso->cso.stencil[0].valuemask;
5799   ds->StencilWriteMask = cso->cso.stencil[0].writemask;
5800
5801   ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op;
5802   ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op;
5803   ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op;
5804   ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func);
5805
5806   ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask;
5807   ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask;
5808   ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled;
5809   ds->StencilTestEnable = cso->cso.stencil[0].enabled;
5810   ds->StencilBufferWriteEnable =
5811      cso->cso.stencil[0].writemask != 0 ||
5812      (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0);
5813}
5814
5815static void
5816emit_vertex_buffer_state(struct crocus_batch *batch,
5817                         unsigned buffer_id,
5818                         struct crocus_bo *bo,
5819                         unsigned start_offset,
5820                         unsigned end_offset,
5821                         unsigned stride,
5822                         unsigned step_rate,
5823                         uint32_t **map)
5824{
5825   const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
5826   _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) {
5827      vb.BufferStartingAddress = ro_bo(bo, start_offset);
5828#if GFX_VER >= 8
5829      vb.BufferSize = end_offset - start_offset;
5830#endif
5831      vb.VertexBufferIndex = buffer_id;
5832      vb.BufferPitch = stride;
5833#if GFX_VER >= 7
5834      vb.AddressModifyEnable = true;
5835#endif
5836#if GFX_VER >= 6
5837      vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
5838#endif
5839#if GFX_VER < 8
5840      vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA;
5841      vb.InstanceDataStepRate = step_rate;
5842#if GFX_VER >= 5
5843      vb.EndAddress = ro_bo(bo, end_offset - 1);
5844#endif
5845#endif
5846   }
5847   *map += vb_dwords;
5848}
5849
5850#if GFX_VER >= 6
5851static uint32_t
5852determine_sample_mask(struct crocus_context *ice)
5853{
5854   uint32_t num_samples = ice->state.framebuffer.samples;
5855
5856   if (num_samples <= 1)
5857      return 1;
5858
5859   uint32_t fb_mask = (1 << num_samples) - 1;
5860   return ice->state.sample_mask & fb_mask;
5861}
5862#endif
5863
5864static void
5865crocus_upload_dirty_render_state(struct crocus_context *ice,
5866                               struct crocus_batch *batch,
5867                               const struct pipe_draw_info *draw)
5868{
5869   uint64_t dirty = ice->state.dirty;
5870   uint64_t stage_dirty = ice->state.stage_dirty;
5871
5872   if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) &&
5873       !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER))
5874      return;
5875
5876   if (dirty & CROCUS_DIRTY_VF_STATISTICS) {
5877      crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
5878         vf.StatisticsEnable = true;
5879      }
5880   }
5881
5882#if GFX_VER <= 5
5883   if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
5884                      CROCUS_STAGE_DIRTY_CONSTANTS_FS)) {
5885      bool ret = calculate_curbe_offsets(batch);
5886      if (ret) {
5887         dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP;
5888         stage_dirty |= CROCUS_STAGE_DIRTY_VS;
5889      }
5890   }
5891
5892   if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) ||
5893       stage_dirty & CROCUS_STAGE_DIRTY_VS) {
5894     bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
5895                                           brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
5896                                           ((struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
5897     if (ret) {
5898	dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_RASTER | CROCUS_DIRTY_CLIP;
5899	stage_dirty |= CROCUS_STAGE_DIRTY_GS | CROCUS_STAGE_DIRTY_VS;
5900     }
5901   }
5902#endif
5903   if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
5904      const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
5905      uint32_t cc_vp_address;
5906
5907      /* XXX: could avoid streaming for depth_clip [0,1] case. */
5908      uint32_t *cc_vp_map =
5909         stream_state(batch,
5910                      4 * ice->state.num_viewports *
5911                      GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
5912      for (int i = 0; i < ice->state.num_viewports; i++) {
5913         float zmin, zmax;
5914         crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz,
5915                                 ice->state.window_space_position,
5916                                 &zmin, &zmax);
5917         if (cso_rast->cso.depth_clip_near)
5918            zmin = 0.0;
5919         if (cso_rast->cso.depth_clip_far)
5920            zmax = 1.0;
5921
5922         crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
5923            ccv.MinimumDepth = zmin;
5924            ccv.MaximumDepth = zmax;
5925         }
5926
5927         cc_vp_map += GENX(CC_VIEWPORT_length);
5928      }
5929
5930#if GFX_VER >= 7
5931      crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
5932         ptr.CCViewportPointer = cc_vp_address;
5933      }
5934#elif GFX_VER == 6
5935      crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
5936         vp.CCViewportStateChange = 1;
5937         vp.PointertoCC_VIEWPORT = cc_vp_address;
5938      }
5939#else
5940      ice->state.cc_vp_address = cc_vp_address;
5941      dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
5942#endif
5943   }
5944
5945   if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) {
5946      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5947#if GFX_VER >= 7
5948      uint32_t sf_cl_vp_address;
5949      uint32_t *vp_map =
5950         stream_state(batch,
5951                      4 * ice->state.num_viewports *
5952                      GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
5953#else
5954      uint32_t *vp_map =
5955         stream_state(batch,
5956                      4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length),
5957                      32, &ice->state.sf_vp_address);
5958      uint32_t *clip_map =
5959         stream_state(batch,
5960                      4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length),
5961                      32, &ice->state.clip_vp_address);
5962#endif
5963
5964      for (unsigned i = 0; i < ice->state.num_viewports; i++) {
5965         const struct pipe_viewport_state *state = &ice->state.viewports[i];
5966         float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
5967
5968#if GFX_VER == 8
5969         float vp_xmin = viewport_extent(state, 0, -1.0f);
5970         float vp_xmax = viewport_extent(state, 0,  1.0f);
5971         float vp_ymin = viewport_extent(state, 1, -1.0f);
5972         float vp_ymax = viewport_extent(state, 1,  1.0f);
5973#endif
5974         intel_calculate_guardband_size(0, cso_fb->width, 0, cso_fb->height,
5975                                        state->scale[0], state->scale[1],
5976                                        state->translate[0], state->translate[1],
5977                                        &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
5978#if GFX_VER >= 7
5979         crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp)
5980#else
5981         crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp)
5982#endif
5983         {
5984            vp.ViewportMatrixElementm00 = state->scale[0];
5985            vp.ViewportMatrixElementm11 = state->scale[1];
5986            vp.ViewportMatrixElementm22 = state->scale[2];
5987            vp.ViewportMatrixElementm30 = state->translate[0];
5988            vp.ViewportMatrixElementm31 = state->translate[1];
5989            vp.ViewportMatrixElementm32 = state->translate[2];
5990#if GFX_VER < 6
5991            struct pipe_scissor_state scissor;
5992            crocus_fill_scissor_rect(ice, 0, &scissor);
5993            vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx;
5994            vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx;
5995            vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny;
5996            vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy;
5997#endif
5998
5999#if GFX_VER >= 7
6000            vp.XMinClipGuardband = gb_xmin;
6001            vp.XMaxClipGuardband = gb_xmax;
6002            vp.YMinClipGuardband = gb_ymin;
6003            vp.YMaxClipGuardband = gb_ymax;
6004#endif
6005#if GFX_VER == 8
6006            vp.XMinViewPort = MAX2(vp_xmin, 0);
6007            vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6008            vp.YMinViewPort = MAX2(vp_ymin, 0);
6009            vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
6010#endif
6011         }
6012#if GFX_VER < 7
6013         crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) {
6014            clip.XMinClipGuardband = gb_xmin;
6015            clip.XMaxClipGuardband = gb_xmax;
6016            clip.YMinClipGuardband = gb_ymin;
6017            clip.YMaxClipGuardband = gb_ymax;
6018         }
6019#endif
6020#if GFX_VER >= 7
6021         vp_map += GENX(SF_CLIP_VIEWPORT_length);
6022#else
6023         vp_map += GENX(SF_VIEWPORT_length);
6024         clip_map += GENX(CLIP_VIEWPORT_length);
6025#endif
6026      }
6027#if GFX_VER >= 7
6028      crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
6029         ptr.SFClipViewportPointer = sf_cl_vp_address;
6030      }
6031#elif GFX_VER == 6
6032      crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
6033         vp.SFViewportStateChange = 1;
6034         vp.CLIPViewportStateChange = 1;
6035         vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address;
6036         vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address;
6037      }
6038#endif
6039   }
6040
6041#if GFX_VER >= 6
6042   if (dirty & CROCUS_DIRTY_GEN6_URB) {
6043#if GFX_VER == 6
6044      bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL
6045         || ice->shaders.ff_gs_prog;
6046
6047      struct brw_vue_prog_data *vue_prog_data =
6048         (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
6049      const unsigned vs_size = vue_prog_data->urb_entry_size;
6050      unsigned gs_size = vs_size;
6051      if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
6052         struct brw_vue_prog_data *gs_vue_prog_data =
6053            (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
6054         gs_size = gs_vue_prog_data->urb_entry_size;
6055      }
6056
6057      genX(crocus_upload_urb)(batch, vs_size, gs_present, gs_size);
6058#endif
6059#if GFX_VER >= 7
6060      const struct intel_device_info *devinfo = &batch->screen->devinfo;
6061      bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
6062      bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
6063      unsigned entry_size[4];
6064
6065      for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6066         if (!ice->shaders.prog[i]) {
6067            entry_size[i] = 1;
6068         } else {
6069            struct brw_vue_prog_data *vue_prog_data =
6070               (void *) ice->shaders.prog[i]->prog_data;
6071            entry_size[i] = vue_prog_data->urb_entry_size;
6072         }
6073         assert(entry_size[i] != 0);
6074      }
6075
6076      /* If we're just switching between programs with the same URB requirements,
6077       * skip the rest of the logic.
6078       */
6079      bool no_change = false;
6080      if (ice->urb.vsize == entry_size[MESA_SHADER_VERTEX] &&
6081          ice->urb.gs_present == gs_present &&
6082          ice->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] &&
6083          ice->urb.tess_present == tess_present &&
6084          ice->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] &&
6085          ice->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) {
6086         no_change = true;
6087      }
6088
6089      if (!no_change) {
6090         ice->urb.vsize = entry_size[MESA_SHADER_VERTEX];
6091         ice->urb.gs_present = gs_present;
6092         ice->urb.gsize = entry_size[MESA_SHADER_GEOMETRY];
6093         ice->urb.tess_present = tess_present;
6094         ice->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL];
6095         ice->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL];
6096
6097         unsigned entries[4];
6098         unsigned start[4];
6099         bool constrained;
6100         intel_get_urb_config(devinfo,
6101                              batch->screen->l3_config_3d,
6102                              tess_present,
6103                              gs_present,
6104                              entry_size,
6105                              entries, start, NULL, &constrained);
6106
6107#if GFX_VER == 7
6108         if (devinfo->platform == INTEL_PLATFORM_IVB)
6109            gen7_emit_vs_workaround_flush(batch);
6110#endif
6111         for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6112            crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
6113               urb._3DCommandSubOpcode += i;
6114               urb.VSURBStartingAddress     = start[i];
6115               urb.VSURBEntryAllocationSize = entry_size[i] - 1;
6116               urb.VSNumberofURBEntries     = entries[i];
6117            }
6118         }
6119      }
6120#endif
6121   }
6122
6123   if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) {
6124      struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6125      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6126      struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
6127
6128      STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2);
6129      int rt_dwords =
6130         MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
6131#if GFX_VER >= 8
6132      rt_dwords += GENX(BLEND_STATE_length);
6133#endif
6134      uint32_t blend_offset;
6135      uint32_t *blend_map =
6136         stream_state(batch,
6137                      4 * rt_dwords, 64, &blend_offset);
6138
6139#if GFX_VER >= 8
6140   struct GENX(BLEND_STATE) be = { 0 };
6141   {
6142#else
6143   for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
6144      struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6145#define be entry
6146#endif
6147
6148      be.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
6149      be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func);
6150      be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage;
6151      be.AlphaToOneEnable = cso_blend->cso.alpha_to_one;
6152      be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage;
6153      be.ColorDitherEnable = cso_blend->cso.dither;
6154
6155#if GFX_VER >= 8
6156      for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
6157         struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6158#else
6159      {
6160#endif
6161         const struct pipe_rt_blend_state *rt =
6162            &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0];
6163
6164         be.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &entry, cso_blend, i) ||
6165            be.IndependentAlphaBlendEnable;
6166
6167         if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
6168            entry.LogicOpEnable = cso_blend->cso.logicop_enable;
6169            entry.LogicOpFunction = cso_blend->cso.logicop_func;
6170         }
6171
6172         entry.ColorClampRange = COLORCLAMP_RTFORMAT;
6173         entry.PreBlendColorClampEnable = true;
6174         entry.PostBlendColorClampEnable = true;
6175
6176         entry.WriteDisableRed   = !(rt->colormask & PIPE_MASK_R);
6177         entry.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
6178         entry.WriteDisableBlue  = !(rt->colormask & PIPE_MASK_B);
6179         entry.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
6180
6181#if GFX_VER >= 8
6182         GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
6183#else
6184         GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
6185#endif
6186      }
6187   }
6188#if GFX_VER >= 8
6189   GENX(BLEND_STATE_pack)(NULL, blend_map, &be);
6190#endif
6191#if GFX_VER < 7
6192      crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6193         ptr.PointertoBLEND_STATE = blend_offset;
6194         ptr.BLEND_STATEChange = true;
6195      }
6196#else
6197      crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
6198         ptr.BlendStatePointer = blend_offset;
6199#if GFX_VER >= 8
6200         ptr.BlendStatePointerValid = true;
6201#endif
6202      }
6203#endif
6204   }
6205#endif
6206
6207   if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) {
6208      struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
6209      UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6210      struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
6211      uint32_t cc_offset;
6212      void *cc_map =
6213         stream_state(batch,
6214                      sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
6215                      64, &cc_offset);
6216#if GFX_VER <= 5
6217      dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6218#endif
6219      _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) {
6220         cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6221         cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6222
6223#if GFX_VER <= 5
6224
6225         set_depth_stencil_bits(ice, &cc);
6226
6227         if (cso_blend->cso.logicop_enable) {
6228            if (can_emit_logic_op(ice)) {
6229               cc.LogicOpEnable = cso_blend->cso.logicop_enable;
6230               cc.LogicOpFunction = cso_blend->cso.logicop_func;
6231            }
6232         }
6233         cc.ColorDitherEnable = cso_blend->cso.dither;
6234
6235         cc.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &cc, cso_blend, 0);
6236
6237         if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) {
6238            cc.AlphaTestEnable = cso->cso.alpha_enabled;
6239            cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func);
6240         }
6241         cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0;
6242         cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address);
6243#else
6244         cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6245         cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6246
6247         cc.BlendConstantColorRed   = ice->state.blend_color.color[0];
6248         cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6249         cc.BlendConstantColorBlue  = ice->state.blend_color.color[2];
6250         cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6251#endif
6252         cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
6253         cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
6254      }
6255      ice->shaders.cc_offset = cc_offset;
6256#if GFX_VER >= 6
6257      crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6258         ptr.ColorCalcStatePointer = cc_offset;
6259#if GFX_VER != 7
6260         ptr.ColorCalcStatePointerValid = true;
6261#endif
6262      }
6263#endif
6264   }
6265#if GFX_VER <= 5
6266   if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) {
6267      crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
6268         blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6269         blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6270         blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6271         blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6272      }
6273   }
6274#endif
6275   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6276      if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage)))
6277         continue;
6278
6279      struct crocus_shader_state *shs = &ice->state.shaders[stage];
6280      struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
6281
6282      if (!shader)
6283         continue;
6284
6285      if (shs->sysvals_need_upload)
6286         upload_sysvals(ice, stage);
6287
6288#if GFX_VER <= 5
6289      dirty |= CROCUS_DIRTY_GEN4_CURBE;
6290#endif
6291#if GFX_VER >= 7
6292      struct push_bos push_bos = {};
6293      setup_constant_buffers(ice, batch, stage, &push_bos);
6294
6295      emit_push_constant_packets(ice, batch, stage, &push_bos);
6296#endif
6297   }
6298
6299   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6300      if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) {
6301         if (ice->shaders.prog[stage]) {
6302#if GFX_VER <= 6
6303            dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6304#endif
6305            crocus_populate_binding_table(ice, batch, stage, false);
6306            ice->shaders.prog[stage]->bind_bo_offset =
6307               crocus_upload_binding_table(ice, batch,
6308                                           ice->shaders.prog[stage]->surf_offset,
6309                                           ice->shaders.prog[stage]->bt.size_bytes);
6310
6311#if GFX_VER >= 7
6312            crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
6313               ptr._3DCommandSubOpcode = 38 + stage;
6314               ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset;
6315            }
6316#endif
6317#if GFX_VER == 6
6318         } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) {
6319            dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6320            crocus_populate_binding_table(ice, batch, stage, true);
6321            ice->shaders.ff_gs_prog->bind_bo_offset =
6322               crocus_upload_binding_table(ice, batch,
6323                                           ice->shaders.ff_gs_prog->surf_offset,
6324                                           ice->shaders.ff_gs_prog->bt.size_bytes);
6325#endif
6326         }
6327      }
6328   }
6329#if GFX_VER <= 6
6330   if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) {
6331      struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6332      if (gs == NULL)
6333         gs = ice->shaders.ff_gs_prog;
6334      crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) {
6335         ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset;
6336         ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset;
6337#if GFX_VER == 6
6338         ptr.VSBindingTableChange = true;
6339         ptr.PSBindingTableChange = true;
6340         ptr.GSBindingTableChange = gs ? true : false;
6341         ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0;
6342#endif
6343      }
6344   }
6345#endif
6346
6347   bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
6348   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6349      if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
6350          !ice->shaders.prog[stage])
6351         continue;
6352
6353      crocus_upload_sampler_states(ice, batch, stage);
6354
6355      sampler_updates = true;
6356
6357#if GFX_VER >= 7
6358      struct crocus_shader_state *shs = &ice->state.shaders[stage];
6359
6360      crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
6361         ptr._3DCommandSubOpcode = 43 + stage;
6362         ptr.PointertoVSSamplerState = shs->sampler_offset;
6363      }
6364#endif
6365   }
6366
6367   if (sampler_updates) {
6368#if GFX_VER == 6
6369      struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX];
6370      struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
6371      struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
6372      crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) {
6373         if (ice->shaders.prog[MESA_SHADER_VERTEX] &&
6374             (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6375              stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) {
6376            ptr.VSSamplerStateChange = true;
6377            ptr.PointertoVSSamplerState = shs_vs->sampler_offset;
6378         }
6379         if (ice->shaders.prog[MESA_SHADER_GEOMETRY] &&
6380             (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6381              stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) {
6382            ptr.GSSamplerStateChange = true;
6383            ptr.PointertoGSSamplerState = shs_gs->sampler_offset;
6384         }
6385         if (ice->shaders.prog[MESA_SHADER_FRAGMENT] &&
6386             (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6387              stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) {
6388            ptr.PSSamplerStateChange = true;
6389            ptr.PointertoPSSamplerState = shs_fs->sampler_offset;
6390         }
6391      }
6392#endif
6393   }
6394
6395#if GFX_VER >= 6
6396   if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) {
6397      crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
6398         ms.PixelLocation =
6399            ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER;
6400         if (ice->state.framebuffer.samples > 0)
6401            ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
6402#if GFX_VER == 6
6403         INTEL_SAMPLE_POS_4X(ms.Sample);
6404#elif GFX_VER == 7
6405         switch (ice->state.framebuffer.samples) {
6406         case 1:
6407            INTEL_SAMPLE_POS_1X(ms.Sample);
6408            break;
6409         case 2:
6410            INTEL_SAMPLE_POS_2X(ms.Sample);
6411            break;
6412         case 4:
6413            INTEL_SAMPLE_POS_4X(ms.Sample);
6414            break;
6415         case 8:
6416            INTEL_SAMPLE_POS_8X(ms.Sample);
6417            break;
6418         default:
6419            break;
6420         }
6421#endif
6422      }
6423   }
6424
6425   if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) {
6426      crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
6427         ms.SampleMask = determine_sample_mask(ice);
6428      }
6429   }
6430#endif
6431
6432#if GFX_VER >= 7
6433   struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
6434   if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) {
6435      struct brw_stage_prog_data *prog_data = shader->prog_data;
6436      struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
6437
6438      crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
6439
6440         /* Initialize the execution mask with VMask.  Otherwise, derivatives are
6441          * incorrect for subspans where some of the pixels are unlit.  We believe
6442          * the bit just didn't take effect in previous generations.
6443          */
6444         ps.VectorMaskEnable = GFX_VER >= 8 && wm_prog_data->uses_vmask;
6445
6446         ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
6447         ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
6448         ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
6449
6450         ps.DispatchGRFStartRegisterForConstantSetupData0 =
6451            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
6452         ps.DispatchGRFStartRegisterForConstantSetupData1 =
6453            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
6454         ps.DispatchGRFStartRegisterForConstantSetupData2 =
6455            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
6456
6457         ps.KernelStartPointer0 = KSP(ice, shader) +
6458            brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
6459         ps.KernelStartPointer1 = KSP(ice, shader) +
6460            brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
6461         ps.KernelStartPointer2 = KSP(ice, shader) +
6462            brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
6463
6464#if GFX_VERx10 == 75
6465         ps.SampleMask = determine_sample_mask(ice);
6466#endif
6467         // XXX: WABTPPrefetchDisable, see above, drop at C0
6468         ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
6469         ps.FloatingPointMode = prog_data->use_alt_mode;
6470#if GFX_VER >= 8
6471         ps.MaximumNumberofThreadsPerPSD =
6472            batch->screen->devinfo.max_threads_per_psd - 2;
6473#else
6474         ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
6475#endif
6476
6477         ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
6478
6479#if GFX_VER < 8
6480         ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6481         ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending;
6482         ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0);
6483#endif
6484         /* From the documentation for this packet:
6485          * "If the PS kernel does not need the Position XY Offsets to
6486          *  compute a Position Value, then this field should be programmed
6487          *  to POSOFFSET_NONE."
6488          *
6489          * "SW Recommendation: If the PS kernel needs the Position Offsets
6490          *  to compute a Position XY value, this field should match Position
6491          *  ZW Interpolation Mode to ensure a consistent position.xyzw
6492          *  computation."
6493          *
6494          * We only require XY sample offsets. So, this recommendation doesn't
6495          * look useful at the moment.  We might need this in future.
6496          */
6497         ps.PositionXYOffsetSelect =
6498            wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
6499
6500         if (wm_prog_data->base.total_scratch) {
6501            struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT);
6502            ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
6503            ps.ScratchSpaceBasePointer = rw_bo(bo, 0);
6504         }
6505      }
6506#if GFX_VER == 8
6507      const struct shader_info *fs_info =
6508         crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
6509      crocus_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) {
6510         psx.PixelShaderValid = true;
6511         psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
6512         psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
6513         psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
6514         psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
6515         psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
6516         psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
6517
6518         /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
6519         if (wm_prog_data->uses_sample_mask)
6520            psx.PixelShaderUsesInputCoverageMask = true;
6521
6522         psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6523
6524         /* The stricter cross-primitive coherency guarantees that the hardware
6525          * gives us with the "Accesses UAV" bit set for at least one shader stage
6526          * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
6527          * are redundant within the current image, atomic counter and SSBO GL
6528          * APIs, which all have very loose ordering and coherency requirements
6529          * and generally rely on the application to insert explicit barriers when
6530          * a shader invocation is expected to see the memory writes performed by
6531          * the invocations of some previous primitive.  Regardless of the value
6532          * of "UAV coherency required", the "Accesses UAV" bits will implicitly
6533          * cause an in most cases useless DC flush when the lowermost stage with
6534          * the bit set finishes execution.
6535          *
6536          * It would be nice to disable it, but in some cases we can't because on
6537          * Gfx8+ it also has an influence on rasterization via the PS UAV-only
6538          * signal (which could be set independently from the coherency mechanism
6539          * in the 3DSTATE_WM command on Gfx7), and because in some cases it will
6540          * determine whether the hardware skips execution of the fragment shader
6541          * or not via the ThreadDispatchEnable signal.  However if we know that
6542          * GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
6543          * GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
6544          * difference so we may just disable it here.
6545          *
6546          * Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't
6547          * take into account KillPixels when no depth or stencil writes are
6548          * enabled.  In order for occlusion queries to work correctly with no
6549          * attachments, we need to force-enable here.
6550          *
6551          */
6552         if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
6553             !(has_writeable_rt(ice->state.cso_blend, fs_info)))
6554            psx.PixelShaderHasUAV = true;
6555      }
6556#endif
6557   }
6558#endif
6559
6560#if GFX_VER >= 7
6561   if (ice->state.streamout_active) {
6562      if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) {
6563         for (int i = 0; i < 4; i++) {
6564            struct crocus_stream_output_target *tgt =
6565               (void *) ice->state.so_target[i];
6566
6567            if (!tgt) {
6568               crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6569                  sob.SOBufferIndex = i;
6570                  sob.MOCS = crocus_mocs(NULL, &batch->screen->isl_dev);
6571               }
6572               continue;
6573            }
6574            struct crocus_resource *res = (void *) tgt->base.buffer;
6575            uint32_t start = tgt->base.buffer_offset;
6576#if GFX_VER < 8
6577            uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
6578#endif
6579            crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6580               sob.SOBufferIndex = i;
6581
6582               sob.SurfaceBaseAddress = rw_bo(res->bo, start);
6583               sob.MOCS = crocus_mocs(res->bo, &batch->screen->isl_dev);
6584#if GFX_VER < 8
6585               sob.SurfacePitch = tgt->stride;
6586               sob.SurfaceEndAddress = rw_bo(res->bo, end);
6587#else
6588               sob.SOBufferEnable = true;
6589               sob.StreamOffsetWriteEnable = true;
6590               sob.StreamOutputBufferOffsetAddressEnable = true;
6591
6592               sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
6593               sob.StreamOutputBufferOffsetAddress =
6594                  rw_bo(crocus_resource_bo(&tgt->offset_res->base.b), tgt->offset_offset);
6595               if (tgt->zero_offset) {
6596                  sob.StreamOffset = 0;
6597                  tgt->zero_offset = false;
6598               } else
6599                  sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
6600#endif
6601            }
6602         }
6603      }
6604
6605      if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
6606         uint32_t *decl_list =
6607            ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
6608         crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
6609      }
6610
6611      if (dirty & CROCUS_DIRTY_STREAMOUT) {
6612         const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6613
6614         uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
6615         crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
6616            sol.SOFunctionEnable = true;
6617            sol.SOStatisticsEnable = true;
6618
6619            sol.RenderingDisable = cso_rast->cso.rasterizer_discard &&
6620                                   !ice->state.prims_generated_query_active;
6621            sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING;
6622         }
6623
6624         assert(ice->state.streamout);
6625
6626         crocus_emit_merge(batch, ice->state.streamout, dynamic_sol,
6627                         GENX(3DSTATE_STREAMOUT_length));
6628      }
6629   } else {
6630      if (dirty & CROCUS_DIRTY_STREAMOUT) {
6631         crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
6632      }
6633   }
6634#endif
6635#if GFX_VER == 6
6636   if (ice->state.streamout_active) {
6637      if (dirty & CROCUS_DIRTY_GEN6_SVBI) {
6638         crocus_emit_so_svbi(ice);
6639      }
6640   }
6641#endif
6642
6643   if (dirty & CROCUS_DIRTY_CLIP) {
6644#if GFX_VER < 6
6645      const struct brw_clip_prog_data *clip_prog_data = (struct brw_clip_prog_data *)ice->shaders.clip_prog->prog_data;
6646      struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
6647
6648      uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset);
6649      dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6650      _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) {
6651         clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog);
6652         clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6653         clip.SingleProgramFlow = true;
6654         clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1;
6655
6656         clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length;
6657         clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length;
6658
6659         clip.DispatchGRFStartRegisterForURBData = 1;
6660         clip.VertexURBEntryReadOffset = 0;
6661         clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2;
6662
6663         clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries;
6664         clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6665
6666         if (batch->ice->urb.nr_clip_entries >= 10) {
6667            /* Half of the URB entries go to each thread, and it has to be an
6668             * even number.
6669             */
6670            assert(batch->ice->urb.nr_clip_entries % 2 == 0);
6671
6672            /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
6673             * only 2 threads can output VUEs at a time.
6674             */
6675            clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
6676         } else {
6677            assert(batch->ice->urb.nr_clip_entries >= 5);
6678            clip.MaximumNumberofThreads = 1 - 1;
6679         }
6680         clip.VertexPositionSpace = VPOS_NDCSPACE;
6681         clip.UserClipFlagsMustClipEnable = true;
6682         clip.GuardbandClipTestEnable = true;
6683
6684         clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address);
6685         clip.ScreenSpaceViewportXMin = -1.0;
6686         clip.ScreenSpaceViewportXMax = 1.0;
6687         clip.ScreenSpaceViewportYMin = -1.0;
6688         clip.ScreenSpaceViewportYMax = 1.0;
6689         clip.ViewportXYClipTestEnable = true;
6690         clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far);
6691
6692#if GFX_VER == 5 || GFX_VERx10 == 45
6693         clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable;
6694#else
6695         /* Up to 6 actual clip flags, plus the 7th for the negative RHW
6696          * workaround.
6697          */
6698         clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40;
6699#endif
6700
6701         clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
6702         clip.GuardbandClipTestEnable = true;
6703
6704         clip.ClipMode = clip_prog_data->clip_mode;
6705#if GFX_VERx10 == 45
6706         clip.NegativeWClipTestEnable = true;
6707#endif
6708      }
6709
6710#else //if GFX_VER >= 6
6711      struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6712      const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data );
6713      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6714      bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
6715                       ice->shaders.prog[MESA_SHADER_TESS_EVAL];
6716      bool points_or_lines = cso_rast->fill_mode_point_or_line ||
6717         (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
6718                    : ice->state.prim_is_points_or_lines);
6719      uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
6720      crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
6721         cl.StatisticsEnable = ice->state.statistics_counters_enabled;
6722         if (cso_rast->cso.rasterizer_discard)
6723            cl.ClipMode = CLIPMODE_REJECT_ALL;
6724         else if (ice->state.window_space_position)
6725            cl.ClipMode = CLIPMODE_ACCEPT_ALL;
6726         else
6727            cl.ClipMode = CLIPMODE_NORMAL;
6728
6729         cl.PerspectiveDivideDisable = ice->state.window_space_position;
6730         cl.ViewportXYClipTestEnable = !points_or_lines;
6731
6732         cl.UserClipDistanceCullTestEnableBitmask =
6733            brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask;
6734
6735         cl.NonPerspectiveBarycentricEnable = wm_prog_data->uses_nonperspective_interp_modes;
6736
6737         cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
6738         cl.MaximumVPIndex = ice->state.num_viewports - 1;
6739      }
6740      crocus_emit_merge(batch, cso_rast->clip, dynamic_clip,
6741                      ARRAY_SIZE(cso_rast->clip));
6742#endif
6743   }
6744
6745   if (stage_dirty & CROCUS_STAGE_DIRTY_VS) {
6746      struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX];
6747      const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6748      const struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
6749#if GFX_VER == 7
6750      if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
6751         gen7_emit_vs_workaround_flush(batch);
6752#endif
6753
6754
6755#if GFX_VER == 6
6756      struct push_bos push_bos = {};
6757      setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6758
6759      emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6760#endif
6761#if GFX_VER >= 6
6762      crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs)
6763#else
6764      uint32_t *vs_ptr = stream_state(batch,
6765                                      GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset);
6766      dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6767      _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs)
6768#endif
6769      {
6770         INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
6771
6772         vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1;
6773
6774#if GFX_VER < 6
6775         vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
6776         vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length;
6777         vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2;
6778
6779         vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
6780         vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6781
6782         vs.MaximumNumberofThreads =
6783            CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1;
6784         vs.StatisticsEnable = false;
6785         vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset);
6786#endif
6787#if GFX_VER == 5
6788         /* Force single program flow on Ironlake.  We cannot reliably get
6789          * all applications working without it.  See:
6790          * https://bugs.freedesktop.org/show_bug.cgi?id=29172
6791          *
6792          * The most notable and reliably failing application is the Humus
6793          * demo "CelShading"
6794          */
6795         vs.SingleProgramFlow = true;
6796         vs.SamplerCount = 0; /* hardware requirement */
6797
6798#endif
6799#if GFX_VER >= 8
6800         vs.SIMD8DispatchEnable =
6801            vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
6802
6803         vs.UserClipDistanceCullTestEnableBitmask =
6804            vue_prog_data->cull_distance_mask;
6805#endif
6806      }
6807
6808#if GFX_VER == 6
6809      crocus_emit_pipe_control_flush(batch,
6810                                     "post VS const",
6811                                     PIPE_CONTROL_DEPTH_STALL |
6812                                     PIPE_CONTROL_INSTRUCTION_INVALIDATE |
6813                                     PIPE_CONTROL_STATE_CACHE_INVALIDATE);
6814#endif
6815   }
6816
6817   if (stage_dirty & CROCUS_STAGE_DIRTY_GS) {
6818      struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6819      bool active = GFX_VER >= 6 && shader;
6820#if GFX_VER == 6
6821      struct push_bos push_bos = {};
6822      if (shader)
6823         setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6824
6825      emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6826#endif
6827#if GFX_VERx10 == 70
6828   /**
6829    * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
6830    * Geometry > Geometry Shader > State:
6831    *
6832    *     "Note: Because of corruption in IVB:GT2, software needs to flush the
6833    *     whole fixed function pipeline when the GS enable changes value in
6834    *     the 3DSTATE_GS."
6835    *
6836    * The hardware architects have clarified that in this context "flush the
6837    * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
6838    * Stall" bit set.
6839    */
6840   if (batch->screen->devinfo.gt == 2 && ice->state.gs_enabled != active)
6841      gen7_emit_cs_stall_flush(batch);
6842#endif
6843#if GFX_VER >= 6
6844      crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
6845#else
6846      uint32_t *gs_ptr = stream_state(batch,
6847                                      GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset);
6848      dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6849      _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs)
6850#endif
6851     {
6852#if GFX_VER >= 6
6853         if (active) {
6854            const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(shader->prog_data);
6855            const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6856            const struct brw_stage_prog_data *prog_data = &gs_prog_data->base.base;
6857
6858            INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
6859#if GFX_VER >= 7
6860            gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
6861            gs.OutputTopology = gs_prog_data->output_topology;
6862            gs.ControlDataHeaderSize =
6863               gs_prog_data->control_data_header_size_hwords;
6864
6865            gs.InstanceControl = gs_prog_data->invocations - 1;
6866            gs.DispatchMode = vue_prog_data->dispatch_mode;
6867
6868            gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
6869
6870            gs.ControlDataFormat = gs_prog_data->control_data_format;
6871#endif
6872
6873            /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
6874             * Ivy Bridge and Haswell.
6875             *
6876             * On Ivy Bridge, setting this bit causes the vertices of a triangle
6877             * strip to be delivered to the geometry shader in an order that does
6878             * not strictly follow the OpenGL spec, but preserves triangle
6879             * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
6880             * the geometry shader sees triangles:
6881             *
6882             * (1, 2, 3), (2, 4, 3), (3, 4, 5)
6883             *
6884             * (Clearing the bit is even worse, because it fails to preserve
6885             * orientation).
6886             *
6887             * Triangle strips with adjacency always ordered in a way that preserves
6888             * triangle orientation but does not strictly follow the OpenGL spec,
6889             * regardless of the setting of this bit.
6890             *
6891             * On Haswell, both triangle strips and triangle strips with adjacency
6892             * are always ordered in a way that preserves triangle orientation.
6893             * Setting this bit causes the ordering to strictly follow the OpenGL
6894             * spec.
6895             *
6896             * So in either case we want to set the bit.  Unfortunately on Ivy
6897             * Bridge this will get the order close to correct but not perfect.
6898             */
6899            gs.ReorderMode = TRAILING;
6900            gs.MaximumNumberofThreads =
6901               GFX_VER == 8 ? (batch->screen->devinfo.max_gs_threads / 2 - 1) :
6902               (batch->screen->devinfo.max_gs_threads - 1);
6903#if GFX_VER < 7
6904            gs.SOStatisticsEnable = true;
6905            if (gs_prog_data->num_transform_feedback_bindings)
6906               gs.SVBIPayloadEnable = ice->state.streamout_active;
6907
6908            /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
6909             * was previously done for gen6.
6910             *
6911             * TODO: test with both disabled to see if the HW is behaving
6912             * as expected, like in gen7.
6913             */
6914            gs.SingleProgramFlow = true;
6915            gs.VectorMaskEnable = true;
6916#endif
6917#if GFX_VER >= 8
6918            gs.ExpectedVertexCount = gs_prog_data->vertices_in;
6919
6920            if (gs_prog_data->static_vertex_count != -1) {
6921               gs.StaticOutput = true;
6922               gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
6923            }
6924            gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
6925
6926            gs.UserClipDistanceCullTestEnableBitmask =
6927               vue_prog_data->cull_distance_mask;
6928
6929            const int urb_entry_write_offset = 1;
6930            const uint32_t urb_entry_output_length =
6931               DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
6932               urb_entry_write_offset;
6933
6934            gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
6935            gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
6936#endif
6937         }
6938#endif
6939#if GFX_VER <= 6
6940         if (!active && ice->shaders.ff_gs_prog) {
6941            const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
6942            /* In gen6, transform feedback for the VS stage is done with an
6943             * ad-hoc GS program. This function provides the needed 3DSTATE_GS
6944             * for this.
6945             */
6946            gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog);
6947            gs.SingleProgramFlow = true;
6948            gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
6949            gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length;
6950
6951#if GFX_VER <= 5
6952            gs.GRFRegisterCount =
6953               DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1;
6954            /* BRW_NEW_URB_FENCE */
6955            gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries;
6956            gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6957            gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0;
6958            gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6959#else
6960            gs.Enable = true;
6961            gs.VectorMaskEnable = true;
6962            gs.SVBIPayloadEnable = true;
6963            gs.SVBIPostIncrementEnable = true;
6964            gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value;
6965            gs.SOStatisticsEnable = true;
6966            gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1;
6967#endif
6968         }
6969#endif
6970         if (!active && !ice->shaders.ff_gs_prog) {
6971#if GFX_VER < 8
6972            gs.DispatchGRFStartRegisterForURBData = 1;
6973#if GFX_VER >= 7
6974            gs.IncludeVertexHandles = true;
6975#endif
6976#endif
6977         }
6978#if GFX_VER >= 6
6979         gs.StatisticsEnable = true;
6980#endif
6981#if GFX_VER == 5 || GFX_VER == 6
6982         gs.RenderingEnabled = true;
6983#endif
6984#if GFX_VER <= 5
6985         gs.MaximumVPIndex = ice->state.num_viewports - 1;
6986#endif
6987      }
6988      ice->state.gs_enabled = active;
6989   }
6990
6991#if GFX_VER >= 7
6992   if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) {
6993      struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL];
6994
6995      if (shader) {
6996         const struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(shader->prog_data);
6997         const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6998         const struct brw_stage_prog_data *prog_data = &tcs_prog_data->base.base;
6999
7000         crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) {
7001            INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
7002            hs.InstanceCount = tcs_prog_data->instances - 1;
7003            hs.IncludeVertexHandles = true;
7004            hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1;
7005         }
7006      } else {
7007         crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7008      }
7009
7010   }
7011
7012   if (stage_dirty & CROCUS_STAGE_DIRTY_TES) {
7013      struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7014      if (shader) {
7015         const struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(shader->prog_data);
7016         const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
7017         const struct brw_stage_prog_data *prog_data = &tes_prog_data->base.base;
7018
7019         crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) {
7020            te.Partitioning = tes_prog_data->partitioning;
7021            te.OutputTopology = tes_prog_data->output_topology;
7022            te.TEDomain = tes_prog_data->domain;
7023            te.TEEnable = true;
7024            te.MaximumTessellationFactorOdd = 63.0;
7025            te.MaximumTessellationFactorNotOdd = 64.0;
7026         };
7027         crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) {
7028            INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
7029
7030            ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1;
7031            ds.ComputeWCoordinateEnable =
7032               tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
7033
7034#if GFX_VER >= 8
7035            if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
7036               ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
7037            ds.UserClipDistanceCullTestEnableBitmask =
7038               vue_prog_data->cull_distance_mask;
7039#endif
7040         };
7041      } else {
7042         crocus_emit_cmd(batch, GENX(3DSTATE_TE), te);
7043         crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7044      }
7045   }
7046#endif
7047   if (dirty & CROCUS_DIRTY_RASTER) {
7048
7049#if GFX_VER < 6
7050      const struct brw_sf_prog_data *sf_prog_data = (struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data;
7051      struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
7052      uint32_t *sf_ptr = stream_state(batch,
7053                                      GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset);
7054      dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7055      _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) {
7056         sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog);
7057         sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7058         sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
7059         sf.DispatchGRFStartRegisterForURBData = 3;
7060         sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
7061         sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
7062         sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
7063         sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
7064         sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7065
7066         sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address);
7067
7068         sf.MaximumNumberofThreads =
7069            MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1;
7070
7071         sf.SpritePointEnable = cso_state->point_quad_rasterization;
7072         sf.DestinationOriginHorizontalBias = 0.5;
7073         sf.DestinationOriginVerticalBias = 0.5;
7074
7075	 sf.LineEndCapAntialiasingRegionWidth =
7076            cso_state->line_smooth ? _10pixels : _05pixels;
7077         sf.LastPixelEnable = cso_state->line_last_pixel;
7078         sf.AntialiasingEnable = cso_state->line_smooth;
7079
7080         sf.LineWidth = get_line_width(cso_state);
7081         sf.PointWidth = cso_state->point_size;
7082         sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
7083#if GFX_VERx10 >= 45
7084         sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
7085#endif
7086         sf.ViewportTransformEnable = true;
7087         sf.FrontWinding = cso_state->front_ccw ? 1 : 0;
7088         sf.ScissorRectangleEnable = true;
7089         sf.CullMode = translate_cull_mode(cso_state->cull_face);
7090
7091         if (cso_state->flatshade_first) {
7092            sf.TriangleFanProvokingVertexSelect = 1;
7093         } else {
7094            sf.TriangleStripListProvokingVertexSelect = 2;
7095            sf.TriangleFanProvokingVertexSelect = 2;
7096            sf.LineStripListProvokingVertexSelect = 1;
7097         }
7098      }
7099#else
7100      struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7101      uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7102      crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7103         sf.ViewportTransformEnable = !ice->state.window_space_position;
7104
7105#if GFX_VER == 6
7106         const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7107         uint32_t urb_entry_read_length;
7108         uint32_t urb_entry_read_offset;
7109         uint32_t point_sprite_enables;
7110         calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables,
7111                                  &urb_entry_read_length,
7112                                  &urb_entry_read_offset);
7113         sf.VertexURBEntryReadLength = urb_entry_read_length;
7114         sf.VertexURBEntryReadOffset = urb_entry_read_offset;
7115         sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
7116         sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
7117         sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7118#endif
7119
7120#if GFX_VER >= 6 && GFX_VER < 8
7121         if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample)
7122            sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7123#endif
7124#if GFX_VER == 7
7125         if (ice->state.framebuffer.zsbuf) {
7126            struct crocus_resource *zres, *sres;
7127               crocus_get_depth_stencil_resources(&batch->screen->devinfo,
7128                                                  ice->state.framebuffer.zsbuf->texture,
7129                                                  &zres, &sres);
7130            /* ANV thinks that the stencil-ness doesn't matter, this is just
7131             * about handling polygon offset scaling.
7132             */
7133            sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM;
7134         }
7135#endif
7136      }
7137      crocus_emit_merge(batch, cso->sf, dynamic_sf,
7138                      ARRAY_SIZE(dynamic_sf));
7139#if GFX_VER == 8
7140      crocus_batch_emit(batch, cso->raster, sizeof(cso->raster));
7141#endif
7142#endif
7143   }
7144
7145   if (dirty & CROCUS_DIRTY_WM) {
7146      struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7147      const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7148      UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
7149      UNUSED const struct shader_info *fs_info =
7150         crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7151
7152#if GFX_VER == 6
7153      struct push_bos push_bos = {};
7154      setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7155
7156      emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7157#endif
7158#if GFX_VER >= 6
7159      crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm)
7160#else
7161      uint32_t *wm_ptr = stream_state(batch,
7162                                      GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset);
7163
7164      dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7165
7166      _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm)
7167#endif
7168     {
7169#if GFX_VER <= 6
7170         wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
7171         wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
7172         wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
7173#endif
7174#if GFX_VER == 4
7175      /* On gen4, we only have one shader kernel */
7176         if (brw_wm_state_has_ksp(wm, 0)) {
7177            wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]);
7178            wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7179            wm.DispatchGRFStartRegisterForConstantSetupData0 =
7180               wm_prog_data->base.dispatch_grf_start_reg;
7181         }
7182#elif GFX_VER == 5
7183         wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7184            brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7185         wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7186            brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7187         wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7188            brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7189
7190         wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7191         wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
7192         wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
7193
7194         wm.DispatchGRFStartRegisterForConstantSetupData0 =
7195            wm_prog_data->base.dispatch_grf_start_reg;
7196#elif GFX_VER == 6
7197         wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7198            brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7199         wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7200            brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7201         wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7202            brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7203
7204         wm.DispatchGRFStartRegisterForConstantSetupData0 =
7205           brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
7206         wm.DispatchGRFStartRegisterForConstantSetupData1 =
7207           brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
7208         wm.DispatchGRFStartRegisterForConstantSetupData2 =
7209           brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
7210#endif
7211#if GFX_VER <= 5
7212         wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
7213         wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2;
7214         wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
7215         wm.SetupURBEntryReadOffset = 0;
7216         wm.EarlyDepthTestEnable = true;
7217         wm.LineAntialiasingRegionWidth = _05pixels;
7218         wm.LineEndCapAntialiasingRegionWidth = _10pixels;
7219         wm.DepthCoefficientURBReadOffset = 1;
7220
7221         if (cso->cso.offset_tri) {
7222            wm.GlobalDepthOffsetEnable = true;
7223
7224         /* Something weird going on with legacy_global_depth_bias,
7225          * offset_constant, scaling and MRD.  This value passes glean
7226          * but gives some odd results elsewere (eg. the
7227          * quad-offset-units test).
7228          */
7229            wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2;
7230            wm.GlobalDepthOffsetScale = cso->cso.offset_scale;
7231         }
7232         wm.SamplerStatePointer = ro_bo(batch->state.bo,
7233                                        ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset);
7234#endif
7235
7236         wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ?
7237            ice->state.statistics_counters_enabled : 0;
7238
7239#if GFX_VER >= 6
7240         wm.LineAntialiasingRegionWidth = _10pixels;
7241         wm.LineEndCapAntialiasingRegionWidth = _05pixels;
7242
7243         wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7244         wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
7245#endif
7246#if GFX_VER == 6
7247      wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend &&
7248         ice->state.cso_blend->dual_color_blending;
7249      wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
7250      wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7251
7252      /* From the SNB PRM, volume 2 part 1, page 281:
7253       * "If the PS kernel does not need the Position XY Offsets
7254       * to compute a Position XY value, then this field should be
7255       * programmed to POSOFFSET_NONE."
7256       *
7257       * "SW Recommendation: If the PS kernel needs the Position Offsets
7258       * to compute a Position XY value, this field should match Position
7259       * ZW Interpolation Mode to ensure a consistent position.xyzw
7260       * computation."
7261       * We only require XY sample offsets. So, this recommendation doesn't
7262       * look useful at the moment. We might need this in future.
7263       */
7264      if (wm_prog_data->uses_pos_offset)
7265         wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
7266      else
7267         wm.PositionXYOffsetSelect = POSOFFSET_NONE;
7268#endif
7269         wm.LineStippleEnable = cso->cso.line_stipple_enable;
7270         wm.PolygonStippleEnable = cso->cso.poly_stipple_enable;
7271
7272#if GFX_VER < 7
7273         if (wm_prog_data->base.use_alt_mode)
7274            wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7275         wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4;
7276         wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
7277#endif
7278
7279#if GFX_VER < 8
7280#if GFX_VER >= 6
7281         wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
7282
7283         struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7284         if (fb->samples > 1) {
7285            if (cso->cso.multisample)
7286               wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7287            else
7288               wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7289
7290            if (wm_prog_data->persample_dispatch)
7291               wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7292            else
7293               wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
7294         } else {
7295            wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7296            wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7297         }
7298#endif
7299
7300         wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
7301
7302         if (wm_prog_data->uses_kill ||
7303             ice->state.cso_zsa->cso.alpha_enabled ||
7304             ice->state.cso_blend->cso.alpha_to_coverage ||
7305             (GFX_VER >= 6 && wm_prog_data->uses_omask))
7306            wm.PixelShaderKillsPixel = true;
7307
7308         if (has_writeable_rt(ice->state.cso_blend, fs_info) ||
7309             writes_depth || wm.PixelShaderKillsPixel ||
7310             (GFX_VER >= 6 && wm_prog_data->has_side_effects))
7311            wm.ThreadDispatchEnable = true;
7312
7313#if GFX_VER >= 7
7314         wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
7315         wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
7316#else
7317         if (wm_prog_data->base.total_scratch) {
7318            struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch,
7319                                                            MESA_SHADER_FRAGMENT);
7320            wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
7321            wm.ScratchSpaceBasePointer = rw_bo(bo, 0);
7322         }
7323
7324         wm.PixelShaderComputedDepth = writes_depth;
7325
7326#endif
7327         /* The "UAV access enable" bits are unnecessary on HSW because they only
7328          * seem to have an effect on the HW-assisted coherency mechanism which we
7329          * don't need, and the rasterization-related UAV_ONLY flag and the
7330          * DISPATCH_ENABLE bit can be set independently from it.
7331          * C.f. gen8_upload_ps_extra().
7332          *
7333          * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
7334          * _NEW_COLOR
7335          */
7336#if GFX_VERx10 == 75
7337         if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) &&
7338             wm_prog_data->has_side_effects)
7339            wm.PSUAVonly = ON;
7340#endif
7341#endif
7342#if GFX_VER >= 7
7343      /* BRW_NEW_FS_PROG_DATA */
7344         if (wm_prog_data->early_fragment_tests)
7345           wm.EarlyDepthStencilControl = EDSC_PREPS;
7346         else if (wm_prog_data->has_side_effects)
7347           wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7348#endif
7349#if GFX_VER == 8
7350         /* We could skip this bit if color writes are enabled. */
7351         if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)
7352            wm.ForceThreadDispatchEnable = ForceON;
7353#endif
7354      };
7355
7356#if GFX_VER <= 5
7357      if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) {
7358         crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
7359            clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp;
7360         }
7361         ice->state.global_depth_offset_clamp = cso->cso.offset_clamp;
7362      }
7363#endif
7364   }
7365
7366#if GFX_VER >= 7
7367   if (dirty & CROCUS_DIRTY_GEN7_SBE) {
7368      crocus_emit_sbe(batch, ice);
7369   }
7370#endif
7371
7372#if GFX_VER >= 8
7373   if (dirty & CROCUS_DIRTY_GEN8_PS_BLEND) {
7374      struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
7375      struct crocus_blend_state *cso_blend = ice->state.cso_blend;
7376      struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7377      struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
7378      const struct shader_info *fs_info =
7379         crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7380      uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7381      crocus_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7382         pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7383         pb.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
7384         pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7385            (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
7386      }
7387      crocus_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7388                        ARRAY_SIZE(cso_blend->ps_blend));
7389   }
7390#endif
7391
7392#if GFX_VER >= 6
7393   if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) {
7394
7395#if GFX_VER >= 8
7396      crocus_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
7397         set_depth_stencil_bits(ice, &wmds);
7398      }
7399#else
7400      uint32_t ds_offset;
7401      void *ds_map = stream_state(batch,
7402                                  sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length),
7403                                  64, &ds_offset);
7404      _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) {
7405         set_depth_stencil_bits(ice, &ds);
7406      }
7407
7408#if GFX_VER == 6
7409      crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7410         ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7411         ptr.DEPTH_STENCIL_STATEChange = true;
7412      }
7413#else
7414      crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
7415         ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7416      }
7417#endif
7418#endif
7419   }
7420
7421   if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) {
7422      /* Align to 64-byte boundary as per anv. */
7423      uint32_t scissor_offset;
7424      struct pipe_scissor_state *scissor_map = (void *)
7425         stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports,
7426                      64, &scissor_offset);
7427      for (int i = 0; i < ice->state.num_viewports; i++) {
7428         struct pipe_scissor_state scissor;
7429         crocus_fill_scissor_rect(ice, i, &scissor);
7430         scissor_map[i] = scissor;
7431      }
7432
7433      crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7434         ptr.ScissorRectPointer = scissor_offset;
7435      }
7436   }
7437#endif
7438
7439   if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
7440      struct isl_device *isl_dev = &batch->screen->isl_dev;
7441#if GFX_VER >= 6
7442      crocus_emit_depth_stall_flushes(batch);
7443#endif
7444      void *batch_ptr;
7445      struct crocus_resource *zres, *sres;
7446      struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
7447      batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size);
7448
7449      struct isl_view view = {
7450                              .base_level = 0,
7451                              .levels = 1,
7452                              .base_array_layer = 0,
7453                              .array_len = 1,
7454                              .swizzle = ISL_SWIZZLE_IDENTITY,
7455      };
7456      struct isl_depth_stencil_hiz_emit_info info = {
7457         .view = &view,
7458         .mocs = crocus_mocs(NULL, isl_dev),
7459      };
7460
7461      if (cso->zsbuf) {
7462         crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres);
7463         struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf;
7464         if (zsbuf->align_res) {
7465            zres = (struct crocus_resource *)zsbuf->align_res;
7466         }
7467         view.base_level = cso->zsbuf->u.tex.level;
7468         view.base_array_layer = cso->zsbuf->u.tex.first_layer;
7469         view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
7470
7471         if (zres) {
7472            view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
7473
7474            info.depth_surf = &zres->surf;
7475            info.depth_address = crocus_command_reloc(batch,
7476                                                      (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset,
7477                                                      zres->bo, 0, RELOC_32BIT);
7478
7479            info.mocs = crocus_mocs(zres->bo, isl_dev);
7480            view.format = zres->surf.format;
7481
7482            if (crocus_resource_level_has_hiz(zres, view.base_level)) {
7483               info.hiz_usage = zres->aux.usage;
7484               info.hiz_surf = &zres->aux.surf;
7485               uint64_t hiz_offset = 0;
7486
7487#if GFX_VER == 6
7488               /* HiZ surfaces on Sandy Bridge technically don't support
7489                * mip-mapping.  However, we can fake it by offsetting to the
7490                * first slice of LOD0 in the HiZ surface.
7491                */
7492               isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf,
7493                                                   view.base_level, 0, 0,
7494                                                   &hiz_offset, NULL, NULL);
7495#endif
7496               info.hiz_address = crocus_command_reloc(batch,
7497                                                       (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset,
7498                                                       zres->aux.bo, zres->aux.offset + hiz_offset,
7499                                                       RELOC_32BIT);
7500               info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0];
7501            }
7502         }
7503
7504#if GFX_VER >= 6
7505         if (sres) {
7506            view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
7507            info.stencil_aux_usage = sres->aux.usage;
7508            info.stencil_surf = &sres->surf;
7509
7510            uint64_t stencil_offset = 0;
7511#if GFX_VER == 6
7512            /* Stencil surfaces on Sandy Bridge technically don't support
7513             * mip-mapping.  However, we can fake it by offsetting to the
7514             * first slice of LOD0 in the stencil surface.
7515             */
7516            isl_surf_get_image_offset_B_tile_sa(&sres->surf,
7517                                                view.base_level, 0, 0,
7518                                                &stencil_offset, NULL, NULL);
7519#endif
7520
7521            info.stencil_address = crocus_command_reloc(batch,
7522                                                        (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset,
7523                                                        sres->bo, stencil_offset, RELOC_32BIT);
7524            if (!zres) {
7525               view.format = sres->surf.format;
7526               info.mocs = crocus_mocs(sres->bo, isl_dev);
7527            }
7528         }
7529#endif
7530      }
7531      isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info);
7532   }
7533
7534   /* TODO: Disable emitting this until something uses a stipple. */
7535   if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) {
7536      crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7537         for (int i = 0; i < 32; i++) {
7538            poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7539         }
7540      }
7541   }
7542
7543   if (dirty & CROCUS_DIRTY_LINE_STIPPLE) {
7544      struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7545      crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7546   }
7547
7548#if GFX_VER >= 8
7549   if (dirty & CROCUS_DIRTY_GEN8_VF_TOPOLOGY) {
7550      crocus_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7551         topo.PrimitiveTopologyType =
7552            translate_prim_type(draw->mode, ice->state.patch_vertices);
7553      }
7554   }
7555#endif
7556
7557#if GFX_VER <= 5
7558   if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) {
7559      upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset,
7560                                      ice->shaders.vs_offset, ice->shaders.sf_offset,
7561                                      ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset);
7562      crocus_upload_urb_fence(batch);
7563
7564      crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) {
7565        cs.NumberofURBEntries = ice->urb.nr_cs_entries;
7566        cs.URBEntryAllocationSize = ice->urb.csize - 1;
7567      }
7568      dirty |= CROCUS_DIRTY_GEN4_CURBE;
7569   }
7570#endif
7571   if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) {
7572      struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7573      if (fb->width && fb->height) {
7574         crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
7575            rect.ClippedDrawingRectangleXMax = fb->width - 1;
7576            rect.ClippedDrawingRectangleYMax = fb->height - 1;
7577         }
7578      }
7579   }
7580
7581   if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) {
7582      const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers);
7583      const uint32_t count = user_count +
7584         ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params;
7585      uint32_t dynamic_bound = ice->state.bound_vertex_buffers;
7586
7587      if (count) {
7588         const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
7589
7590         uint32_t *map =
7591            crocus_get_command_space(batch, 4 * (1 + vb_dwords * count));
7592         _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
7593            vb.DWordLength = (vb_dwords * count + 1) - 2;
7594         }
7595         map += 1;
7596
7597         uint32_t bound = dynamic_bound;
7598         int i;
7599         while (bound) {
7600            i = u_bit_scan(&bound);
7601            struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i];
7602            struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource);
7603            uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i];
7604
7605            emit_vertex_buffer_state(batch, i, bo,
7606                                     buf->buffer_offset,
7607                                     ice->state.vb_end[i],
7608                                     buf->stride,
7609                                     step_rate,
7610                                     &map);
7611         }
7612         i = user_count;
7613         if (ice->state.vs_uses_draw_params) {
7614            struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res;
7615            emit_vertex_buffer_state(batch, i++,
7616                                     res->bo,
7617                                     ice->draw.draw_params.offset,
7618                                     ice->draw.draw_params.res->width0,
7619                                     0, 0, &map);
7620         }
7621         if (ice->state.vs_uses_derived_draw_params) {
7622            struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res;
7623            emit_vertex_buffer_state(batch, i++,
7624                                     res->bo,
7625                                     ice->draw.derived_draw_params.offset,
7626                                     ice->draw.derived_draw_params.res->width0,
7627                                     0, 0, &map);
7628         }
7629      }
7630   }
7631
7632   if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) {
7633      struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7634      const unsigned entries = MAX2(cso->count, 1);
7635      if (!(ice->state.vs_needs_sgvs_element ||
7636            ice->state.vs_uses_derived_draw_params ||
7637            ice->state.vs_needs_edge_flag)) {
7638         crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
7639                         (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
7640      } else {
7641         uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
7642         const unsigned dyn_count = cso->count +
7643            ice->state.vs_needs_sgvs_element +
7644            ice->state.vs_uses_derived_draw_params;
7645
7646         crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
7647                           &dynamic_ves, ve) {
7648            ve.DWordLength =
7649               1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
7650         }
7651         memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
7652                (cso->count - ice->state.vs_needs_edge_flag) *
7653                GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
7654         uint32_t *ve_pack_dest =
7655            &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
7656                         GENX(VERTEX_ELEMENT_STATE_length)];
7657
7658         if (ice->state.vs_needs_sgvs_element) {
7659            uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
7660                                 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
7661            crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7662               ve.Valid = true;
7663               ve.VertexBufferIndex =
7664                  util_bitcount64(ice->state.bound_vertex_buffers);
7665               ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7666               ve.Component0Control = base_ctrl;
7667               ve.Component1Control = base_ctrl;
7668#if GFX_VER < 8
7669               ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0;
7670               ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0;
7671#else
7672               ve.Component2Control = VFCOMP_STORE_0;
7673               ve.Component3Control = VFCOMP_STORE_0;
7674#endif
7675#if GFX_VER < 5
7676               ve.DestinationElementOffset = cso->count * 4;
7677#endif
7678            }
7679            ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7680         }
7681         if (ice->state.vs_uses_derived_draw_params) {
7682            crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7683               ve.Valid = true;
7684               ve.VertexBufferIndex =
7685                  util_bitcount64(ice->state.bound_vertex_buffers) +
7686                  ice->state.vs_uses_draw_params;
7687               ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7688               ve.Component0Control = VFCOMP_STORE_SRC;
7689               ve.Component1Control = VFCOMP_STORE_SRC;
7690               ve.Component2Control = VFCOMP_STORE_0;
7691               ve.Component3Control = VFCOMP_STORE_0;
7692#if GFX_VER < 5
7693               ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4;
7694#endif
7695            }
7696            ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7697         }
7698         if (ice->state.vs_needs_edge_flag) {
7699            for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length);  i++)
7700               ve_pack_dest[i] = cso->edgeflag_ve[i];
7701         }
7702
7703         crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
7704                         (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
7705      }
7706
7707#if GFX_VER == 8
7708      if (!ice->state.vs_needs_edge_flag) {
7709         crocus_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
7710                         entries * GENX(3DSTATE_VF_INSTANCING_length));
7711      } else {
7712         assert(cso->count > 0);
7713         const unsigned edgeflag_index = cso->count - 1;
7714         uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
7715         memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
7716                GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
7717
7718         uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
7719            edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
7720         crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
7721            vi.VertexElementIndex = edgeflag_index +
7722               ice->state.vs_needs_sgvs_element +
7723               ice->state.vs_uses_derived_draw_params;
7724         }
7725         for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length);  i++)
7726            vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
7727
7728         crocus_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
7729                         entries * GENX(3DSTATE_VF_INSTANCING_length));
7730      }
7731#endif
7732   }
7733
7734#if GFX_VER == 8
7735   if (dirty & CROCUS_DIRTY_GEN8_VF_SGVS) {
7736      const struct brw_vs_prog_data *vs_prog_data = (void *)
7737         ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
7738      struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7739
7740      crocus_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
7741         if (vs_prog_data->uses_vertexid) {
7742            sgv.VertexIDEnable = true;
7743            sgv.VertexIDComponentNumber = 2;
7744            sgv.VertexIDElementOffset =
7745               cso->count - ice->state.vs_needs_edge_flag;
7746         }
7747
7748         if (vs_prog_data->uses_instanceid) {
7749            sgv.InstanceIDEnable = true;
7750            sgv.InstanceIDComponentNumber = 3;
7751            sgv.InstanceIDElementOffset =
7752               cso->count - ice->state.vs_needs_edge_flag;
7753         }
7754      }
7755   }
7756#endif
7757#if GFX_VERx10 >= 75
7758   if (dirty & CROCUS_DIRTY_GEN75_VF) {
7759      crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
7760         if (draw->primitive_restart) {
7761            vf.IndexedDrawCutIndexEnable = true;
7762            vf.CutIndex = draw->restart_index;
7763         }
7764      }
7765   }
7766#endif
7767
7768#if GFX_VER == 8
7769   if (dirty & CROCUS_DIRTY_GEN8_PMA_FIX) {
7770      bool enable = want_pma_fix(ice);
7771      genX(crocus_update_pma_fix)(ice, batch, enable);
7772   }
7773#endif
7774
7775#if GFX_VER <= 5
7776   if (dirty & CROCUS_DIRTY_GEN4_CURBE) {
7777      gen4_upload_curbe(batch);
7778   }
7779#endif
7780}
7781
7782static void
7783crocus_upload_render_state(struct crocus_context *ice,
7784                           struct crocus_batch *batch,
7785                           const struct pipe_draw_info *draw,
7786                           unsigned drawid_offset,
7787                           const struct pipe_draw_indirect_info *indirect,
7788                           const struct pipe_draw_start_count_bias *sc)
7789{
7790#if GFX_VER >= 7
7791   bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT;
7792#endif
7793
7794   batch->no_wrap = true;
7795   batch->contains_draw = true;
7796
7797   crocus_update_surface_base_address(batch);
7798
7799   crocus_upload_dirty_render_state(ice, batch, draw);
7800
7801   batch->no_wrap = false;
7802   if (draw->index_size > 0) {
7803      unsigned offset;
7804      unsigned size;
7805      bool emit_index = false;
7806
7807      if (draw->has_user_indices) {
7808         unsigned start_offset = draw->index_size * sc->start;
7809         u_upload_data(ice->ctx.stream_uploader, 0,
7810                       sc->count * draw->index_size, 4,
7811                       (char *)draw->index.user + start_offset,
7812                       &offset, &ice->state.index_buffer.res);
7813         offset -= start_offset;
7814         size = start_offset + sc->count * draw->index_size;
7815         emit_index = true;
7816      } else {
7817         struct crocus_resource *res = (void *) draw->index.resource;
7818
7819         if (ice->state.index_buffer.res != draw->index.resource) {
7820            res->bind_history |= PIPE_BIND_INDEX_BUFFER;
7821            pipe_resource_reference(&ice->state.index_buffer.res,
7822                                    draw->index.resource);
7823            emit_index = true;
7824         }
7825         offset = 0;
7826         size = draw->index.resource->width0;
7827      }
7828
7829      if (!emit_index &&
7830          (ice->state.index_buffer.size != size ||
7831           ice->state.index_buffer.index_size != draw->index_size
7832#if GFX_VERx10 < 75
7833           || ice->state.index_buffer.prim_restart != draw->primitive_restart
7834#endif
7835	   )
7836	  )
7837         emit_index = true;
7838
7839      if (emit_index) {
7840         struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res);
7841
7842         crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
7843#if GFX_VERx10 < 75
7844            ib.CutIndexEnable = draw->primitive_restart;
7845#endif
7846            ib.IndexFormat = draw->index_size >> 1;
7847            ib.BufferStartingAddress = ro_bo(bo, offset);
7848#if GFX_VER >= 8
7849            ib.BufferSize = bo->size - offset;
7850#else
7851            ib.BufferEndingAddress = ro_bo(bo, offset + size - 1);
7852#endif
7853#if GFX_VER >= 6
7854            ib.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
7855#endif
7856         }
7857         ice->state.index_buffer.size = size;
7858         ice->state.index_buffer.offset = offset;
7859         ice->state.index_buffer.index_size = draw->index_size;
7860#if GFX_VERx10 < 75
7861         ice->state.index_buffer.prim_restart = draw->primitive_restart;
7862#endif
7863      }
7864   }
7865
7866#define _3DPRIM_END_OFFSET          0x2420
7867#define _3DPRIM_START_VERTEX        0x2430
7868#define _3DPRIM_VERTEX_COUNT        0x2434
7869#define _3DPRIM_INSTANCE_COUNT      0x2438
7870#define _3DPRIM_START_INSTANCE      0x243C
7871#define _3DPRIM_BASE_VERTEX         0x2440
7872
7873#if GFX_VER >= 7
7874   if (indirect && !indirect->count_from_stream_output) {
7875      if (indirect->indirect_draw_count) {
7876         use_predicate = true;
7877
7878         struct crocus_bo *draw_count_bo =
7879            crocus_resource_bo(indirect->indirect_draw_count);
7880         unsigned draw_count_offset =
7881            indirect->indirect_draw_count_offset;
7882
7883         crocus_emit_pipe_control_flush(batch,
7884                                        "ensure indirect draw buffer is flushed",
7885                                        PIPE_CONTROL_FLUSH_ENABLE);
7886         if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
7887#if GFX_VERx10 >= 75
7888            struct mi_builder b;
7889            mi_builder_init(&b, &batch->screen->devinfo, batch);
7890
7891            /* comparison = draw id < draw count */
7892            struct mi_value comparison =
7893               mi_ult(&b, mi_imm(drawid_offset),
7894                      mi_mem32(ro_bo(draw_count_bo,
7895                                     draw_count_offset)));
7896#if GFX_VER == 8
7897            /* predicate = comparison & conditional rendering predicate */
7898            mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
7899                         mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
7900#else
7901            /* predicate = comparison & conditional rendering predicate */
7902            struct mi_value pred = mi_iand(&b, comparison,
7903                                           mi_reg32(CS_GPR(15)));
7904
7905            mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred);
7906            mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7907
7908            unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7909               MI_PREDICATE_COMBINEOP_SET |
7910               MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7911
7912            crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7913#endif
7914#endif
7915         } else {
7916            uint32_t mi_predicate;
7917
7918            /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
7919            crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
7920            /* Upload the current draw count from the draw parameters buffer
7921             * to MI_PREDICATE_SRC0.
7922             */
7923            crocus_load_register_mem32(batch, MI_PREDICATE_SRC0,
7924                                       draw_count_bo, draw_count_offset);
7925            /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
7926            crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
7927
7928            if (drawid_offset == 0) {
7929               mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7930                  MI_PREDICATE_COMBINEOP_SET |
7931                  MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7932            } else {
7933               /* While draw_index < draw_count the predicate's result will be
7934                *  (draw_index == draw_count) ^ TRUE = TRUE
7935                * When draw_index == draw_count the result is
7936                *  (TRUE) ^ TRUE = FALSE
7937                * After this all results will be:
7938                *  (FALSE) ^ FALSE = FALSE
7939                */
7940               mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
7941                  MI_PREDICATE_COMBINEOP_XOR |
7942                  MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7943            }
7944            crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7945         }
7946      }
7947
7948#if GFX_VER >= 7
7949      struct crocus_bo *bo = crocus_resource_bo(indirect->buffer);
7950      assert(bo);
7951
7952      crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7953         lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
7954         lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
7955      }
7956      crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7957         lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
7958         lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
7959      }
7960      crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7961         lrm.RegisterAddress = _3DPRIM_START_VERTEX;
7962         lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
7963      }
7964      if (draw->index_size) {
7965         crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7966            lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
7967            lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7968         }
7969         crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7970            lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7971            lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
7972         }
7973      } else {
7974         crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7975            lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7976            lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7977         }
7978         crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
7979            lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
7980            lri.DataDWord = 0;
7981         }
7982      }
7983#endif
7984   } else if (indirect && indirect->count_from_stream_output) {
7985#if GFX_VERx10 >= 75
7986      struct crocus_stream_output_target *so =
7987         (void *) indirect->count_from_stream_output;
7988
7989      /* XXX: Replace with actual cache tracking */
7990      crocus_emit_pipe_control_flush(batch,
7991                                     "draw count from stream output stall",
7992                                     PIPE_CONTROL_CS_STALL);
7993
7994      struct mi_builder b;
7995      mi_builder_init(&b, &batch->screen->devinfo, batch);
7996
7997      struct crocus_address addr =
7998         ro_bo(crocus_resource_bo(&so->offset_res->base.b), so->offset_offset);
7999      struct mi_value offset =
8000         mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
8001
8002      mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8003               mi_udiv32_imm(&b, offset, so->stride));
8004
8005      _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
8006      _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
8007      _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
8008      _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
8009#endif
8010   }
8011#else
8012   assert(!indirect);
8013#endif
8014
8015   crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8016      prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8017#if GFX_VER >= 7
8018      prim.PredicateEnable = use_predicate;
8019#endif
8020
8021      prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, ice->state.patch_vertices);
8022      if (indirect) {
8023         // XXX Probably have to do something for gen6 here?
8024#if GFX_VER >= 7
8025         prim.IndirectParameterEnable = true;
8026#endif
8027      } else {
8028#if GFX_VER >= 5
8029         prim.StartInstanceLocation = draw->start_instance;
8030#endif
8031         prim.InstanceCount = draw->instance_count;
8032         prim.VertexCountPerInstance = sc->count;
8033
8034         prim.StartVertexLocation = sc->start;
8035
8036         if (draw->index_size) {
8037            prim.BaseVertexLocation += sc->index_bias;
8038         }
8039      }
8040   }
8041}
8042
8043#if GFX_VER >= 7
8044
8045static void
8046crocus_upload_compute_state(struct crocus_context *ice,
8047                            struct crocus_batch *batch,
8048                            const struct pipe_grid_info *grid)
8049{
8050   const uint64_t stage_dirty = ice->state.stage_dirty;
8051   struct crocus_screen *screen = batch->screen;
8052   const struct intel_device_info *devinfo = &screen->devinfo;
8053   struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
8054   struct crocus_compiled_shader *shader =
8055      ice->shaders.prog[MESA_SHADER_COMPUTE];
8056   struct brw_stage_prog_data *prog_data = shader->prog_data;
8057   struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
8058   const struct brw_cs_dispatch_info dispatch =
8059      brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
8060
8061   crocus_update_surface_base_address(batch);
8062   if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload)
8063      upload_sysvals(ice, MESA_SHADER_COMPUTE);
8064
8065   if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) {
8066      crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
8067      ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset =
8068         crocus_upload_binding_table(ice, batch,
8069                                     ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset,
8070                                     ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes);
8071   }
8072
8073   if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS)
8074      crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE);
8075
8076   if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8077       cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8078      /* The MEDIA_VFE_STATE documentation for Gen8+ says:
8079       *
8080       *   "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
8081       *    the only bits that are changed are scoreboard related: Scoreboard
8082       *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta.  For
8083       *    these scoreboard related states, a MEDIA_STATE_FLUSH is
8084       *    sufficient."
8085       */
8086      crocus_emit_pipe_control_flush(batch,
8087                                     "workaround: stall before MEDIA_VFE_STATE",
8088                                     PIPE_CONTROL_CS_STALL);
8089
8090      crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
8091         if (prog_data->total_scratch) {
8092            struct crocus_bo *bo =
8093               crocus_get_scratch_space(ice, prog_data->total_scratch,
8094                                        MESA_SHADER_COMPUTE);
8095#if GFX_VER == 8
8096            /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
8097             * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
8098             */
8099            vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
8100#elif GFX_VERx10 == 75
8101            /* Haswell's Per Thread Scratch Space is in the range [0, 10]
8102             * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
8103             */
8104            vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12;
8105#else
8106            /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
8107             * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
8108             */
8109            vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1;
8110#endif
8111            vfe.ScratchSpaceBasePointer = rw_bo(bo, 0);
8112         }
8113
8114         vfe.MaximumNumberofThreads =
8115            devinfo->max_cs_threads * devinfo->subslice_total - 1;
8116         vfe.ResetGatewayTimer =
8117            Resettingrelativetimerandlatchingtheglobaltimestamp;
8118         vfe.BypassGatewayControl = true;
8119#if GFX_VER == 7
8120         vfe.GPGPUMode = true;
8121#endif
8122#if GFX_VER == 8
8123         vfe.BypassGatewayControl = true;
8124#endif
8125         vfe.NumberofURBEntries = GFX_VER == 8 ? 2 : 0;
8126         vfe.URBEntryAllocationSize = GFX_VER == 8 ? 2 : 0;
8127
8128         vfe.CURBEAllocationSize =
8129            ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
8130                  cs_prog_data->push.cross_thread.regs, 2);
8131      }
8132   }
8133
8134   /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
8135   if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8136       cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8137      uint32_t curbe_data_offset = 0;
8138      assert(cs_prog_data->push.cross_thread.dwords == 0 &&
8139             cs_prog_data->push.per_thread.dwords == 1 &&
8140             cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
8141      const unsigned push_const_size =
8142         brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
8143      uint32_t *curbe_data_map =
8144         stream_state(batch,
8145                      ALIGN(push_const_size, 64), 64,
8146                      &curbe_data_offset);
8147      assert(curbe_data_map);
8148      memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
8149      crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
8150                                       curbe_data_map);
8151
8152      crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
8153         curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
8154         curbe.CURBEDataStartAddress = curbe_data_offset;
8155      }
8156   }
8157
8158   if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS |
8159                      CROCUS_STAGE_DIRTY_BINDINGS_CS |
8160                      CROCUS_STAGE_DIRTY_CONSTANTS_CS |
8161                      CROCUS_STAGE_DIRTY_CS)) {
8162      uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
8163      const uint64_t ksp = KSP(ice,shader) + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size);
8164      crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
8165         idd.KernelStartPointer = ksp;
8166         idd.SamplerStatePointer = shs->sampler_offset;
8167         idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset;
8168         idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
8169         idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
8170         idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
8171         idd.BarrierEnable = cs_prog_data->uses_barrier;
8172         idd.SharedLocalMemorySize = encode_slm_size(GFX_VER,
8173                                                     prog_data->total_shared);
8174#if GFX_VERx10 >= 75
8175         idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
8176#endif
8177      }
8178
8179      crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
8180         load.InterfaceDescriptorTotalLength =
8181            GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
8182         load.InterfaceDescriptorDataStartAddress =
8183            emit_state(batch, desc, sizeof(desc), 64);
8184      }
8185   }
8186
8187#define GPGPU_DISPATCHDIMX 0x2500
8188#define GPGPU_DISPATCHDIMY 0x2504
8189#define GPGPU_DISPATCHDIMZ 0x2508
8190
8191   if (grid->indirect) {
8192      struct crocus_state_ref *grid_size = &ice->state.grid_size;
8193      struct crocus_bo *bo = crocus_resource_bo(grid_size->res);
8194      crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8195         lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
8196         lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
8197      }
8198      crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8199         lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
8200         lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
8201      }
8202      crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8203         lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
8204         lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
8205      }
8206
8207#if GFX_VER == 7
8208      /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
8209      _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
8210      crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0);
8211
8212      /* Load compute_dispatch_indirect_x_size into SRC0 */
8213      crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0);
8214
8215      /* predicate = (compute_dispatch_indirect_x_size == 0); */
8216      crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8217         mip.LoadOperation    = LOAD_LOAD;
8218         mip.CombineOperation = COMBINE_SET;
8219         mip.CompareOperation = COMPARE_SRCS_EQUAL;
8220      };
8221
8222      /* Load compute_dispatch_indirect_y_size into SRC0 */
8223      crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4);
8224
8225      /* predicate = (compute_dispatch_indirect_y_size == 0); */
8226      crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8227         mip.LoadOperation    = LOAD_LOAD;
8228         mip.CombineOperation = COMBINE_OR;
8229         mip.CompareOperation = COMPARE_SRCS_EQUAL;
8230      };
8231
8232      /* Load compute_dispatch_indirect_z_size into SRC0 */
8233      crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8);
8234
8235      /* predicate = (compute_dispatch_indirect_z_size == 0); */
8236      crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8237         mip.LoadOperation    = LOAD_LOAD;
8238         mip.CombineOperation = COMBINE_OR;
8239         mip.CompareOperation = COMPARE_SRCS_EQUAL;
8240      };
8241
8242      /* predicate = !predicate; */
8243#define COMPARE_FALSE                           1
8244      crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8245         mip.LoadOperation    = LOAD_LOADINV;
8246         mip.CombineOperation = COMBINE_OR;
8247         mip.CompareOperation = COMPARE_FALSE;
8248      }
8249#endif
8250   }
8251
8252   crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
8253      ggw.IndirectParameterEnable    = grid->indirect != NULL;
8254      ggw.PredicateEnable            = GFX_VER <= 7 && grid->indirect != NULL;
8255      ggw.SIMDSize                   = dispatch.simd_size / 16;
8256      ggw.ThreadDepthCounterMaximum  = 0;
8257      ggw.ThreadHeightCounterMaximum = 0;
8258      ggw.ThreadWidthCounterMaximum  = dispatch.threads - 1;
8259      ggw.ThreadGroupIDXDimension    = grid->grid[0];
8260      ggw.ThreadGroupIDYDimension    = grid->grid[1];
8261      ggw.ThreadGroupIDZDimension    = grid->grid[2];
8262      ggw.RightExecutionMask         = dispatch.right_mask;
8263      ggw.BottomExecutionMask        = 0xffffffff;
8264   }
8265
8266   crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
8267
8268   batch->contains_draw = true;
8269}
8270
8271#endif /* GFX_VER >= 7 */
8272
8273/**
8274 * State module teardown.
8275 */
8276static void
8277crocus_destroy_state(struct crocus_context *ice)
8278{
8279   pipe_resource_reference(&ice->draw.draw_params.res, NULL);
8280   pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
8281
8282   free(ice->state.genx);
8283
8284   for (int i = 0; i < 4; i++) {
8285      pipe_so_target_reference(&ice->state.so_target[i], NULL);
8286   }
8287
8288   for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
8289      pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL);
8290   }
8291   pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL);
8292
8293   for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
8294      struct crocus_shader_state *shs = &ice->state.shaders[stage];
8295      for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
8296         pipe_resource_reference(&shs->constbufs[i].buffer, NULL);
8297      }
8298      for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
8299         pipe_resource_reference(&shs->image[i].base.resource, NULL);
8300      }
8301      for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
8302         pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
8303      }
8304      for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) {
8305         pipe_sampler_view_reference((struct pipe_sampler_view **)
8306                                     &shs->textures[i], NULL);
8307      }
8308   }
8309
8310   for (int i = 0; i < 16; i++)
8311      pipe_resource_reference(&ice->state.vertex_buffers[i].buffer.resource, NULL);
8312   pipe_resource_reference(&ice->state.grid_size.res, NULL);
8313
8314   pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8315}
8316
8317/* ------------------------------------------------------------------- */
8318
8319static void
8320crocus_rebind_buffer(struct crocus_context *ice,
8321                     struct crocus_resource *res)
8322{
8323   struct pipe_context *ctx = &ice->ctx;
8324
8325   assert(res->base.b.target == PIPE_BUFFER);
8326
8327   /* Buffers can't be framebuffer attachments, nor display related,
8328    * and we don't have upstream Clover support.
8329    */
8330   assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
8331                                 PIPE_BIND_RENDER_TARGET |
8332                                 PIPE_BIND_BLENDABLE |
8333                                 PIPE_BIND_DISPLAY_TARGET |
8334                                 PIPE_BIND_CURSOR |
8335                                 PIPE_BIND_COMPUTE_RESOURCE |
8336                                 PIPE_BIND_GLOBAL)));
8337
8338   if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
8339      uint64_t bound_vbs = ice->state.bound_vertex_buffers;
8340      while (bound_vbs) {
8341         const int i = u_bit_scan64(&bound_vbs);
8342         struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i];
8343
8344         if (!buffer->is_user_buffer && &res->base.b == buffer->buffer.resource)
8345            ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
8346      }
8347   }
8348
8349   if ((res->bind_history & PIPE_BIND_INDEX_BUFFER) &&
8350       ice->state.index_buffer.res) {
8351      if (res->bo == crocus_resource_bo(ice->state.index_buffer.res))
8352         pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8353   }
8354   /* There is no need to handle these:
8355    * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
8356    * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
8357    */
8358
8359   if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
8360      /* XXX: be careful about resetting vs appending... */
8361      for (int i = 0; i < 4; i++) {
8362         if (ice->state.so_target[i] &&
8363             (ice->state.so_target[i]->buffer == &res->base.b)) {
8364#if GFX_VER == 6
8365            ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
8366#else
8367            ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
8368#endif
8369         }
8370      }
8371   }
8372
8373   for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
8374      struct crocus_shader_state *shs = &ice->state.shaders[s];
8375      enum pipe_shader_type p_stage = stage_to_pipe(s);
8376
8377      if (!(res->bind_stages & (1 << s)))
8378         continue;
8379
8380      if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
8381         /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
8382         uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
8383         while (bound_cbufs) {
8384            const int i = u_bit_scan(&bound_cbufs);
8385            struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
8386
8387            if (res->bo == crocus_resource_bo(cbuf->buffer)) {
8388               ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s;
8389            }
8390         }
8391      }
8392
8393      if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
8394         uint32_t bound_ssbos = shs->bound_ssbos;
8395         while (bound_ssbos) {
8396            const int i = u_bit_scan(&bound_ssbos);
8397            struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
8398
8399            if (res->bo == crocus_resource_bo(ssbo->buffer)) {
8400               struct pipe_shader_buffer buf = {
8401                  .buffer = &res->base.b,
8402                  .buffer_offset = ssbo->buffer_offset,
8403                  .buffer_size = ssbo->buffer_size,
8404               };
8405               crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf,
8406                                         (shs->writable_ssbos >> i) & 1);
8407            }
8408         }
8409      }
8410
8411      if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
8412         uint32_t bound_sampler_views = shs->bound_sampler_views;
8413         while (bound_sampler_views) {
8414            const int i = u_bit_scan(&bound_sampler_views);
8415            struct crocus_sampler_view *isv = shs->textures[i];
8416            struct crocus_bo *bo = isv->res->bo;
8417
8418            if (res->bo == bo) {
8419               ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8420            }
8421         }
8422      }
8423
8424      if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
8425         uint32_t bound_image_views = shs->bound_image_views;
8426         while (bound_image_views) {
8427            const int i = u_bit_scan(&bound_image_views);
8428            struct crocus_image_view *iv = &shs->image[i];
8429            struct crocus_bo *bo = crocus_resource_bo(iv->base.resource);
8430
8431            if (res->bo == bo)
8432               ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8433         }
8434      }
8435   }
8436}
8437
8438/* ------------------------------------------------------------------- */
8439
8440static unsigned
8441flags_to_post_sync_op(uint32_t flags)
8442{
8443   if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
8444      return WriteImmediateData;
8445
8446   if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
8447      return WritePSDepthCount;
8448
8449   if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
8450      return WriteTimestamp;
8451
8452   return 0;
8453}
8454
8455/*
8456 * Do the given flags have a Post Sync or LRI Post Sync operation?
8457 */
8458static enum pipe_control_flags
8459get_post_sync_flags(enum pipe_control_flags flags)
8460{
8461   flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
8462            PIPE_CONTROL_WRITE_DEPTH_COUNT |
8463            PIPE_CONTROL_WRITE_TIMESTAMP |
8464            PIPE_CONTROL_LRI_POST_SYNC_OP;
8465
8466   /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
8467    * "LRI Post Sync Operation".  So more than one bit set would be illegal.
8468    */
8469   assert(util_bitcount(flags) <= 1);
8470
8471   return flags;
8472}
8473
8474#define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE)
8475
8476/**
8477 * Emit a series of PIPE_CONTROL commands, taking into account any
8478 * workarounds necessary to actually accomplish the caller's request.
8479 *
8480 * Unless otherwise noted, spec quotations in this function come from:
8481 *
8482 * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
8483 * Restrictions for PIPE_CONTROL.
8484 *
8485 * You should not use this function directly.  Use the helpers in
8486 * crocus_pipe_control.c instead, which may split the pipe control further.
8487 */
8488static void
8489crocus_emit_raw_pipe_control(struct crocus_batch *batch,
8490                             const char *reason,
8491                             uint32_t flags,
8492                             struct crocus_bo *bo,
8493                             uint32_t offset,
8494                             uint64_t imm)
8495{
8496   UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
8497   enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
8498   UNUSED enum pipe_control_flags non_lri_post_sync_flags =
8499      post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
8500
8501   /* Recursive PIPE_CONTROL workarounds --------------------------------
8502    * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
8503    *
8504    * We do these first because we want to look at the original operation,
8505    * rather than any workarounds we set.
8506    */
8507
8508   /* "Flush Types" workarounds ---------------------------------------------
8509    * We do these now because they may add post-sync operations or CS stalls.
8510    */
8511
8512   if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
8513      /* Hardware workaround: SNB B-Spec says:
8514       *
8515       *    "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
8516       *     Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
8517       *     required."
8518       */
8519      crocus_emit_post_sync_nonzero_flush(batch);
8520   }
8521
8522#if GFX_VER == 8
8523   if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
8524      /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
8525       *
8526       * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
8527       *  'Write PS Depth Count' or 'Write Timestamp'."
8528       */
8529      if (!bo) {
8530         flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8531         post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8532         non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8533         bo = batch->ice->workaround_bo;
8534         offset = batch->ice->workaround_offset;
8535      }
8536   }
8537#endif
8538
8539#if GFX_VERx10 < 75
8540   if (flags & PIPE_CONTROL_DEPTH_STALL) {
8541      /* Project: PRE-HSW / Argument: Depth Stall
8542       *
8543       * "The following bits must be clear:
8544       *  - Render Target Cache Flush Enable ([12] of DW1)
8545       *  - Depth Cache Flush Enable ([0] of DW1)"
8546       */
8547      assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8548                        PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
8549   }
8550#endif
8551   if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
8552      /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
8553       *
8554       *    "This bit must be DISABLED for operations other than writing
8555       *     PS_DEPTH_COUNT."
8556       *
8557       * This seems like nonsense.  An Ivybridge workaround requires us to
8558       * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
8559       * operation.  Gen8+ requires us to emit depth stalls and depth cache
8560       * flushes together.  So, it's hard to imagine this means anything other
8561       * than "we originally intended this to be used for PS_DEPTH_COUNT".
8562       *
8563       * We ignore the supposed restriction and do nothing.
8564       */
8565   }
8566
8567   if (GFX_VERx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
8568      /* Project: PRE-HSW / Argument: Depth Cache Flush
8569       *
8570       * "Depth Stall must be clear ([13] of DW1)."
8571       */
8572      assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
8573   }
8574
8575   if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8576                PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
8577      /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
8578       *
8579       *    "This bit must be DISABLED for End-of-pipe (Read) fences,
8580       *     PS_DEPTH_COUNT or TIMESTAMP queries."
8581       *
8582       * TODO: Implement end-of-pipe checking.
8583       */
8584      assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
8585                                  PIPE_CONTROL_WRITE_TIMESTAMP)));
8586   }
8587
8588   if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
8589      /* From the PIPE_CONTROL instruction table, bit 1:
8590       *
8591       *    "This bit is ignored if Depth Stall Enable is set.
8592       *     Further, the render cache is not flushed even if Write Cache
8593       *     Flush Enable bit is set."
8594       *
8595       * We assert that the caller doesn't do this combination, to try and
8596       * prevent mistakes.  It shouldn't hurt the GPU, though.
8597       *
8598       * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard"
8599       * and "Render Target Flush" combo is explicitly required for BTI
8600       * update workarounds.
8601       */
8602      assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
8603                        PIPE_CONTROL_RENDER_TARGET_FLUSH)));
8604   }
8605
8606   /* PIPE_CONTROL page workarounds ------------------------------------- */
8607
8608   if (GFX_VER >= 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
8609      /* From the PIPE_CONTROL page itself:
8610       *
8611       *    "IVB, HSW, BDW
8612       *     Restriction: Pipe_control with CS-stall bit set must be issued
8613       *     before a pipe-control command that has the State Cache
8614       *     Invalidate bit set."
8615       */
8616      flags |= PIPE_CONTROL_CS_STALL;
8617   }
8618
8619   if ((GFX_VERx10 == 75)) {
8620      /* From the PIPE_CONTROL page itself:
8621       *
8622       *    "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
8623       *     Prior to programming a PIPECONTROL command with any of the RO
8624       *     cache invalidation bit set, program a PIPECONTROL flush command
8625       *     with “CS stall” bit and “HDC Flush” bit set."
8626       *
8627       * TODO: Actually implement this.  What's an HDC Flush?
8628       */
8629   }
8630
8631   if (flags & PIPE_CONTROL_FLUSH_LLC) {
8632      /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
8633       *
8634       *    "Project: ALL
8635       *     SW must always program Post-Sync Operation to "Write Immediate
8636       *     Data" when Flush LLC is set."
8637       *
8638       * For now, we just require the caller to do it.
8639       */
8640      assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
8641   }
8642
8643   /* "Post-Sync Operation" workarounds -------------------------------- */
8644
8645   /* Project: All / Argument: Global Snapshot Count Reset [19]
8646    *
8647    * "This bit must not be exercised on any product.
8648    *  Requires stall bit ([20] of DW1) set."
8649    *
8650    * We don't use this, so we just assert that it isn't used.  The
8651    * PIPE_CONTROL instruction page indicates that they intended this
8652    * as a debug feature and don't think it is useful in production,
8653    * but it may actually be usable, should we ever want to.
8654    */
8655   assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
8656
8657   if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
8658                PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
8659      /* Project: All / Arguments:
8660       *
8661       * - Generic Media State Clear [16]
8662       * - Indirect State Pointers Disable [16]
8663       *
8664       *    "Requires stall bit ([20] of DW1) set."
8665       *
8666       * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
8667       * State Clear) says:
8668       *
8669       *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
8670       *     programmed prior to programming a PIPECONTROL command with "Media
8671       *     State Clear" set in GPGPU mode of operation"
8672       *
8673       * This is a subset of the earlier rule, so there's nothing to do.
8674       */
8675      flags |= PIPE_CONTROL_CS_STALL;
8676   }
8677
8678   if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
8679      /* Project: All / Argument: Store Data Index
8680       *
8681       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8682       *  than '0'."
8683       *
8684       * For now, we just assert that the caller does this.  We might want to
8685       * automatically add a write to the workaround BO...
8686       */
8687      assert(non_lri_post_sync_flags != 0);
8688   }
8689
8690   if (flags & PIPE_CONTROL_SYNC_GFDT) {
8691      /* Project: All / Argument: Sync GFDT
8692       *
8693       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8694       *  than '0' or 0x2520[13] must be set."
8695       *
8696       * For now, we just assert that the caller does this.
8697       */
8698      assert(non_lri_post_sync_flags != 0);
8699   }
8700
8701   if (GFX_VER >= 6 && GFX_VER < 8 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8702      /* Project: SNB, IVB, HSW / Argument: TLB inv
8703       *
8704       * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
8705       *  must be set to something other than '0'."
8706       *
8707       * For now, we just assert that the caller does this.
8708       */
8709      assert(non_lri_post_sync_flags != 0);
8710   }
8711
8712   if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8713      /* Project: IVB+ / Argument: TLB inv
8714       *
8715       *    "Requires stall bit ([20] of DW1) set."
8716       *
8717       * Also, from the PIPE_CONTROL instruction table:
8718       *
8719       *    "Project: SKL+
8720       *     Post Sync Operation or CS stall must be set to ensure a TLB
8721       *     invalidation occurs.  Otherwise no cycle will occur to the TLB
8722       *     cache to invalidate."
8723       *
8724       * This is not a subset of the earlier rule, so there's nothing to do.
8725       */
8726      flags |= PIPE_CONTROL_CS_STALL;
8727   }
8728#if GFX_VER == 8
8729   if (IS_COMPUTE_PIPELINE(batch)) {
8730      if (post_sync_flags ||
8731          (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
8732                    PIPE_CONTROL_DEPTH_STALL |
8733                    PIPE_CONTROL_RENDER_TARGET_FLUSH |
8734                    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8735                    PIPE_CONTROL_DATA_CACHE_FLUSH))) {
8736         /* Project: BDW / Arguments:
8737          *
8738          * - LRI Post Sync Operation   [23]
8739          * - Post Sync Op              [15:14]
8740          * - Notify En                 [8]
8741          * - Depth Stall               [13]
8742          * - Render Target Cache Flush [12]
8743          * - Depth Cache Flush         [0]
8744          * - DC Flush Enable           [5]
8745          *
8746          *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
8747          *     Workloads."
8748          *
8749          * (The docs have separate table rows for each bit, with essentially
8750          * the same workaround text.  We've combined them here.)
8751          */
8752         flags |= PIPE_CONTROL_CS_STALL;
8753
8754         /* Also, from the PIPE_CONTROL instruction table, bit 20:
8755          *
8756          *    "Project: BDW
8757          *     This bit must be always set when PIPE_CONTROL command is
8758          *     programmed by GPGPU and MEDIA workloads, except for the cases
8759          *     when only Read Only Cache Invalidation bits are set (State
8760          *     Cache Invalidation Enable, Instruction cache Invalidation
8761          *     Enable, Texture Cache Invalidation Enable, Constant Cache
8762          *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
8763          *     need not implemented when FF_DOP_CG is disable via "Fixed
8764          *     Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
8765          *
8766          * It sounds like we could avoid CS stalls in some cases, but we
8767          * don't currently bother.  This list isn't exactly the list above,
8768          * either...
8769          */
8770      }
8771   }
8772#endif
8773   /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
8774    *
8775    * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
8776    *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
8777    *
8778    * Note that the kernel does CS stalls between batches, so we only need
8779    * to count them within a batch.  We currently naively count every 4, and
8780    * don't skip the ones with only read-cache-invalidate bits set.  This
8781    * may or may not be a problem...
8782    */
8783   if (GFX_VER == 7 && !(GFX_VERx10 == 75)) {
8784      if (flags & PIPE_CONTROL_CS_STALL) {
8785         /* If we're doing a CS stall, reset the counter and carry on. */
8786         batch->pipe_controls_since_last_cs_stall = 0;
8787      }
8788
8789      /* If this is the fourth pipe control without a CS stall, do one now. */
8790      if (++batch->pipe_controls_since_last_cs_stall == 4) {
8791         batch->pipe_controls_since_last_cs_stall = 0;
8792         flags |= PIPE_CONTROL_CS_STALL;
8793      }
8794   }
8795
8796   /* "Stall" workarounds ----------------------------------------------
8797    * These have to come after the earlier ones because we may have added
8798    * some additional CS stalls above.
8799    */
8800
8801   if (flags & PIPE_CONTROL_CS_STALL) {
8802      /* Project: PRE-SKL, VLV, CHV
8803       *
8804       * "[All Stepping][All SKUs]:
8805       *
8806       *  One of the following must also be set:
8807       *
8808       *  - Render Target Cache Flush Enable ([12] of DW1)
8809       *  - Depth Cache Flush Enable ([0] of DW1)
8810       *  - Stall at Pixel Scoreboard ([1] of DW1)
8811       *  - Depth Stall ([13] of DW1)
8812       *  - Post-Sync Operation ([13] of DW1)
8813       *  - DC Flush Enable ([5] of DW1)"
8814       *
8815       * If we don't already have one of those bits set, we choose to add
8816       * "Stall at Pixel Scoreboard".  Some of the other bits require a
8817       * CS stall as a workaround (see above), which would send us into
8818       * an infinite recursion of PIPE_CONTROLs.  "Stall at Pixel Scoreboard"
8819       * appears to be safe, so we choose that.
8820       */
8821      const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
8822                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8823                               PIPE_CONTROL_WRITE_IMMEDIATE |
8824                               PIPE_CONTROL_WRITE_DEPTH_COUNT |
8825                               PIPE_CONTROL_WRITE_TIMESTAMP |
8826                               PIPE_CONTROL_STALL_AT_SCOREBOARD |
8827                               PIPE_CONTROL_DEPTH_STALL |
8828                               PIPE_CONTROL_DATA_CACHE_FLUSH;
8829      if (!(flags & wa_bits))
8830         flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
8831   }
8832
8833   /* Emit --------------------------------------------------------------- */
8834
8835   if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
8836      fprintf(stderr,
8837              "  PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
8838              (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
8839              (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
8840              (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
8841              (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
8842              (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
8843              (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
8844              (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
8845              (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
8846              (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
8847              (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
8848              (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
8849              (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
8850              (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
8851              (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
8852              (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
8853              (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
8854              "SnapRes" : "",
8855              (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
8856              "ISPDis" : "",
8857              (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
8858              (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
8859              (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
8860              imm, reason);
8861   }
8862
8863   crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8864#if GFX_VER >= 7
8865      pc.LRIPostSyncOperation = NoLRIOperation;
8866      pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
8867      pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
8868#endif
8869#if GFX_VER >= 6
8870      pc.StoreDataIndex = 0;
8871      pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
8872      pc.GlobalSnapshotCountReset =
8873         flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
8874      pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
8875      pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
8876      pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
8877      pc.RenderTargetCacheFlushEnable =
8878         flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8879      pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
8880      pc.StateCacheInvalidationEnable =
8881         flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
8882      pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
8883      pc.ConstantCacheInvalidationEnable =
8884         flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
8885#else
8886      pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8887#endif
8888      pc.PostSyncOperation = flags_to_post_sync_op(flags);
8889      pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
8890      pc.InstructionCacheInvalidateEnable =
8891         flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
8892      pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
8893#if GFX_VER >= 5 || GFX_VERx10 == 45
8894      pc.IndirectStatePointersDisable =
8895         flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
8896#endif
8897#if GFX_VER >= 6
8898      pc.TextureCacheInvalidationEnable =
8899         flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8900#elif GFX_VER == 5 || GFX_VERx10 == 45
8901      pc.TextureCacheFlushEnable =
8902         flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8903#endif
8904      pc.Address = ggtt_bo(bo, offset);
8905      if (GFX_VER < 7 && bo)
8906         pc.DestinationAddressType = DAT_GGTT;
8907      pc.ImmediateData = imm;
8908   }
8909}
8910
8911#if GFX_VER == 6
8912void
8913genX(crocus_upload_urb)(struct crocus_batch *batch,
8914                        unsigned vs_size,
8915                        bool gs_present,
8916                        unsigned gs_size)
8917{
8918   struct crocus_context *ice = batch->ice;
8919   int nr_vs_entries, nr_gs_entries;
8920   int total_urb_size = ice->urb.size * 1024; /* in bytes */
8921   const struct intel_device_info *devinfo = &batch->screen->devinfo;
8922
8923   /* Calculate how many entries fit in each stage's section of the URB */
8924   if (gs_present) {
8925      nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
8926      nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
8927   } else {
8928      nr_vs_entries = total_urb_size / (vs_size * 128);
8929      nr_gs_entries = 0;
8930   }
8931
8932   /* Then clamp to the maximum allowed by the hardware */
8933   if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX])
8934      nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX];
8935
8936   if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY])
8937      nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY];
8938
8939   /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
8940   ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
8941   ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
8942
8943   assert(ice->urb.nr_vs_entries >=
8944          devinfo->urb.min_entries[MESA_SHADER_VERTEX]);
8945   assert(ice->urb.nr_vs_entries % 4 == 0);
8946   assert(ice->urb.nr_gs_entries % 4 == 0);
8947   assert(vs_size <= 5);
8948   assert(gs_size <= 5);
8949
8950   crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) {
8951      urb.VSNumberofURBEntries = ice->urb.nr_vs_entries;
8952      urb.VSURBEntryAllocationSize = vs_size - 1;
8953
8954      urb.GSNumberofURBEntries = ice->urb.nr_gs_entries;
8955      urb.GSURBEntryAllocationSize = gs_size - 1;
8956   };
8957   /* From the PRM Volume 2 part 1, section 1.4.7:
8958    *
8959    *   Because of a urb corruption caused by allocating a previous gsunit’s
8960    *   urb entry to vsunit software is required to send a "GS NULL
8961    *   Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
8962    *   a dummy DRAW call before any case where VS will be taking over GS URB
8963    *   space.
8964    *
8965    * It is not clear exactly what this means ("URB fence" is a command that
8966    * doesn't exist on Gen6).  So for now we just do a full pipeline flush as
8967    * a workaround.
8968    */
8969   if (ice->urb.gs_present && !gs_present)
8970      crocus_emit_mi_flush(batch);
8971   ice->urb.gs_present = gs_present;
8972}
8973#endif
8974
8975static void
8976crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch)
8977{
8978}
8979
8980static void
8981crocus_emit_mi_report_perf_count(struct crocus_batch *batch,
8982                                 struct crocus_bo *bo,
8983                                 uint32_t offset_in_bytes,
8984                                 uint32_t report_id)
8985{
8986#if GFX_VER >= 7
8987   crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
8988      mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes);
8989      mi_rpc.ReportID = report_id;
8990   }
8991#endif
8992}
8993
8994/**
8995 * From the PRM, Volume 2a:
8996 *
8997 *    "Indirect State Pointers Disable
8998 *
8999 *    At the completion of the post-sync operation associated with this pipe
9000 *    control packet, the indirect state pointers in the hardware are
9001 *    considered invalid; the indirect pointers are not saved in the context.
9002 *    If any new indirect state commands are executed in the command stream
9003 *    while the pipe control is pending, the new indirect state commands are
9004 *    preserved.
9005 *
9006 *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
9007 *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
9008 *    commands are only considered as Indirect State Pointers. Once ISP is
9009 *    issued in a context, SW must initialize by programming push constant
9010 *    commands for all the shaders (at least to zero length) before attempting
9011 *    any rendering operation for the same context."
9012 *
9013 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
9014 * even though they point to a BO that has been already unreferenced at
9015 * the end of the previous batch buffer. This has been fine so far since
9016 * we are protected by these scratch page (every address not covered by
9017 * a BO should be pointing to the scratch page). But on CNL, it is
9018 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
9019 * instruction.
9020 *
9021 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
9022 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
9023 * context restore, so the mentioned hang doesn't happen. However,
9024 * software must program push constant commands for all stages prior to
9025 * rendering anything, so we flag them as dirty.
9026 *
9027 * Finally, we also make sure to stall at pixel scoreboard to make sure the
9028 * constants have been loaded into the EUs prior to disable the push constants
9029 * so that it doesn't hang a previous 3DPRIMITIVE.
9030 */
9031#if GFX_VER >= 7
9032static void
9033gen7_emit_isp_disable(struct crocus_batch *batch)
9034{
9035   crocus_emit_raw_pipe_control(batch, "isp disable",
9036                                PIPE_CONTROL_STALL_AT_SCOREBOARD |
9037                                PIPE_CONTROL_CS_STALL,
9038                                NULL, 0, 0);
9039   crocus_emit_raw_pipe_control(batch, "isp disable",
9040                                PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
9041                                PIPE_CONTROL_CS_STALL,
9042                                NULL, 0, 0);
9043
9044   struct crocus_context *ice = batch->ice;
9045   ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
9046                              CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
9047                              CROCUS_STAGE_DIRTY_CONSTANTS_TES |
9048                              CROCUS_STAGE_DIRTY_CONSTANTS_GS |
9049                              CROCUS_STAGE_DIRTY_CONSTANTS_FS);
9050}
9051#endif
9052
9053#if GFX_VER >= 7
9054static void
9055crocus_state_finish_batch(struct crocus_batch *batch)
9056{
9057#if GFX_VERx10 == 75
9058   if (batch->name == CROCUS_BATCH_RENDER) {
9059      crocus_emit_mi_flush(batch);
9060      crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
9061         ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset;
9062      }
9063
9064      crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH |
9065                                     PIPE_CONTROL_CS_STALL);
9066   }
9067#endif
9068   gen7_emit_isp_disable(batch);
9069}
9070#endif
9071
9072static void
9073crocus_batch_reset_dirty(struct crocus_batch *batch)
9074{
9075   /* unreference any index buffer so it get reemitted. */
9076   pipe_resource_reference(&batch->ice->state.index_buffer.res, NULL);
9077
9078   /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch
9079    * as the old state batch won't still be available.
9080    */
9081   batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER |
9082      CROCUS_DIRTY_COLOR_CALC_STATE;
9083
9084   batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
9085
9086   batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
9087   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS;
9088   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES;
9089   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS;
9090   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS;
9091   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS;
9092   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS;
9093
9094   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS;
9095   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
9096   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
9097   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS;
9098   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS;
9099   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
9100
9101   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
9102   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
9103   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS;
9104   batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT;
9105
9106#if GFX_VER >= 6
9107   /* SCISSOR_STATE */
9108   batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
9109   batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
9110   batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
9111
9112#endif
9113#if GFX_VER <= 5
9114   /* dirty the SF state on gen4/5 */
9115   batch->ice->state.dirty |= CROCUS_DIRTY_RASTER;
9116   batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
9117   batch->ice->state.dirty |= CROCUS_DIRTY_CLIP;
9118   batch->ice->state.dirty |= CROCUS_DIRTY_WM;
9119#endif
9120#if GFX_VER >= 7
9121   /* Streamout dirty */
9122   batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
9123   batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
9124   batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
9125#endif
9126}
9127
9128#if GFX_VERx10 == 75
9129struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice)
9130{
9131   return &ice->state.cso_rast->cso;
9132}
9133#endif
9134
9135#if GFX_VER >= 6
9136static void update_so_strides(struct crocus_context *ice,
9137                              uint16_t *strides)
9138{
9139   for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
9140      struct crocus_stream_output_target *so = (void *)ice->state.so_target[i];
9141      if (so)
9142         so->stride = strides[i] * sizeof(uint32_t);
9143   }
9144}
9145#endif
9146
9147static void crocus_fill_clamp_mask(const struct crocus_sampler_state *samp,
9148                                   int s,
9149                                   uint32_t *clamp_mask)
9150{
9151#if GFX_VER < 8
9152   if (samp->pstate.min_img_filter != PIPE_TEX_FILTER_NEAREST &&
9153       samp->pstate.mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
9154      if (samp->pstate.wrap_s == PIPE_TEX_WRAP_CLAMP)
9155         clamp_mask[0] |= (1 << s);
9156      if (samp->pstate.wrap_t == PIPE_TEX_WRAP_CLAMP)
9157         clamp_mask[1] |= (1 << s);
9158      if (samp->pstate.wrap_r == PIPE_TEX_WRAP_CLAMP)
9159         clamp_mask[2] |= (1 << s);
9160   }
9161#endif
9162}
9163
9164static void
9165crocus_set_frontend_noop(struct pipe_context *ctx, bool enable)
9166{
9167   struct crocus_context *ice = (struct crocus_context *) ctx;
9168
9169   if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) {
9170      ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER;
9171      ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
9172   }
9173
9174   if (ice->batch_count == 1)
9175      return;
9176
9177   if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) {
9178      ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
9179      ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
9180   }
9181}
9182
9183void
9184genX(crocus_init_screen_state)(struct crocus_screen *screen)
9185{
9186   assert(screen->devinfo.verx10 == GFX_VERx10);
9187   assert(screen->devinfo.ver == GFX_VER);
9188   screen->vtbl.destroy_state = crocus_destroy_state;
9189   screen->vtbl.init_render_context = crocus_init_render_context;
9190   screen->vtbl.upload_render_state = crocus_upload_render_state;
9191#if GFX_VER >= 7
9192   screen->vtbl.init_compute_context = crocus_init_compute_context;
9193   screen->vtbl.upload_compute_state = crocus_upload_compute_state;
9194#endif
9195   screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control;
9196   screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count;
9197   screen->vtbl.rebind_buffer = crocus_rebind_buffer;
9198#if GFX_VERx10 >= 75
9199   screen->vtbl.load_register_reg32 = crocus_load_register_reg32;
9200   screen->vtbl.load_register_reg64 = crocus_load_register_reg64;
9201   screen->vtbl.load_register_imm32 = crocus_load_register_imm32;
9202   screen->vtbl.load_register_imm64 = crocus_load_register_imm64;
9203   screen->vtbl.store_data_imm32 = crocus_store_data_imm32;
9204   screen->vtbl.store_data_imm64 = crocus_store_data_imm64;
9205#endif
9206#if GFX_VER >= 7
9207   screen->vtbl.load_register_mem32 = crocus_load_register_mem32;
9208   screen->vtbl.load_register_mem64 = crocus_load_register_mem64;
9209   screen->vtbl.copy_mem_mem = crocus_copy_mem_mem;
9210   screen->vtbl.create_so_decl_list = crocus_create_so_decl_list;
9211#endif
9212   screen->vtbl.update_surface_base_address = crocus_update_surface_base_address;
9213#if GFX_VER >= 6
9214   screen->vtbl.store_register_mem32 = crocus_store_register_mem32;
9215   screen->vtbl.store_register_mem64 = crocus_store_register_mem64;
9216#endif
9217   screen->vtbl.populate_vs_key = crocus_populate_vs_key;
9218   screen->vtbl.populate_tcs_key = crocus_populate_tcs_key;
9219   screen->vtbl.populate_tes_key = crocus_populate_tes_key;
9220   screen->vtbl.populate_gs_key = crocus_populate_gs_key;
9221   screen->vtbl.populate_fs_key = crocus_populate_fs_key;
9222   screen->vtbl.populate_cs_key = crocus_populate_cs_key;
9223   screen->vtbl.lost_genx_state = crocus_lost_genx_state;
9224#if GFX_VER >= 7
9225   screen->vtbl.finish_batch = crocus_state_finish_batch;
9226#endif
9227#if GFX_VER <= 5
9228   screen->vtbl.upload_urb_fence = crocus_upload_urb_fence;
9229   screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence;
9230#endif
9231   screen->vtbl.fill_clamp_mask = crocus_fill_clamp_mask;
9232   screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty;
9233   screen->vtbl.translate_prim_type = translate_prim_type;
9234#if GFX_VER >= 6
9235   screen->vtbl.update_so_strides = update_so_strides;
9236   screen->vtbl.get_so_offset = crocus_get_so_offset;
9237#endif
9238
9239   genX(crocus_init_blt)(screen);
9240}
9241
9242void
9243genX(crocus_init_state)(struct crocus_context *ice)
9244{
9245   struct pipe_context *ctx = &ice->ctx;
9246
9247   ctx->create_blend_state = crocus_create_blend_state;
9248   ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state;
9249   ctx->create_rasterizer_state = crocus_create_rasterizer_state;
9250   ctx->create_sampler_state = crocus_create_sampler_state;
9251   ctx->create_sampler_view = crocus_create_sampler_view;
9252   ctx->create_surface = crocus_create_surface;
9253   ctx->create_vertex_elements_state = crocus_create_vertex_elements;
9254   ctx->bind_blend_state = crocus_bind_blend_state;
9255   ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state;
9256   ctx->bind_sampler_states = crocus_bind_sampler_states;
9257   ctx->bind_rasterizer_state = crocus_bind_rasterizer_state;
9258   ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state;
9259   ctx->delete_blend_state = crocus_delete_state;
9260   ctx->delete_depth_stencil_alpha_state = crocus_delete_state;
9261   ctx->delete_rasterizer_state = crocus_delete_state;
9262   ctx->delete_sampler_state = crocus_delete_state;
9263   ctx->delete_vertex_elements_state = crocus_delete_state;
9264   ctx->set_blend_color = crocus_set_blend_color;
9265   ctx->set_clip_state = crocus_set_clip_state;
9266   ctx->set_constant_buffer = crocus_set_constant_buffer;
9267   ctx->set_shader_buffers = crocus_set_shader_buffers;
9268   ctx->set_shader_images = crocus_set_shader_images;
9269   ctx->set_sampler_views = crocus_set_sampler_views;
9270   ctx->set_tess_state = crocus_set_tess_state;
9271   ctx->set_patch_vertices = crocus_set_patch_vertices;
9272   ctx->set_framebuffer_state = crocus_set_framebuffer_state;
9273   ctx->set_polygon_stipple = crocus_set_polygon_stipple;
9274   ctx->set_sample_mask = crocus_set_sample_mask;
9275   ctx->set_scissor_states = crocus_set_scissor_states;
9276   ctx->set_stencil_ref = crocus_set_stencil_ref;
9277   ctx->set_vertex_buffers = crocus_set_vertex_buffers;
9278   ctx->set_viewport_states = crocus_set_viewport_states;
9279   ctx->sampler_view_destroy = crocus_sampler_view_destroy;
9280   ctx->surface_destroy = crocus_surface_destroy;
9281   ctx->draw_vbo = crocus_draw_vbo;
9282   ctx->launch_grid = crocus_launch_grid;
9283
9284   ctx->set_frontend_noop = crocus_set_frontend_noop;
9285
9286#if GFX_VER >= 6
9287   ctx->create_stream_output_target = crocus_create_stream_output_target;
9288   ctx->stream_output_target_destroy = crocus_stream_output_target_destroy;
9289   ctx->set_stream_output_targets = crocus_set_stream_output_targets;
9290#endif
9291
9292   ice->state.dirty = ~0ull;
9293   ice->state.stage_dirty = ~0ull;
9294
9295   ice->state.statistics_counters_enabled = true;
9296
9297   ice->state.sample_mask = 0xff;
9298   ice->state.num_viewports = 1;
9299   ice->state.prim_mode = PIPE_PRIM_MAX;
9300   ice->state.reduced_prim_mode = PIPE_PRIM_MAX;
9301   ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
9302   ice->draw.derived_params.drawid = -1;
9303
9304   /* Default all scissor rectangles to be empty regions. */
9305   for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) {
9306      ice->state.scissors[i] = (struct pipe_scissor_state) {
9307         .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
9308      };
9309   }
9310}
9311