1/* 2 * Copyright © 2017 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included 12 * in all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 * DEALINGS IN THE SOFTWARE. 21 */ 22 23/** 24 * @file crocus_pipe_control.c 25 * 26 * PIPE_CONTROL is the main flushing and synchronization primitive on Intel 27 * GPUs. It can invalidate caches, stall until rendering reaches various 28 * stages of completion, write to memory, and other things. In a way, it's 29 * a swiss army knife command - it has all kinds of capabilities, but some 30 * significant limitations as well. 31 * 32 * Unfortunately, it's notoriously complicated and difficult to use. Many 33 * sub-commands can't be used together. Some are meant to be used at the 34 * top of the pipeline (invalidating caches before drawing), while some are 35 * meant to be used at the end (stalling or flushing after drawing). 36 * 37 * Also, there's a list of restrictions a mile long, which vary by generation. 38 * Do this before doing that, or suffer the consequences (usually a GPU hang). 39 * 40 * This file contains helpers for emitting them safely. You can simply call 41 * crocus_emit_pipe_control_flush() with the desired operations (as logical 42 * PIPE_CONTROL_* bits), and it will take care of splitting it into multiple 43 * PIPE_CONTROL commands as necessary. The per-generation workarounds are 44 * applied in crocus_emit_raw_pipe_control() in crocus_state.c. 45 */ 46 47#include "crocus_context.h" 48#include "util/hash_table.h" 49#include "util/set.h" 50 51/** 52 * Emit a PIPE_CONTROL with various flushing flags. 53 * 54 * The caller is responsible for deciding what flags are appropriate for the 55 * given generation. 56 */ 57void 58crocus_emit_pipe_control_flush(struct crocus_batch *batch, 59 const char *reason, 60 uint32_t flags) 61{ 62 const struct intel_device_info *devinfo = &batch->screen->devinfo; 63 64 if (devinfo->ver >= 6 && 65 (flags & PIPE_CONTROL_CACHE_FLUSH_BITS) && 66 (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) { 67 /* A pipe control command with flush and invalidate bits set 68 * simultaneously is an inherently racy operation on Gen6+ if the 69 * contents of the flushed caches were intended to become visible from 70 * any of the invalidated caches. Split it in two PIPE_CONTROLs, the 71 * first one should stall the pipeline to make sure that the flushed R/W 72 * caches are coherent with memory once the specified R/O caches are 73 * invalidated. On pre-Gen6 hardware the (implicit) R/O cache 74 * invalidation seems to happen at the bottom of the pipeline together 75 * with any write cache flush, so this shouldn't be a concern. In order 76 * to ensure a full stall, we do an end-of-pipe sync. 77 */ 78 crocus_emit_end_of_pipe_sync(batch, reason, 79 flags & PIPE_CONTROL_CACHE_FLUSH_BITS); 80 flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL); 81 } 82 83 batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, NULL, 0, 0); 84} 85 86/** 87 * Emit a PIPE_CONTROL that writes to a buffer object. 88 * 89 * \p flags should contain one of the following items: 90 * - PIPE_CONTROL_WRITE_IMMEDIATE 91 * - PIPE_CONTROL_WRITE_TIMESTAMP 92 * - PIPE_CONTROL_WRITE_DEPTH_COUNT 93 */ 94void 95crocus_emit_pipe_control_write(struct crocus_batch *batch, 96 const char *reason, uint32_t flags, 97 struct crocus_bo *bo, uint32_t offset, 98 uint64_t imm) 99{ 100 batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, bo, offset, imm); 101} 102 103/** 104 * Restriction [DevSNB, DevIVB]: 105 * 106 * Prior to changing Depth/Stencil Buffer state (i.e. any combination of 107 * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER, 108 * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall 109 * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth 110 * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by 111 * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set), 112 * unless SW can otherwise guarantee that the pipeline from WM onwards is 113 * already flushed (e.g., via a preceding MI_FLUSH). 114 */ 115void 116crocus_emit_depth_stall_flushes(struct crocus_batch *batch) 117{ 118 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo; 119 120 assert(devinfo->ver >= 6); 121 122 /* Starting on BDW, these pipe controls are unnecessary. 123 * 124 * WM HW will internally manage the draining pipe and flushing of the caches 125 * when this command is issued. The PIPE_CONTROL restrictions are removed. 126 */ 127 if (devinfo->ver >= 8) 128 return; 129 130 crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_STALL); 131 crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_CACHE_FLUSH); 132 crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_STALL); 133} 134 135/* 136 * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization": 137 * 138 * Write synchronization is a special case of end-of-pipe 139 * synchronization that requires that the render cache and/or depth 140 * related caches are flushed to memory, where the data will become 141 * globally visible. This type of synchronization is required prior to 142 * SW (CPU) actually reading the result data from memory, or initiating 143 * an operation that will use as a read surface (such as a texture 144 * surface) a previous render target and/or depth/stencil buffer 145 * 146 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": 147 * 148 * Exercising the write cache flush bits (Render Target Cache Flush 149 * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only 150 * ensures the write caches are flushed and doesn't guarantee the data 151 * is globally visible. 152 * 153 * SW can track the completion of the end-of-pipe-synchronization by 154 * using "Notify Enable" and "PostSync Operation - Write Immediate 155 * Data" in the PIPE_CONTROL command. 156 */ 157void 158crocus_emit_end_of_pipe_sync(struct crocus_batch *batch, 159 const char *reason, uint32_t flags) 160{ 161 const struct intel_device_info *devinfo = &batch->screen->devinfo; 162 163 if (devinfo->ver >= 6) { 164 /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory": 165 * 166 * "The most common action to perform upon reaching a synchronization 167 * point is to write a value out to memory. An immediate value 168 * (included with the synchronization command) may be written." 169 * 170 * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization": 171 * 172 * "In case the data flushed out by the render engine is to be read 173 * back in to the render engine in coherent manner, then the render 174 * engine has to wait for the fence completion before accessing the 175 * flushed data. This can be achieved by following means on various 176 * products: PIPE_CONTROL command with CS Stall and the required 177 * write caches flushed with Post-Sync-Operation as Write Immediate 178 * Data. 179 * 180 * Example: 181 * - Workload-1 (3D/GPGPU/MEDIA) 182 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write Immediate 183 * Data, Required Write Cache Flush bits set) 184 * - Workload-2 (Can use the data produce or output by Workload-1) 185 */ 186 crocus_emit_pipe_control_write(batch, reason, 187 flags | PIPE_CONTROL_CS_STALL | 188 PIPE_CONTROL_WRITE_IMMEDIATE, 189 batch->ice->workaround_bo, 190 batch->ice->workaround_offset, 0); 191 192 if (batch->screen->devinfo.platform == INTEL_PLATFORM_HSW) { 193#define GEN7_3DPRIM_START_INSTANCE 0x243C 194 batch->screen->vtbl.load_register_mem32(batch, GEN7_3DPRIM_START_INSTANCE, 195 batch->ice->workaround_bo, 196 batch->ice->workaround_offset); 197 } 198 } else { 199 /* On gen4-5, a regular pipe control seems to suffice. */ 200 crocus_emit_pipe_control_flush(batch, reason, flags); 201 } 202} 203 204/* Emit a pipelined flush to either flush render and texture cache for 205 * reading from a FBO-drawn texture, or flush so that frontbuffer 206 * render appears on the screen in DRI1. 207 * 208 * This is also used for the always_flush_cache driconf debug option. 209 */ 210void 211crocus_emit_mi_flush(struct crocus_batch *batch) 212{ 213 const struct intel_device_info *devinfo = &batch->screen->devinfo; 214 int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH; 215 if (devinfo->ver >= 6) { 216 flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE | 217 PIPE_CONTROL_CONST_CACHE_INVALIDATE | 218 PIPE_CONTROL_DATA_CACHE_FLUSH | 219 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 220 PIPE_CONTROL_VF_CACHE_INVALIDATE | 221 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | 222 PIPE_CONTROL_CS_STALL; 223 } 224 crocus_emit_pipe_control_flush(batch, "mi flush", flags); 225} 226 227/** 228 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for 229 * implementing two workarounds on gen6. From section 1.4.7.1 230 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1: 231 * 232 * [DevSNB-C+{W/A}] Before any depth stall flush (including those 233 * produced by non-pipelined state commands), software needs to first 234 * send a PIPE_CONTROL with no bits set except Post-Sync Operation != 235 * 0. 236 * 237 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable 238 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required. 239 * 240 * And the workaround for these two requires this workaround first: 241 * 242 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent 243 * BEFORE the pipe-control with a post-sync op and no write-cache 244 * flushes. 245 * 246 * And this last workaround is tricky because of the requirements on 247 * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM 248 * volume 2 part 1: 249 * 250 * "1 of the following must also be set: 251 * - Render Target Cache Flush Enable ([12] of DW1) 252 * - Depth Cache Flush Enable ([0] of DW1) 253 * - Stall at Pixel Scoreboard ([1] of DW1) 254 * - Depth Stall ([13] of DW1) 255 * - Post-Sync Operation ([13] of DW1) 256 * - Notify Enable ([8] of DW1)" 257 * 258 * The cache flushes require the workaround flush that triggered this 259 * one, so we can't use it. Depth stall would trigger the same. 260 * Post-sync nonzero is what triggered this second workaround, so we 261 * can't use that one either. Notify enable is IRQs, which aren't 262 * really our business. That leaves only stall at scoreboard. 263 */ 264void 265crocus_emit_post_sync_nonzero_flush(struct crocus_batch *batch) 266{ 267 crocus_emit_pipe_control_flush(batch, "nonzero", 268 PIPE_CONTROL_CS_STALL | 269 PIPE_CONTROL_STALL_AT_SCOREBOARD); 270 271 crocus_emit_pipe_control_write(batch, "nonzero", 272 PIPE_CONTROL_WRITE_IMMEDIATE, 273 batch->ice->workaround_bo, 274 batch->ice->workaround_offset, 0); 275} 276 277/** 278 * Flush and invalidate all caches (for debugging purposes). 279 */ 280void 281crocus_flush_all_caches(struct crocus_batch *batch) 282{ 283 crocus_emit_pipe_control_flush(batch, "debug: flush all caches", 284 PIPE_CONTROL_CS_STALL | 285 PIPE_CONTROL_DATA_CACHE_FLUSH | 286 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 287 PIPE_CONTROL_RENDER_TARGET_FLUSH | 288 PIPE_CONTROL_VF_CACHE_INVALIDATE | 289 PIPE_CONTROL_INSTRUCTION_INVALIDATE | 290 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | 291 PIPE_CONTROL_CONST_CACHE_INVALIDATE | 292 PIPE_CONTROL_STATE_CACHE_INVALIDATE); 293} 294 295static void 296crocus_texture_barrier(struct pipe_context *ctx, unsigned flags) 297{ 298 struct crocus_context *ice = (void *) ctx; 299 struct crocus_batch *render_batch = &ice->batches[CROCUS_BATCH_RENDER]; 300 struct crocus_batch *compute_batch = &ice->batches[CROCUS_BATCH_COMPUTE]; 301 const struct intel_device_info *devinfo = &render_batch->screen->devinfo; 302 303 if (devinfo->ver < 6) { 304 crocus_emit_mi_flush(render_batch); 305 return; 306 } 307 308 if (render_batch->contains_draw) { 309 crocus_batch_maybe_flush(render_batch, 48); 310 crocus_emit_pipe_control_flush(render_batch, 311 "API: texture barrier (1/2)", 312 (flags == 1 ? PIPE_CONTROL_DEPTH_CACHE_FLUSH : 0) | 313 PIPE_CONTROL_RENDER_TARGET_FLUSH | 314 PIPE_CONTROL_CS_STALL); 315 crocus_emit_pipe_control_flush(render_batch, 316 "API: texture barrier (2/2)", 317 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE); 318 } 319 320 if (compute_batch->contains_draw) { 321 crocus_batch_maybe_flush(compute_batch, 48); 322 crocus_emit_pipe_control_flush(compute_batch, 323 "API: texture barrier (1/2)", 324 PIPE_CONTROL_CS_STALL); 325 crocus_emit_pipe_control_flush(compute_batch, 326 "API: texture barrier (2/2)", 327 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE); 328 } 329} 330 331static void 332crocus_memory_barrier(struct pipe_context *ctx, unsigned flags) 333{ 334 struct crocus_context *ice = (void *) ctx; 335 unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL; 336 const struct intel_device_info *devinfo = &ice->batches[0].screen->devinfo; 337 338 assert(devinfo->ver >= 7); 339 340 if (flags & (PIPE_BARRIER_VERTEX_BUFFER | 341 PIPE_BARRIER_INDEX_BUFFER | 342 PIPE_BARRIER_INDIRECT_BUFFER)) { 343 bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 344 } 345 346 if (flags & PIPE_BARRIER_CONSTANT_BUFFER) { 347 bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | 348 PIPE_CONTROL_CONST_CACHE_INVALIDATE; 349 } 350 351 if (flags & (PIPE_BARRIER_TEXTURE | PIPE_BARRIER_FRAMEBUFFER)) { 352 bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | 353 PIPE_CONTROL_RENDER_TARGET_FLUSH; 354 } 355 356 /* Typed surface messages are handled by the render cache on IVB, so we 357 * need to flush it too. 358 */ 359 if (devinfo->verx10 < 75) 360 bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH; 361 362 for (int i = 0; i < ice->batch_count; i++) { 363 if (ice->batches[i].contains_draw) { 364 crocus_batch_maybe_flush(&ice->batches[i], 24); 365 crocus_emit_pipe_control_flush(&ice->batches[i], "API: memory barrier", 366 bits); 367 } 368 } 369} 370 371void 372crocus_init_flush_functions(struct pipe_context *ctx) 373{ 374 ctx->memory_barrier = crocus_memory_barrier; 375 ctx->texture_barrier = crocus_texture_barrier; 376} 377