1/* 2 * Copyright © 2017 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included 12 * in all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 * DEALINGS IN THE SOFTWARE. 21 */ 22 23/** 24 * @file crocus_batch.c 25 * 26 * Batchbuffer and command submission module. 27 * 28 * Every API draw call results in a number of GPU commands, which we 29 * collect into a "batch buffer". Typically, many draw calls are grouped 30 * into a single batch to amortize command submission overhead. 31 * 32 * We submit batches to the kernel using the I915_GEM_EXECBUFFER2 ioctl. 33 * One critical piece of data is the "validation list", which contains a 34 * list of the buffer objects (BOs) which the commands in the GPU need. 35 * The kernel will make sure these are resident and pinned at the correct 36 * virtual memory address before executing our batch. If a BO is not in 37 * the validation list, it effectively does not exist, so take care. 38 */ 39 40#include "crocus_batch.h" 41#include "crocus_bufmgr.h" 42#include "crocus_context.h" 43#include "crocus_fence.h" 44 45#include "drm-uapi/i915_drm.h" 46 47#include "intel/common/intel_gem.h" 48#include "util/hash_table.h" 49#include "util/set.h" 50#include "util/u_upload_mgr.h" 51 52#include <errno.h> 53#include <xf86drm.h> 54 55#if HAVE_VALGRIND 56#include <memcheck.h> 57#include <valgrind.h> 58#define VG(x) x 59#else 60#define VG(x) 61#endif 62 63#define FILE_DEBUG_FLAG DEBUG_BUFMGR 64 65/* Terminating the batch takes either 4 bytes for MI_BATCH_BUFFER_END 66 * or 12 bytes for MI_BATCH_BUFFER_START (when chaining). Plus, we may 67 * need an extra 4 bytes to pad out to the nearest QWord. So reserve 16. 68 */ 69#define BATCH_RESERVED(devinfo) ((devinfo)->platform == INTEL_PLATFORM_HSW ? 32 : 16) 70 71static void crocus_batch_reset(struct crocus_batch *batch); 72 73static unsigned 74num_fences(struct crocus_batch *batch) 75{ 76 return util_dynarray_num_elements(&batch->exec_fences, 77 struct drm_i915_gem_exec_fence); 78} 79 80/** 81 * Debugging code to dump the fence list, used by INTEL_DEBUG=submit. 82 */ 83static void 84dump_fence_list(struct crocus_batch *batch) 85{ 86 fprintf(stderr, "Fence list (length %u): ", num_fences(batch)); 87 88 util_dynarray_foreach(&batch->exec_fences, 89 struct drm_i915_gem_exec_fence, f) { 90 fprintf(stderr, "%s%u%s ", 91 (f->flags & I915_EXEC_FENCE_WAIT) ? "..." : "", 92 f->handle, 93 (f->flags & I915_EXEC_FENCE_SIGNAL) ? "!" : ""); 94 } 95 96 fprintf(stderr, "\n"); 97} 98 99/** 100 * Debugging code to dump the validation list, used by INTEL_DEBUG=submit. 101 */ 102static void 103dump_validation_list(struct crocus_batch *batch) 104{ 105 fprintf(stderr, "Validation list (length %d):\n", batch->exec_count); 106 107 for (int i = 0; i < batch->exec_count; i++) { 108 uint64_t flags = batch->validation_list[i].flags; 109 assert(batch->validation_list[i].handle == 110 batch->exec_bos[i]->gem_handle); 111 fprintf(stderr, 112 "[%2d]: %2d %-14s @ 0x%"PRIx64" (%" PRIu64 "B)\t %2d refs %s\n", i, 113 batch->validation_list[i].handle, batch->exec_bos[i]->name, 114 (uint64_t)batch->validation_list[i].offset, batch->exec_bos[i]->size, 115 batch->exec_bos[i]->refcount, 116 (flags & EXEC_OBJECT_WRITE) ? " (write)" : ""); 117 } 118} 119 120/** 121 * Return BO information to the batch decoder (for debugging). 122 */ 123static struct intel_batch_decode_bo 124decode_get_bo(void *v_batch, bool ppgtt, uint64_t address) 125{ 126 struct crocus_batch *batch = v_batch; 127 128 for (int i = 0; i < batch->exec_count; i++) { 129 struct crocus_bo *bo = batch->exec_bos[i]; 130 /* The decoder zeroes out the top 16 bits, so we need to as well */ 131 uint64_t bo_address = bo->gtt_offset & (~0ull >> 16); 132 133 if (address >= bo_address && address < bo_address + bo->size) { 134 return (struct intel_batch_decode_bo){ 135 .addr = address, 136 .size = bo->size, 137 .map = crocus_bo_map(batch->dbg, bo, MAP_READ) + 138 (address - bo_address), 139 }; 140 } 141 } 142 143 return (struct intel_batch_decode_bo) { }; 144} 145 146static unsigned 147decode_get_state_size(void *v_batch, uint64_t address, 148 uint64_t base_address) 149{ 150 struct crocus_batch *batch = v_batch; 151 152 /* The decoder gives us offsets from a base address, which is not great. 153 * Binding tables are relative to surface state base address, and other 154 * state is relative to dynamic state base address. These could alias, 155 * but in practice it's unlikely because surface offsets are always in 156 * the [0, 64K) range, and we assign dynamic state addresses starting at 157 * the top of the 4GB range. We should fix this but it's likely good 158 * enough for now. 159 */ 160 unsigned size = (uintptr_t) 161 _mesa_hash_table_u64_search(batch->state_sizes, address - base_address); 162 163 return size; 164} 165 166/** 167 * Decode the current batch. 168 */ 169static void 170decode_batch(struct crocus_batch *batch) 171{ 172 void *map = crocus_bo_map(batch->dbg, batch->exec_bos[0], MAP_READ); 173 intel_print_batch(&batch->decoder, map, batch->primary_batch_size, 174 batch->exec_bos[0]->gtt_offset, false); 175} 176 177static void 178init_reloc_list(struct crocus_reloc_list *rlist, int count) 179{ 180 rlist->reloc_count = 0; 181 rlist->reloc_array_size = count; 182 rlist->relocs = malloc(rlist->reloc_array_size * 183 sizeof(struct drm_i915_gem_relocation_entry)); 184} 185 186void 187crocus_init_batch(struct crocus_context *ice, 188 enum crocus_batch_name name, 189 int priority) 190{ 191 struct crocus_batch *batch = &ice->batches[name]; 192 struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; 193 struct intel_device_info *devinfo = &screen->devinfo; 194 195 batch->ice = ice; 196 batch->screen = screen; 197 batch->dbg = &ice->dbg; 198 batch->reset = &ice->reset; 199 batch->name = name; 200 batch->contains_fence_signal = false; 201 202 if (devinfo->ver >= 7) { 203 batch->fine_fences.uploader = 204 u_upload_create(&ice->ctx, 4096, PIPE_BIND_CUSTOM, 205 PIPE_USAGE_STAGING, 0); 206 } 207 crocus_fine_fence_init(batch); 208 209 batch->hw_ctx_id = crocus_create_hw_context(screen->bufmgr); 210 assert(batch->hw_ctx_id); 211 212 crocus_hw_context_set_priority(screen->bufmgr, batch->hw_ctx_id, priority); 213 214 batch->valid_reloc_flags = EXEC_OBJECT_WRITE; 215 if (devinfo->ver == 6) 216 batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT; 217 218 if (INTEL_DEBUG(DEBUG_BATCH)) { 219 /* The shadow doesn't get relocs written so state decode fails. */ 220 batch->use_shadow_copy = false; 221 } else 222 batch->use_shadow_copy = !devinfo->has_llc; 223 224 util_dynarray_init(&batch->exec_fences, ralloc_context(NULL)); 225 util_dynarray_init(&batch->syncobjs, ralloc_context(NULL)); 226 227 init_reloc_list(&batch->command.relocs, 250); 228 init_reloc_list(&batch->state.relocs, 250); 229 230 batch->exec_count = 0; 231 batch->exec_array_size = 100; 232 batch->exec_bos = 233 malloc(batch->exec_array_size * sizeof(batch->exec_bos[0])); 234 batch->validation_list = 235 malloc(batch->exec_array_size * sizeof(batch->validation_list[0])); 236 237 batch->cache.render = _mesa_hash_table_create(NULL, NULL, 238 _mesa_key_pointer_equal); 239 batch->cache.depth = _mesa_set_create(NULL, NULL, 240 _mesa_key_pointer_equal); 241 242 memset(batch->other_batches, 0, sizeof(batch->other_batches)); 243 244 for (int i = 0, j = 0; i < ice->batch_count; i++) { 245 if (i != name) 246 batch->other_batches[j++] = &ice->batches[i]; 247 } 248 249 if (INTEL_DEBUG(DEBUG_BATCH)) { 250 251 batch->state_sizes = _mesa_hash_table_u64_create(NULL); 252 const unsigned decode_flags = 253 INTEL_BATCH_DECODE_FULL | 254 (INTEL_DEBUG(DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) | 255 INTEL_BATCH_DECODE_OFFSETS | INTEL_BATCH_DECODE_FLOATS; 256 257 intel_batch_decode_ctx_init(&batch->decoder, &screen->compiler->isa, 258 &screen->devinfo, stderr, 259 decode_flags, NULL, decode_get_bo, 260 decode_get_state_size, batch); 261 batch->decoder.max_vbo_decoded_lines = 32; 262 } 263 264 crocus_batch_reset(batch); 265} 266 267static int 268find_exec_index(struct crocus_batch *batch, struct crocus_bo *bo) 269{ 270 unsigned index = READ_ONCE(bo->index); 271 272 if (index < batch->exec_count && batch->exec_bos[index] == bo) 273 return index; 274 275 /* May have been shared between multiple active batches */ 276 for (index = 0; index < batch->exec_count; index++) { 277 if (batch->exec_bos[index] == bo) 278 return index; 279 } 280 return -1; 281} 282 283static struct drm_i915_gem_exec_object2 * 284find_validation_entry(struct crocus_batch *batch, struct crocus_bo *bo) 285{ 286 int index = find_exec_index(batch, bo); 287 288 if (index == -1) 289 return NULL; 290 return &batch->validation_list[index]; 291} 292 293static void 294ensure_exec_obj_space(struct crocus_batch *batch, uint32_t count) 295{ 296 while (batch->exec_count + count > batch->exec_array_size) { 297 batch->exec_array_size *= 2; 298 batch->exec_bos = realloc( 299 batch->exec_bos, batch->exec_array_size * sizeof(batch->exec_bos[0])); 300 batch->validation_list = 301 realloc(batch->validation_list, 302 batch->exec_array_size * sizeof(batch->validation_list[0])); 303 } 304} 305 306static struct drm_i915_gem_exec_object2 * 307crocus_use_bo(struct crocus_batch *batch, struct crocus_bo *bo, bool writable) 308{ 309 assert(bo->bufmgr == batch->command.bo->bufmgr); 310 311 struct drm_i915_gem_exec_object2 *existing_entry = 312 find_validation_entry(batch, bo); 313 314 if (existing_entry) { 315 /* The BO is already in the validation list; mark it writable */ 316 if (writable) 317 existing_entry->flags |= EXEC_OBJECT_WRITE; 318 return existing_entry; 319 } 320 321 if (bo != batch->command.bo && bo != batch->state.bo) { 322 /* This is the first time our batch has seen this BO. Before we use it, 323 * we may need to flush and synchronize with other batches. 324 */ 325 for (int b = 0; b < ARRAY_SIZE(batch->other_batches); b++) { 326 327 if (!batch->other_batches[b]) 328 continue; 329 struct drm_i915_gem_exec_object2 *other_entry = 330 find_validation_entry(batch->other_batches[b], bo); 331 332 /* If the buffer is referenced by another batch, and either batch 333 * intends to write it, then flush the other batch and synchronize. 334 * 335 * Consider these cases: 336 * 337 * 1. They read, we read => No synchronization required. 338 * 2. They read, we write => Synchronize (they need the old value) 339 * 3. They write, we read => Synchronize (we need their new value) 340 * 4. They write, we write => Synchronize (order writes) 341 * 342 * The read/read case is very common, as multiple batches usually 343 * share a streaming state buffer or shader assembly buffer, and 344 * we want to avoid synchronizing in this case. 345 */ 346 if (other_entry && 347 ((other_entry->flags & EXEC_OBJECT_WRITE) || writable)) { 348 crocus_batch_flush(batch->other_batches[b]); 349 crocus_batch_add_syncobj(batch, 350 batch->other_batches[b]->last_fence->syncobj, 351 I915_EXEC_FENCE_WAIT); 352 } 353 } 354 } 355 356 /* Bump the ref count since the batch is now using this bo. */ 357 crocus_bo_reference(bo); 358 359 ensure_exec_obj_space(batch, 1); 360 361 batch->validation_list[batch->exec_count] = 362 (struct drm_i915_gem_exec_object2) { 363 .handle = bo->gem_handle, 364 .offset = bo->gtt_offset, 365 .flags = bo->kflags | (writable ? EXEC_OBJECT_WRITE : 0), 366 }; 367 368 bo->index = batch->exec_count; 369 batch->exec_bos[batch->exec_count] = bo; 370 batch->aperture_space += bo->size; 371 372 batch->exec_count++; 373 374 return &batch->validation_list[batch->exec_count - 1]; 375} 376 377static uint64_t 378emit_reloc(struct crocus_batch *batch, 379 struct crocus_reloc_list *rlist, uint32_t offset, 380 struct crocus_bo *target, int32_t target_offset, 381 unsigned int reloc_flags) 382{ 383 assert(target != NULL); 384 385 if (target == batch->ice->workaround_bo) 386 reloc_flags &= ~RELOC_WRITE; 387 388 bool writable = reloc_flags & RELOC_WRITE; 389 390 struct drm_i915_gem_exec_object2 *entry = 391 crocus_use_bo(batch, target, writable); 392 393 if (rlist->reloc_count == rlist->reloc_array_size) { 394 rlist->reloc_array_size *= 2; 395 rlist->relocs = realloc(rlist->relocs, 396 rlist->reloc_array_size * 397 sizeof(struct drm_i915_gem_relocation_entry)); 398 } 399 400 if (reloc_flags & RELOC_32BIT) { 401 /* Restrict this buffer to the low 32 bits of the address space. 402 * 403 * Altering the validation list flags restricts it for this batch, 404 * but we also alter the BO's kflags to restrict it permanently 405 * (until the BO is destroyed and put back in the cache). Buffers 406 * may stay bound across batches, and we want keep it constrained. 407 */ 408 target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS; 409 entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS; 410 411 /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */ 412 reloc_flags &= ~RELOC_32BIT; 413 } 414 415 if (reloc_flags) 416 entry->flags |= reloc_flags & batch->valid_reloc_flags; 417 418 rlist->relocs[rlist->reloc_count++] = 419 (struct drm_i915_gem_relocation_entry) { 420 .offset = offset, 421 .delta = target_offset, 422 .target_handle = find_exec_index(batch, target), 423 .presumed_offset = entry->offset, 424 }; 425 426 /* Using the old buffer offset, write in what the right data would be, in 427 * case the buffer doesn't move and we can short-circuit the relocation 428 * processing in the kernel 429 */ 430 return entry->offset + target_offset; 431} 432 433uint64_t 434crocus_command_reloc(struct crocus_batch *batch, uint32_t batch_offset, 435 struct crocus_bo *target, uint32_t target_offset, 436 unsigned int reloc_flags) 437{ 438 assert(batch_offset <= batch->command.bo->size - sizeof(uint32_t)); 439 440 return emit_reloc(batch, &batch->command.relocs, batch_offset, 441 target, target_offset, reloc_flags); 442} 443 444uint64_t 445crocus_state_reloc(struct crocus_batch *batch, uint32_t state_offset, 446 struct crocus_bo *target, uint32_t target_offset, 447 unsigned int reloc_flags) 448{ 449 assert(state_offset <= batch->state.bo->size - sizeof(uint32_t)); 450 451 return emit_reloc(batch, &batch->state.relocs, state_offset, 452 target, target_offset, reloc_flags); 453} 454 455static void 456recreate_growing_buffer(struct crocus_batch *batch, 457 struct crocus_growing_bo *grow, 458 const char *name, unsigned size) 459{ 460 struct crocus_screen *screen = batch->screen; 461 struct crocus_bufmgr *bufmgr = screen->bufmgr; 462 grow->bo = crocus_bo_alloc(bufmgr, name, size); 463 grow->bo->kflags |= EXEC_OBJECT_CAPTURE; 464 grow->partial_bo = NULL; 465 grow->partial_bo_map = NULL; 466 grow->partial_bytes = 0; 467 if (batch->use_shadow_copy) 468 grow->map = realloc(grow->map, grow->bo->size); 469 else 470 grow->map = crocus_bo_map(NULL, grow->bo, MAP_READ | MAP_WRITE); 471 grow->map_next = grow->map; 472} 473 474static void 475create_batch(struct crocus_batch *batch) 476{ 477 struct crocus_screen *screen = batch->screen; 478 479 recreate_growing_buffer(batch, &batch->command, 480 "command buffer", 481 BATCH_SZ + BATCH_RESERVED(&screen->devinfo)); 482 483 crocus_use_bo(batch, batch->command.bo, false); 484 485 /* Always add workaround_bo which contains a driver identifier to be 486 * recorded in error states. 487 */ 488 crocus_use_bo(batch, batch->ice->workaround_bo, false); 489 490 recreate_growing_buffer(batch, &batch->state, 491 "state buffer", 492 STATE_SZ); 493 494 batch->state.used = 1; 495 crocus_use_bo(batch, batch->state.bo, false); 496} 497 498static void 499crocus_batch_maybe_noop(struct crocus_batch *batch) 500{ 501 /* We only insert the NOOP at the beginning of the batch. */ 502 assert(crocus_batch_bytes_used(batch) == 0); 503 504 if (batch->noop_enabled) { 505 /* Emit MI_BATCH_BUFFER_END to prevent any further command to be 506 * executed. 507 */ 508 uint32_t *map = batch->command.map_next; 509 510 map[0] = (0xA << 23); 511 512 batch->command.map_next += 4; 513 } 514} 515 516static void 517crocus_batch_reset(struct crocus_batch *batch) 518{ 519 struct crocus_screen *screen = batch->screen; 520 521 crocus_bo_unreference(batch->command.bo); 522 crocus_bo_unreference(batch->state.bo); 523 batch->primary_batch_size = 0; 524 batch->contains_draw = false; 525 batch->contains_fence_signal = false; 526 batch->state_base_address_emitted = false; 527 batch->screen->vtbl.batch_reset_dirty(batch); 528 529 create_batch(batch); 530 assert(batch->command.bo->index == 0); 531 532 if (batch->state_sizes) 533 _mesa_hash_table_u64_clear(batch->state_sizes); 534 struct crocus_syncobj *syncobj = crocus_create_syncobj(screen); 535 crocus_batch_add_syncobj(batch, syncobj, I915_EXEC_FENCE_SIGNAL); 536 crocus_syncobj_reference(screen, &syncobj, NULL); 537 538 crocus_cache_sets_clear(batch); 539} 540 541void 542crocus_batch_free(struct crocus_batch *batch) 543{ 544 struct crocus_screen *screen = batch->screen; 545 struct crocus_bufmgr *bufmgr = screen->bufmgr; 546 547 if (batch->use_shadow_copy) { 548 free(batch->command.map); 549 free(batch->state.map); 550 } 551 552 for (int i = 0; i < batch->exec_count; i++) { 553 crocus_bo_unreference(batch->exec_bos[i]); 554 } 555 556 pipe_resource_reference(&batch->fine_fences.ref.res, NULL); 557 558 free(batch->command.relocs.relocs); 559 free(batch->state.relocs.relocs); 560 free(batch->exec_bos); 561 free(batch->validation_list); 562 563 ralloc_free(batch->exec_fences.mem_ctx); 564 565 util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s) 566 crocus_syncobj_reference(screen, s, NULL); 567 ralloc_free(batch->syncobjs.mem_ctx); 568 569 crocus_fine_fence_reference(batch->screen, &batch->last_fence, NULL); 570 if (batch_has_fine_fence(batch)) 571 u_upload_destroy(batch->fine_fences.uploader); 572 573 crocus_bo_unreference(batch->command.bo); 574 crocus_bo_unreference(batch->state.bo); 575 batch->command.bo = NULL; 576 batch->command.map = NULL; 577 batch->command.map_next = NULL; 578 579 crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id); 580 581 _mesa_hash_table_destroy(batch->cache.render, NULL); 582 _mesa_set_destroy(batch->cache.depth, NULL); 583 584 if (batch->state_sizes) { 585 _mesa_hash_table_u64_destroy(batch->state_sizes); 586 intel_batch_decode_ctx_finish(&batch->decoder); 587 } 588} 589 590/** 591 * If we've chained to a secondary batch, or are getting near to the end, 592 * then flush. This should only be called between draws. 593 */ 594void 595crocus_batch_maybe_flush(struct crocus_batch *batch, unsigned estimate) 596{ 597 if (batch->command.bo != batch->exec_bos[0] || 598 crocus_batch_bytes_used(batch) + estimate >= BATCH_SZ) { 599 crocus_batch_flush(batch); 600 } 601} 602 603/** 604 * Finish copying the old batch/state buffer's contents to the new one 605 * after we tried to "grow" the buffer in an earlier operation. 606 */ 607static void 608finish_growing_bos(struct crocus_growing_bo *grow) 609{ 610 struct crocus_bo *old_bo = grow->partial_bo; 611 if (!old_bo) 612 return; 613 614 memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes); 615 616 grow->partial_bo = NULL; 617 grow->partial_bo_map = NULL; 618 grow->partial_bytes = 0; 619 620 crocus_bo_unreference(old_bo); 621} 622 623void 624crocus_grow_buffer(struct crocus_batch *batch, bool grow_state, 625 unsigned used, 626 unsigned new_size) 627{ 628 struct crocus_screen *screen = batch->screen; 629 struct crocus_bufmgr *bufmgr = screen->bufmgr; 630 struct crocus_growing_bo *grow = grow_state ? &batch->state : &batch->command; 631 struct crocus_bo *bo = grow->bo; 632 633 if (grow->partial_bo) { 634 /* We've already grown once, and now we need to do it again. 635 * Finish our last grow operation so we can start a new one. 636 * This should basically never happen. 637 */ 638 finish_growing_bos(grow); 639 } 640 641 struct crocus_bo *new_bo = crocus_bo_alloc(bufmgr, bo->name, new_size); 642 643 /* Copy existing data to the new larger buffer */ 644 grow->partial_bo_map = grow->map; 645 646 if (batch->use_shadow_copy) { 647 /* We can't safely use realloc, as it may move the existing buffer, 648 * breaking existing pointers the caller may still be using. Just 649 * malloc a new copy and memcpy it like the normal BO path. 650 * 651 * Use bo->size rather than new_size because the bufmgr may have 652 * rounded up the size, and we want the shadow size to match. 653 */ 654 grow->map = malloc(new_bo->size); 655 } else { 656 grow->map = crocus_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE); 657 } 658 /* Try to put the new BO at the same GTT offset as the old BO (which 659 * we're throwing away, so it doesn't need to be there). 660 * 661 * This guarantees that our relocations continue to work: values we've 662 * already written into the buffer, values we're going to write into the 663 * buffer, and the validation/relocation lists all will match. 664 * 665 * Also preserve kflags for EXEC_OBJECT_CAPTURE. 666 */ 667 new_bo->gtt_offset = bo->gtt_offset; 668 new_bo->index = bo->index; 669 new_bo->kflags = bo->kflags; 670 671 /* Batch/state buffers are per-context, and if we've run out of space, 672 * we must have actually used them before, so...they will be in the list. 673 */ 674 assert(bo->index < batch->exec_count); 675 assert(batch->exec_bos[bo->index] == bo); 676 677 /* Update the validation list to use the new BO. */ 678 batch->validation_list[bo->index].handle = new_bo->gem_handle; 679 /* Exchange the two BOs...without breaking pointers to the old BO. 680 * 681 * Consider this scenario: 682 * 683 * 1. Somebody calls brw_state_batch() to get a region of memory, and 684 * and then creates a brw_address pointing to brw->batch.state.bo. 685 * 2. They then call brw_state_batch() a second time, which happens to 686 * grow and replace the state buffer. They then try to emit a 687 * relocation to their first section of memory. 688 * 689 * If we replace the brw->batch.state.bo pointer at step 2, we would 690 * break the address created in step 1. They'd have a pointer to the 691 * old destroyed BO. Emitting a relocation would add this dead BO to 692 * the validation list...causing /both/ statebuffers to be in the list, 693 * and all kinds of disasters. 694 * 695 * This is not a contrived case - BLORP vertex data upload hits this. 696 * 697 * There are worse scenarios too. Fences for GL sync objects reference 698 * brw->batch.batch.bo. If we replaced the batch pointer when growing, 699 * we'd need to chase down every fence and update it to point to the 700 * new BO. Otherwise, it would refer to a "batch" that never actually 701 * gets submitted, and would fail to trigger. 702 * 703 * To work around both of these issues, we transmutate the buffers in 704 * place, making the existing struct brw_bo represent the new buffer, 705 * and "new_bo" represent the old BO. This is highly unusual, but it 706 * seems like a necessary evil. 707 * 708 * We also defer the memcpy of the existing batch's contents. Callers 709 * may make multiple brw_state_batch calls, and retain pointers to the 710 * old BO's map. We'll perform the memcpy in finish_growing_bo() when 711 * we finally submit the batch, at which point we've finished uploading 712 * state, and nobody should have any old references anymore. 713 * 714 * To do that, we keep a reference to the old BO in grow->partial_bo, 715 * and store the number of bytes to copy in grow->partial_bytes. We 716 * can monkey with the refcounts directly without atomics because these 717 * are per-context BOs and they can only be touched by this thread. 718 */ 719 assert(new_bo->refcount == 1); 720 new_bo->refcount = bo->refcount; 721 bo->refcount = 1; 722 723 struct crocus_bo tmp; 724 memcpy(&tmp, bo, sizeof(struct crocus_bo)); 725 memcpy(bo, new_bo, sizeof(struct crocus_bo)); 726 memcpy(new_bo, &tmp, sizeof(struct crocus_bo)); 727 728 grow->partial_bo = new_bo; /* the one reference of the OLD bo */ 729 grow->partial_bytes = used; 730} 731 732static void 733finish_seqno(struct crocus_batch *batch) 734{ 735 struct crocus_fine_fence *sq = crocus_fine_fence_new(batch, CROCUS_FENCE_END); 736 if (!sq) 737 return; 738 739 crocus_fine_fence_reference(batch->screen, &batch->last_fence, sq); 740 crocus_fine_fence_reference(batch->screen, &sq, NULL); 741} 742 743/** 744 * Terminate a batch with MI_BATCH_BUFFER_END. 745 */ 746static void 747crocus_finish_batch(struct crocus_batch *batch) 748{ 749 750 batch->no_wrap = true; 751 if (batch->screen->vtbl.finish_batch) 752 batch->screen->vtbl.finish_batch(batch); 753 754 finish_seqno(batch); 755 756 /* Emit MI_BATCH_BUFFER_END to finish our batch. */ 757 uint32_t *map = batch->command.map_next; 758 759 map[0] = (0xA << 23); 760 761 batch->command.map_next += 4; 762 VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->command.map, crocus_batch_bytes_used(batch))); 763 764 if (batch->command.bo == batch->exec_bos[0]) 765 batch->primary_batch_size = crocus_batch_bytes_used(batch); 766 batch->no_wrap = false; 767} 768 769/** 770 * Replace our current GEM context with a new one (in case it got banned). 771 */ 772static bool 773replace_hw_ctx(struct crocus_batch *batch) 774{ 775 struct crocus_screen *screen = batch->screen; 776 struct crocus_bufmgr *bufmgr = screen->bufmgr; 777 778 uint32_t new_ctx = crocus_clone_hw_context(bufmgr, batch->hw_ctx_id); 779 if (!new_ctx) 780 return false; 781 782 crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id); 783 batch->hw_ctx_id = new_ctx; 784 785 /* Notify the context that state must be re-initialized. */ 786 crocus_lost_context_state(batch); 787 788 return true; 789} 790 791enum pipe_reset_status 792crocus_batch_check_for_reset(struct crocus_batch *batch) 793{ 794 struct crocus_screen *screen = batch->screen; 795 enum pipe_reset_status status = PIPE_NO_RESET; 796 struct drm_i915_reset_stats stats = { .ctx_id = batch->hw_ctx_id }; 797 798 if (drmIoctl(screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats)) 799 DBG("DRM_IOCTL_I915_GET_RESET_STATS failed: %s\n", strerror(errno)); 800 801 if (stats.batch_active != 0) { 802 /* A reset was observed while a batch from this hardware context was 803 * executing. Assume that this context was at fault. 804 */ 805 status = PIPE_GUILTY_CONTEXT_RESET; 806 } else if (stats.batch_pending != 0) { 807 /* A reset was observed while a batch from this context was in progress, 808 * but the batch was not executing. In this case, assume that the 809 * context was not at fault. 810 */ 811 status = PIPE_INNOCENT_CONTEXT_RESET; 812 } 813 814 if (status != PIPE_NO_RESET) { 815 /* Our context is likely banned, or at least in an unknown state. 816 * Throw it away and start with a fresh context. Ideally this may 817 * catch the problem before our next execbuf fails with -EIO. 818 */ 819 replace_hw_ctx(batch); 820 } 821 822 return status; 823} 824 825/** 826 * Submit the batch to the GPU via execbuffer2. 827 */ 828static int 829submit_batch(struct crocus_batch *batch) 830{ 831 832 if (batch->use_shadow_copy) { 833 void *bo_map = crocus_bo_map(batch->dbg, batch->command.bo, MAP_WRITE); 834 memcpy(bo_map, batch->command.map, crocus_batch_bytes_used(batch)); 835 836 bo_map = crocus_bo_map(batch->dbg, batch->state.bo, MAP_WRITE); 837 memcpy(bo_map, batch->state.map, batch->state.used); 838 } 839 840 crocus_bo_unmap(batch->command.bo); 841 crocus_bo_unmap(batch->state.bo); 842 843 /* The requirement for using I915_EXEC_NO_RELOC are: 844 * 845 * The addresses written in the objects must match the corresponding 846 * reloc.gtt_offset which in turn must match the corresponding 847 * execobject.offset. 848 * 849 * Any render targets written to in the batch must be flagged with 850 * EXEC_OBJECT_WRITE. 851 * 852 * To avoid stalling, execobject.offset should match the current 853 * address of that object within the active context. 854 */ 855 /* Set statebuffer relocations */ 856 const unsigned state_index = batch->state.bo->index; 857 if (state_index < batch->exec_count && 858 batch->exec_bos[state_index] == batch->state.bo) { 859 struct drm_i915_gem_exec_object2 *entry = 860 &batch->validation_list[state_index]; 861 assert(entry->handle == batch->state.bo->gem_handle); 862 entry->relocation_count = batch->state.relocs.reloc_count; 863 entry->relocs_ptr = (uintptr_t)batch->state.relocs.relocs; 864 } 865 866 /* Set batchbuffer relocations */ 867 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0]; 868 assert(entry->handle == batch->command.bo->gem_handle); 869 entry->relocation_count = batch->command.relocs.reloc_count; 870 entry->relocs_ptr = (uintptr_t)batch->command.relocs.relocs; 871 872 struct drm_i915_gem_execbuffer2 execbuf = { 873 .buffers_ptr = (uintptr_t)batch->validation_list, 874 .buffer_count = batch->exec_count, 875 .batch_start_offset = 0, 876 /* This must be QWord aligned. */ 877 .batch_len = ALIGN(batch->primary_batch_size, 8), 878 .flags = I915_EXEC_RENDER | 879 I915_EXEC_NO_RELOC | 880 I915_EXEC_BATCH_FIRST | 881 I915_EXEC_HANDLE_LUT, 882 .rsvd1 = batch->hw_ctx_id, /* rsvd1 is actually the context ID */ 883 }; 884 885 if (num_fences(batch)) { 886 execbuf.flags |= I915_EXEC_FENCE_ARRAY; 887 execbuf.num_cliprects = num_fences(batch); 888 execbuf.cliprects_ptr = 889 (uintptr_t)util_dynarray_begin(&batch->exec_fences); 890 } 891 892 int ret = 0; 893 if (!batch->screen->devinfo.no_hw && 894 intel_ioctl(batch->screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf)) 895 ret = -errno; 896 897 for (int i = 0; i < batch->exec_count; i++) { 898 struct crocus_bo *bo = batch->exec_bos[i]; 899 900 bo->idle = false; 901 bo->index = -1; 902 903 /* Update brw_bo::gtt_offset */ 904 if (batch->validation_list[i].offset != bo->gtt_offset) { 905 DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%" PRIx64 "\n", 906 bo->gem_handle, bo->gtt_offset, 907 (uint64_t)batch->validation_list[i].offset); 908 assert(!(bo->kflags & EXEC_OBJECT_PINNED)); 909 bo->gtt_offset = batch->validation_list[i].offset; 910 } 911 } 912 913 return ret; 914} 915 916static const char * 917batch_name_to_string(enum crocus_batch_name name) 918{ 919 const char *names[CROCUS_BATCH_COUNT] = { 920 [CROCUS_BATCH_RENDER] = "render", 921 [CROCUS_BATCH_COMPUTE] = "compute", 922 }; 923 return names[name]; 924} 925 926/** 927 * Flush the batch buffer, submitting it to the GPU and resetting it so 928 * we're ready to emit the next batch. 929 * 930 * \param in_fence_fd is ignored if -1. Otherwise, this function takes 931 * ownership of the fd. 932 * 933 * \param out_fence_fd is ignored if NULL. Otherwise, the caller must 934 * take ownership of the returned fd. 935 */ 936void 937_crocus_batch_flush(struct crocus_batch *batch, const char *file, int line) 938{ 939 struct crocus_screen *screen = batch->screen; 940 941 /* If a fence signals we need to flush it. */ 942 if (crocus_batch_bytes_used(batch) == 0 && !batch->contains_fence_signal) 943 return; 944 945 assert(!batch->no_wrap); 946 crocus_finish_batch(batch); 947 948 finish_growing_bos(&batch->command); 949 finish_growing_bos(&batch->state); 950 int ret = submit_batch(batch); 951 952 if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL)) { 953 int bytes_for_commands = crocus_batch_bytes_used(batch); 954 int second_bytes = 0; 955 if (batch->command.bo != batch->exec_bos[0]) { 956 second_bytes = bytes_for_commands; 957 bytes_for_commands += batch->primary_batch_size; 958 } 959 fprintf(stderr, "%19s:%-3d: %s batch [%u] flush with %5d+%5db (%0.1f%%) " 960 "(cmds), %4d BOs (%0.1fMb aperture)," 961 " %4d command relocs, %4d state relocs\n", 962 file, line, batch_name_to_string(batch->name), batch->hw_ctx_id, 963 batch->primary_batch_size, second_bytes, 964 100.0f * bytes_for_commands / BATCH_SZ, 965 batch->exec_count, 966 (float) batch->aperture_space / (1024 * 1024), 967 batch->command.relocs.reloc_count, 968 batch->state.relocs.reloc_count); 969 970 if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_SUBMIT)) { 971 dump_fence_list(batch); 972 dump_validation_list(batch); 973 } 974 975 if (INTEL_DEBUG(DEBUG_BATCH)) { 976 decode_batch(batch); 977 } 978 } 979 980 for (int i = 0; i < batch->exec_count; i++) { 981 struct crocus_bo *bo = batch->exec_bos[i]; 982 crocus_bo_unreference(bo); 983 } 984 985 batch->command.relocs.reloc_count = 0; 986 batch->state.relocs.reloc_count = 0; 987 batch->exec_count = 0; 988 batch->aperture_space = 0; 989 990 util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s) 991 crocus_syncobj_reference(screen, s, NULL); 992 util_dynarray_clear(&batch->syncobjs); 993 994 util_dynarray_clear(&batch->exec_fences); 995 996 if (INTEL_DEBUG(DEBUG_SYNC)) { 997 dbg_printf("waiting for idle\n"); 998 crocus_bo_wait_rendering(batch->command.bo); /* if execbuf failed; this is a nop */ 999 } 1000 1001 /* Start a new batch buffer. */ 1002 crocus_batch_reset(batch); 1003 1004 /* EIO means our context is banned. In this case, try and replace it 1005 * with a new logical context, and inform crocus_context that all state 1006 * has been lost and needs to be re-initialized. If this succeeds, 1007 * dubiously claim success... 1008 */ 1009 if (ret == -EIO && replace_hw_ctx(batch)) { 1010 if (batch->reset->reset) { 1011 /* Tell the state tracker the device is lost and it was our fault. */ 1012 batch->reset->reset(batch->reset->data, PIPE_GUILTY_CONTEXT_RESET); 1013 } 1014 1015 ret = 0; 1016 } 1017 1018 if (ret < 0) { 1019#ifdef DEBUG 1020 const bool color = INTEL_DEBUG(DEBUG_COLOR); 1021 fprintf(stderr, "%scrocus: Failed to submit batchbuffer: %-80s%s\n", 1022 color ? "\e[1;41m" : "", strerror(-ret), color ? "\e[0m" : ""); 1023#endif 1024 abort(); 1025 } 1026} 1027 1028/** 1029 * Does the current batch refer to the given BO? 1030 * 1031 * (In other words, is the BO in the current batch's validation list?) 1032 */ 1033bool 1034crocus_batch_references(struct crocus_batch *batch, struct crocus_bo *bo) 1035{ 1036 return find_validation_entry(batch, bo) != NULL; 1037} 1038 1039/** 1040 * Updates the state of the noop feature. Returns true if there was a noop 1041 * transition that led to state invalidation. 1042 */ 1043bool 1044crocus_batch_prepare_noop(struct crocus_batch *batch, bool noop_enable) 1045{ 1046 if (batch->noop_enabled == noop_enable) 1047 return 0; 1048 1049 batch->noop_enabled = noop_enable; 1050 1051 crocus_batch_flush(batch); 1052 1053 /* If the batch was empty, flush had no effect, so insert our noop. */ 1054 if (crocus_batch_bytes_used(batch) == 0) 1055 crocus_batch_maybe_noop(batch); 1056 1057 /* We only need to update the entire state if we transition from noop -> 1058 * not-noop. 1059 */ 1060 return !batch->noop_enabled; 1061} 1062