1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <assert.h> 25#include <stdbool.h> 26#include <string.h> 27#include <unistd.h> 28#include <fcntl.h> 29 30#include <xf86drm.h> 31 32#include "anv_private.h" 33#include "anv_measure.h" 34 35#include "genxml/gen8_pack.h" 36#include "genxml/genX_bits.h" 37#include "perf/intel_perf.h" 38 39#include "util/debug.h" 40#include "util/perf/u_trace.h" 41 42/** \file anv_batch_chain.c 43 * 44 * This file contains functions related to anv_cmd_buffer as a data 45 * structure. This involves everything required to create and destroy 46 * the actual batch buffers as well as link them together and handle 47 * relocations and surface state. It specifically does *not* contain any 48 * handling of actual vkCmd calls beyond vkCmdExecuteCommands. 49 */ 50 51/*-----------------------------------------------------------------------* 52 * Functions related to anv_reloc_list 53 *-----------------------------------------------------------------------*/ 54 55VkResult 56anv_reloc_list_init(struct anv_reloc_list *list, 57 const VkAllocationCallbacks *alloc) 58{ 59 memset(list, 0, sizeof(*list)); 60 return VK_SUCCESS; 61} 62 63static VkResult 64anv_reloc_list_init_clone(struct anv_reloc_list *list, 65 const VkAllocationCallbacks *alloc, 66 const struct anv_reloc_list *other_list) 67{ 68 list->num_relocs = other_list->num_relocs; 69 list->array_length = other_list->array_length; 70 71 if (list->num_relocs > 0) { 72 list->relocs = 73 vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8, 74 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 75 if (list->relocs == NULL) 76 return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); 77 78 list->reloc_bos = 79 vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8, 80 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 81 if (list->reloc_bos == NULL) { 82 vk_free(alloc, list->relocs); 83 return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); 84 } 85 86 memcpy(list->relocs, other_list->relocs, 87 list->array_length * sizeof(*list->relocs)); 88 memcpy(list->reloc_bos, other_list->reloc_bos, 89 list->array_length * sizeof(*list->reloc_bos)); 90 } else { 91 list->relocs = NULL; 92 list->reloc_bos = NULL; 93 } 94 95 list->dep_words = other_list->dep_words; 96 97 if (list->dep_words > 0) { 98 list->deps = 99 vk_alloc(alloc, list->dep_words * sizeof(BITSET_WORD), 8, 100 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 101 memcpy(list->deps, other_list->deps, 102 list->dep_words * sizeof(BITSET_WORD)); 103 } else { 104 list->deps = NULL; 105 } 106 107 return VK_SUCCESS; 108} 109 110void 111anv_reloc_list_finish(struct anv_reloc_list *list, 112 const VkAllocationCallbacks *alloc) 113{ 114 vk_free(alloc, list->relocs); 115 vk_free(alloc, list->reloc_bos); 116 vk_free(alloc, list->deps); 117} 118 119static VkResult 120anv_reloc_list_grow(struct anv_reloc_list *list, 121 const VkAllocationCallbacks *alloc, 122 size_t num_additional_relocs) 123{ 124 if (list->num_relocs + num_additional_relocs <= list->array_length) 125 return VK_SUCCESS; 126 127 size_t new_length = MAX2(16, list->array_length * 2); 128 while (new_length < list->num_relocs + num_additional_relocs) 129 new_length *= 2; 130 131 struct drm_i915_gem_relocation_entry *new_relocs = 132 vk_realloc(alloc, list->relocs, 133 new_length * sizeof(*list->relocs), 8, 134 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 135 if (new_relocs == NULL) 136 return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); 137 list->relocs = new_relocs; 138 139 struct anv_bo **new_reloc_bos = 140 vk_realloc(alloc, list->reloc_bos, 141 new_length * sizeof(*list->reloc_bos), 8, 142 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 143 if (new_reloc_bos == NULL) 144 return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); 145 list->reloc_bos = new_reloc_bos; 146 147 list->array_length = new_length; 148 149 return VK_SUCCESS; 150} 151 152static VkResult 153anv_reloc_list_grow_deps(struct anv_reloc_list *list, 154 const VkAllocationCallbacks *alloc, 155 uint32_t min_num_words) 156{ 157 if (min_num_words <= list->dep_words) 158 return VK_SUCCESS; 159 160 uint32_t new_length = MAX2(32, list->dep_words * 2); 161 while (new_length < min_num_words) 162 new_length *= 2; 163 164 BITSET_WORD *new_deps = 165 vk_realloc(alloc, list->deps, new_length * sizeof(BITSET_WORD), 8, 166 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 167 if (new_deps == NULL) 168 return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); 169 list->deps = new_deps; 170 171 /* Zero out the new data */ 172 memset(list->deps + list->dep_words, 0, 173 (new_length - list->dep_words) * sizeof(BITSET_WORD)); 174 list->dep_words = new_length; 175 176 return VK_SUCCESS; 177} 178 179#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) 180 181VkResult 182anv_reloc_list_add_bo(struct anv_reloc_list *list, 183 const VkAllocationCallbacks *alloc, 184 struct anv_bo *target_bo) 185{ 186 assert(!target_bo->is_wrapper); 187 assert(anv_bo_is_pinned(target_bo)); 188 189 uint32_t idx = target_bo->gem_handle; 190 VkResult result = anv_reloc_list_grow_deps(list, alloc, 191 (idx / BITSET_WORDBITS) + 1); 192 if (unlikely(result != VK_SUCCESS)) 193 return result; 194 195 BITSET_SET(list->deps, idx); 196 197 return VK_SUCCESS; 198} 199 200VkResult 201anv_reloc_list_add(struct anv_reloc_list *list, 202 const VkAllocationCallbacks *alloc, 203 uint32_t offset, struct anv_bo *target_bo, uint32_t delta, 204 uint64_t *address_u64_out) 205{ 206 struct drm_i915_gem_relocation_entry *entry; 207 int index; 208 209 struct anv_bo *unwrapped_target_bo = anv_bo_unwrap(target_bo); 210 uint64_t target_bo_offset = READ_ONCE(unwrapped_target_bo->offset); 211 if (address_u64_out) 212 *address_u64_out = target_bo_offset + delta; 213 214 assert(unwrapped_target_bo->gem_handle > 0); 215 assert(unwrapped_target_bo->refcount > 0); 216 217 if (anv_bo_is_pinned(unwrapped_target_bo)) 218 return anv_reloc_list_add_bo(list, alloc, unwrapped_target_bo); 219 220 VkResult result = anv_reloc_list_grow(list, alloc, 1); 221 if (result != VK_SUCCESS) 222 return result; 223 224 /* XXX: Can we use I915_EXEC_HANDLE_LUT? */ 225 index = list->num_relocs++; 226 list->reloc_bos[index] = target_bo; 227 entry = &list->relocs[index]; 228 entry->target_handle = -1; /* See also anv_cmd_buffer_process_relocs() */ 229 entry->delta = delta; 230 entry->offset = offset; 231 entry->presumed_offset = target_bo_offset; 232 entry->read_domains = 0; 233 entry->write_domain = 0; 234 VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry))); 235 236 return VK_SUCCESS; 237} 238 239static void 240anv_reloc_list_clear(struct anv_reloc_list *list) 241{ 242 list->num_relocs = 0; 243 if (list->dep_words > 0) 244 memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD)); 245} 246 247static VkResult 248anv_reloc_list_append(struct anv_reloc_list *list, 249 const VkAllocationCallbacks *alloc, 250 struct anv_reloc_list *other, uint32_t offset) 251{ 252 VkResult result = anv_reloc_list_grow(list, alloc, other->num_relocs); 253 if (result != VK_SUCCESS) 254 return result; 255 256 if (other->num_relocs > 0) { 257 memcpy(&list->relocs[list->num_relocs], &other->relocs[0], 258 other->num_relocs * sizeof(other->relocs[0])); 259 memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0], 260 other->num_relocs * sizeof(other->reloc_bos[0])); 261 262 for (uint32_t i = 0; i < other->num_relocs; i++) 263 list->relocs[i + list->num_relocs].offset += offset; 264 265 list->num_relocs += other->num_relocs; 266 } 267 268 anv_reloc_list_grow_deps(list, alloc, other->dep_words); 269 for (uint32_t w = 0; w < other->dep_words; w++) 270 list->deps[w] |= other->deps[w]; 271 272 return VK_SUCCESS; 273} 274 275/*-----------------------------------------------------------------------* 276 * Functions related to anv_batch 277 *-----------------------------------------------------------------------*/ 278 279void * 280anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords) 281{ 282 if (batch->next + num_dwords * 4 > batch->end) { 283 VkResult result = batch->extend_cb(batch, batch->user_data); 284 if (result != VK_SUCCESS) { 285 anv_batch_set_error(batch, result); 286 return NULL; 287 } 288 } 289 290 void *p = batch->next; 291 292 batch->next += num_dwords * 4; 293 assert(batch->next <= batch->end); 294 295 return p; 296} 297 298struct anv_address 299anv_batch_address(struct anv_batch *batch, void *batch_location) 300{ 301 assert(batch->start <= batch_location); 302 303 /* Allow a jump at the current location of the batch. */ 304 assert(batch->next >= batch_location); 305 306 return anv_address_add(batch->start_addr, batch_location - batch->start); 307} 308 309void 310anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other) 311{ 312 uint32_t size, offset; 313 314 size = other->next - other->start; 315 assert(size % 4 == 0); 316 317 if (batch->next + size > batch->end) { 318 VkResult result = batch->extend_cb(batch, batch->user_data); 319 if (result != VK_SUCCESS) { 320 anv_batch_set_error(batch, result); 321 return; 322 } 323 } 324 325 assert(batch->next + size <= batch->end); 326 327 VG(VALGRIND_CHECK_MEM_IS_DEFINED(other->start, size)); 328 memcpy(batch->next, other->start, size); 329 330 offset = batch->next - batch->start; 331 VkResult result = anv_reloc_list_append(batch->relocs, batch->alloc, 332 other->relocs, offset); 333 if (result != VK_SUCCESS) { 334 anv_batch_set_error(batch, result); 335 return; 336 } 337 338 batch->next += size; 339} 340 341/*-----------------------------------------------------------------------* 342 * Functions related to anv_batch_bo 343 *-----------------------------------------------------------------------*/ 344 345static VkResult 346anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer, 347 uint32_t size, 348 struct anv_batch_bo **bbo_out) 349{ 350 VkResult result; 351 352 struct anv_batch_bo *bbo = vk_zalloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo), 353 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 354 if (bbo == NULL) 355 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); 356 357 result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, 358 size, &bbo->bo); 359 if (result != VK_SUCCESS) 360 goto fail_alloc; 361 362 result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->vk.pool->alloc); 363 if (result != VK_SUCCESS) 364 goto fail_bo_alloc; 365 366 *bbo_out = bbo; 367 368 return VK_SUCCESS; 369 370 fail_bo_alloc: 371 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); 372 fail_alloc: 373 vk_free(&cmd_buffer->vk.pool->alloc, bbo); 374 375 return result; 376} 377 378static VkResult 379anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer, 380 const struct anv_batch_bo *other_bbo, 381 struct anv_batch_bo **bbo_out) 382{ 383 VkResult result; 384 385 struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo), 386 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 387 if (bbo == NULL) 388 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); 389 390 result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, 391 other_bbo->bo->size, &bbo->bo); 392 if (result != VK_SUCCESS) 393 goto fail_alloc; 394 395 result = anv_reloc_list_init_clone(&bbo->relocs, &cmd_buffer->vk.pool->alloc, 396 &other_bbo->relocs); 397 if (result != VK_SUCCESS) 398 goto fail_bo_alloc; 399 400 bbo->length = other_bbo->length; 401 memcpy(bbo->bo->map, other_bbo->bo->map, other_bbo->length); 402 *bbo_out = bbo; 403 404 return VK_SUCCESS; 405 406 fail_bo_alloc: 407 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); 408 fail_alloc: 409 vk_free(&cmd_buffer->vk.pool->alloc, bbo); 410 411 return result; 412} 413 414static void 415anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch, 416 size_t batch_padding) 417{ 418 anv_batch_set_storage(batch, (struct anv_address) { .bo = bbo->bo, }, 419 bbo->bo->map, bbo->bo->size - batch_padding); 420 batch->relocs = &bbo->relocs; 421 anv_reloc_list_clear(&bbo->relocs); 422} 423 424static void 425anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch, 426 size_t batch_padding) 427{ 428 batch->start_addr = (struct anv_address) { .bo = bbo->bo, }; 429 batch->start = bbo->bo->map; 430 batch->next = bbo->bo->map + bbo->length; 431 batch->end = bbo->bo->map + bbo->bo->size - batch_padding; 432 batch->relocs = &bbo->relocs; 433} 434 435static void 436anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch) 437{ 438 assert(batch->start == bbo->bo->map); 439 bbo->length = batch->next - batch->start; 440 VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length)); 441} 442 443static VkResult 444anv_batch_bo_grow(struct anv_cmd_buffer *cmd_buffer, struct anv_batch_bo *bbo, 445 struct anv_batch *batch, size_t additional, 446 size_t batch_padding) 447{ 448 assert(batch->start == bbo->bo->map); 449 bbo->length = batch->next - batch->start; 450 451 size_t new_size = bbo->bo->size; 452 while (new_size <= bbo->length + additional + batch_padding) 453 new_size *= 2; 454 455 if (new_size == bbo->bo->size) 456 return VK_SUCCESS; 457 458 struct anv_bo *new_bo; 459 VkResult result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, 460 new_size, &new_bo); 461 if (result != VK_SUCCESS) 462 return result; 463 464 memcpy(new_bo->map, bbo->bo->map, bbo->length); 465 466 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); 467 468 bbo->bo = new_bo; 469 anv_batch_bo_continue(bbo, batch, batch_padding); 470 471 return VK_SUCCESS; 472} 473 474static void 475anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer, 476 struct anv_batch_bo *prev_bbo, 477 struct anv_batch_bo *next_bbo, 478 uint32_t next_bbo_offset) 479{ 480 const uint32_t bb_start_offset = 481 prev_bbo->length - GFX8_MI_BATCH_BUFFER_START_length * 4; 482 ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset; 483 484 /* Make sure we're looking at a MI_BATCH_BUFFER_START */ 485 assert(((*bb_start >> 29) & 0x07) == 0); 486 assert(((*bb_start >> 23) & 0x3f) == 49); 487 488 if (anv_use_relocations(cmd_buffer->device->physical)) { 489 uint32_t reloc_idx = prev_bbo->relocs.num_relocs - 1; 490 assert(prev_bbo->relocs.relocs[reloc_idx].offset == bb_start_offset + 4); 491 492 prev_bbo->relocs.reloc_bos[reloc_idx] = next_bbo->bo; 493 prev_bbo->relocs.relocs[reloc_idx].delta = next_bbo_offset; 494 495 /* Use a bogus presumed offset to force a relocation */ 496 prev_bbo->relocs.relocs[reloc_idx].presumed_offset = -1; 497 } else { 498 assert(anv_bo_is_pinned(prev_bbo->bo)); 499 assert(anv_bo_is_pinned(next_bbo->bo)); 500 501 write_reloc(cmd_buffer->device, 502 prev_bbo->bo->map + bb_start_offset + 4, 503 next_bbo->bo->offset + next_bbo_offset, true); 504 } 505} 506 507static void 508anv_batch_bo_destroy(struct anv_batch_bo *bbo, 509 struct anv_cmd_buffer *cmd_buffer) 510{ 511 anv_reloc_list_finish(&bbo->relocs, &cmd_buffer->vk.pool->alloc); 512 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); 513 vk_free(&cmd_buffer->vk.pool->alloc, bbo); 514} 515 516static VkResult 517anv_batch_bo_list_clone(const struct list_head *list, 518 struct anv_cmd_buffer *cmd_buffer, 519 struct list_head *new_list) 520{ 521 VkResult result = VK_SUCCESS; 522 523 list_inithead(new_list); 524 525 struct anv_batch_bo *prev_bbo = NULL; 526 list_for_each_entry(struct anv_batch_bo, bbo, list, link) { 527 struct anv_batch_bo *new_bbo = NULL; 528 result = anv_batch_bo_clone(cmd_buffer, bbo, &new_bbo); 529 if (result != VK_SUCCESS) 530 break; 531 list_addtail(&new_bbo->link, new_list); 532 533 if (prev_bbo) 534 anv_batch_bo_link(cmd_buffer, prev_bbo, new_bbo, 0); 535 536 prev_bbo = new_bbo; 537 } 538 539 if (result != VK_SUCCESS) { 540 list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) { 541 list_del(&bbo->link); 542 anv_batch_bo_destroy(bbo, cmd_buffer); 543 } 544 } 545 546 return result; 547} 548 549/*-----------------------------------------------------------------------* 550 * Functions related to anv_batch_bo 551 *-----------------------------------------------------------------------*/ 552 553static struct anv_batch_bo * 554anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer) 555{ 556 return list_entry(cmd_buffer->batch_bos.prev, struct anv_batch_bo, link); 557} 558 559struct anv_address 560anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer) 561{ 562 struct anv_state_pool *pool = anv_binding_table_pool(cmd_buffer->device); 563 struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); 564 return (struct anv_address) { 565 .bo = pool->block_pool.bo, 566 .offset = bt_block->offset - pool->start_offset, 567 }; 568} 569 570static void 571emit_batch_buffer_start(struct anv_cmd_buffer *cmd_buffer, 572 struct anv_bo *bo, uint32_t offset) 573{ 574 /* In gfx8+ the address field grew to two dwords to accommodate 48 bit 575 * offsets. The high 16 bits are in the last dword, so we can use the gfx8 576 * version in either case, as long as we set the instruction length in the 577 * header accordingly. This means that we always emit three dwords here 578 * and all the padding and adjustment we do in this file works for all 579 * gens. 580 */ 581 582#define GFX7_MI_BATCH_BUFFER_START_length 2 583#define GFX7_MI_BATCH_BUFFER_START_length_bias 2 584 585 const uint32_t gfx7_length = 586 GFX7_MI_BATCH_BUFFER_START_length - GFX7_MI_BATCH_BUFFER_START_length_bias; 587 const uint32_t gfx8_length = 588 GFX8_MI_BATCH_BUFFER_START_length - GFX8_MI_BATCH_BUFFER_START_length_bias; 589 590 anv_batch_emit(&cmd_buffer->batch, GFX8_MI_BATCH_BUFFER_START, bbs) { 591 bbs.DWordLength = cmd_buffer->device->info.ver < 8 ? 592 gfx7_length : gfx8_length; 593 bbs.SecondLevelBatchBuffer = Firstlevelbatch; 594 bbs.AddressSpaceIndicator = ASI_PPGTT; 595 bbs.BatchBufferStartAddress = (struct anv_address) { bo, offset }; 596 } 597} 598 599static void 600cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer, 601 struct anv_batch_bo *bbo) 602{ 603 struct anv_batch *batch = &cmd_buffer->batch; 604 struct anv_batch_bo *current_bbo = 605 anv_cmd_buffer_current_batch_bo(cmd_buffer); 606 607 /* We set the end of the batch a little short so we would be sure we 608 * have room for the chaining command. Since we're about to emit the 609 * chaining command, let's set it back where it should go. 610 */ 611 batch->end += GFX8_MI_BATCH_BUFFER_START_length * 4; 612 assert(batch->end == current_bbo->bo->map + current_bbo->bo->size); 613 614 emit_batch_buffer_start(cmd_buffer, bbo->bo, 0); 615 616 anv_batch_bo_finish(current_bbo, batch); 617} 618 619static void 620anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from, 621 struct anv_cmd_buffer *cmd_buffer_to) 622{ 623 assert(!anv_use_relocations(cmd_buffer_from->device->physical)); 624 625 uint32_t *bb_start = cmd_buffer_from->batch_end; 626 627 struct anv_batch_bo *last_bbo = 628 list_last_entry(&cmd_buffer_from->batch_bos, struct anv_batch_bo, link); 629 struct anv_batch_bo *first_bbo = 630 list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link); 631 632 struct GFX8_MI_BATCH_BUFFER_START gen_bb_start = { 633 __anv_cmd_header(GFX8_MI_BATCH_BUFFER_START), 634 .SecondLevelBatchBuffer = Firstlevelbatch, 635 .AddressSpaceIndicator = ASI_PPGTT, 636 .BatchBufferStartAddress = (struct anv_address) { first_bbo->bo, 0 }, 637 }; 638 struct anv_batch local_batch = { 639 .start = last_bbo->bo->map, 640 .end = last_bbo->bo->map + last_bbo->bo->size, 641 .relocs = &last_bbo->relocs, 642 .alloc = &cmd_buffer_from->vk.pool->alloc, 643 }; 644 645 __anv_cmd_pack(GFX8_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start); 646 647 last_bbo->chained = true; 648} 649 650static void 651anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer) 652{ 653 assert(!anv_use_relocations(cmd_buffer->device->physical)); 654 655 struct anv_batch_bo *last_bbo = 656 list_last_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link); 657 last_bbo->chained = false; 658 659 uint32_t *batch = cmd_buffer->batch_end; 660 anv_pack_struct(batch, GFX8_MI_BATCH_BUFFER_END, 661 __anv_cmd_header(GFX8_MI_BATCH_BUFFER_END)); 662} 663 664static VkResult 665anv_cmd_buffer_chain_batch(struct anv_batch *batch, void *_data) 666{ 667 struct anv_cmd_buffer *cmd_buffer = _data; 668 struct anv_batch_bo *new_bbo = NULL; 669 /* Cap reallocation to chunk. */ 670 uint32_t alloc_size = MIN2(cmd_buffer->total_batch_size, 671 ANV_MAX_CMD_BUFFER_BATCH_SIZE); 672 673 VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo); 674 if (result != VK_SUCCESS) 675 return result; 676 677 cmd_buffer->total_batch_size += alloc_size; 678 679 struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos); 680 if (seen_bbo == NULL) { 681 anv_batch_bo_destroy(new_bbo, cmd_buffer); 682 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); 683 } 684 *seen_bbo = new_bbo; 685 686 cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo); 687 688 list_addtail(&new_bbo->link, &cmd_buffer->batch_bos); 689 690 anv_batch_bo_start(new_bbo, batch, GFX8_MI_BATCH_BUFFER_START_length * 4); 691 692 return VK_SUCCESS; 693} 694 695static VkResult 696anv_cmd_buffer_grow_batch(struct anv_batch *batch, void *_data) 697{ 698 struct anv_cmd_buffer *cmd_buffer = _data; 699 struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer); 700 701 anv_batch_bo_grow(cmd_buffer, bbo, &cmd_buffer->batch, 4096, 702 GFX8_MI_BATCH_BUFFER_START_length * 4); 703 704 return VK_SUCCESS; 705} 706 707/** Allocate a binding table 708 * 709 * This function allocates a binding table. This is a bit more complicated 710 * than one would think due to a combination of Vulkan driver design and some 711 * unfortunate hardware restrictions. 712 * 713 * The 3DSTATE_BINDING_TABLE_POINTERS_* packets only have a 16-bit field for 714 * the binding table pointer which means that all binding tables need to live 715 * in the bottom 64k of surface state base address. The way the GL driver has 716 * classically dealt with this restriction is to emit all surface states 717 * on-the-fly into the batch and have a batch buffer smaller than 64k. This 718 * isn't really an option in Vulkan for a couple of reasons: 719 * 720 * 1) In Vulkan, we have growing (or chaining) batches so surface states have 721 * to live in their own buffer and we have to be able to re-emit 722 * STATE_BASE_ADDRESS as needed which requires a full pipeline stall. In 723 * order to avoid emitting STATE_BASE_ADDRESS any more often than needed 724 * (it's not that hard to hit 64k of just binding tables), we allocate 725 * surface state objects up-front when VkImageView is created. In order 726 * for this to work, surface state objects need to be allocated from a 727 * global buffer. 728 * 729 * 2) We tried to design the surface state system in such a way that it's 730 * already ready for bindless texturing. The way bindless texturing works 731 * on our hardware is that you have a big pool of surface state objects 732 * (with its own state base address) and the bindless handles are simply 733 * offsets into that pool. With the architecture we chose, we already 734 * have that pool and it's exactly the same pool that we use for regular 735 * surface states so we should already be ready for bindless. 736 * 737 * 3) For render targets, we need to be able to fill out the surface states 738 * later in vkBeginRenderPass so that we can assign clear colors 739 * correctly. One way to do this would be to just create the surface 740 * state data and then repeatedly copy it into the surface state BO every 741 * time we have to re-emit STATE_BASE_ADDRESS. While this works, it's 742 * rather annoying and just being able to allocate them up-front and 743 * re-use them for the entire render pass. 744 * 745 * While none of these are technically blockers for emitting state on the fly 746 * like we do in GL, the ability to have a single surface state pool is 747 * simplifies things greatly. Unfortunately, it comes at a cost... 748 * 749 * Because of the 64k limitation of 3DSTATE_BINDING_TABLE_POINTERS_*, we can't 750 * place the binding tables just anywhere in surface state base address. 751 * Because 64k isn't a whole lot of space, we can't simply restrict the 752 * surface state buffer to 64k, we have to be more clever. The solution we've 753 * chosen is to have a block pool with a maximum size of 2G that starts at 754 * zero and grows in both directions. All surface states are allocated from 755 * the top of the pool (positive offsets) and we allocate blocks (< 64k) of 756 * binding tables from the bottom of the pool (negative offsets). Every time 757 * we allocate a new binding table block, we set surface state base address to 758 * point to the bottom of the binding table block. This way all of the 759 * binding tables in the block are in the bottom 64k of surface state base 760 * address. When we fill out the binding table, we add the distance between 761 * the bottom of our binding table block and zero of the block pool to the 762 * surface state offsets so that they are correct relative to out new surface 763 * state base address at the bottom of the binding table block. 764 * 765 * \see adjust_relocations_from_block_pool() 766 * \see adjust_relocations_too_block_pool() 767 * 768 * \param[in] entries The number of surface state entries the binding 769 * table should be able to hold. 770 * 771 * \param[out] state_offset The offset surface surface state base address 772 * where the surface states live. This must be 773 * added to the surface state offset when it is 774 * written into the binding table entry. 775 * 776 * \return An anv_state representing the binding table 777 */ 778struct anv_state 779anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer, 780 uint32_t entries, uint32_t *state_offset) 781{ 782 struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); 783 784 uint32_t bt_size = align_u32(entries * 4, 32); 785 786 struct anv_state state = cmd_buffer->bt_next; 787 if (bt_size > state.alloc_size) 788 return (struct anv_state) { 0 }; 789 790 state.alloc_size = bt_size; 791 cmd_buffer->bt_next.offset += bt_size; 792 cmd_buffer->bt_next.map += bt_size; 793 cmd_buffer->bt_next.alloc_size -= bt_size; 794 795 if (cmd_buffer->device->info.verx10 >= 125) { 796 /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to change the binding 797 * table address independently from surface state base address. We no 798 * longer need any sort of offsetting. 799 */ 800 *state_offset = 0; 801 } else { 802 assert(bt_block->offset < 0); 803 *state_offset = -bt_block->offset; 804 } 805 806 return state; 807} 808 809struct anv_state 810anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer) 811{ 812 struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 813 return anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 814 isl_dev->ss.size, isl_dev->ss.align); 815} 816 817struct anv_state 818anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer, 819 uint32_t size, uint32_t alignment) 820{ 821 return anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream, 822 size, alignment); 823} 824 825VkResult 826anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer) 827{ 828 struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states); 829 if (bt_block == NULL) { 830 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); 831 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); 832 } 833 834 *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device); 835 836 /* The bt_next state is a rolling state (we update it as we suballocate 837 * from it) which is relative to the start of the binding table block. 838 */ 839 cmd_buffer->bt_next = *bt_block; 840 cmd_buffer->bt_next.offset = 0; 841 842 return VK_SUCCESS; 843} 844 845VkResult 846anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) 847{ 848 struct anv_batch_bo *batch_bo = NULL; 849 VkResult result; 850 851 list_inithead(&cmd_buffer->batch_bos); 852 853 cmd_buffer->total_batch_size = ANV_MIN_CMD_BUFFER_BATCH_SIZE; 854 855 result = anv_batch_bo_create(cmd_buffer, 856 cmd_buffer->total_batch_size, 857 &batch_bo); 858 if (result != VK_SUCCESS) 859 return result; 860 861 list_addtail(&batch_bo->link, &cmd_buffer->batch_bos); 862 863 cmd_buffer->batch.alloc = &cmd_buffer->vk.pool->alloc; 864 cmd_buffer->batch.user_data = cmd_buffer; 865 866 if (cmd_buffer->device->can_chain_batches) { 867 cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch; 868 } else { 869 cmd_buffer->batch.extend_cb = anv_cmd_buffer_grow_batch; 870 } 871 872 anv_batch_bo_start(batch_bo, &cmd_buffer->batch, 873 GFX8_MI_BATCH_BUFFER_START_length * 4); 874 875 int success = u_vector_init_pow2(&cmd_buffer->seen_bbos, 8, 876 sizeof(struct anv_bo *)); 877 if (!success) 878 goto fail_batch_bo; 879 880 *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo; 881 882 success = u_vector_init(&cmd_buffer->bt_block_states, 8, 883 sizeof(struct anv_state)); 884 if (!success) 885 goto fail_seen_bbos; 886 887 result = anv_reloc_list_init(&cmd_buffer->surface_relocs, 888 &cmd_buffer->vk.pool->alloc); 889 if (result != VK_SUCCESS) 890 goto fail_bt_blocks; 891 cmd_buffer->last_ss_pool_center = 0; 892 893 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); 894 if (result != VK_SUCCESS) 895 goto fail_bt_blocks; 896 897 return VK_SUCCESS; 898 899 fail_bt_blocks: 900 u_vector_finish(&cmd_buffer->bt_block_states); 901 fail_seen_bbos: 902 u_vector_finish(&cmd_buffer->seen_bbos); 903 fail_batch_bo: 904 anv_batch_bo_destroy(batch_bo, cmd_buffer); 905 906 return result; 907} 908 909void 910anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) 911{ 912 struct anv_state *bt_block; 913 u_vector_foreach(bt_block, &cmd_buffer->bt_block_states) 914 anv_binding_table_pool_free(cmd_buffer->device, *bt_block); 915 u_vector_finish(&cmd_buffer->bt_block_states); 916 917 anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->vk.pool->alloc); 918 919 u_vector_finish(&cmd_buffer->seen_bbos); 920 921 /* Destroy all of the batch buffers */ 922 list_for_each_entry_safe(struct anv_batch_bo, bbo, 923 &cmd_buffer->batch_bos, link) { 924 list_del(&bbo->link); 925 anv_batch_bo_destroy(bbo, cmd_buffer); 926 } 927} 928 929void 930anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) 931{ 932 /* Delete all but the first batch bo */ 933 assert(!list_is_empty(&cmd_buffer->batch_bos)); 934 while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) { 935 struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer); 936 list_del(&bbo->link); 937 anv_batch_bo_destroy(bbo, cmd_buffer); 938 } 939 assert(!list_is_empty(&cmd_buffer->batch_bos)); 940 941 anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer), 942 &cmd_buffer->batch, 943 GFX8_MI_BATCH_BUFFER_START_length * 4); 944 945 while (u_vector_length(&cmd_buffer->bt_block_states) > 1) { 946 struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states); 947 anv_binding_table_pool_free(cmd_buffer->device, *bt_block); 948 } 949 assert(u_vector_length(&cmd_buffer->bt_block_states) == 1); 950 cmd_buffer->bt_next = *(struct anv_state *)u_vector_head(&cmd_buffer->bt_block_states); 951 cmd_buffer->bt_next.offset = 0; 952 953 anv_reloc_list_clear(&cmd_buffer->surface_relocs); 954 cmd_buffer->last_ss_pool_center = 0; 955 956 /* Reset the list of seen buffers */ 957 cmd_buffer->seen_bbos.head = 0; 958 cmd_buffer->seen_bbos.tail = 0; 959 960 struct anv_batch_bo *first_bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer); 961 962 *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = first_bbo; 963 964 965 assert(!cmd_buffer->device->can_chain_batches || 966 first_bbo->bo->size == ANV_MIN_CMD_BUFFER_BATCH_SIZE); 967 cmd_buffer->total_batch_size = first_bbo->bo->size; 968} 969 970void 971anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer) 972{ 973 struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer); 974 975 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { 976 /* When we start a batch buffer, we subtract a certain amount of 977 * padding from the end to ensure that we always have room to emit a 978 * BATCH_BUFFER_START to chain to the next BO. We need to remove 979 * that padding before we end the batch; otherwise, we may end up 980 * with our BATCH_BUFFER_END in another BO. 981 */ 982 cmd_buffer->batch.end += GFX8_MI_BATCH_BUFFER_START_length * 4; 983 assert(cmd_buffer->batch.start == batch_bo->bo->map); 984 assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size); 985 986 /* Save end instruction location to override it later. */ 987 cmd_buffer->batch_end = cmd_buffer->batch.next; 988 989 /* If we can chain this command buffer to another one, leave some place 990 * for the jump instruction. 991 */ 992 batch_bo->chained = anv_cmd_buffer_is_chainable(cmd_buffer); 993 if (batch_bo->chained) 994 emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0); 995 else 996 anv_batch_emit(&cmd_buffer->batch, GFX8_MI_BATCH_BUFFER_END, bbe); 997 998 /* Round batch up to an even number of dwords. */ 999 if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4) 1000 anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop); 1001 1002 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY; 1003 } else { 1004 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); 1005 /* If this is a secondary command buffer, we need to determine the 1006 * mode in which it will be executed with vkExecuteCommands. We 1007 * determine this statically here so that this stays in sync with the 1008 * actual ExecuteCommands implementation. 1009 */ 1010 const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start; 1011 if (!cmd_buffer->device->can_chain_batches) { 1012 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT; 1013 } else if (cmd_buffer->device->physical->use_call_secondary) { 1014 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN; 1015 /* If the secondary command buffer begins & ends in the same BO and 1016 * its length is less than the length of CS prefetch, add some NOOPs 1017 * instructions so the last MI_BATCH_BUFFER_START is outside the CS 1018 * prefetch. 1019 */ 1020 if (cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) { 1021 const struct intel_device_info *devinfo = &cmd_buffer->device->info; 1022 /* Careful to have everything in signed integer. */ 1023 int32_t prefetch_len = devinfo->cs_prefetch_size; 1024 int32_t batch_len = 1025 cmd_buffer->batch.next - cmd_buffer->batch.start; 1026 1027 for (int32_t i = 0; i < (prefetch_len - batch_len); i += 4) 1028 anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop); 1029 } 1030 1031 void *jump_addr = 1032 anv_batch_emitn(&cmd_buffer->batch, 1033 GFX8_MI_BATCH_BUFFER_START_length, 1034 GFX8_MI_BATCH_BUFFER_START, 1035 .AddressSpaceIndicator = ASI_PPGTT, 1036 .SecondLevelBatchBuffer = Firstlevelbatch) + 1037 (GFX8_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8); 1038 cmd_buffer->return_addr = anv_batch_address(&cmd_buffer->batch, jump_addr); 1039 1040 /* The emit above may have caused us to chain batch buffers which 1041 * would mean that batch_bo is no longer valid. 1042 */ 1043 batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer); 1044 } else if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) && 1045 (length < ANV_MIN_CMD_BUFFER_BATCH_SIZE / 2)) { 1046 /* If the secondary has exactly one batch buffer in its list *and* 1047 * that batch buffer is less than half of the maximum size, we're 1048 * probably better of simply copying it into our batch. 1049 */ 1050 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_EMIT; 1051 } else if (!(cmd_buffer->usage_flags & 1052 VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) { 1053 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN; 1054 1055 /* In order to chain, we need this command buffer to contain an 1056 * MI_BATCH_BUFFER_START which will jump back to the calling batch. 1057 * It doesn't matter where it points now so long as has a valid 1058 * relocation. We'll adjust it later as part of the chaining 1059 * process. 1060 * 1061 * We set the end of the batch a little short so we would be sure we 1062 * have room for the chaining command. Since we're about to emit the 1063 * chaining command, let's set it back where it should go. 1064 */ 1065 cmd_buffer->batch.end += GFX8_MI_BATCH_BUFFER_START_length * 4; 1066 assert(cmd_buffer->batch.start == batch_bo->bo->map); 1067 assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size); 1068 1069 emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0); 1070 assert(cmd_buffer->batch.start == batch_bo->bo->map); 1071 } else { 1072 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN; 1073 } 1074 } 1075 1076 anv_batch_bo_finish(batch_bo, &cmd_buffer->batch); 1077} 1078 1079static VkResult 1080anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer, 1081 struct list_head *list) 1082{ 1083 list_for_each_entry(struct anv_batch_bo, bbo, list, link) { 1084 struct anv_batch_bo **bbo_ptr = u_vector_add(&cmd_buffer->seen_bbos); 1085 if (bbo_ptr == NULL) 1086 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); 1087 1088 *bbo_ptr = bbo; 1089 } 1090 1091 return VK_SUCCESS; 1092} 1093 1094void 1095anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, 1096 struct anv_cmd_buffer *secondary) 1097{ 1098 anv_measure_add_secondary(primary, secondary); 1099 switch (secondary->exec_mode) { 1100 case ANV_CMD_BUFFER_EXEC_MODE_EMIT: 1101 anv_batch_emit_batch(&primary->batch, &secondary->batch); 1102 break; 1103 case ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT: { 1104 struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(primary); 1105 unsigned length = secondary->batch.end - secondary->batch.start; 1106 anv_batch_bo_grow(primary, bbo, &primary->batch, length, 1107 GFX8_MI_BATCH_BUFFER_START_length * 4); 1108 anv_batch_emit_batch(&primary->batch, &secondary->batch); 1109 break; 1110 } 1111 case ANV_CMD_BUFFER_EXEC_MODE_CHAIN: { 1112 struct anv_batch_bo *first_bbo = 1113 list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link); 1114 struct anv_batch_bo *last_bbo = 1115 list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link); 1116 1117 emit_batch_buffer_start(primary, first_bbo->bo, 0); 1118 1119 struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary); 1120 assert(primary->batch.start == this_bbo->bo->map); 1121 uint32_t offset = primary->batch.next - primary->batch.start; 1122 1123 /* Make the tail of the secondary point back to right after the 1124 * MI_BATCH_BUFFER_START in the primary batch. 1125 */ 1126 anv_batch_bo_link(primary, last_bbo, this_bbo, offset); 1127 1128 anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos); 1129 break; 1130 } 1131 case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: { 1132 struct list_head copy_list; 1133 VkResult result = anv_batch_bo_list_clone(&secondary->batch_bos, 1134 secondary, 1135 ©_list); 1136 if (result != VK_SUCCESS) 1137 return; /* FIXME */ 1138 1139 anv_cmd_buffer_add_seen_bbos(primary, ©_list); 1140 1141 struct anv_batch_bo *first_bbo = 1142 list_first_entry(©_list, struct anv_batch_bo, link); 1143 struct anv_batch_bo *last_bbo = 1144 list_last_entry(©_list, struct anv_batch_bo, link); 1145 1146 cmd_buffer_chain_to_batch_bo(primary, first_bbo); 1147 1148 list_splicetail(©_list, &primary->batch_bos); 1149 1150 anv_batch_bo_continue(last_bbo, &primary->batch, 1151 GFX8_MI_BATCH_BUFFER_START_length * 4); 1152 break; 1153 } 1154 case ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN: { 1155 struct anv_batch_bo *first_bbo = 1156 list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link); 1157 1158 uint64_t *write_return_addr = 1159 anv_batch_emitn(&primary->batch, 1160 GFX8_MI_STORE_DATA_IMM_length + 1 /* QWord write */, 1161 GFX8_MI_STORE_DATA_IMM, 1162 .Address = secondary->return_addr) 1163 + (GFX8_MI_STORE_DATA_IMM_ImmediateData_start / 8); 1164 1165 emit_batch_buffer_start(primary, first_bbo->bo, 0); 1166 1167 *write_return_addr = 1168 anv_address_physical(anv_batch_address(&primary->batch, 1169 primary->batch.next)); 1170 1171 anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos); 1172 break; 1173 } 1174 default: 1175 assert(!"Invalid execution mode"); 1176 } 1177 1178 anv_reloc_list_append(&primary->surface_relocs, &primary->vk.pool->alloc, 1179 &secondary->surface_relocs, 0); 1180} 1181 1182struct anv_execbuf { 1183 struct drm_i915_gem_execbuffer2 execbuf; 1184 1185 struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences; 1186 1187 struct drm_i915_gem_exec_object2 * objects; 1188 uint32_t bo_count; 1189 struct anv_bo ** bos; 1190 1191 /* Allocated length of the 'objects' and 'bos' arrays */ 1192 uint32_t array_length; 1193 1194 uint32_t syncobj_count; 1195 uint32_t syncobj_array_length; 1196 struct drm_i915_gem_exec_fence * syncobjs; 1197 uint64_t * syncobj_values; 1198 1199 /* List of relocations for surface states, only used with platforms not 1200 * using softpin. 1201 */ 1202 void * surface_states_relocs; 1203 1204 uint32_t cmd_buffer_count; 1205 struct anv_query_pool *perf_query_pool; 1206 1207 /* Indicates whether any of the command buffers have relocations. This 1208 * doesn't not necessarily mean we'll need the kernel to process them. It 1209 * might be that a previous execbuf has already placed things in the VMA 1210 * and we can make i915 skip the relocations. 1211 */ 1212 bool has_relocs; 1213 1214 const VkAllocationCallbacks * alloc; 1215 VkSystemAllocationScope alloc_scope; 1216 1217 int perf_query_pass; 1218}; 1219 1220static void 1221anv_execbuf_init(struct anv_execbuf *exec) 1222{ 1223 memset(exec, 0, sizeof(*exec)); 1224} 1225 1226static void 1227anv_execbuf_finish(struct anv_execbuf *exec) 1228{ 1229 vk_free(exec->alloc, exec->syncobjs); 1230 vk_free(exec->alloc, exec->syncobj_values); 1231 vk_free(exec->alloc, exec->surface_states_relocs); 1232 vk_free(exec->alloc, exec->objects); 1233 vk_free(exec->alloc, exec->bos); 1234} 1235 1236static void 1237anv_execbuf_add_ext(struct anv_execbuf *exec, 1238 uint32_t ext_name, 1239 struct i915_user_extension *ext) 1240{ 1241 __u64 *iter = &exec->execbuf.cliprects_ptr; 1242 1243 exec->execbuf.flags |= I915_EXEC_USE_EXTENSIONS; 1244 1245 while (*iter != 0) { 1246 iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension; 1247 } 1248 1249 ext->name = ext_name; 1250 1251 *iter = (uintptr_t) ext; 1252} 1253 1254static VkResult 1255anv_execbuf_add_bo_bitset(struct anv_device *device, 1256 struct anv_execbuf *exec, 1257 uint32_t dep_words, 1258 BITSET_WORD *deps, 1259 uint32_t extra_flags); 1260 1261static VkResult 1262anv_execbuf_add_bo(struct anv_device *device, 1263 struct anv_execbuf *exec, 1264 struct anv_bo *bo, 1265 struct anv_reloc_list *relocs, 1266 uint32_t extra_flags) 1267{ 1268 struct drm_i915_gem_exec_object2 *obj = NULL; 1269 1270 bo = anv_bo_unwrap(bo); 1271 1272 if (bo->exec_obj_index < exec->bo_count && 1273 exec->bos[bo->exec_obj_index] == bo) 1274 obj = &exec->objects[bo->exec_obj_index]; 1275 1276 if (obj == NULL) { 1277 /* We've never seen this one before. Add it to the list and assign 1278 * an id that we can use later. 1279 */ 1280 if (exec->bo_count >= exec->array_length) { 1281 uint32_t new_len = exec->objects ? exec->array_length * 2 : 64; 1282 1283 struct drm_i915_gem_exec_object2 *new_objects = 1284 vk_alloc(exec->alloc, new_len * sizeof(*new_objects), 8, exec->alloc_scope); 1285 if (new_objects == NULL) 1286 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1287 1288 struct anv_bo **new_bos = 1289 vk_alloc(exec->alloc, new_len * sizeof(*new_bos), 8, exec->alloc_scope); 1290 if (new_bos == NULL) { 1291 vk_free(exec->alloc, new_objects); 1292 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1293 } 1294 1295 if (exec->objects) { 1296 memcpy(new_objects, exec->objects, 1297 exec->bo_count * sizeof(*new_objects)); 1298 memcpy(new_bos, exec->bos, 1299 exec->bo_count * sizeof(*new_bos)); 1300 } 1301 1302 vk_free(exec->alloc, exec->objects); 1303 vk_free(exec->alloc, exec->bos); 1304 1305 exec->objects = new_objects; 1306 exec->bos = new_bos; 1307 exec->array_length = new_len; 1308 } 1309 1310 assert(exec->bo_count < exec->array_length); 1311 1312 bo->exec_obj_index = exec->bo_count++; 1313 obj = &exec->objects[bo->exec_obj_index]; 1314 exec->bos[bo->exec_obj_index] = bo; 1315 1316 obj->handle = bo->gem_handle; 1317 obj->relocation_count = 0; 1318 obj->relocs_ptr = 0; 1319 obj->alignment = 0; 1320 obj->offset = bo->offset; 1321 obj->flags = bo->flags | extra_flags; 1322 obj->rsvd1 = 0; 1323 obj->rsvd2 = 0; 1324 } 1325 1326 if (extra_flags & EXEC_OBJECT_WRITE) { 1327 obj->flags |= EXEC_OBJECT_WRITE; 1328 obj->flags &= ~EXEC_OBJECT_ASYNC; 1329 } 1330 1331 if (relocs != NULL) { 1332 assert(obj->relocation_count == 0); 1333 1334 if (relocs->num_relocs > 0) { 1335 /* This is the first time we've ever seen a list of relocations for 1336 * this BO. Go ahead and set the relocations and then walk the list 1337 * of relocations and add them all. 1338 */ 1339 exec->has_relocs = true; 1340 obj->relocation_count = relocs->num_relocs; 1341 obj->relocs_ptr = (uintptr_t) relocs->relocs; 1342 1343 for (size_t i = 0; i < relocs->num_relocs; i++) { 1344 VkResult result; 1345 1346 /* A quick sanity check on relocations */ 1347 assert(relocs->relocs[i].offset < bo->size); 1348 result = anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i], 1349 NULL, extra_flags); 1350 if (result != VK_SUCCESS) 1351 return result; 1352 } 1353 } 1354 1355 return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words, 1356 relocs->deps, extra_flags); 1357 } 1358 1359 return VK_SUCCESS; 1360} 1361 1362/* Add BO dependencies to execbuf */ 1363static VkResult 1364anv_execbuf_add_bo_bitset(struct anv_device *device, 1365 struct anv_execbuf *exec, 1366 uint32_t dep_words, 1367 BITSET_WORD *deps, 1368 uint32_t extra_flags) 1369{ 1370 for (uint32_t w = 0; w < dep_words; w++) { 1371 BITSET_WORD mask = deps[w]; 1372 while (mask) { 1373 int i = u_bit_scan(&mask); 1374 uint32_t gem_handle = w * BITSET_WORDBITS + i; 1375 struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle); 1376 assert(bo->refcount > 0); 1377 VkResult result = 1378 anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags); 1379 if (result != VK_SUCCESS) 1380 return result; 1381 } 1382 } 1383 1384 return VK_SUCCESS; 1385} 1386 1387static void 1388anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer, 1389 struct anv_reloc_list *list) 1390{ 1391 for (size_t i = 0; i < list->num_relocs; i++) { 1392 list->relocs[i].target_handle = 1393 anv_bo_unwrap(list->reloc_bos[i])->exec_obj_index; 1394 } 1395} 1396 1397static void 1398adjust_relocations_from_state_pool(struct anv_state_pool *pool, 1399 struct anv_reloc_list *relocs, 1400 uint32_t last_pool_center_bo_offset) 1401{ 1402 assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset); 1403 uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset; 1404 1405 for (size_t i = 0; i < relocs->num_relocs; i++) { 1406 /* All of the relocations from this block pool to other BO's should 1407 * have been emitted relative to the surface block pool center. We 1408 * need to add the center offset to make them relative to the 1409 * beginning of the actual GEM bo. 1410 */ 1411 relocs->relocs[i].offset += delta; 1412 } 1413} 1414 1415static void 1416adjust_relocations_to_state_pool(struct anv_state_pool *pool, 1417 struct anv_bo *from_bo, 1418 struct anv_reloc_list *relocs, 1419 uint32_t last_pool_center_bo_offset) 1420{ 1421 assert(!from_bo->is_wrapper); 1422 assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset); 1423 uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset; 1424 1425 /* When we initially emit relocations into a block pool, we don't 1426 * actually know what the final center_bo_offset will be so we just emit 1427 * it as if center_bo_offset == 0. Now that we know what the center 1428 * offset is, we need to walk the list of relocations and adjust any 1429 * relocations that point to the pool bo with the correct offset. 1430 */ 1431 for (size_t i = 0; i < relocs->num_relocs; i++) { 1432 if (relocs->reloc_bos[i] == pool->block_pool.bo) { 1433 /* Adjust the delta value in the relocation to correctly 1434 * correspond to the new delta. Initially, this value may have 1435 * been negative (if treated as unsigned), but we trust in 1436 * uint32_t roll-over to fix that for us at this point. 1437 */ 1438 relocs->relocs[i].delta += delta; 1439 1440 /* Since the delta has changed, we need to update the actual 1441 * relocated value with the new presumed value. This function 1442 * should only be called on batch buffers, so we know it isn't in 1443 * use by the GPU at the moment. 1444 */ 1445 assert(relocs->relocs[i].offset < from_bo->size); 1446 write_reloc(pool->block_pool.device, 1447 from_bo->map + relocs->relocs[i].offset, 1448 relocs->relocs[i].presumed_offset + 1449 relocs->relocs[i].delta, false); 1450 } 1451 } 1452} 1453 1454static void 1455anv_reloc_list_apply(struct anv_device *device, 1456 struct anv_reloc_list *list, 1457 struct anv_bo *bo, 1458 bool always_relocate) 1459{ 1460 bo = anv_bo_unwrap(bo); 1461 1462 for (size_t i = 0; i < list->num_relocs; i++) { 1463 struct anv_bo *target_bo = anv_bo_unwrap(list->reloc_bos[i]); 1464 if (list->relocs[i].presumed_offset == target_bo->offset && 1465 !always_relocate) 1466 continue; 1467 1468 void *p = bo->map + list->relocs[i].offset; 1469 write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true); 1470 list->relocs[i].presumed_offset = target_bo->offset; 1471 } 1472} 1473 1474/** 1475 * This function applies the relocation for a command buffer and writes the 1476 * actual addresses into the buffers as per what we were told by the kernel on 1477 * the previous execbuf2 call. This should be safe to do because, for each 1478 * relocated address, we have two cases: 1479 * 1480 * 1) The target BO is inactive (as seen by the kernel). In this case, it is 1481 * not in use by the GPU so updating the address is 100% ok. It won't be 1482 * in-use by the GPU (from our context) again until the next execbuf2 1483 * happens. If the kernel decides to move it in the next execbuf2, it 1484 * will have to do the relocations itself, but that's ok because it should 1485 * have all of the information needed to do so. 1486 * 1487 * 2) The target BO is active (as seen by the kernel). In this case, it 1488 * hasn't moved since the last execbuffer2 call because GTT shuffling 1489 * *only* happens when the BO is idle. (From our perspective, it only 1490 * happens inside the execbuffer2 ioctl, but the shuffling may be 1491 * triggered by another ioctl, with full-ppgtt this is limited to only 1492 * execbuffer2 ioctls on the same context, or memory pressure.) Since the 1493 * target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT 1494 * address and the relocated value we are writing into the BO will be the 1495 * same as the value that is already there. 1496 * 1497 * There is also a possibility that the target BO is active but the exact 1498 * RENDER_SURFACE_STATE object we are writing the relocation into isn't in 1499 * use. In this case, the address currently in the RENDER_SURFACE_STATE 1500 * may be stale but it's still safe to write the relocation because that 1501 * particular RENDER_SURFACE_STATE object isn't in-use by the GPU and 1502 * won't be until the next execbuf2 call. 1503 * 1504 * By doing relocations on the CPU, we can tell the kernel that it doesn't 1505 * need to bother. We want to do this because the surface state buffer is 1506 * used by every command buffer so, if the kernel does the relocations, it 1507 * will always be busy and the kernel will always stall. This is also 1508 * probably the fastest mechanism for doing relocations since the kernel would 1509 * have to make a full copy of all the relocations lists. 1510 */ 1511static bool 1512execbuf_can_skip_relocations(struct anv_execbuf *exec) 1513{ 1514 if (!exec->has_relocs) 1515 return true; 1516 1517 static int userspace_relocs = -1; 1518 if (userspace_relocs < 0) 1519 userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true); 1520 if (!userspace_relocs) 1521 return false; 1522 1523 /* First, we have to check to see whether or not we can even do the 1524 * relocation. New buffers which have never been submitted to the kernel 1525 * don't have a valid offset so we need to let the kernel do relocations so 1526 * that we can get offsets for them. On future execbuf2 calls, those 1527 * buffers will have offsets and we will be able to skip relocating. 1528 * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1. 1529 */ 1530 for (uint32_t i = 0; i < exec->bo_count; i++) { 1531 assert(!exec->bos[i]->is_wrapper); 1532 if (exec->bos[i]->offset == (uint64_t)-1) 1533 return false; 1534 } 1535 1536 return true; 1537} 1538 1539static void 1540relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, 1541 struct anv_execbuf *exec) 1542{ 1543 /* Since surface states are shared between command buffers and we don't 1544 * know what order they will be submitted to the kernel, we don't know 1545 * what address is actually written in the surface state object at any 1546 * given time. The only option is to always relocate them. 1547 */ 1548 struct anv_bo *surface_state_bo = 1549 anv_bo_unwrap(cmd_buffer->device->surface_state_pool.block_pool.bo); 1550 anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs, 1551 surface_state_bo, 1552 true /* always relocate surface states */); 1553 1554 /* Since we own all of the batch buffers, we know what values are stored 1555 * in the relocated addresses and only have to update them if the offsets 1556 * have changed. 1557 */ 1558 struct anv_batch_bo **bbo; 1559 u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { 1560 anv_reloc_list_apply(cmd_buffer->device, 1561 &(*bbo)->relocs, (*bbo)->bo, false); 1562 } 1563 1564 for (uint32_t i = 0; i < exec->bo_count; i++) 1565 exec->objects[i].offset = exec->bos[i]->offset; 1566} 1567 1568static void 1569reset_cmd_buffer_surface_offsets(struct anv_cmd_buffer *cmd_buffer) 1570{ 1571 /* In the case where we fall back to doing kernel relocations, we need to 1572 * ensure that the relocation list is valid. All relocations on the batch 1573 * buffers are already valid and kept up-to-date. Since surface states are 1574 * shared between command buffers and we don't know what order they will be 1575 * submitted to the kernel, we don't know what address is actually written 1576 * in the surface state object at any given time. The only option is to set 1577 * a bogus presumed offset and let the kernel relocate them. 1578 */ 1579 for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++) 1580 cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1; 1581} 1582 1583static VkResult 1584anv_execbuf_add_syncobj(struct anv_device *device, 1585 struct anv_execbuf *exec, 1586 uint32_t syncobj, 1587 uint32_t flags, 1588 uint64_t timeline_value) 1589{ 1590 if (exec->syncobj_count >= exec->syncobj_array_length) { 1591 uint32_t new_len = MAX2(exec->syncobj_array_length * 2, 16); 1592 1593 struct drm_i915_gem_exec_fence *new_syncobjs = 1594 vk_alloc(exec->alloc, new_len * sizeof(*new_syncobjs), 1595 8, exec->alloc_scope); 1596 if (!new_syncobjs) 1597 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1598 1599 if (exec->syncobjs) 1600 typed_memcpy(new_syncobjs, exec->syncobjs, exec->syncobj_count); 1601 1602 exec->syncobjs = new_syncobjs; 1603 1604 if (exec->syncobj_values) { 1605 uint64_t *new_syncobj_values = 1606 vk_alloc(exec->alloc, new_len * sizeof(*new_syncobj_values), 1607 8, exec->alloc_scope); 1608 if (!new_syncobj_values) 1609 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1610 1611 typed_memcpy(new_syncobj_values, exec->syncobj_values, 1612 exec->syncobj_count); 1613 1614 exec->syncobj_values = new_syncobj_values; 1615 } 1616 1617 exec->syncobj_array_length = new_len; 1618 } 1619 1620 if (timeline_value && !exec->syncobj_values) { 1621 exec->syncobj_values = 1622 vk_zalloc(exec->alloc, exec->syncobj_array_length * 1623 sizeof(*exec->syncobj_values), 1624 8, exec->alloc_scope); 1625 if (!exec->syncobj_values) 1626 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1627 } 1628 1629 exec->syncobjs[exec->syncobj_count] = (struct drm_i915_gem_exec_fence) { 1630 .handle = syncobj, 1631 .flags = flags, 1632 }; 1633 if (timeline_value) 1634 exec->syncobj_values[exec->syncobj_count] = timeline_value; 1635 1636 exec->syncobj_count++; 1637 1638 return VK_SUCCESS; 1639} 1640 1641static VkResult 1642anv_execbuf_add_sync(struct anv_device *device, 1643 struct anv_execbuf *execbuf, 1644 struct vk_sync *sync, 1645 bool is_signal, 1646 uint64_t value) 1647{ 1648 /* It's illegal to signal a timeline with value 0 because that's never 1649 * higher than the current value. A timeline wait on value 0 is always 1650 * trivial because 0 <= uint64_t always. 1651 */ 1652 if ((sync->flags & VK_SYNC_IS_TIMELINE) && value == 0) 1653 return VK_SUCCESS; 1654 1655 if (vk_sync_is_anv_bo_sync(sync)) { 1656 struct anv_bo_sync *bo_sync = 1657 container_of(sync, struct anv_bo_sync, sync); 1658 1659 assert(is_signal == (bo_sync->state == ANV_BO_SYNC_STATE_RESET)); 1660 1661 return anv_execbuf_add_bo(device, execbuf, bo_sync->bo, NULL, 1662 is_signal ? EXEC_OBJECT_WRITE : 0); 1663 } else if (vk_sync_type_is_drm_syncobj(sync->type)) { 1664 struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(sync); 1665 1666 if (!(sync->flags & VK_SYNC_IS_TIMELINE)) 1667 value = 0; 1668 1669 return anv_execbuf_add_syncobj(device, execbuf, syncobj->syncobj, 1670 is_signal ? I915_EXEC_FENCE_SIGNAL : 1671 I915_EXEC_FENCE_WAIT, 1672 value); 1673 } 1674 1675 unreachable("Invalid sync type"); 1676} 1677 1678static VkResult 1679setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, 1680 struct anv_cmd_buffer *cmd_buffer) 1681{ 1682 struct anv_state_pool *ss_pool = 1683 &cmd_buffer->device->surface_state_pool; 1684 1685 adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs, 1686 cmd_buffer->last_ss_pool_center); 1687 VkResult result; 1688 if (anv_use_relocations(cmd_buffer->device->physical)) { 1689 /* Since we aren't in the softpin case, all of our STATE_BASE_ADDRESS BOs 1690 * will get added automatically by processing relocations on the batch 1691 * buffer. We have to add the surface state BO manually because it has 1692 * relocations of its own that we need to be sure are processed. 1693 */ 1694 result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, 1695 ss_pool->block_pool.bo, 1696 &cmd_buffer->surface_relocs, 0); 1697 if (result != VK_SUCCESS) 1698 return result; 1699 } else { 1700 /* Add surface dependencies (BOs) to the execbuf */ 1701 anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf, 1702 cmd_buffer->surface_relocs.dep_words, 1703 cmd_buffer->surface_relocs.deps, 0); 1704 } 1705 1706 /* First, we walk over all of the bos we've seen and add them and their 1707 * relocations to the validate list. 1708 */ 1709 struct anv_batch_bo **bbo; 1710 u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { 1711 adjust_relocations_to_state_pool(ss_pool, (*bbo)->bo, &(*bbo)->relocs, 1712 cmd_buffer->last_ss_pool_center); 1713 1714 result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, 1715 (*bbo)->bo, &(*bbo)->relocs, 0); 1716 if (result != VK_SUCCESS) 1717 return result; 1718 } 1719 1720 /* Now that we've adjusted all of the surface state relocations, we need to 1721 * record the surface state pool center so future executions of the command 1722 * buffer can adjust correctly. 1723 */ 1724 cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset; 1725 1726 return VK_SUCCESS; 1727} 1728 1729static void 1730chain_command_buffers(struct anv_cmd_buffer **cmd_buffers, 1731 uint32_t num_cmd_buffers) 1732{ 1733 if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) { 1734 assert(num_cmd_buffers == 1); 1735 return; 1736 } 1737 1738 /* Chain the N-1 first batch buffers */ 1739 for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++) 1740 anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]); 1741 1742 /* Put an end to the last one */ 1743 anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]); 1744} 1745 1746static VkResult 1747setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf, 1748 struct anv_queue *queue, 1749 struct anv_cmd_buffer **cmd_buffers, 1750 uint32_t num_cmd_buffers) 1751{ 1752 struct anv_device *device = queue->device; 1753 struct anv_state_pool *ss_pool = &device->surface_state_pool; 1754 VkResult result; 1755 1756 /* Edit the tail of the command buffers to chain them all together if they 1757 * can be. 1758 */ 1759 chain_command_buffers(cmd_buffers, num_cmd_buffers); 1760 1761 for (uint32_t i = 0; i < num_cmd_buffers; i++) { 1762 anv_measure_submit(cmd_buffers[i]); 1763 result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]); 1764 if (result != VK_SUCCESS) 1765 return result; 1766 } 1767 1768 /* Add all the global BOs to the object list for softpin case. */ 1769 if (!anv_use_relocations(device->physical)) { 1770 anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) { 1771 result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); 1772 if (result != VK_SUCCESS) 1773 return result; 1774 } 1775 1776 struct anv_block_pool *pool; 1777 pool = &device->dynamic_state_pool.block_pool; 1778 anv_block_pool_foreach_bo(bo, pool) { 1779 result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); 1780 if (result != VK_SUCCESS) 1781 return result; 1782 } 1783 1784 pool = &device->general_state_pool.block_pool; 1785 anv_block_pool_foreach_bo(bo, pool) { 1786 result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); 1787 if (result != VK_SUCCESS) 1788 return result; 1789 } 1790 1791 pool = &device->instruction_state_pool.block_pool; 1792 anv_block_pool_foreach_bo(bo, pool) { 1793 result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); 1794 if (result != VK_SUCCESS) 1795 return result; 1796 } 1797 1798 pool = &device->binding_table_pool.block_pool; 1799 anv_block_pool_foreach_bo(bo, pool) { 1800 result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); 1801 if (result != VK_SUCCESS) 1802 return result; 1803 } 1804 1805 /* Add the BOs for all user allocated memory objects because we can't 1806 * track after binding updates of VK_EXT_descriptor_indexing. 1807 */ 1808 list_for_each_entry(struct anv_device_memory, mem, 1809 &device->memory_objects, link) { 1810 result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0); 1811 if (result != VK_SUCCESS) 1812 return result; 1813 } 1814 } else { 1815 /* We do not support chaining primary command buffers without 1816 * softpin. 1817 */ 1818 assert(num_cmd_buffers == 1); 1819 } 1820 1821 bool no_reloc = true; 1822 if (execbuf->has_relocs) { 1823 no_reloc = execbuf_can_skip_relocations(execbuf); 1824 if (no_reloc) { 1825 /* If we were able to successfully relocate everything, tell the 1826 * kernel that it can skip doing relocations. The requirement for 1827 * using NO_RELOC is: 1828 * 1829 * 1) The addresses written in the objects must match the 1830 * corresponding reloc.presumed_offset which in turn must match 1831 * the corresponding execobject.offset. 1832 * 1833 * 2) To avoid stalling, execobject.offset should match the current 1834 * address of that object within the active context. 1835 * 1836 * In order to satisfy all of the invariants that make userspace 1837 * relocations to be safe (see relocate_cmd_buffer()), we need to 1838 * further ensure that the addresses we use match those used by the 1839 * kernel for the most recent execbuf2. 1840 * 1841 * The kernel may still choose to do relocations anyway if something 1842 * has moved in the GTT. In this case, the relocation list still 1843 * needs to be valid. All relocations on the batch buffers are 1844 * already valid and kept up-to-date. For surface state relocations, 1845 * by applying the relocations in relocate_cmd_buffer, we ensured 1846 * that the address in the RENDER_SURFACE_STATE matches 1847 * presumed_offset, so it should be safe for the kernel to relocate 1848 * them as needed. 1849 */ 1850 for (uint32_t i = 0; i < num_cmd_buffers; i++) { 1851 relocate_cmd_buffer(cmd_buffers[i], execbuf); 1852 1853 anv_reloc_list_apply(device, &cmd_buffers[i]->surface_relocs, 1854 device->surface_state_pool.block_pool.bo, 1855 true /* always relocate surface states */); 1856 } 1857 } else { 1858 /* In the case where we fall back to doing kernel relocations, we 1859 * need to ensure that the relocation list is valid. All relocations 1860 * on the batch buffers are already valid and kept up-to-date. Since 1861 * surface states are shared between command buffers and we don't 1862 * know what order they will be submitted to the kernel, we don't 1863 * know what address is actually written in the surface state object 1864 * at any given time. The only option is to set a bogus presumed 1865 * offset and let the kernel relocate them. 1866 */ 1867 for (uint32_t i = 0; i < num_cmd_buffers; i++) 1868 reset_cmd_buffer_surface_offsets(cmd_buffers[i]); 1869 } 1870 } 1871 1872 struct anv_batch_bo *first_batch_bo = 1873 list_first_entry(&cmd_buffers[0]->batch_bos, struct anv_batch_bo, link); 1874 1875 /* The kernel requires that the last entry in the validation list be the 1876 * batch buffer to execute. We can simply swap the element 1877 * corresponding to the first batch_bo in the chain with the last 1878 * element in the list. 1879 */ 1880 if (first_batch_bo->bo->exec_obj_index != execbuf->bo_count - 1) { 1881 uint32_t idx = first_batch_bo->bo->exec_obj_index; 1882 uint32_t last_idx = execbuf->bo_count - 1; 1883 1884 struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx]; 1885 assert(execbuf->bos[idx] == first_batch_bo->bo); 1886 1887 execbuf->objects[idx] = execbuf->objects[last_idx]; 1888 execbuf->bos[idx] = execbuf->bos[last_idx]; 1889 execbuf->bos[idx]->exec_obj_index = idx; 1890 1891 execbuf->objects[last_idx] = tmp_obj; 1892 execbuf->bos[last_idx] = first_batch_bo->bo; 1893 first_batch_bo->bo->exec_obj_index = last_idx; 1894 } 1895 1896 /* If we are pinning our BOs, we shouldn't have to relocate anything */ 1897 if (!anv_use_relocations(device->physical)) 1898 assert(!execbuf->has_relocs); 1899 1900 /* Now we go through and fixup all of the relocation lists to point to the 1901 * correct indices in the object array (I915_EXEC_HANDLE_LUT). We have to 1902 * do this after we reorder the list above as some of the indices may have 1903 * changed. 1904 */ 1905 struct anv_batch_bo **bbo; 1906 if (execbuf->has_relocs) { 1907 assert(num_cmd_buffers == 1); 1908 u_vector_foreach(bbo, &cmd_buffers[0]->seen_bbos) 1909 anv_cmd_buffer_process_relocs(cmd_buffers[0], &(*bbo)->relocs); 1910 1911 anv_cmd_buffer_process_relocs(cmd_buffers[0], &cmd_buffers[0]->surface_relocs); 1912 } 1913 1914 if (device->physical->memory.need_clflush) { 1915 __builtin_ia32_mfence(); 1916 for (uint32_t i = 0; i < num_cmd_buffers; i++) { 1917 u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) { 1918 for (uint32_t l = 0; l < (*bbo)->length; l += CACHELINE_SIZE) 1919 __builtin_ia32_clflush((*bbo)->bo->map + l); 1920 } 1921 } 1922 } 1923 1924 struct anv_batch *batch = &cmd_buffers[0]->batch; 1925 execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { 1926 .buffers_ptr = (uintptr_t) execbuf->objects, 1927 .buffer_count = execbuf->bo_count, 1928 .batch_start_offset = 0, 1929 /* On platforms that cannot chain batch buffers because of the i915 1930 * command parser, we have to provide the batch length. Everywhere else 1931 * we'll chain batches so no point in passing a length. 1932 */ 1933 .batch_len = device->can_chain_batches ? 0 : batch->next - batch->start, 1934 .cliprects_ptr = 0, 1935 .num_cliprects = 0, 1936 .DR1 = 0, 1937 .DR4 = 0, 1938 .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | (no_reloc ? I915_EXEC_NO_RELOC : 0), 1939 .rsvd1 = device->context_id, 1940 .rsvd2 = 0, 1941 }; 1942 1943 return VK_SUCCESS; 1944} 1945 1946static VkResult 1947setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue) 1948{ 1949 struct anv_device *device = queue->device; 1950 VkResult result = anv_execbuf_add_bo(device, execbuf, 1951 device->trivial_batch_bo, 1952 NULL, 0); 1953 if (result != VK_SUCCESS) 1954 return result; 1955 1956 execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { 1957 .buffers_ptr = (uintptr_t) execbuf->objects, 1958 .buffer_count = execbuf->bo_count, 1959 .batch_start_offset = 0, 1960 .batch_len = 8, /* GFX7_MI_BATCH_BUFFER_END and NOOP */ 1961 .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC, 1962 .rsvd1 = device->context_id, 1963 .rsvd2 = 0, 1964 }; 1965 1966 return VK_SUCCESS; 1967} 1968 1969static VkResult 1970setup_utrace_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue, 1971 struct anv_utrace_flush_copy *flush) 1972{ 1973 struct anv_device *device = queue->device; 1974 VkResult result = anv_execbuf_add_bo(device, execbuf, 1975 flush->batch_bo, 1976 &flush->relocs, 0); 1977 if (result != VK_SUCCESS) 1978 return result; 1979 1980 result = anv_execbuf_add_sync(device, execbuf, flush->sync, 1981 true /* is_signal */, 0 /* value */); 1982 if (result != VK_SUCCESS) 1983 return result; 1984 1985 if (flush->batch_bo->exec_obj_index != execbuf->bo_count - 1) { 1986 uint32_t idx = flush->batch_bo->exec_obj_index; 1987 uint32_t last_idx = execbuf->bo_count - 1; 1988 1989 struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx]; 1990 assert(execbuf->bos[idx] == flush->batch_bo); 1991 1992 execbuf->objects[idx] = execbuf->objects[last_idx]; 1993 execbuf->bos[idx] = execbuf->bos[last_idx]; 1994 execbuf->bos[idx]->exec_obj_index = idx; 1995 1996 execbuf->objects[last_idx] = tmp_obj; 1997 execbuf->bos[last_idx] = flush->batch_bo; 1998 flush->batch_bo->exec_obj_index = last_idx; 1999 } 2000 2001 if (device->physical->memory.need_clflush) 2002 intel_flush_range(flush->batch_bo->map, flush->batch_bo->size); 2003 2004 execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { 2005 .buffers_ptr = (uintptr_t) execbuf->objects, 2006 .buffer_count = execbuf->bo_count, 2007 .batch_start_offset = 0, 2008 .batch_len = flush->batch.next - flush->batch.start, 2009 .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_FENCE_ARRAY | queue->exec_flags | 2010 (execbuf->has_relocs ? 0 : I915_EXEC_NO_RELOC), 2011 .rsvd1 = device->context_id, 2012 .rsvd2 = 0, 2013 .num_cliprects = execbuf->syncobj_count, 2014 .cliprects_ptr = (uintptr_t)execbuf->syncobjs, 2015 }; 2016 2017 return VK_SUCCESS; 2018} 2019 2020static VkResult 2021anv_queue_exec_utrace_locked(struct anv_queue *queue, 2022 struct anv_utrace_flush_copy *flush) 2023{ 2024 assert(flush->batch_bo); 2025 2026 struct anv_device *device = queue->device; 2027 struct anv_execbuf execbuf; 2028 anv_execbuf_init(&execbuf); 2029 execbuf.alloc = &device->vk.alloc; 2030 execbuf.alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE; 2031 2032 VkResult result = setup_utrace_execbuf(&execbuf, queue, flush); 2033 if (result != VK_SUCCESS) 2034 goto error; 2035 2036 int ret = queue->device->info.no_hw ? 0 : 2037 anv_gem_execbuffer(queue->device, &execbuf.execbuf); 2038 if (ret) 2039 result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m"); 2040 2041 struct drm_i915_gem_exec_object2 *objects = execbuf.objects; 2042 for (uint32_t k = 0; k < execbuf.bo_count; k++) { 2043 if (anv_bo_is_pinned(execbuf.bos[k])) 2044 assert(execbuf.bos[k]->offset == objects[k].offset); 2045 execbuf.bos[k]->offset = objects[k].offset; 2046 } 2047 2048 error: 2049 anv_execbuf_finish(&execbuf); 2050 2051 return result; 2052} 2053 2054/* We lock around execbuf for three main reasons: 2055 * 2056 * 1) When a block pool is resized, we create a new gem handle with a 2057 * different size and, in the case of surface states, possibly a different 2058 * center offset but we re-use the same anv_bo struct when we do so. If 2059 * this happens in the middle of setting up an execbuf, we could end up 2060 * with our list of BOs out of sync with our list of gem handles. 2061 * 2062 * 2) The algorithm we use for building the list of unique buffers isn't 2063 * thread-safe. While the client is supposed to synchronize around 2064 * QueueSubmit, this would be extremely difficult to debug if it ever came 2065 * up in the wild due to a broken app. It's better to play it safe and 2066 * just lock around QueueSubmit. 2067 * 2068 * 3) The anv_cmd_buffer_execbuf function may perform relocations in 2069 * userspace. Due to the fact that the surface state buffer is shared 2070 * between batches, we can't afford to have that happen from multiple 2071 * threads at the same time. Even though the user is supposed to ensure 2072 * this doesn't happen, we play it safe as in (2) above. 2073 * 2074 * Since the only other things that ever take the device lock such as block 2075 * pool resize only rarely happen, this will almost never be contended so 2076 * taking a lock isn't really an expensive operation in this case. 2077 */ 2078static VkResult 2079anv_queue_exec_locked(struct anv_queue *queue, 2080 uint32_t wait_count, 2081 const struct vk_sync_wait *waits, 2082 uint32_t cmd_buffer_count, 2083 struct anv_cmd_buffer **cmd_buffers, 2084 uint32_t signal_count, 2085 const struct vk_sync_signal *signals, 2086 struct anv_query_pool *perf_query_pool, 2087 uint32_t perf_query_pass) 2088{ 2089 struct anv_device *device = queue->device; 2090 struct anv_utrace_flush_copy *utrace_flush_data = NULL; 2091 struct anv_execbuf execbuf; 2092 anv_execbuf_init(&execbuf); 2093 execbuf.alloc = &queue->device->vk.alloc; 2094 execbuf.alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE; 2095 execbuf.perf_query_pass = perf_query_pass; 2096 2097 /* Flush the trace points first, they need to be moved */ 2098 VkResult result = 2099 anv_device_utrace_flush_cmd_buffers(queue, 2100 cmd_buffer_count, 2101 cmd_buffers, 2102 &utrace_flush_data); 2103 if (result != VK_SUCCESS) 2104 goto error; 2105 2106 if (utrace_flush_data && !utrace_flush_data->batch_bo) { 2107 result = anv_execbuf_add_sync(device, &execbuf, 2108 utrace_flush_data->sync, 2109 true /* is_signal */, 2110 0); 2111 if (result != VK_SUCCESS) 2112 goto error; 2113 2114 utrace_flush_data = NULL; 2115 } 2116 2117 /* Always add the workaround BO as it includes a driver identifier for the 2118 * error_state. 2119 */ 2120 result = 2121 anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0); 2122 if (result != VK_SUCCESS) 2123 goto error; 2124 2125 for (uint32_t i = 0; i < wait_count; i++) { 2126 result = anv_execbuf_add_sync(device, &execbuf, 2127 waits[i].sync, 2128 false /* is_signal */, 2129 waits[i].wait_value); 2130 if (result != VK_SUCCESS) 2131 goto error; 2132 } 2133 2134 for (uint32_t i = 0; i < signal_count; i++) { 2135 result = anv_execbuf_add_sync(device, &execbuf, 2136 signals[i].sync, 2137 true /* is_signal */, 2138 signals[i].signal_value); 2139 if (result != VK_SUCCESS) 2140 goto error; 2141 } 2142 2143 if (queue->sync) { 2144 result = anv_execbuf_add_sync(device, &execbuf, 2145 queue->sync, 2146 true /* is_signal */, 2147 0 /* signal_value */); 2148 if (result != VK_SUCCESS) 2149 goto error; 2150 } 2151 2152 if (cmd_buffer_count) { 2153 result = setup_execbuf_for_cmd_buffers(&execbuf, queue, 2154 cmd_buffers, 2155 cmd_buffer_count); 2156 } else { 2157 result = setup_empty_execbuf(&execbuf, queue); 2158 } 2159 2160 if (result != VK_SUCCESS) 2161 goto error; 2162 2163 const bool has_perf_query = 2164 perf_query_pool && perf_query_pass >= 0 && cmd_buffer_count; 2165 2166 if (INTEL_DEBUG(DEBUG_SUBMIT)) { 2167 fprintf(stderr, "Batch offset=0x%x len=0x%x on queue 0\n", 2168 execbuf.execbuf.batch_start_offset, execbuf.execbuf.batch_len); 2169 for (uint32_t i = 0; i < execbuf.bo_count; i++) { 2170 const struct anv_bo *bo = execbuf.bos[i]; 2171 2172 fprintf(stderr, " BO: addr=0x%016"PRIx64"-0x%016"PRIx64" size=0x%010"PRIx64 2173 " handle=%05u name=%s\n", 2174 bo->offset, bo->offset + bo->size - 1, bo->size, bo->gem_handle, bo->name); 2175 } 2176 } 2177 2178 if (INTEL_DEBUG(DEBUG_BATCH)) { 2179 fprintf(stderr, "Batch on queue %d\n", (int)(queue - device->queues)); 2180 if (cmd_buffer_count) { 2181 if (has_perf_query) { 2182 struct anv_bo *pass_batch_bo = perf_query_pool->bo; 2183 uint64_t pass_batch_offset = 2184 khr_perf_query_preamble_offset(perf_query_pool, perf_query_pass); 2185 2186 intel_print_batch(&device->decoder_ctx, 2187 pass_batch_bo->map + pass_batch_offset, 64, 2188 pass_batch_bo->offset + pass_batch_offset, false); 2189 } 2190 2191 for (uint32_t i = 0; i < cmd_buffer_count; i++) { 2192 struct anv_batch_bo **bo = 2193 u_vector_tail(&cmd_buffers[i]->seen_bbos); 2194 device->cmd_buffer_being_decoded = cmd_buffers[i]; 2195 intel_print_batch(&device->decoder_ctx, (*bo)->bo->map, 2196 (*bo)->bo->size, (*bo)->bo->offset, false); 2197 device->cmd_buffer_being_decoded = NULL; 2198 } 2199 } else { 2200 intel_print_batch(&device->decoder_ctx, 2201 device->trivial_batch_bo->map, 2202 device->trivial_batch_bo->size, 2203 device->trivial_batch_bo->offset, false); 2204 } 2205 } 2206 2207 if (execbuf.syncobj_values) { 2208 execbuf.timeline_fences.fence_count = execbuf.syncobj_count; 2209 execbuf.timeline_fences.handles_ptr = (uintptr_t)execbuf.syncobjs; 2210 execbuf.timeline_fences.values_ptr = (uintptr_t)execbuf.syncobj_values; 2211 anv_execbuf_add_ext(&execbuf, 2212 DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES, 2213 &execbuf.timeline_fences.base); 2214 } else if (execbuf.syncobjs) { 2215 execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY; 2216 execbuf.execbuf.num_cliprects = execbuf.syncobj_count; 2217 execbuf.execbuf.cliprects_ptr = (uintptr_t)execbuf.syncobjs; 2218 } 2219 2220 if (has_perf_query) { 2221 assert(perf_query_pass < perf_query_pool->n_passes); 2222 struct intel_perf_query_info *query_info = 2223 perf_query_pool->pass_query[perf_query_pass]; 2224 2225 /* Some performance queries just the pipeline statistic HW, no need for 2226 * OA in that case, so no need to reconfigure. 2227 */ 2228 if (!INTEL_DEBUG(DEBUG_NO_OACONFIG) && 2229 (query_info->kind == INTEL_PERF_QUERY_TYPE_OA || 2230 query_info->kind == INTEL_PERF_QUERY_TYPE_RAW)) { 2231 int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG, 2232 (void *)(uintptr_t) query_info->oa_metrics_set_id); 2233 if (ret < 0) { 2234 result = vk_device_set_lost(&device->vk, 2235 "i915-perf config failed: %s", 2236 strerror(errno)); 2237 } 2238 } 2239 2240 struct anv_bo *pass_batch_bo = perf_query_pool->bo; 2241 2242 struct drm_i915_gem_exec_object2 query_pass_object = { 2243 .handle = pass_batch_bo->gem_handle, 2244 .offset = pass_batch_bo->offset, 2245 .flags = pass_batch_bo->flags, 2246 }; 2247 struct drm_i915_gem_execbuffer2 query_pass_execbuf = { 2248 .buffers_ptr = (uintptr_t) &query_pass_object, 2249 .buffer_count = 1, 2250 .batch_start_offset = khr_perf_query_preamble_offset(perf_query_pool, 2251 perf_query_pass), 2252 .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags, 2253 .rsvd1 = device->context_id, 2254 }; 2255 2256 int ret = queue->device->info.no_hw ? 0 : 2257 anv_gem_execbuffer(queue->device, &query_pass_execbuf); 2258 if (ret) 2259 result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m"); 2260 } 2261 2262 int ret = queue->device->info.no_hw ? 0 : 2263 anv_gem_execbuffer(queue->device, &execbuf.execbuf); 2264 if (ret) 2265 result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m"); 2266 2267 if (queue->sync) { 2268 VkResult result = vk_sync_wait(&device->vk, 2269 queue->sync, 0, 2270 VK_SYNC_WAIT_COMPLETE, 2271 UINT64_MAX); 2272 if (result != VK_SUCCESS) 2273 result = vk_queue_set_lost(&queue->vk, "sync wait failed"); 2274 } 2275 2276 struct drm_i915_gem_exec_object2 *objects = execbuf.objects; 2277 for (uint32_t k = 0; k < execbuf.bo_count; k++) { 2278 if (anv_bo_is_pinned(execbuf.bos[k])) 2279 assert(execbuf.bos[k]->offset == objects[k].offset); 2280 execbuf.bos[k]->offset = objects[k].offset; 2281 } 2282 2283 error: 2284 anv_execbuf_finish(&execbuf); 2285 2286 if (result == VK_SUCCESS && utrace_flush_data) 2287 result = anv_queue_exec_utrace_locked(queue, utrace_flush_data); 2288 2289 return result; 2290} 2291 2292static inline bool 2293can_chain_query_pools(struct anv_query_pool *p1, struct anv_query_pool *p2) 2294{ 2295 return (!p1 || !p2 || p1 == p2); 2296} 2297 2298static VkResult 2299anv_queue_submit_locked(struct anv_queue *queue, 2300 struct vk_queue_submit *submit) 2301{ 2302 VkResult result; 2303 2304 if (submit->command_buffer_count == 0) { 2305 result = anv_queue_exec_locked(queue, submit->wait_count, submit->waits, 2306 0 /* cmd_buffer_count */, 2307 NULL /* cmd_buffers */, 2308 submit->signal_count, submit->signals, 2309 NULL /* perf_query_pool */, 2310 0 /* perf_query_pass */); 2311 if (result != VK_SUCCESS) 2312 return result; 2313 } else { 2314 /* Everything's easier if we don't have to bother with container_of() */ 2315 STATIC_ASSERT(offsetof(struct anv_cmd_buffer, vk) == 0); 2316 struct vk_command_buffer **vk_cmd_buffers = submit->command_buffers; 2317 struct anv_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers; 2318 uint32_t start = 0; 2319 uint32_t end = submit->command_buffer_count; 2320 struct anv_query_pool *perf_query_pool = 2321 cmd_buffers[start]->perf_query_pool; 2322 for (uint32_t n = 0; n < end; n++) { 2323 bool can_chain = false; 2324 uint32_t next = n + 1; 2325 /* Can we chain the last buffer into the next one? */ 2326 if (next < end && 2327 anv_cmd_buffer_is_chainable(cmd_buffers[next]) && 2328 can_chain_query_pools 2329 (cmd_buffers[next]->perf_query_pool, perf_query_pool)) { 2330 can_chain = true; 2331 perf_query_pool = 2332 perf_query_pool ? perf_query_pool : 2333 cmd_buffers[next]->perf_query_pool; 2334 } 2335 if (!can_chain) { 2336 /* The next buffer cannot be chained, or we have reached the 2337 * last buffer, submit what have been chained so far. 2338 */ 2339 VkResult result = 2340 anv_queue_exec_locked(queue, 2341 start == 0 ? submit->wait_count : 0, 2342 start == 0 ? submit->waits : NULL, 2343 next - start, &cmd_buffers[start], 2344 next == end ? submit->signal_count : 0, 2345 next == end ? submit->signals : NULL, 2346 perf_query_pool, 2347 submit->perf_pass_index); 2348 if (result != VK_SUCCESS) 2349 return result; 2350 if (next < end) { 2351 start = next; 2352 perf_query_pool = cmd_buffers[start]->perf_query_pool; 2353 } 2354 } 2355 } 2356 } 2357 for (uint32_t i = 0; i < submit->signal_count; i++) { 2358 if (!vk_sync_is_anv_bo_sync(submit->signals[i].sync)) 2359 continue; 2360 2361 struct anv_bo_sync *bo_sync = 2362 container_of(submit->signals[i].sync, struct anv_bo_sync, sync); 2363 2364 /* Once the execbuf has returned, we need to set the fence state to 2365 * SUBMITTED. We can't do this before calling execbuf because 2366 * anv_GetFenceStatus does take the global device lock before checking 2367 * fence->state. 2368 * 2369 * We set the fence state to SUBMITTED regardless of whether or not the 2370 * execbuf succeeds because we need to ensure that vkWaitForFences() and 2371 * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or 2372 * VK_SUCCESS) in a finite amount of time even if execbuf fails. 2373 */ 2374 assert(bo_sync->state == ANV_BO_SYNC_STATE_RESET); 2375 bo_sync->state = ANV_BO_SYNC_STATE_SUBMITTED; 2376 } 2377 2378 pthread_cond_broadcast(&queue->device->queue_submit); 2379 2380 return VK_SUCCESS; 2381} 2382 2383VkResult 2384anv_queue_submit(struct vk_queue *vk_queue, 2385 struct vk_queue_submit *submit) 2386{ 2387 struct anv_queue *queue = container_of(vk_queue, struct anv_queue, vk); 2388 struct anv_device *device = queue->device; 2389 VkResult result; 2390 2391 if (queue->device->info.no_hw) { 2392 for (uint32_t i = 0; i < submit->signal_count; i++) { 2393 result = vk_sync_signal(&device->vk, 2394 submit->signals[i].sync, 2395 submit->signals[i].signal_value); 2396 if (result != VK_SUCCESS) 2397 return vk_queue_set_lost(&queue->vk, "vk_sync_signal failed"); 2398 } 2399 return VK_SUCCESS; 2400 } 2401 2402 uint64_t start_ts = intel_ds_begin_submit(queue->ds); 2403 2404 pthread_mutex_lock(&device->mutex); 2405 result = anv_queue_submit_locked(queue, submit); 2406 /* Take submission ID under lock */ 2407 pthread_mutex_unlock(&device->mutex); 2408 2409 intel_ds_end_submit(queue->ds, start_ts); 2410 2411 return result; 2412} 2413 2414VkResult 2415anv_queue_submit_simple_batch(struct anv_queue *queue, 2416 struct anv_batch *batch) 2417{ 2418 struct anv_device *device = queue->device; 2419 VkResult result = VK_SUCCESS; 2420 int err; 2421 2422 if (queue->device->info.no_hw) 2423 return VK_SUCCESS; 2424 2425 /* This is only used by device init so we can assume the queue is empty and 2426 * we aren't fighting with a submit thread. 2427 */ 2428 assert(vk_queue_is_empty(&queue->vk)); 2429 2430 uint32_t batch_size = align_u32(batch->next - batch->start, 8); 2431 2432 struct anv_bo *batch_bo = NULL; 2433 result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size, &batch_bo); 2434 if (result != VK_SUCCESS) 2435 return result; 2436 2437 memcpy(batch_bo->map, batch->start, batch_size); 2438 if (device->physical->memory.need_clflush) 2439 intel_flush_range(batch_bo->map, batch_size); 2440 2441 struct anv_execbuf execbuf; 2442 anv_execbuf_init(&execbuf); 2443 execbuf.alloc = &queue->device->vk.alloc; 2444 execbuf.alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE; 2445 2446 result = anv_execbuf_add_bo(device, &execbuf, batch_bo, NULL, 0); 2447 if (result != VK_SUCCESS) 2448 goto fail; 2449 2450 if (INTEL_DEBUG(DEBUG_BATCH)) { 2451 intel_print_batch(&device->decoder_ctx, 2452 batch_bo->map, 2453 batch_bo->size, 2454 batch_bo->offset, false); 2455 } 2456 2457 execbuf.execbuf = (struct drm_i915_gem_execbuffer2) { 2458 .buffers_ptr = (uintptr_t) execbuf.objects, 2459 .buffer_count = execbuf.bo_count, 2460 .batch_start_offset = 0, 2461 .batch_len = batch_size, 2462 .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC, 2463 .rsvd1 = device->context_id, 2464 .rsvd2 = 0, 2465 }; 2466 2467 err = anv_gem_execbuffer(device, &execbuf.execbuf); 2468 if (err) { 2469 result = vk_device_set_lost(&device->vk, "anv_gem_execbuffer failed: %m"); 2470 goto fail; 2471 } 2472 2473 result = anv_device_wait(device, batch_bo, INT64_MAX); 2474 if (result != VK_SUCCESS) { 2475 result = vk_device_set_lost(&device->vk, 2476 "anv_device_wait failed: %m"); 2477 goto fail; 2478 } 2479 2480fail: 2481 anv_execbuf_finish(&execbuf); 2482 anv_bo_pool_free(&device->batch_bo_pool, batch_bo); 2483 2484 return result; 2485} 2486