1/* 2 * Copyright © 2008 Jérôme Glisse 3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com> 4 * Copyright © 2015 Advanced Micro Devices, Inc. 5 * All Rights Reserved. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining 8 * a copy of this software and associated documentation files (the 9 * "Software"), to deal in the Software without restriction, including 10 * without limitation the rights to use, copy, modify, merge, publish, 11 * distribute, sub license, and/or sell copies of the Software, and to 12 * permit persons to whom the Software is furnished to do so, subject to 13 * the following conditions: 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS 19 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * The above copyright notice and this permission notice (including the 25 * next paragraph) shall be included in all copies or substantial portions 26 * of the Software. 27 */ 28 29#include "amdgpu_cs.h" 30#include "util/os_time.h" 31#include <inttypes.h> 32#include <stdio.h> 33 34#include "amd/common/sid.h" 35 36/* FENCES */ 37 38static struct pipe_fence_handle * 39amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type) 40{ 41 struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); 42 43 fence->reference.count = 1; 44 fence->ws = ctx->ws; 45 fence->ctx = ctx; 46 fence->fence.context = ctx->ctx; 47 fence->fence.ip_type = ip_type; 48 util_queue_fence_init(&fence->submitted); 49 util_queue_fence_reset(&fence->submitted); 50 p_atomic_inc(&ctx->refcount); 51 return (struct pipe_fence_handle *)fence; 52} 53 54static struct pipe_fence_handle * 55amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd) 56{ 57 struct amdgpu_winsys *ws = amdgpu_winsys(rws); 58 struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); 59 int r; 60 61 if (!fence) 62 return NULL; 63 64 pipe_reference_init(&fence->reference, 1); 65 fence->ws = ws; 66 67 r = amdgpu_cs_import_syncobj(ws->dev, fd, &fence->syncobj); 68 if (r) { 69 FREE(fence); 70 return NULL; 71 } 72 73 util_queue_fence_init(&fence->submitted); 74 75 assert(amdgpu_fence_is_syncobj(fence)); 76 return (struct pipe_fence_handle*)fence; 77} 78 79static struct pipe_fence_handle * 80amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd) 81{ 82 struct amdgpu_winsys *ws = amdgpu_winsys(rws); 83 struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); 84 85 if (!fence) 86 return NULL; 87 88 pipe_reference_init(&fence->reference, 1); 89 fence->ws = ws; 90 /* fence->ctx == NULL means that the fence is syncobj-based. */ 91 92 /* Convert sync_file into syncobj. */ 93 int r = amdgpu_cs_create_syncobj(ws->dev, &fence->syncobj); 94 if (r) { 95 FREE(fence); 96 return NULL; 97 } 98 99 r = amdgpu_cs_syncobj_import_sync_file(ws->dev, fence->syncobj, fd); 100 if (r) { 101 amdgpu_cs_destroy_syncobj(ws->dev, fence->syncobj); 102 FREE(fence); 103 return NULL; 104 } 105 106 util_queue_fence_init(&fence->submitted); 107 108 return (struct pipe_fence_handle*)fence; 109} 110 111static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws, 112 struct pipe_fence_handle *pfence) 113{ 114 struct amdgpu_winsys *ws = amdgpu_winsys(rws); 115 struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence; 116 117 if (amdgpu_fence_is_syncobj(fence)) { 118 int fd, r; 119 120 /* Convert syncobj into sync_file. */ 121 r = amdgpu_cs_syncobj_export_sync_file(ws->dev, fence->syncobj, &fd); 122 return r ? -1 : fd; 123 } 124 125 util_queue_fence_wait(&fence->submitted); 126 127 /* Convert the amdgpu fence into a fence FD. */ 128 int fd; 129 if (amdgpu_cs_fence_to_handle(ws->dev, &fence->fence, 130 AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD, 131 (uint32_t*)&fd)) 132 return -1; 133 134 return fd; 135} 136 137static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws) 138{ 139 struct amdgpu_winsys *ws = amdgpu_winsys(rws); 140 uint32_t syncobj; 141 int fd = -1; 142 143 int r = amdgpu_cs_create_syncobj2(ws->dev, DRM_SYNCOBJ_CREATE_SIGNALED, 144 &syncobj); 145 if (r) { 146 return -1; 147 } 148 149 r = amdgpu_cs_syncobj_export_sync_file(ws->dev, syncobj, &fd); 150 if (r) { 151 fd = -1; 152 } 153 154 amdgpu_cs_destroy_syncobj(ws->dev, syncobj); 155 return fd; 156} 157 158static void amdgpu_fence_submitted(struct pipe_fence_handle *fence, 159 uint64_t seq_no, 160 uint64_t *user_fence_cpu_address) 161{ 162 struct amdgpu_fence *afence = (struct amdgpu_fence*)fence; 163 164 afence->fence.fence = seq_no; 165 afence->user_fence_cpu_address = user_fence_cpu_address; 166 util_queue_fence_signal(&afence->submitted); 167} 168 169static void amdgpu_fence_signalled(struct pipe_fence_handle *fence) 170{ 171 struct amdgpu_fence *afence = (struct amdgpu_fence*)fence; 172 173 afence->signalled = true; 174 util_queue_fence_signal(&afence->submitted); 175} 176 177bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, 178 bool absolute) 179{ 180 struct amdgpu_fence *afence = (struct amdgpu_fence*)fence; 181 uint32_t expired; 182 int64_t abs_timeout; 183 uint64_t *user_fence_cpu; 184 int r; 185 186 if (afence->signalled) 187 return true; 188 189 if (absolute) 190 abs_timeout = timeout; 191 else 192 abs_timeout = os_time_get_absolute_timeout(timeout); 193 194 /* Handle syncobjs. */ 195 if (amdgpu_fence_is_syncobj(afence)) { 196 if (abs_timeout == OS_TIMEOUT_INFINITE) 197 abs_timeout = INT64_MAX; 198 199 if (amdgpu_cs_syncobj_wait(afence->ws->dev, &afence->syncobj, 1, 200 abs_timeout, 0, NULL)) 201 return false; 202 203 afence->signalled = true; 204 return true; 205 } 206 207 /* The fence might not have a number assigned if its IB is being 208 * submitted in the other thread right now. Wait until the submission 209 * is done. */ 210 if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout)) 211 return false; 212 213 user_fence_cpu = afence->user_fence_cpu_address; 214 if (user_fence_cpu) { 215 if (*user_fence_cpu >= afence->fence.fence) { 216 afence->signalled = true; 217 return true; 218 } 219 220 /* No timeout, just query: no need for the ioctl. */ 221 if (!absolute && !timeout) 222 return false; 223 } 224 225 /* Now use the libdrm query. */ 226 r = amdgpu_cs_query_fence_status(&afence->fence, 227 abs_timeout, 228 AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE, 229 &expired); 230 if (r) { 231 fprintf(stderr, "amdgpu: amdgpu_cs_query_fence_status failed.\n"); 232 return false; 233 } 234 235 if (expired) { 236 /* This variable can only transition from false to true, so it doesn't 237 * matter if threads race for it. */ 238 afence->signalled = true; 239 return true; 240 } 241 return false; 242} 243 244static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws, 245 struct pipe_fence_handle *fence, 246 uint64_t timeout) 247{ 248 return amdgpu_fence_wait(fence, timeout, false); 249} 250 251static struct pipe_fence_handle * 252amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs) 253{ 254 struct amdgpu_cs *cs = amdgpu_cs(rcs); 255 struct pipe_fence_handle *fence = NULL; 256 257 if (cs->noop) 258 return NULL; 259 260 if (cs->next_fence) { 261 amdgpu_fence_reference(&fence, cs->next_fence); 262 return fence; 263 } 264 265 fence = amdgpu_fence_create(cs->ctx, 266 cs->csc->ib[IB_MAIN].ip_type); 267 if (!fence) 268 return NULL; 269 270 amdgpu_fence_reference(&cs->next_fence, fence); 271 return fence; 272} 273 274/* CONTEXTS */ 275 276static uint32_t 277radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority) 278{ 279 switch (radeon_priority) { 280 case RADEON_CTX_PRIORITY_REALTIME: 281 return AMDGPU_CTX_PRIORITY_VERY_HIGH; 282 case RADEON_CTX_PRIORITY_HIGH: 283 return AMDGPU_CTX_PRIORITY_HIGH; 284 case RADEON_CTX_PRIORITY_MEDIUM: 285 return AMDGPU_CTX_PRIORITY_NORMAL; 286 case RADEON_CTX_PRIORITY_LOW: 287 return AMDGPU_CTX_PRIORITY_LOW; 288 default: 289 unreachable("Invalid context priority"); 290 } 291} 292 293static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *ws, 294 enum radeon_ctx_priority priority) 295{ 296 struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx); 297 int r; 298 struct amdgpu_bo_alloc_request alloc_buffer = {}; 299 uint32_t amdgpu_priority = radeon_to_amdgpu_priority(priority); 300 amdgpu_bo_handle buf_handle; 301 302 if (!ctx) 303 return NULL; 304 305 ctx->ws = amdgpu_winsys(ws); 306 ctx->refcount = 1; 307 ctx->initial_num_total_rejected_cs = ctx->ws->num_total_rejected_cs; 308 309 r = amdgpu_cs_ctx_create2(ctx->ws->dev, amdgpu_priority, &ctx->ctx); 310 if (r) { 311 fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create2 failed. (%i)\n", r); 312 goto error_create; 313 } 314 315 alloc_buffer.alloc_size = ctx->ws->info.gart_page_size; 316 alloc_buffer.phys_alignment = ctx->ws->info.gart_page_size; 317 alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT; 318 319 r = amdgpu_bo_alloc(ctx->ws->dev, &alloc_buffer, &buf_handle); 320 if (r) { 321 fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r); 322 goto error_user_fence_alloc; 323 } 324 325 r = amdgpu_bo_cpu_map(buf_handle, (void**)&ctx->user_fence_cpu_address_base); 326 if (r) { 327 fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r); 328 goto error_user_fence_map; 329 } 330 331 memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size); 332 ctx->user_fence_bo = buf_handle; 333 334 return (struct radeon_winsys_ctx*)ctx; 335 336error_user_fence_map: 337 amdgpu_bo_free(buf_handle); 338error_user_fence_alloc: 339 amdgpu_cs_ctx_free(ctx->ctx); 340error_create: 341 FREE(ctx); 342 return NULL; 343} 344 345static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx) 346{ 347 amdgpu_ctx_unref((struct amdgpu_ctx*)rwctx); 348} 349 350static enum pipe_reset_status 351amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only, 352 bool *needs_reset) 353{ 354 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; 355 int r; 356 357 if (needs_reset) 358 *needs_reset = false; 359 360 /* Return a failure due to a GPU hang. */ 361 if (ctx->ws->info.drm_minor >= 24) { 362 uint64_t flags; 363 364 if (full_reset_only && 365 ctx->initial_num_total_rejected_cs == ctx->ws->num_total_rejected_cs) { 366 /* If the caller is only interested in full reset (= wants to ignore soft 367 * recoveries), we can use the rejected cs count as a quick first check. 368 */ 369 return PIPE_NO_RESET; 370 } 371 372 r = amdgpu_cs_query_reset_state2(ctx->ctx, &flags); 373 if (r) { 374 fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r); 375 return PIPE_NO_RESET; 376 } 377 378 if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) { 379 if (needs_reset) 380 *needs_reset = flags & AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST; 381 if (flags & AMDGPU_CTX_QUERY2_FLAGS_GUILTY) 382 return PIPE_GUILTY_CONTEXT_RESET; 383 else 384 return PIPE_INNOCENT_CONTEXT_RESET; 385 } 386 } else { 387 uint32_t result, hangs; 388 389 r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs); 390 if (r) { 391 fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r); 392 return PIPE_NO_RESET; 393 } 394 395 if (needs_reset) 396 *needs_reset = true; 397 switch (result) { 398 case AMDGPU_CTX_GUILTY_RESET: 399 return PIPE_GUILTY_CONTEXT_RESET; 400 case AMDGPU_CTX_INNOCENT_RESET: 401 return PIPE_INNOCENT_CONTEXT_RESET; 402 case AMDGPU_CTX_UNKNOWN_RESET: 403 return PIPE_UNKNOWN_CONTEXT_RESET; 404 } 405 } 406 407 /* Return a failure due to a rejected command submission. */ 408 if (ctx->ws->num_total_rejected_cs > ctx->initial_num_total_rejected_cs) { 409 if (needs_reset) 410 *needs_reset = true; 411 return ctx->num_rejected_cs ? PIPE_GUILTY_CONTEXT_RESET : 412 PIPE_INNOCENT_CONTEXT_RESET; 413 } 414 if (needs_reset) 415 *needs_reset = false; 416 return PIPE_NO_RESET; 417} 418 419/* COMMAND SUBMISSION */ 420 421static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs) 422{ 423 return cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_UVD && 424 cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCE && 425 cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_UVD_ENC && 426 cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_DEC && 427 cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC && 428 cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_JPEG; 429} 430 431static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *cs) 432{ 433 if (cs->has_chaining) 434 return 4; /* for chaining */ 435 436 return 0; 437} 438 439static int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo, 440 struct amdgpu_cs_buffer *buffers, unsigned num_buffers) 441{ 442 unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1); 443 int i = cs->buffer_indices_hashlist[hash]; 444 445 /* not found or found */ 446 if (i < 0 || (i < num_buffers && buffers[i].bo == bo)) 447 return i; 448 449 /* Hash collision, look for the BO in the list of buffers linearly. */ 450 for (int i = num_buffers - 1; i >= 0; i--) { 451 if (buffers[i].bo == bo) { 452 /* Put this buffer in the hash list. 453 * This will prevent additional hash collisions if there are 454 * several consecutive lookup_buffer calls for the same buffer. 455 * 456 * Example: Assuming buffers A,B,C collide in the hash list, 457 * the following sequence of buffers: 458 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC 459 * will collide here: ^ and here: ^, 460 * meaning that we should get very few collisions in the end. */ 461 cs->buffer_indices_hashlist[hash] = i & 0x7fff; 462 return i; 463 } 464 } 465 return -1; 466} 467 468int amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo) 469{ 470 struct amdgpu_cs_buffer *buffers; 471 int num_buffers; 472 473 if (bo->bo) { 474 buffers = cs->real_buffers; 475 num_buffers = cs->num_real_buffers; 476 } else if (!(bo->base.usage & RADEON_FLAG_SPARSE)) { 477 buffers = cs->slab_buffers; 478 num_buffers = cs->num_slab_buffers; 479 } else { 480 buffers = cs->sparse_buffers; 481 num_buffers = cs->num_sparse_buffers; 482 } 483 484 return amdgpu_lookup_buffer(cs, bo, buffers, num_buffers); 485} 486 487static int 488amdgpu_do_add_real_buffer(struct amdgpu_cs_context *cs, 489 struct amdgpu_winsys_bo *bo) 490{ 491 struct amdgpu_cs_buffer *buffer; 492 int idx; 493 494 /* New buffer, check if the backing array is large enough. */ 495 if (cs->num_real_buffers >= cs->max_real_buffers) { 496 unsigned new_max = 497 MAX2(cs->max_real_buffers + 16, (unsigned)(cs->max_real_buffers * 1.3)); 498 struct amdgpu_cs_buffer *new_buffers; 499 500 new_buffers = MALLOC(new_max * sizeof(*new_buffers)); 501 502 if (!new_buffers) { 503 fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n"); 504 FREE(new_buffers); 505 return -1; 506 } 507 508 memcpy(new_buffers, cs->real_buffers, cs->num_real_buffers * sizeof(*new_buffers)); 509 510 FREE(cs->real_buffers); 511 512 cs->max_real_buffers = new_max; 513 cs->real_buffers = new_buffers; 514 } 515 516 idx = cs->num_real_buffers; 517 buffer = &cs->real_buffers[idx]; 518 519 memset(buffer, 0, sizeof(*buffer)); 520 amdgpu_winsys_bo_reference(cs->ws, &buffer->bo, bo); 521 cs->num_real_buffers++; 522 523 return idx; 524} 525 526static int 527amdgpu_lookup_or_add_real_buffer(struct radeon_cmdbuf *rcs, struct amdgpu_cs_context *cs, 528 struct amdgpu_winsys_bo *bo) 529{ 530 unsigned hash; 531 int idx = amdgpu_lookup_buffer(cs, bo, cs->real_buffers, cs->num_real_buffers); 532 533 if (idx >= 0) 534 return idx; 535 536 idx = amdgpu_do_add_real_buffer(cs, bo); 537 538 hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1); 539 cs->buffer_indices_hashlist[hash] = idx & 0x7fff; 540 541 if (bo->base.placement & RADEON_DOMAIN_VRAM) 542 rcs->used_vram_kb += bo->base.size / 1024; 543 else if (bo->base.placement & RADEON_DOMAIN_GTT) 544 rcs->used_gart_kb += bo->base.size / 1024; 545 546 return idx; 547} 548 549static int amdgpu_lookup_or_add_slab_buffer(struct radeon_cmdbuf *rcs, 550 struct amdgpu_cs_context *cs, 551 struct amdgpu_winsys_bo *bo) 552{ 553 struct amdgpu_cs_buffer *buffer; 554 unsigned hash; 555 int idx = amdgpu_lookup_buffer(cs, bo, cs->slab_buffers, cs->num_slab_buffers); 556 int real_idx; 557 558 if (idx >= 0) 559 return idx; 560 561 real_idx = amdgpu_lookup_or_add_real_buffer(rcs, cs, bo->u.slab.real); 562 if (real_idx < 0) 563 return -1; 564 565 /* New buffer, check if the backing array is large enough. */ 566 if (cs->num_slab_buffers >= cs->max_slab_buffers) { 567 unsigned new_max = 568 MAX2(cs->max_slab_buffers + 16, (unsigned)(cs->max_slab_buffers * 1.3)); 569 struct amdgpu_cs_buffer *new_buffers; 570 571 new_buffers = REALLOC(cs->slab_buffers, 572 cs->max_slab_buffers * sizeof(*new_buffers), 573 new_max * sizeof(*new_buffers)); 574 if (!new_buffers) { 575 fprintf(stderr, "amdgpu_lookup_or_add_slab_buffer: allocation failed\n"); 576 return -1; 577 } 578 579 cs->max_slab_buffers = new_max; 580 cs->slab_buffers = new_buffers; 581 } 582 583 idx = cs->num_slab_buffers; 584 buffer = &cs->slab_buffers[idx]; 585 586 memset(buffer, 0, sizeof(*buffer)); 587 amdgpu_winsys_bo_reference(cs->ws, &buffer->bo, bo); 588 buffer->slab_real_idx = real_idx; 589 cs->num_slab_buffers++; 590 591 hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1); 592 cs->buffer_indices_hashlist[hash] = idx & 0x7fff; 593 594 return idx; 595} 596 597static int amdgpu_lookup_or_add_sparse_buffer(struct radeon_cmdbuf *rcs, 598 struct amdgpu_cs_context *cs, 599 struct amdgpu_winsys_bo *bo) 600{ 601 struct amdgpu_cs_buffer *buffer; 602 unsigned hash; 603 int idx = amdgpu_lookup_buffer(cs, bo, cs->sparse_buffers, cs->num_sparse_buffers); 604 605 if (idx >= 0) 606 return idx; 607 608 /* New buffer, check if the backing array is large enough. */ 609 if (cs->num_sparse_buffers >= cs->max_sparse_buffers) { 610 unsigned new_max = 611 MAX2(cs->max_sparse_buffers + 16, (unsigned)(cs->max_sparse_buffers * 1.3)); 612 struct amdgpu_cs_buffer *new_buffers; 613 614 new_buffers = REALLOC(cs->sparse_buffers, 615 cs->max_sparse_buffers * sizeof(*new_buffers), 616 new_max * sizeof(*new_buffers)); 617 if (!new_buffers) { 618 fprintf(stderr, "amdgpu_lookup_or_add_sparse_buffer: allocation failed\n"); 619 return -1; 620 } 621 622 cs->max_sparse_buffers = new_max; 623 cs->sparse_buffers = new_buffers; 624 } 625 626 idx = cs->num_sparse_buffers; 627 buffer = &cs->sparse_buffers[idx]; 628 629 memset(buffer, 0, sizeof(*buffer)); 630 amdgpu_winsys_bo_reference(cs->ws, &buffer->bo, bo); 631 cs->num_sparse_buffers++; 632 633 hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1); 634 cs->buffer_indices_hashlist[hash] = idx & 0x7fff; 635 636 /* We delay adding the backing buffers until we really have to. However, 637 * we cannot delay accounting for memory use. 638 */ 639 simple_mtx_lock(&bo->lock); 640 641 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { 642 if (bo->base.placement & RADEON_DOMAIN_VRAM) 643 rcs->used_vram_kb += backing->bo->base.size / 1024; 644 else if (bo->base.placement & RADEON_DOMAIN_GTT) 645 rcs->used_gart_kb += backing->bo->base.size / 1024; 646 } 647 648 simple_mtx_unlock(&bo->lock); 649 650 return idx; 651} 652 653static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs, 654 struct pb_buffer *buf, 655 unsigned usage, 656 enum radeon_bo_domain domains) 657{ 658 /* Don't use the "domains" parameter. Amdgpu doesn't support changing 659 * the buffer placement during command submission. 660 */ 661 struct amdgpu_cs_context *cs = (struct amdgpu_cs_context*)rcs->csc; 662 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; 663 struct amdgpu_cs_buffer *buffer; 664 int index; 665 666 /* Fast exit for no-op calls. 667 * This is very effective with suballocators and linear uploaders that 668 * are outside of the winsys. 669 */ 670 if (bo == cs->last_added_bo && 671 (usage & cs->last_added_bo_usage) == usage) 672 return cs->last_added_bo_index; 673 674 if (!(bo->base.usage & RADEON_FLAG_SPARSE)) { 675 if (!bo->bo) { 676 index = amdgpu_lookup_or_add_slab_buffer(rcs, cs, bo); 677 if (index < 0) 678 return 0; 679 680 buffer = &cs->slab_buffers[index]; 681 buffer->usage |= usage; 682 cs->last_added_bo_usage = buffer->usage; 683 684 index = buffer->slab_real_idx; 685 buffer = &cs->real_buffers[index]; 686 buffer->usage |= usage & ~RADEON_USAGE_SYNCHRONIZED; 687 } else { 688 index = amdgpu_lookup_or_add_real_buffer(rcs, cs, bo); 689 if (index < 0) 690 return 0; 691 692 buffer = &cs->real_buffers[index]; 693 buffer->usage |= usage; 694 cs->last_added_bo_usage = buffer->usage; 695 } 696 } else { 697 index = amdgpu_lookup_or_add_sparse_buffer(rcs, cs, bo); 698 if (index < 0) 699 return 0; 700 701 buffer = &cs->sparse_buffers[index]; 702 buffer->usage |= usage; 703 cs->last_added_bo_usage = buffer->usage; 704 } 705 706 cs->last_added_bo = bo; 707 cs->last_added_bo_index = index; 708 return index; 709} 710 711static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, 712 struct amdgpu_ib *ib, 713 struct amdgpu_cs *cs) 714{ 715 struct pb_buffer *pb; 716 uint8_t *mapped; 717 unsigned buffer_size; 718 719 /* Always create a buffer that is at least as large as the maximum seen IB 720 * size, aligned to a power of two (and multiplied by 4 to reduce internal 721 * fragmentation if chaining is not available). Limit to 512k dwords, which 722 * is the largest power of two that fits into the size field of the 723 * INDIRECT_BUFFER packet. 724 */ 725 if (cs->has_chaining) 726 buffer_size = 4 * util_next_power_of_two(ib->max_ib_size); 727 else 728 buffer_size = 4 * util_next_power_of_two(4 * ib->max_ib_size); 729 730 const unsigned min_size = MAX2(ib->max_check_space_size, 8 * 1024 * 4); 731 const unsigned max_size = 512 * 1024 * 4; 732 733 buffer_size = MIN2(buffer_size, max_size); 734 buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */ 735 736 enum radeon_bo_domain domain; 737 unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING; 738 739 if (cs->ip_type == AMD_IP_GFX || 740 cs->ip_type == AMD_IP_COMPUTE || 741 cs->ip_type == AMD_IP_SDMA) { 742 domain = ws->info.smart_access_memory ? RADEON_DOMAIN_VRAM : RADEON_DOMAIN_GTT; 743 flags |= RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC; 744 } else { 745 /* UVD/VCE */ 746 /* TODO: validate that UVD/VCE don't read from IBs and enable WC or even VRAM. */ 747 domain = RADEON_DOMAIN_GTT; 748 } 749 750 pb = amdgpu_bo_create(ws, buffer_size, 751 ws->info.gart_page_size, 752 domain, flags); 753 if (!pb) 754 return false; 755 756 mapped = amdgpu_bo_map(&ws->dummy_ws.base, pb, NULL, PIPE_MAP_WRITE); 757 if (!mapped) { 758 radeon_bo_reference(&ws->dummy_ws.base, &pb, NULL); 759 return false; 760 } 761 762 radeon_bo_reference(&ws->dummy_ws.base, &ib->big_ib_buffer, pb); 763 radeon_bo_reference(&ws->dummy_ws.base, &pb, NULL); 764 765 ib->ib_mapped = mapped; 766 ib->used_ib_space = 0; 767 768 return true; 769} 770 771static bool amdgpu_get_new_ib(struct amdgpu_winsys *ws, 772 struct radeon_cmdbuf *rcs, 773 struct amdgpu_ib *ib, 774 struct amdgpu_cs *cs) 775{ 776 /* Small IBs are better than big IBs, because the GPU goes idle quicker 777 * and there is less waiting for buffers and fences. Proof: 778 * http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1 779 */ 780 struct drm_amdgpu_cs_chunk_ib *info = &cs->csc->ib[ib->ib_type]; 781 /* This is the minimum size of a contiguous IB. */ 782 unsigned ib_size = 4 * 1024 * 4; 783 784 /* Always allocate at least the size of the biggest cs_check_space call, 785 * because precisely the last call might have requested this size. 786 */ 787 ib_size = MAX2(ib_size, ib->max_check_space_size); 788 789 if (!cs->has_chaining) { 790 ib_size = MAX2(ib_size, 791 4 * MIN2(util_next_power_of_two(ib->max_ib_size), 792 IB_MAX_SUBMIT_DWORDS)); 793 } 794 795 ib->max_ib_size = ib->max_ib_size - ib->max_ib_size / 32; 796 797 rcs->prev_dw = 0; 798 rcs->num_prev = 0; 799 rcs->current.cdw = 0; 800 rcs->current.buf = NULL; 801 802 /* Allocate a new buffer for IBs if the current buffer is all used. */ 803 if (!ib->big_ib_buffer || 804 ib->used_ib_space + ib_size > ib->big_ib_buffer->size) { 805 if (!amdgpu_ib_new_buffer(ws, ib, cs)) 806 return false; 807 } 808 809 info->va_start = amdgpu_winsys_bo(ib->big_ib_buffer)->va + ib->used_ib_space; 810 info->ib_bytes = 0; 811 /* ib_bytes is in dwords and the conversion to bytes will be done before 812 * the CS ioctl. */ 813 ib->ptr_ib_size = &info->ib_bytes; 814 ib->ptr_ib_size_inside_ib = false; 815 816 amdgpu_cs_add_buffer(cs->main.rcs, ib->big_ib_buffer, 817 RADEON_USAGE_READ | RADEON_PRIO_IB, 0); 818 819 rcs->current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space); 820 821 if (ib->ib_type == IB_MAIN) 822 cs->csc->ib_main_addr = rcs->current.buf; 823 824 ib_size = ib->big_ib_buffer->size - ib->used_ib_space; 825 rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs); 826 rcs->gpu_address = info->va_start; 827 return true; 828} 829 830static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib) 831{ 832 if (ib->ptr_ib_size_inside_ib) { 833 *ib->ptr_ib_size = rcs->current.cdw | 834 S_3F2_CHAIN(1) | S_3F2_VALID(1); 835 } else { 836 *ib->ptr_ib_size = rcs->current.cdw; 837 } 838} 839 840static void amdgpu_ib_finalize(struct amdgpu_winsys *ws, struct radeon_cmdbuf *rcs, 841 struct amdgpu_ib *ib) 842{ 843 amdgpu_set_ib_size(rcs, ib); 844 ib->used_ib_space += rcs->current.cdw * 4; 845 ib->used_ib_space = align(ib->used_ib_space, ws->info.ib_alignment); 846 ib->max_ib_size = MAX2(ib->max_ib_size, rcs->prev_dw + rcs->current.cdw); 847} 848 849static bool amdgpu_init_cs_context(struct amdgpu_winsys *ws, 850 struct amdgpu_cs_context *cs, 851 enum amd_ip_type ip_type) 852{ 853 switch (ip_type) { 854 case AMD_IP_SDMA: 855 cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_DMA; 856 break; 857 858 case AMD_IP_UVD: 859 cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_UVD; 860 break; 861 862 case AMD_IP_UVD_ENC: 863 cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_UVD_ENC; 864 break; 865 866 case AMD_IP_VCE: 867 cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCE; 868 break; 869 870 case AMD_IP_VCN_DEC: 871 cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_DEC; 872 break; 873 874 case AMD_IP_VCN_ENC: 875 cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_ENC; 876 break; 877 878 case AMD_IP_VCN_JPEG: 879 cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_JPEG; 880 break; 881 882 case AMD_IP_COMPUTE: 883 case AMD_IP_GFX: 884 cs->ib[IB_MAIN].ip_type = ip_type == AMD_IP_GFX ? AMDGPU_HW_IP_GFX : 885 AMDGPU_HW_IP_COMPUTE; 886 887 /* The kernel shouldn't invalidate L2 and vL1. The proper place for cache 888 * invalidation is the beginning of IBs (the previous commit does that), 889 * because completion of an IB doesn't care about the state of GPU caches, 890 * but the beginning of an IB does. Draw calls from multiple IBs can be 891 * executed in parallel, so draw calls from the current IB can finish after 892 * the next IB starts drawing, and so the cache flush at the end of IB 893 * is always late. 894 */ 895 if (ws->info.drm_minor >= 26) { 896 cs->ib[IB_PREAMBLE].flags = AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE; 897 cs->ib[IB_MAIN].flags = AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE; 898 } 899 break; 900 901 default: 902 assert(0); 903 } 904 905 cs->ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE; 906 cs->ib[IB_PREAMBLE].ip_type = cs->ib[IB_MAIN].ip_type; 907 908 cs->last_added_bo = NULL; 909 return true; 910} 911 912static void cleanup_fence_list(struct amdgpu_fence_list *fences) 913{ 914 for (unsigned i = 0; i < fences->num; i++) 915 amdgpu_fence_reference(&fences->list[i], NULL); 916 fences->num = 0; 917} 918 919static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *ws, struct amdgpu_cs_context *cs) 920{ 921 unsigned i; 922 923 for (i = 0; i < cs->num_real_buffers; i++) { 924 amdgpu_winsys_bo_reference(ws, &cs->real_buffers[i].bo, NULL); 925 } 926 for (i = 0; i < cs->num_slab_buffers; i++) { 927 amdgpu_winsys_bo_reference(ws, &cs->slab_buffers[i].bo, NULL); 928 } 929 for (i = 0; i < cs->num_sparse_buffers; i++) { 930 amdgpu_winsys_bo_reference(ws, &cs->sparse_buffers[i].bo, NULL); 931 } 932 cleanup_fence_list(&cs->fence_dependencies); 933 cleanup_fence_list(&cs->syncobj_dependencies); 934 cleanup_fence_list(&cs->syncobj_to_signal); 935 936 cs->num_real_buffers = 0; 937 cs->num_slab_buffers = 0; 938 cs->num_sparse_buffers = 0; 939 amdgpu_fence_reference(&cs->fence, NULL); 940 cs->last_added_bo = NULL; 941} 942 943static void amdgpu_destroy_cs_context(struct amdgpu_winsys *ws, struct amdgpu_cs_context *cs) 944{ 945 amdgpu_cs_context_cleanup(ws, cs); 946 FREE(cs->real_buffers); 947 FREE(cs->slab_buffers); 948 FREE(cs->sparse_buffers); 949 FREE(cs->fence_dependencies.list); 950 FREE(cs->syncobj_dependencies.list); 951 FREE(cs->syncobj_to_signal.list); 952} 953 954 955static bool 956amdgpu_cs_create(struct radeon_cmdbuf *rcs, 957 struct radeon_winsys_ctx *rwctx, 958 enum amd_ip_type ip_type, 959 void (*flush)(void *ctx, unsigned flags, 960 struct pipe_fence_handle **fence), 961 void *flush_ctx, 962 bool stop_exec_on_failure) 963{ 964 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; 965 struct amdgpu_cs *cs; 966 967 cs = CALLOC_STRUCT(amdgpu_cs); 968 if (!cs) { 969 return false; 970 } 971 972 util_queue_fence_init(&cs->flush_completed); 973 974 cs->ws = ctx->ws; 975 cs->ctx = ctx; 976 cs->flush_cs = flush; 977 cs->flush_data = flush_ctx; 978 cs->ip_type = ip_type; 979 cs->stop_exec_on_failure = stop_exec_on_failure; 980 cs->noop = ctx->ws->noop_cs; 981 cs->has_chaining = ctx->ws->info.gfx_level >= GFX7 && 982 (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE); 983 984 struct amdgpu_cs_fence_info fence_info; 985 fence_info.handle = cs->ctx->user_fence_bo; 986 fence_info.offset = cs->ip_type * 4; 987 amdgpu_cs_chunk_fence_info_to_data(&fence_info, (void*)&cs->fence_chunk); 988 989 cs->main.ib_type = IB_MAIN; 990 991 if (!amdgpu_init_cs_context(ctx->ws, &cs->csc1, ip_type)) { 992 FREE(cs); 993 return false; 994 } 995 996 if (!amdgpu_init_cs_context(ctx->ws, &cs->csc2, ip_type)) { 997 amdgpu_destroy_cs_context(ctx->ws, &cs->csc1); 998 FREE(cs); 999 return false; 1000 } 1001 1002 memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); 1003 1004 /* Set the first submission context as current. */ 1005 rcs->csc = cs->csc = &cs->csc1; 1006 cs->cst = &cs->csc2; 1007 1008 /* Assign to both amdgpu_cs_context; only csc will use it. */ 1009 cs->csc1.buffer_indices_hashlist = cs->buffer_indices_hashlist; 1010 cs->csc2.buffer_indices_hashlist = cs->buffer_indices_hashlist; 1011 1012 cs->csc1.ws = ctx->ws; 1013 cs->csc2.ws = ctx->ws; 1014 1015 cs->main.rcs = rcs; 1016 rcs->priv = cs; 1017 1018 if (!amdgpu_get_new_ib(ctx->ws, rcs, &cs->main, cs)) { 1019 amdgpu_destroy_cs_context(ctx->ws, &cs->csc2); 1020 amdgpu_destroy_cs_context(ctx->ws, &cs->csc1); 1021 FREE(cs); 1022 rcs->priv = NULL; 1023 return false; 1024 } 1025 1026 p_atomic_inc(&ctx->ws->num_cs); 1027 return true; 1028} 1029 1030static void amdgpu_cs_set_preamble(struct radeon_cmdbuf *cs, const uint32_t *preamble_ib, 1031 unsigned preamble_num_dw, bool preamble_changed) 1032{ 1033 /* TODO: implement this properly */ 1034 radeon_emit_array(cs, preamble_ib, preamble_num_dw); 1035} 1036 1037static bool 1038amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib, 1039 unsigned preamble_num_dw) 1040{ 1041 struct amdgpu_cs *cs = amdgpu_cs(rcs); 1042 struct amdgpu_winsys *ws = cs->ws; 1043 struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2}; 1044 unsigned size = align(preamble_num_dw * 4, ws->info.ib_alignment); 1045 struct pb_buffer *preamble_bo; 1046 uint32_t *map; 1047 1048 /* Create the preamble IB buffer. */ 1049 preamble_bo = amdgpu_bo_create(ws, size, ws->info.ib_alignment, 1050 RADEON_DOMAIN_VRAM, 1051 RADEON_FLAG_NO_INTERPROCESS_SHARING | 1052 RADEON_FLAG_GTT_WC | 1053 RADEON_FLAG_READ_ONLY); 1054 if (!preamble_bo) 1055 return false; 1056 1057 map = (uint32_t*)amdgpu_bo_map(&ws->dummy_ws.base, preamble_bo, NULL, 1058 PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY); 1059 if (!map) { 1060 radeon_bo_reference(&ws->dummy_ws.base, &preamble_bo, NULL); 1061 return false; 1062 } 1063 1064 /* Upload the preamble IB. */ 1065 memcpy(map, preamble_ib, preamble_num_dw * 4); 1066 1067 /* Pad the IB. */ 1068 uint32_t ib_pad_dw_mask = ws->info.ib_pad_dw_mask[cs->ip_type]; 1069 while (preamble_num_dw & ib_pad_dw_mask) 1070 map[preamble_num_dw++] = PKT3_NOP_PAD; 1071 amdgpu_bo_unmap(&ws->dummy_ws.base, preamble_bo); 1072 1073 for (unsigned i = 0; i < 2; i++) { 1074 csc[i]->ib[IB_PREAMBLE].va_start = amdgpu_winsys_bo(preamble_bo)->va; 1075 csc[i]->ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4; 1076 1077 csc[i]->ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT; 1078 } 1079 1080 assert(!cs->preamble_ib_bo); 1081 cs->preamble_ib_bo = preamble_bo; 1082 1083 amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, 1084 RADEON_USAGE_READ | RADEON_PRIO_IB, 0); 1085 return true; 1086} 1087 1088static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs) 1089{ 1090 return true; 1091} 1092 1093static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw) 1094{ 1095 struct amdgpu_cs *cs = amdgpu_cs(rcs); 1096 struct amdgpu_ib *ib = &cs->main; 1097 1098 assert(rcs->current.cdw <= rcs->current.max_dw); 1099 1100 /* 125% of the size for IB epilog. */ 1101 unsigned requested_size = rcs->prev_dw + rcs->current.cdw + dw; 1102 1103 if (requested_size > IB_MAX_SUBMIT_DWORDS) 1104 return false; 1105 1106 if (rcs->current.max_dw - rcs->current.cdw >= dw) 1107 return true; 1108 1109 unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs); 1110 unsigned need_byte_size = (dw + cs_epilog_dw) * 4; 1111 unsigned safe_byte_size = need_byte_size + need_byte_size / 4; 1112 ib->max_check_space_size = MAX2(ib->max_check_space_size, 1113 safe_byte_size); 1114 ib->max_ib_size = MAX2(ib->max_ib_size, requested_size); 1115 1116 if (!cs->has_chaining) 1117 return false; 1118 1119 /* Allocate a new chunk */ 1120 if (rcs->num_prev >= rcs->max_prev) { 1121 unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev); 1122 struct radeon_cmdbuf_chunk *new_prev; 1123 1124 new_prev = REALLOC(rcs->prev, 1125 sizeof(*new_prev) * rcs->max_prev, 1126 sizeof(*new_prev) * new_max_prev); 1127 if (!new_prev) 1128 return false; 1129 1130 rcs->prev = new_prev; 1131 rcs->max_prev = new_max_prev; 1132 } 1133 1134 if (!amdgpu_ib_new_buffer(cs->ws, ib, cs)) 1135 return false; 1136 1137 assert(ib->used_ib_space == 0); 1138 uint64_t va = amdgpu_winsys_bo(ib->big_ib_buffer)->va; 1139 1140 /* This space was originally reserved. */ 1141 rcs->current.max_dw += cs_epilog_dw; 1142 1143 /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */ 1144 uint32_t ib_pad_dw_mask = cs->ws->info.ib_pad_dw_mask[cs->ip_type]; 1145 while ((rcs->current.cdw & ib_pad_dw_mask) != ib_pad_dw_mask - 3) 1146 radeon_emit(rcs, PKT3_NOP_PAD); 1147 1148 radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); 1149 radeon_emit(rcs, va); 1150 radeon_emit(rcs, va >> 32); 1151 uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++]; 1152 assert((rcs->current.cdw & ib_pad_dw_mask) == 0); 1153 1154 assert((rcs->current.cdw & 7) == 0); 1155 assert(rcs->current.cdw <= rcs->current.max_dw); 1156 1157 amdgpu_set_ib_size(rcs, ib); 1158 ib->ptr_ib_size = new_ptr_ib_size; 1159 ib->ptr_ib_size_inside_ib = true; 1160 1161 /* Hook up the new chunk */ 1162 rcs->prev[rcs->num_prev].buf = rcs->current.buf; 1163 rcs->prev[rcs->num_prev].cdw = rcs->current.cdw; 1164 rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */ 1165 rcs->num_prev++; 1166 1167 rcs->prev_dw += rcs->current.cdw; 1168 rcs->current.cdw = 0; 1169 1170 rcs->current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space); 1171 rcs->current.max_dw = ib->big_ib_buffer->size / 4 - cs_epilog_dw; 1172 rcs->gpu_address = va; 1173 1174 amdgpu_cs_add_buffer(cs->main.rcs, ib->big_ib_buffer, 1175 RADEON_USAGE_READ | RADEON_PRIO_IB, 0); 1176 1177 return true; 1178} 1179 1180static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs, 1181 struct radeon_bo_list_item *list) 1182{ 1183 struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc; 1184 int i; 1185 1186 if (list) { 1187 for (i = 0; i < cs->num_real_buffers; i++) { 1188 list[i].bo_size = cs->real_buffers[i].bo->base.size; 1189 list[i].vm_address = cs->real_buffers[i].bo->va; 1190 list[i].priority_usage = cs->real_buffers[i].usage; 1191 } 1192 } 1193 return cs->num_real_buffers; 1194} 1195 1196static void add_fence_to_list(struct amdgpu_fence_list *fences, 1197 struct amdgpu_fence *fence) 1198{ 1199 unsigned idx = fences->num++; 1200 1201 if (idx >= fences->max) { 1202 unsigned size; 1203 const unsigned increment = 8; 1204 1205 fences->max = idx + increment; 1206 size = fences->max * sizeof(fences->list[0]); 1207 fences->list = realloc(fences->list, size); 1208 /* Clear the newly-allocated elements. */ 1209 memset(fences->list + idx, 0, 1210 increment * sizeof(fences->list[0])); 1211 } 1212 amdgpu_fence_reference(&fences->list[idx], (struct pipe_fence_handle*)fence); 1213} 1214 1215static bool is_noop_fence_dependency(struct amdgpu_cs *acs, 1216 struct amdgpu_fence *fence) 1217{ 1218 struct amdgpu_cs_context *cs = acs->csc; 1219 1220 /* Detect no-op dependencies only when there is only 1 ring, 1221 * because IBs on one ring are always executed one at a time. 1222 * 1223 * We always want no dependency between back-to-back gfx IBs, because 1224 * we need the parallelism between IBs for good performance. 1225 */ 1226 if ((acs->ip_type == AMD_IP_GFX || 1227 acs->ws->info.ip[acs->ip_type].num_queues == 1) && 1228 !amdgpu_fence_is_syncobj(fence) && 1229 fence->ctx == acs->ctx && 1230 fence->fence.ip_type == cs->ib[IB_MAIN].ip_type) 1231 return true; 1232 1233 return amdgpu_fence_wait((void *)fence, 0, false); 1234} 1235 1236static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws, 1237 struct pipe_fence_handle *pfence, 1238 unsigned dependency_flags) 1239{ 1240 struct amdgpu_cs *acs = amdgpu_cs(rws); 1241 struct amdgpu_cs_context *cs = acs->csc; 1242 struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence; 1243 1244 util_queue_fence_wait(&fence->submitted); 1245 1246 if (is_noop_fence_dependency(acs, fence)) 1247 return; 1248 1249 if (amdgpu_fence_is_syncobj(fence)) 1250 add_fence_to_list(&cs->syncobj_dependencies, fence); 1251 else 1252 add_fence_to_list(&cs->fence_dependencies, fence); 1253} 1254 1255static void amdgpu_add_bo_fence_dependencies(struct amdgpu_cs *acs, 1256 struct amdgpu_cs_context *cs, 1257 struct amdgpu_cs_buffer *buffer) 1258{ 1259 struct amdgpu_winsys_bo *bo = buffer->bo; 1260 unsigned new_num_fences = 0; 1261 const unsigned num_fences = bo->num_fences; 1262 1263 for (unsigned j = 0; j < num_fences; ++j) { 1264 struct amdgpu_fence *bo_fence = (void *)bo->fences[j]; 1265 1266 if (is_noop_fence_dependency(acs, bo_fence)) 1267 continue; 1268 1269 amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]); 1270 new_num_fences++; 1271 1272 if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED)) 1273 continue; 1274 1275 add_fence_to_list(&cs->fence_dependencies, bo_fence); 1276 } 1277 1278 for (unsigned j = new_num_fences; j < num_fences; ++j) 1279 amdgpu_fence_reference(&bo->fences[j], NULL); 1280 1281 bo->num_fences = new_num_fences; 1282} 1283 1284/* Add the given list of fences to the buffer's fence list. 1285 * 1286 * Must be called with the winsys bo_fence_lock held. 1287 */ 1288void amdgpu_add_fences(struct amdgpu_winsys_bo *bo, 1289 unsigned num_fences, 1290 struct pipe_fence_handle **fences) 1291{ 1292 if (bo->num_fences + num_fences > bo->max_fences) { 1293 unsigned new_max_fences = MAX2(bo->num_fences + num_fences, bo->max_fences * 2); 1294 struct pipe_fence_handle **new_fences = 1295 REALLOC(bo->fences, 1296 bo->num_fences * sizeof(*new_fences), 1297 new_max_fences * sizeof(*new_fences)); 1298 if (likely(new_fences && new_max_fences < UINT16_MAX)) { 1299 bo->fences = new_fences; 1300 bo->max_fences = new_max_fences; 1301 } else { 1302 unsigned drop; 1303 1304 fprintf(stderr, new_fences ? "amdgpu_add_fences: too many fences, dropping some\n" 1305 : "amdgpu_add_fences: allocation failure, dropping fence(s)\n"); 1306 free(new_fences); 1307 1308 if (!bo->num_fences) 1309 return; 1310 1311 bo->num_fences--; /* prefer to keep the most recent fence if possible */ 1312 amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL); 1313 1314 drop = bo->num_fences + num_fences - bo->max_fences; 1315 num_fences -= drop; 1316 fences += drop; 1317 } 1318 } 1319 1320 unsigned bo_num_fences = bo->num_fences; 1321 1322 for (unsigned i = 0; i < num_fences; ++i) { 1323 bo->fences[bo_num_fences] = NULL; 1324 amdgpu_fence_reference(&bo->fences[bo_num_fences], fences[i]); 1325 bo_num_fences++; 1326 } 1327 bo->num_fences = bo_num_fences; 1328} 1329 1330static void amdgpu_inc_bo_num_active_ioctls(unsigned num_buffers, 1331 struct amdgpu_cs_buffer *buffers) 1332{ 1333 for (unsigned i = 0; i < num_buffers; i++) 1334 p_atomic_inc(&buffers[i].bo->num_active_ioctls); 1335} 1336 1337static void amdgpu_add_fence_dependencies_bo_list(struct amdgpu_cs *acs, 1338 struct amdgpu_cs_context *cs, 1339 struct pipe_fence_handle *fence, 1340 unsigned num_buffers, 1341 struct amdgpu_cs_buffer *buffers) 1342{ 1343 for (unsigned i = 0; i < num_buffers; i++) { 1344 struct amdgpu_cs_buffer *buffer = &buffers[i]; 1345 struct amdgpu_winsys_bo *bo = buffer->bo; 1346 1347 amdgpu_add_bo_fence_dependencies(acs, cs, buffer); 1348 amdgpu_add_fences(bo, 1, &fence); 1349 } 1350} 1351 1352/* Since the kernel driver doesn't synchronize execution between different 1353 * rings automatically, we have to add fence dependencies manually. 1354 */ 1355static void amdgpu_add_fence_dependencies_bo_lists(struct amdgpu_cs *acs, 1356 struct amdgpu_cs_context *cs) 1357{ 1358 amdgpu_add_fence_dependencies_bo_list(acs, cs, cs->fence, cs->num_real_buffers, cs->real_buffers); 1359 amdgpu_add_fence_dependencies_bo_list(acs, cs, cs->fence, cs->num_slab_buffers, cs->slab_buffers); 1360 amdgpu_add_fence_dependencies_bo_list(acs, cs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers); 1361} 1362 1363static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws, 1364 struct pipe_fence_handle *fence) 1365{ 1366 struct amdgpu_cs *acs = amdgpu_cs(rws); 1367 struct amdgpu_cs_context *cs = acs->csc; 1368 1369 assert(amdgpu_fence_is_syncobj((struct amdgpu_fence *)fence)); 1370 1371 add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence); 1372} 1373 1374/* Add backing of sparse buffers to the buffer list. 1375 * 1376 * This is done late, during submission, to keep the buffer list short before 1377 * submit, and to avoid managing fences for the backing buffers. 1378 */ 1379static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs) 1380{ 1381 for (unsigned i = 0; i < cs->num_sparse_buffers; ++i) { 1382 struct amdgpu_cs_buffer *buffer = &cs->sparse_buffers[i]; 1383 struct amdgpu_winsys_bo *bo = buffer->bo; 1384 1385 simple_mtx_lock(&bo->lock); 1386 1387 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { 1388 /* We can directly add the buffer here, because we know that each 1389 * backing buffer occurs only once. 1390 */ 1391 int idx = amdgpu_do_add_real_buffer(cs, backing->bo); 1392 if (idx < 0) { 1393 fprintf(stderr, "%s: failed to add buffer\n", __FUNCTION__); 1394 simple_mtx_unlock(&bo->lock); 1395 return false; 1396 } 1397 1398 cs->real_buffers[idx].usage = buffer->usage; 1399 } 1400 1401 simple_mtx_unlock(&bo->lock); 1402 } 1403 1404 return true; 1405} 1406 1407static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index) 1408{ 1409 struct amdgpu_cs *acs = (struct amdgpu_cs*)job; 1410 struct amdgpu_winsys *ws = acs->ws; 1411 struct amdgpu_cs_context *cs = acs->cst; 1412 int i, r; 1413 uint32_t bo_list = 0; 1414 uint64_t seq_no = 0; 1415 bool has_user_fence = amdgpu_cs_has_user_fence(cs); 1416 bool use_bo_list_create = ws->info.drm_minor < 27; 1417 struct drm_amdgpu_bo_list_in bo_list_in; 1418 unsigned initial_num_real_buffers = cs->num_real_buffers; 1419 1420 simple_mtx_lock(&ws->bo_fence_lock); 1421 amdgpu_add_fence_dependencies_bo_lists(acs, cs); 1422 simple_mtx_unlock(&ws->bo_fence_lock); 1423 1424#if DEBUG 1425 /* Prepare the buffer list. */ 1426 if (ws->debug_all_bos) { 1427 /* The buffer list contains all buffers. This is a slow path that 1428 * ensures that no buffer is missing in the BO list. 1429 */ 1430 unsigned num_handles = 0; 1431 struct drm_amdgpu_bo_list_entry *list = 1432 alloca(ws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry)); 1433 struct amdgpu_winsys_bo *bo; 1434 1435 simple_mtx_lock(&ws->global_bo_list_lock); 1436 LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, u.real.global_list_item) { 1437 list[num_handles].bo_handle = bo->u.real.kms_handle; 1438 list[num_handles].bo_priority = 0; 1439 ++num_handles; 1440 } 1441 1442 r = amdgpu_bo_list_create_raw(ws->dev, ws->num_buffers, list, &bo_list); 1443 simple_mtx_unlock(&ws->global_bo_list_lock); 1444 if (r) { 1445 fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r); 1446 goto cleanup; 1447 } 1448 } else 1449#endif 1450 { 1451 if (!amdgpu_add_sparse_backing_buffers(cs)) { 1452 fprintf(stderr, "amdgpu: amdgpu_add_sparse_backing_buffers failed\n"); 1453 r = -ENOMEM; 1454 goto cleanup; 1455 } 1456 1457 struct drm_amdgpu_bo_list_entry *list = 1458 alloca((cs->num_real_buffers + 2) * sizeof(struct drm_amdgpu_bo_list_entry)); 1459 1460 unsigned num_handles = 0; 1461 for (i = 0; i < cs->num_real_buffers; ++i) { 1462 struct amdgpu_cs_buffer *buffer = &cs->real_buffers[i]; 1463 1464 list[num_handles].bo_handle = buffer->bo->u.real.kms_handle; 1465 list[num_handles].bo_priority = 1466 (util_last_bit(buffer->usage & RADEON_ALL_PRIORITIES) - 1) / 2; 1467 ++num_handles; 1468 } 1469 1470 if (use_bo_list_create) { 1471 /* Legacy path creating the buffer list handle and passing it to the CS ioctl. */ 1472 r = amdgpu_bo_list_create_raw(ws->dev, num_handles, list, &bo_list); 1473 if (r) { 1474 fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r); 1475 goto cleanup; 1476 } 1477 } else { 1478 /* Standard path passing the buffer list via the CS ioctl. */ 1479 bo_list_in.operation = ~0; 1480 bo_list_in.list_handle = ~0; 1481 bo_list_in.bo_number = num_handles; 1482 bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry); 1483 bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)list; 1484 } 1485 } 1486 1487 if (acs->ip_type == AMD_IP_GFX) 1488 ws->gfx_bo_list_counter += cs->num_real_buffers; 1489 1490 bool noop = false; 1491 1492 if (acs->stop_exec_on_failure && acs->ctx->num_rejected_cs) { 1493 r = -ECANCELED; 1494 } else { 1495 struct drm_amdgpu_cs_chunk chunks[7]; 1496 unsigned num_chunks = 0; 1497 1498 /* BO list */ 1499 if (!use_bo_list_create) { 1500 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES; 1501 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4; 1502 chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in; 1503 num_chunks++; 1504 } 1505 1506 /* Fence dependencies. */ 1507 unsigned num_dependencies = cs->fence_dependencies.num; 1508 if (num_dependencies) { 1509 struct drm_amdgpu_cs_chunk_dep *dep_chunk = 1510 alloca(num_dependencies * sizeof(*dep_chunk)); 1511 1512 for (unsigned i = 0; i < num_dependencies; i++) { 1513 struct amdgpu_fence *fence = 1514 (struct amdgpu_fence*)cs->fence_dependencies.list[i]; 1515 1516 assert(util_queue_fence_is_signalled(&fence->submitted)); 1517 amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]); 1518 } 1519 1520 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES; 1521 chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_dependencies; 1522 chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk; 1523 num_chunks++; 1524 } 1525 1526 /* Syncobj dependencies. */ 1527 unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num; 1528 if (num_syncobj_dependencies) { 1529 struct drm_amdgpu_cs_chunk_sem *sem_chunk = 1530 alloca(num_syncobj_dependencies * sizeof(sem_chunk[0])); 1531 1532 for (unsigned i = 0; i < num_syncobj_dependencies; i++) { 1533 struct amdgpu_fence *fence = 1534 (struct amdgpu_fence*)cs->syncobj_dependencies.list[i]; 1535 1536 if (!amdgpu_fence_is_syncobj(fence)) 1537 continue; 1538 1539 assert(util_queue_fence_is_signalled(&fence->submitted)); 1540 sem_chunk[i].handle = fence->syncobj; 1541 } 1542 1543 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN; 1544 chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies; 1545 chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk; 1546 num_chunks++; 1547 } 1548 1549 /* Syncobj signals. */ 1550 unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num; 1551 if (num_syncobj_to_signal) { 1552 struct drm_amdgpu_cs_chunk_sem *sem_chunk = 1553 alloca(num_syncobj_to_signal * sizeof(sem_chunk[0])); 1554 1555 for (unsigned i = 0; i < num_syncobj_to_signal; i++) { 1556 struct amdgpu_fence *fence = 1557 (struct amdgpu_fence*)cs->syncobj_to_signal.list[i]; 1558 1559 assert(amdgpu_fence_is_syncobj(fence)); 1560 sem_chunk[i].handle = fence->syncobj; 1561 } 1562 1563 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT; 1564 chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 1565 * num_syncobj_to_signal; 1566 chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk; 1567 num_chunks++; 1568 } 1569 1570 /* Fence */ 1571 if (has_user_fence) { 1572 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE; 1573 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4; 1574 chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk; 1575 num_chunks++; 1576 } 1577 1578 /* IB */ 1579 if (cs->ib[IB_PREAMBLE].ib_bytes) { 1580 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; 1581 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; 1582 chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PREAMBLE]; 1583 num_chunks++; 1584 } 1585 1586 /* IB */ 1587 cs->ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */ 1588 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; 1589 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; 1590 chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_MAIN]; 1591 num_chunks++; 1592 1593 if (cs->secure) { 1594 cs->ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE; 1595 cs->ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE; 1596 } else { 1597 cs->ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE; 1598 cs->ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE; 1599 } 1600 1601 /* Apply RADEON_NOOP. */ 1602 if (acs->noop) { 1603 if (acs->ip_type == AMD_IP_GFX) { 1604 /* Reduce the IB size and fill it with NOP to make it like an empty IB. */ 1605 unsigned noop_size = MIN2(cs->ib[IB_MAIN].ib_bytes, ws->info.ib_alignment); 1606 1607 cs->ib_main_addr[0] = PKT3(PKT3_NOP, noop_size / 4 - 2, 0); 1608 cs->ib[IB_MAIN].ib_bytes = noop_size; 1609 } else { 1610 noop = true; 1611 } 1612 } 1613 1614 assert(num_chunks <= ARRAY_SIZE(chunks)); 1615 1616 r = 0; 1617 1618 if (!noop) { 1619 /* The kernel returns -ENOMEM with many parallel processes using GDS such as test suites 1620 * quite often, but it eventually succeeds after enough attempts. This happens frequently 1621 * with dEQP using NGG streamout. 1622 */ 1623 do { 1624 /* Wait 1 ms and try again. */ 1625 if (r == -ENOMEM) 1626 os_time_sleep(1000); 1627 1628 r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list, 1629 num_chunks, chunks, &seq_no); 1630 } while (r == -ENOMEM); 1631 } 1632 } 1633 1634 if (r) { 1635 if (r == -ECANCELED) 1636 fprintf(stderr, "amdgpu: The CS has been cancelled because the context is lost.\n"); 1637 else 1638 fprintf(stderr, "amdgpu: The CS has been rejected, " 1639 "see dmesg for more information (%i).\n", r); 1640 1641 acs->ctx->num_rejected_cs++; 1642 ws->num_total_rejected_cs++; 1643 } else if (!noop) { 1644 /* Success. */ 1645 uint64_t *user_fence = NULL; 1646 1647 /* Need to reserve 4 QWORD for user fence: 1648 * QWORD[0]: completed fence 1649 * QWORD[1]: preempted fence 1650 * QWORD[2]: reset fence 1651 * QWORD[3]: preempted then reset 1652 **/ 1653 if (has_user_fence) 1654 user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4; 1655 amdgpu_fence_submitted(cs->fence, seq_no, user_fence); 1656 } 1657 1658 /* Cleanup. */ 1659 if (bo_list) 1660 amdgpu_bo_list_destroy_raw(ws->dev, bo_list); 1661 1662cleanup: 1663 /* If there was an error, signal the fence, because it won't be signalled 1664 * by the hardware. */ 1665 if (r || noop) 1666 amdgpu_fence_signalled(cs->fence); 1667 1668 cs->error_code = r; 1669 1670 /* Only decrement num_active_ioctls for those buffers where we incremented it. */ 1671 for (i = 0; i < initial_num_real_buffers; i++) 1672 p_atomic_dec(&cs->real_buffers[i].bo->num_active_ioctls); 1673 for (i = 0; i < cs->num_slab_buffers; i++) 1674 p_atomic_dec(&cs->slab_buffers[i].bo->num_active_ioctls); 1675 for (i = 0; i < cs->num_sparse_buffers; i++) 1676 p_atomic_dec(&cs->sparse_buffers[i].bo->num_active_ioctls); 1677 1678 amdgpu_cs_context_cleanup(ws, cs); 1679} 1680 1681/* Make sure the previous submission is completed. */ 1682void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs) 1683{ 1684 struct amdgpu_cs *cs = amdgpu_cs(rcs); 1685 1686 /* Wait for any pending ioctl of this CS to complete. */ 1687 util_queue_fence_wait(&cs->flush_completed); 1688} 1689 1690static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs, 1691 unsigned flags, 1692 struct pipe_fence_handle **fence) 1693{ 1694 struct amdgpu_cs *cs = amdgpu_cs(rcs); 1695 struct amdgpu_winsys *ws = cs->ws; 1696 int error_code = 0; 1697 uint32_t ib_pad_dw_mask = ws->info.ib_pad_dw_mask[cs->ip_type]; 1698 1699 rcs->current.max_dw += amdgpu_cs_epilog_dws(cs); 1700 1701 /* Pad the IB according to the mask. */ 1702 switch (cs->ip_type) { 1703 case AMD_IP_SDMA: 1704 if (ws->info.gfx_level <= GFX6) { 1705 while (rcs->current.cdw & ib_pad_dw_mask) 1706 radeon_emit(rcs, 0xf0000000); /* NOP packet */ 1707 } else { 1708 while (rcs->current.cdw & ib_pad_dw_mask) 1709 radeon_emit(rcs, SDMA_NOP_PAD); 1710 } 1711 break; 1712 case AMD_IP_GFX: 1713 case AMD_IP_COMPUTE: 1714 if (ws->info.gfx_ib_pad_with_type2) { 1715 while (rcs->current.cdw & ib_pad_dw_mask) 1716 radeon_emit(rcs, PKT2_NOP_PAD); 1717 } else { 1718 while (rcs->current.cdw & ib_pad_dw_mask) 1719 radeon_emit(rcs, PKT3_NOP_PAD); 1720 } 1721 if (cs->ip_type == AMD_IP_GFX) 1722 ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4; 1723 break; 1724 case AMD_IP_UVD: 1725 case AMD_IP_UVD_ENC: 1726 while (rcs->current.cdw & ib_pad_dw_mask) 1727 radeon_emit(rcs, 0x80000000); /* type2 nop packet */ 1728 break; 1729 case AMD_IP_VCN_JPEG: 1730 if (rcs->current.cdw % 2) 1731 assert(0); 1732 while (rcs->current.cdw & ib_pad_dw_mask) { 1733 radeon_emit(rcs, 0x60000000); /* nop packet */ 1734 radeon_emit(rcs, 0x00000000); 1735 } 1736 break; 1737 case AMD_IP_VCN_DEC: 1738 while (rcs->current.cdw & ib_pad_dw_mask) 1739 radeon_emit(rcs, 0x81ff); /* nop packet */ 1740 break; 1741 default: 1742 break; 1743 } 1744 1745 if (rcs->current.cdw > rcs->current.max_dw) { 1746 fprintf(stderr, "amdgpu: command stream overflowed\n"); 1747 } 1748 1749 /* If the CS is not empty or overflowed.... */ 1750 if (likely(radeon_emitted(rcs, 0) && 1751 rcs->current.cdw <= rcs->current.max_dw && 1752 !(flags & RADEON_FLUSH_NOOP))) { 1753 struct amdgpu_cs_context *cur = cs->csc; 1754 1755 /* Set IB sizes. */ 1756 amdgpu_ib_finalize(ws, rcs, &cs->main); 1757 1758 /* Create a fence. */ 1759 amdgpu_fence_reference(&cur->fence, NULL); 1760 if (cs->next_fence) { 1761 /* just move the reference */ 1762 cur->fence = cs->next_fence; 1763 cs->next_fence = NULL; 1764 } else { 1765 cur->fence = amdgpu_fence_create(cs->ctx, 1766 cur->ib[IB_MAIN].ip_type); 1767 } 1768 if (fence) 1769 amdgpu_fence_reference(fence, cur->fence); 1770 1771 amdgpu_inc_bo_num_active_ioctls(cur->num_real_buffers, cur->real_buffers); 1772 amdgpu_inc_bo_num_active_ioctls(cur->num_slab_buffers, cur->slab_buffers); 1773 amdgpu_inc_bo_num_active_ioctls(cur->num_sparse_buffers, cur->sparse_buffers); 1774 1775 amdgpu_cs_sync_flush(rcs); 1776 1777 /* Swap command streams. "cst" is going to be submitted. */ 1778 rcs->csc = cs->csc = cs->cst; 1779 cs->cst = cur; 1780 1781 /* Submit. */ 1782 util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed, 1783 amdgpu_cs_submit_ib, NULL, 0); 1784 1785 if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION) 1786 cs->csc->secure = !cs->cst->secure; 1787 else 1788 cs->csc->secure = cs->cst->secure; 1789 1790 if (!(flags & PIPE_FLUSH_ASYNC)) { 1791 amdgpu_cs_sync_flush(rcs); 1792 error_code = cur->error_code; 1793 } 1794 } else { 1795 if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION) 1796 cs->csc->secure = !cs->csc->secure; 1797 amdgpu_cs_context_cleanup(ws, cs->csc); 1798 } 1799 1800 memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); 1801 1802 amdgpu_get_new_ib(ws, rcs, &cs->main, cs); 1803 1804 if (cs->preamble_ib_bo) { 1805 amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, 1806 RADEON_USAGE_READ | RADEON_PRIO_IB, 0); 1807 } 1808 1809 rcs->used_gart_kb = 0; 1810 rcs->used_vram_kb = 0; 1811 1812 if (cs->ip_type == AMD_IP_GFX) 1813 ws->num_gfx_IBs++; 1814 else if (cs->ip_type == AMD_IP_SDMA) 1815 ws->num_sdma_IBs++; 1816 1817 return error_code; 1818} 1819 1820static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs) 1821{ 1822 struct amdgpu_cs *cs = amdgpu_cs(rcs); 1823 1824 if (!cs) 1825 return; 1826 1827 amdgpu_cs_sync_flush(rcs); 1828 util_queue_fence_destroy(&cs->flush_completed); 1829 p_atomic_dec(&cs->ws->num_cs); 1830 radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->preamble_ib_bo, NULL); 1831 radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->main.big_ib_buffer, NULL); 1832 FREE(rcs->prev); 1833 amdgpu_destroy_cs_context(cs->ws, &cs->csc1); 1834 amdgpu_destroy_cs_context(cs->ws, &cs->csc2); 1835 amdgpu_fence_reference(&cs->next_fence, NULL); 1836 FREE(cs); 1837} 1838 1839static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs, 1840 struct pb_buffer *_buf, 1841 unsigned usage) 1842{ 1843 struct amdgpu_cs *cs = amdgpu_cs(rcs); 1844 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf; 1845 1846 return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage); 1847} 1848 1849void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws) 1850{ 1851 ws->base.ctx_create = amdgpu_ctx_create; 1852 ws->base.ctx_destroy = amdgpu_ctx_destroy; 1853 ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status; 1854 ws->base.cs_create = amdgpu_cs_create; 1855 ws->base.cs_set_preamble = amdgpu_cs_set_preamble; 1856 ws->base.cs_setup_preemption = amdgpu_cs_setup_preemption; 1857 ws->base.cs_destroy = amdgpu_cs_destroy; 1858 ws->base.cs_add_buffer = amdgpu_cs_add_buffer; 1859 ws->base.cs_validate = amdgpu_cs_validate; 1860 ws->base.cs_check_space = amdgpu_cs_check_space; 1861 ws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list; 1862 ws->base.cs_flush = amdgpu_cs_flush; 1863 ws->base.cs_get_next_fence = amdgpu_cs_get_next_fence; 1864 ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced; 1865 ws->base.cs_sync_flush = amdgpu_cs_sync_flush; 1866 ws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency; 1867 ws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal; 1868 ws->base.fence_wait = amdgpu_fence_wait_rel_timeout; 1869 ws->base.fence_reference = amdgpu_fence_reference; 1870 ws->base.fence_import_syncobj = amdgpu_fence_import_syncobj; 1871 ws->base.fence_import_sync_file = amdgpu_fence_import_sync_file; 1872 ws->base.fence_export_sync_file = amdgpu_fence_export_sync_file; 1873 ws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file; 1874} 1875