1/* 2 * Copyright © 2019 Raspberry Pi Ltd 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "v3dv_private.h" 25#include "drm-uapi/v3d_drm.h" 26 27#include "broadcom/clif/clif_dump.h" 28#include "util/libsync.h" 29#include "util/os_time.h" 30#include "vk_drm_syncobj.h" 31 32#include <errno.h> 33#include <time.h> 34 35static void 36v3dv_clif_dump(struct v3dv_device *device, 37 struct v3dv_job *job, 38 struct drm_v3d_submit_cl *submit) 39{ 40 if (!(unlikely(V3D_DEBUG & (V3D_DEBUG_CL | 41 V3D_DEBUG_CL_NO_BIN | 42 V3D_DEBUG_CLIF)))) 43 return; 44 45 struct clif_dump *clif = clif_dump_init(&device->devinfo, 46 stderr, 47 V3D_DEBUG & (V3D_DEBUG_CL | 48 V3D_DEBUG_CL_NO_BIN), 49 V3D_DEBUG & V3D_DEBUG_CL_NO_BIN); 50 51 set_foreach(job->bos, entry) { 52 struct v3dv_bo *bo = (void *)entry->key; 53 char *name = ralloc_asprintf(NULL, "%s_0x%x", 54 bo->name, bo->offset); 55 56 bool ok = v3dv_bo_map(device, bo, bo->size); 57 if (!ok) { 58 fprintf(stderr, "failed to map BO for clif_dump.\n"); 59 ralloc_free(name); 60 goto free_clif; 61 } 62 clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map); 63 64 ralloc_free(name); 65 } 66 67 clif_dump(clif, submit); 68 69 free_clif: 70 clif_dump_destroy(clif); 71} 72 73static VkResult 74queue_wait_idle(struct v3dv_queue *queue, 75 struct v3dv_submit_sync_info *sync_info) 76{ 77 if (queue->device->pdevice->caps.multisync) { 78 int ret = drmSyncobjWait(queue->device->pdevice->render_fd, 79 queue->last_job_syncs.syncs, 3, 80 INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, 81 NULL); 82 if (ret) { 83 return vk_errorf(queue, VK_ERROR_DEVICE_LOST, 84 "syncobj wait failed: %m"); 85 } 86 87 bool first = true; 88 for (int i = 0; i < 3; i++) { 89 if (!queue->last_job_syncs.first[i]) 90 first = false; 91 } 92 93 /* If we're not the first job, that means we're waiting on some 94 * per-queue-type syncobj which transitively waited on the semaphores 95 * so we can skip the semaphore wait. 96 */ 97 if (first) { 98 VkResult result = vk_sync_wait_many(&queue->device->vk, 99 sync_info->wait_count, 100 sync_info->waits, 101 VK_SYNC_WAIT_COMPLETE, 102 UINT64_MAX); 103 if (result != VK_SUCCESS) 104 return result; 105 } 106 } else { 107 /* Without multisync, all the semaphores are baked into the one syncobj 108 * at the start of each submit so we only need to wait on the one. 109 */ 110 int ret = drmSyncobjWait(queue->device->pdevice->render_fd, 111 &queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], 1, 112 INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, 113 NULL); 114 if (ret) { 115 return vk_errorf(queue, VK_ERROR_DEVICE_LOST, 116 "syncobj wait failed: %m"); 117 } 118 } 119 120 for (int i = 0; i < 3; i++) 121 queue->last_job_syncs.first[i] = false; 122 123 return VK_SUCCESS; 124} 125 126static VkResult 127handle_reset_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, 128 struct v3dv_submit_sync_info *sync_info) 129{ 130 struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset; 131 assert(info->pool); 132 133 /* We are about to reset query counters so we need to make sure that 134 * The GPU is not using them. The exception is timestamp queries, since 135 * we handle those in the CPU. 136 */ 137 if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) 138 v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE); 139 140 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 141 struct vk_sync_wait waits[info->count]; 142 unsigned wait_count = 0; 143 for (int i = 0; i < info->count; i++) { 144 struct v3dv_query *query = &info->pool->queries[i]; 145 /* Only wait for a query if we've used it otherwise we will be 146 * waiting forever for the fence to become signaled. 147 */ 148 if (query->maybe_available) { 149 waits[wait_count] = (struct vk_sync_wait){ 150 .sync = info->pool->queries[i].perf.last_job_sync 151 }; 152 wait_count++; 153 }; 154 } 155 156 VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits, 157 VK_SYNC_WAIT_COMPLETE, UINT64_MAX); 158 159 if (result != VK_SUCCESS) 160 return result; 161 } 162 163 v3dv_reset_query_pools(job->device, info->pool, info->first, info->count); 164 165 return VK_SUCCESS; 166} 167 168static VkResult 169export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd) 170{ 171 int err; 172 if (job->device->pdevice->caps.multisync) { 173 static const enum v3dv_queue_type queues_to_sync[] = { 174 V3DV_QUEUE_CL, 175 V3DV_QUEUE_CSD, 176 }; 177 178 for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) { 179 enum v3dv_queue_type queue_type = queues_to_sync[i]; 180 int tmp_fd = -1; 181 182 err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd, 183 queue->last_job_syncs.syncs[queue_type], 184 &tmp_fd); 185 186 if (err) { 187 close(*fd); 188 return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN, 189 "sync file export failed: %m"); 190 } 191 192 err = sync_accumulate("v3dv", fd, tmp_fd); 193 194 if (err) { 195 close(tmp_fd); 196 close(*fd); 197 return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN, 198 "failed to accumulate sync files: %m"); 199 } 200 } 201 } else { 202 err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd, 203 queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], 204 fd); 205 206 if (err) { 207 return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN, 208 "sync file export failed: %m"); 209 } 210 } 211 return VK_SUCCESS; 212} 213 214static VkResult 215handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx) 216{ 217 VkResult result = VK_SUCCESS; 218 219 mtx_lock(&job->device->query_mutex); 220 221 struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end; 222 struct v3dv_queue *queue = &job->device->queue; 223 224 int err = 0; 225 int fd = -1; 226 227 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 228 result = export_perfmon_last_job_sync(queue, job, &fd); 229 230 if (result != VK_SUCCESS) 231 goto fail; 232 233 assert(fd >= 0); 234 } 235 236 for (uint32_t i = 0; i < info->count; i++) { 237 assert(info->query + i < info->pool->query_count); 238 struct v3dv_query *query = &info->pool->queries[info->query + i]; 239 240 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 241 uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj; 242 err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd, 243 syncobj, fd); 244 245 if (err) { 246 result = vk_errorf(queue, VK_ERROR_UNKNOWN, 247 "sync file import failed: %m"); 248 goto fail; 249 } 250 } 251 252 query->maybe_available = true; 253 } 254 255fail: 256 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) 257 close(fd); 258 259 cnd_broadcast(&job->device->query_ended); 260 mtx_unlock(&job->device->query_mutex); 261 262 return result; 263} 264 265static VkResult 266handle_copy_query_results_cpu_job(struct v3dv_job *job) 267{ 268 struct v3dv_copy_query_results_cpu_job_info *info = 269 &job->cpu.query_copy_results; 270 271 assert(info->dst && info->dst->mem && info->dst->mem->bo); 272 struct v3dv_bo *bo = info->dst->mem->bo; 273 274 /* Map the entire dst buffer for the CPU copy if needed */ 275 assert(!bo->map || bo->map_size == bo->size); 276 if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size)) 277 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); 278 279 uint8_t *offset = ((uint8_t *) bo->map) + 280 info->offset + info->dst->mem_offset; 281 v3dv_get_query_pool_results(job->device, 282 info->pool, 283 info->first, 284 info->count, 285 offset, 286 info->stride, 287 info->flags); 288 289 return VK_SUCCESS; 290} 291 292static VkResult 293handle_set_event_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, 294 struct v3dv_submit_sync_info *sync_info) 295{ 296 /* From the Vulkan 1.0 spec: 297 * 298 * "When vkCmdSetEvent is submitted to a queue, it defines an execution 299 * dependency on commands that were submitted before it, and defines an 300 * event signal operation which sets the event to the signaled state. 301 * The first synchronization scope includes every command previously 302 * submitted to the same queue, including those in the same command 303 * buffer and batch". 304 * 305 * So we should wait for all prior work to be completed before signaling 306 * the event, this includes all active CPU wait threads spawned for any 307 * command buffer submitted *before* this. 308 */ 309 310 VkResult result = queue_wait_idle(queue, sync_info); 311 if (result != VK_SUCCESS) 312 return result; 313 314 struct v3dv_event_set_cpu_job_info *info = &job->cpu.event_set; 315 p_atomic_set(&info->event->state, info->state); 316 317 return VK_SUCCESS; 318} 319 320static bool 321check_wait_events_complete(struct v3dv_job *job) 322{ 323 assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); 324 325 struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait; 326 for (uint32_t i = 0; i < info->event_count; i++) { 327 if (!p_atomic_read(&info->events[i]->state)) 328 return false; 329 } 330 return true; 331} 332 333static VkResult 334handle_wait_events_cpu_job(struct v3dv_job *job) 335{ 336 assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS); 337 338 /* Wait for events to be signaled */ 339 const useconds_t wait_interval_ms = 1; 340 while (!check_wait_events_complete(job)) 341 usleep(wait_interval_ms * 1000); 342 343 return VK_SUCCESS; 344} 345 346static VkResult 347handle_copy_buffer_to_image_cpu_job(struct v3dv_queue *queue, 348 struct v3dv_job *job, 349 struct v3dv_submit_sync_info *sync_info) 350{ 351 assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE); 352 struct v3dv_copy_buffer_to_image_cpu_job_info *info = 353 &job->cpu.copy_buffer_to_image; 354 355 /* Wait for all GPU work to finish first, since we may be accessing 356 * the BOs involved in the operation. 357 */ 358 VkResult result = queue_wait_idle(queue, sync_info); 359 if (result != VK_SUCCESS) 360 return result; 361 362 /* Map BOs */ 363 struct v3dv_bo *dst_bo = info->image->mem->bo; 364 assert(!dst_bo->map || dst_bo->map_size == dst_bo->size); 365 if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size)) 366 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); 367 void *dst_ptr = dst_bo->map; 368 369 struct v3dv_bo *src_bo = info->buffer->mem->bo; 370 assert(!src_bo->map || src_bo->map_size == src_bo->size); 371 if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size)) 372 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); 373 void *src_ptr = src_bo->map; 374 375 const struct v3d_resource_slice *slice = 376 &info->image->slices[info->mip_level]; 377 378 const struct pipe_box box = { 379 info->image_offset.x, info->image_offset.y, info->base_layer, 380 info->image_extent.width, info->image_extent.height, info->layer_count, 381 }; 382 383 /* Copy each layer */ 384 for (uint32_t i = 0; i < info->layer_count; i++) { 385 const uint32_t dst_offset = 386 v3dv_layer_offset(info->image, info->mip_level, info->base_layer + i); 387 const uint32_t src_offset = 388 info->buffer->mem_offset + info->buffer_offset + 389 info->buffer_layer_stride * i; 390 v3d_store_tiled_image( 391 dst_ptr + dst_offset, slice->stride, 392 src_ptr + src_offset, info->buffer_stride, 393 slice->tiling, info->image->cpp, slice->padded_height, &box); 394 } 395 396 return VK_SUCCESS; 397} 398 399static VkResult 400handle_timestamp_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, 401 struct v3dv_submit_sync_info *sync_info) 402{ 403 assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY); 404 struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp; 405 406 /* Wait for completion of all work queued before the timestamp query */ 407 VkResult result = queue_wait_idle(queue, sync_info); 408 if (result != VK_SUCCESS) 409 return result; 410 411 mtx_lock(&job->device->query_mutex); 412 413 /* Compute timestamp */ 414 struct timespec t; 415 clock_gettime(CLOCK_MONOTONIC, &t); 416 417 for (uint32_t i = 0; i < info->count; i++) { 418 assert(info->query + i < info->pool->query_count); 419 struct v3dv_query *query = &info->pool->queries[info->query + i]; 420 query->maybe_available = true; 421 if (i == 0) 422 query->value = t.tv_sec * 1000000000ull + t.tv_nsec; 423 } 424 425 cnd_broadcast(&job->device->query_ended); 426 mtx_unlock(&job->device->query_mutex); 427 428 return VK_SUCCESS; 429} 430 431static VkResult 432handle_csd_indirect_cpu_job(struct v3dv_queue *queue, 433 struct v3dv_job *job, 434 struct v3dv_submit_sync_info *sync_info) 435{ 436 assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT); 437 struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect; 438 assert(info->csd_job); 439 440 /* Make sure the GPU is no longer using the indirect buffer*/ 441 assert(info->buffer && info->buffer->mem && info->buffer->mem->bo); 442 v3dv_bo_wait(queue->device, info->buffer->mem->bo, PIPE_TIMEOUT_INFINITE); 443 444 /* Map the indirect buffer and read the dispatch parameters */ 445 assert(info->buffer && info->buffer->mem && info->buffer->mem->bo); 446 struct v3dv_bo *bo = info->buffer->mem->bo; 447 if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size)) 448 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); 449 assert(bo->map); 450 451 const uint32_t offset = info->buffer->mem_offset + info->offset; 452 const uint32_t *group_counts = (uint32_t *) (bo->map + offset); 453 if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0) 454 return VK_SUCCESS; 455 456 if (memcmp(group_counts, info->csd_job->csd.wg_count, 457 sizeof(info->csd_job->csd.wg_count)) != 0) { 458 v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts); 459 } 460 461 return VK_SUCCESS; 462} 463 464static VkResult 465process_waits(struct v3dv_queue *queue, 466 uint32_t count, struct vk_sync_wait *waits) 467{ 468 struct v3dv_device *device = queue->device; 469 VkResult result = VK_SUCCESS; 470 int err = 0; 471 472 if (count == 0) 473 return VK_SUCCESS; 474 475 /* If multisync is supported, we wait on semaphores in the first job 476 * submitted to each of the individual queues. We don't need to 477 * pre-populate the syncobjs. 478 */ 479 if (queue->device->pdevice->caps.multisync) 480 return VK_SUCCESS; 481 482 int fd = -1; 483 err = drmSyncobjExportSyncFile(device->pdevice->render_fd, 484 queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], 485 &fd); 486 if (err) { 487 result = vk_errorf(queue, VK_ERROR_UNKNOWN, 488 "sync file export failed: %m"); 489 goto fail; 490 } 491 492 for (uint32_t i = 0; i < count; i++) { 493 uint32_t syncobj = vk_sync_as_drm_syncobj(waits[i].sync)->syncobj; 494 int wait_fd = -1; 495 496 err = drmSyncobjExportSyncFile(device->pdevice->render_fd, 497 syncobj, &wait_fd); 498 if (err) { 499 result = vk_errorf(queue, VK_ERROR_UNKNOWN, 500 "sync file export failed: %m"); 501 goto fail; 502 } 503 504 err = sync_accumulate("v3dv", &fd, wait_fd); 505 close(wait_fd); 506 if (err) { 507 result = vk_errorf(queue, VK_ERROR_UNKNOWN, 508 "sync file merge failed: %m"); 509 goto fail; 510 } 511 } 512 513 err = drmSyncobjImportSyncFile(device->pdevice->render_fd, 514 queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], 515 fd); 516 if (err) { 517 result = vk_errorf(queue, VK_ERROR_UNKNOWN, 518 "sync file import failed: %m"); 519 } 520 521fail: 522 close(fd); 523 return result; 524} 525 526static VkResult 527process_signals(struct v3dv_queue *queue, 528 uint32_t count, struct vk_sync_signal *signals) 529{ 530 struct v3dv_device *device = queue->device; 531 532 if (count == 0) 533 return VK_SUCCESS; 534 535 /* If multisync is supported, we are signalling semaphores in the last job 536 * of the last command buffer and, therefore, we do not need to process any 537 * semaphores here. 538 */ 539 if (device->pdevice->caps.multisync) 540 return VK_SUCCESS; 541 542 int fd; 543 drmSyncobjExportSyncFile(device->pdevice->render_fd, 544 queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], 545 &fd); 546 if (fd == -1) { 547 return vk_errorf(queue, VK_ERROR_UNKNOWN, 548 "sync file export failed: %m"); 549 } 550 551 VkResult result = VK_SUCCESS; 552 for (uint32_t i = 0; i < count; i++) { 553 uint32_t syncobj = vk_sync_as_drm_syncobj(signals[i].sync)->syncobj; 554 int err = drmSyncobjImportSyncFile(device->pdevice->render_fd, 555 syncobj, fd); 556 if (err) { 557 result = vk_errorf(queue, VK_ERROR_UNKNOWN, 558 "sync file import failed: %m"); 559 break; 560 } 561 } 562 563 assert(fd >= 0); 564 close(fd); 565 566 return result; 567} 568 569static void 570multisync_free(struct v3dv_device *device, 571 struct drm_v3d_multi_sync *ms) 572{ 573 vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs); 574 vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs); 575} 576 577static struct drm_v3d_sem * 578set_in_syncs(struct v3dv_queue *queue, 579 struct v3dv_job *job, 580 enum v3dv_queue_type queue_sync, 581 uint32_t *count, 582 struct v3dv_submit_sync_info *sync_info) 583{ 584 struct v3dv_device *device = queue->device; 585 uint32_t n_syncs = 0; 586 587 /* If this is the first job submitted to a given GPU queue in this cmd buf 588 * batch, it has to wait on wait semaphores (if any) before running. 589 */ 590 if (queue->last_job_syncs.first[queue_sync]) 591 n_syncs = sync_info->wait_count; 592 593 /* If the serialize flag is set the job needs to be serialized in the 594 * corresponding queues. Notice that we may implement transfer operations 595 * as both CL or TFU jobs. 596 * 597 * FIXME: maybe we could track more precisely if the source of a transfer 598 * barrier is a CL and/or a TFU job. 599 */ 600 bool sync_csd = job->serialize & V3DV_BARRIER_COMPUTE_BIT; 601 bool sync_tfu = job->serialize & V3DV_BARRIER_TRANSFER_BIT; 602 bool sync_cl = job->serialize & (V3DV_BARRIER_GRAPHICS_BIT | 603 V3DV_BARRIER_TRANSFER_BIT); 604 *count = n_syncs; 605 if (sync_cl) 606 (*count)++; 607 if (sync_tfu) 608 (*count)++; 609 if (sync_csd) 610 (*count)++; 611 612 if (!*count) 613 return NULL; 614 615 struct drm_v3d_sem *syncs = 616 vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem), 617 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 618 619 if (!syncs) 620 return NULL; 621 622 for (int i = 0; i < n_syncs; i++) { 623 syncs[i].handle = 624 vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj; 625 } 626 627 if (sync_cl) 628 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CL]; 629 630 if (sync_csd) 631 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD]; 632 633 if (sync_tfu) 634 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU]; 635 636 assert(n_syncs == *count); 637 return syncs; 638} 639 640static struct drm_v3d_sem * 641set_out_syncs(struct v3dv_queue *queue, 642 struct v3dv_job *job, 643 enum v3dv_queue_type queue_sync, 644 uint32_t *count, 645 struct v3dv_submit_sync_info *sync_info, 646 bool signal_syncs) 647{ 648 struct v3dv_device *device = queue->device; 649 650 uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0; 651 652 /* We always signal the syncobj from `device->last_job_syncs` related to 653 * this v3dv_queue_type to track the last job submitted to this queue. 654 */ 655 (*count) = n_vk_syncs + 1; 656 657 struct drm_v3d_sem *syncs = 658 vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem), 659 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); 660 661 if (!syncs) 662 return NULL; 663 664 if (n_vk_syncs) { 665 for (unsigned i = 0; i < n_vk_syncs; i++) { 666 syncs[i].handle = 667 vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj; 668 } 669 } 670 671 syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync]; 672 673 return syncs; 674} 675 676static void 677set_ext(struct drm_v3d_extension *ext, 678 struct drm_v3d_extension *next, 679 uint32_t id, 680 uintptr_t flags) 681{ 682 ext->next = (uintptr_t)(void *)next; 683 ext->id = id; 684 ext->flags = flags; 685} 686 687/* This function sets the extension for multiple in/out syncobjs. When it is 688 * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC. 689 * Otherwise, the extension id is 0, which means an out-of-memory error. 690 */ 691static void 692set_multisync(struct drm_v3d_multi_sync *ms, 693 struct v3dv_submit_sync_info *sync_info, 694 struct drm_v3d_extension *next, 695 struct v3dv_device *device, 696 struct v3dv_job *job, 697 enum v3dv_queue_type queue_sync, 698 enum v3d_queue wait_stage, 699 bool signal_syncs) 700{ 701 struct v3dv_queue *queue = &device->queue; 702 uint32_t out_sync_count = 0, in_sync_count = 0; 703 struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL; 704 705 in_syncs = set_in_syncs(queue, job, queue_sync, 706 &in_sync_count, sync_info); 707 if (!in_syncs && in_sync_count) 708 goto fail; 709 710 out_syncs = set_out_syncs(queue, job, queue_sync, 711 &out_sync_count, sync_info, signal_syncs); 712 713 assert(out_sync_count > 0); 714 715 if (!out_syncs) 716 goto fail; 717 718 set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0); 719 ms->wait_stage = wait_stage; 720 ms->out_sync_count = out_sync_count; 721 ms->out_syncs = (uintptr_t)(void *)out_syncs; 722 ms->in_sync_count = in_sync_count; 723 ms->in_syncs = (uintptr_t)(void *)in_syncs; 724 725 return; 726 727fail: 728 if (in_syncs) 729 vk_free(&device->vk.alloc, in_syncs); 730 assert(!out_syncs); 731 732 return; 733} 734 735static VkResult 736handle_cl_job(struct v3dv_queue *queue, 737 struct v3dv_job *job, 738 uint32_t counter_pass_idx, 739 struct v3dv_submit_sync_info *sync_info, 740 bool signal_syncs) 741{ 742 struct v3dv_device *device = queue->device; 743 744 struct drm_v3d_submit_cl submit = { 0 }; 745 746 /* Sanity check: we should only flag a bcl sync on a job that needs to be 747 * serialized. 748 */ 749 assert(job->serialize || !job->needs_bcl_sync); 750 751 /* We expect to have just one RCL per job which should fit in just one BO. 752 * Our BCL, could chain multiple BOS together though. 753 */ 754 assert(list_length(&job->rcl.bo_list) == 1); 755 assert(list_length(&job->bcl.bo_list) >= 1); 756 struct v3dv_bo *bcl_fist_bo = 757 list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link); 758 submit.bcl_start = bcl_fist_bo->offset; 759 submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl); 760 submit.rcl_start = job->rcl.bo->offset; 761 submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl); 762 763 submit.qma = job->tile_alloc->offset; 764 submit.qms = job->tile_alloc->size; 765 submit.qts = job->tile_state->offset; 766 767 submit.flags = 0; 768 if (job->tmu_dirty_rcl) 769 submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE; 770 771 /* If the job uses VK_KHR_buffer_device_addess we need to ensure all 772 * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR 773 * are included. 774 */ 775 if (job->uses_buffer_device_address) { 776 util_dynarray_foreach(&queue->device->device_address_bo_list, 777 struct v3dv_bo *, bo) { 778 v3dv_job_add_bo(job, *bo); 779 } 780 } 781 782 submit.bo_handle_count = job->bo_count; 783 uint32_t *bo_handles = 784 (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count); 785 uint32_t bo_idx = 0; 786 set_foreach(job->bos, entry) { 787 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key; 788 bo_handles[bo_idx++] = bo->handle; 789 } 790 assert(bo_idx == submit.bo_handle_count); 791 submit.bo_handles = (uintptr_t)(void *)bo_handles; 792 793 submit.perfmon_id = job->perf ? 794 job->perf->kperfmon_ids[counter_pass_idx] : 0; 795 const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id; 796 queue->last_perfmon_id = submit.perfmon_id; 797 798 /* We need a binning sync if we are the first CL job waiting on a semaphore 799 * with a wait stage that involves the geometry pipeline, or if the job 800 * comes after a pipeline barrier that involves geometry stages 801 * (needs_bcl_sync) or when performance queries are in use. 802 * 803 * We need a render sync if the job doesn't need a binning sync but has 804 * still been flagged for serialization. It should be noted that RCL jobs 805 * don't start until the previous RCL job has finished so we don't really 806 * need to add a fence for those, however, we might need to wait on a CSD or 807 * TFU job, which are not automatically serialized with CL jobs. 808 */ 809 bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync; 810 if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) { 811 for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) { 812 needs_bcl_sync = sync_info->waits[i].stage_mask & 813 (VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT | 814 VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | 815 VK_PIPELINE_STAGE_ALL_COMMANDS_BIT | 816 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | 817 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | 818 VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | 819 VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | 820 VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | 821 VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT); 822 } 823 } 824 825 bool needs_rcl_sync = job->serialize && !needs_bcl_sync; 826 827 /* Replace single semaphore settings whenever our kernel-driver supports 828 * multiple semaphores extension. 829 */ 830 struct drm_v3d_multi_sync ms = { 0 }; 831 if (device->pdevice->caps.multisync) { 832 enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN; 833 set_multisync(&ms, sync_info, NULL, device, job, 834 V3DV_QUEUE_CL, wait_stage, signal_syncs); 835 if (!ms.base.id) 836 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); 837 838 submit.flags |= DRM_V3D_SUBMIT_EXTENSION; 839 submit.extensions = (uintptr_t)(void *)&ms; 840 /* Disable legacy sync interface when multisync extension is used */ 841 submit.in_sync_rcl = 0; 842 submit.in_sync_bcl = 0; 843 submit.out_sync = 0; 844 } else { 845 uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY]; 846 submit.in_sync_bcl = needs_bcl_sync ? last_job_sync : 0; 847 submit.in_sync_rcl = needs_rcl_sync ? last_job_sync : 0; 848 submit.out_sync = last_job_sync; 849 } 850 851 v3dv_clif_dump(device, job, &submit); 852 int ret = v3dv_ioctl(device->pdevice->render_fd, 853 DRM_IOCTL_V3D_SUBMIT_CL, &submit); 854 855 static bool warned = false; 856 if (ret && !warned) { 857 fprintf(stderr, "Draw call returned %s. Expect corruption.\n", 858 strerror(errno)); 859 warned = true; 860 } 861 862 free(bo_handles); 863 multisync_free(device, &ms); 864 865 queue->last_job_syncs.first[V3DV_QUEUE_CL] = false; 866 867 if (ret) 868 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m"); 869 870 return VK_SUCCESS; 871} 872 873static VkResult 874handle_tfu_job(struct v3dv_queue *queue, 875 struct v3dv_job *job, 876 struct v3dv_submit_sync_info *sync_info, 877 bool signal_syncs) 878{ 879 struct v3dv_device *device = queue->device; 880 881 const bool needs_sync = sync_info->wait_count || job->serialize; 882 883 /* Replace single semaphore settings whenever our kernel-driver supports 884 * multiple semaphore extension. 885 */ 886 struct drm_v3d_multi_sync ms = { 0 }; 887 if (device->pdevice->caps.multisync) { 888 set_multisync(&ms, sync_info, NULL, device, job, 889 V3DV_QUEUE_TFU, V3D_TFU, signal_syncs); 890 if (!ms.base.id) 891 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); 892 893 job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION; 894 job->tfu.extensions = (uintptr_t)(void *)&ms; 895 /* Disable legacy sync interface when multisync extension is used */ 896 job->tfu.in_sync = 0; 897 job->tfu.out_sync = 0; 898 } else { 899 uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY]; 900 job->tfu.in_sync = needs_sync ? last_job_sync : 0; 901 job->tfu.out_sync = last_job_sync; 902 } 903 int ret = v3dv_ioctl(device->pdevice->render_fd, 904 DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu); 905 906 multisync_free(device, &ms); 907 queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false; 908 909 if (ret != 0) 910 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m"); 911 912 return VK_SUCCESS; 913} 914 915static VkResult 916handle_csd_job(struct v3dv_queue *queue, 917 struct v3dv_job *job, 918 uint32_t counter_pass_idx, 919 struct v3dv_submit_sync_info *sync_info, 920 bool signal_syncs) 921{ 922 struct v3dv_device *device = queue->device; 923 924 struct drm_v3d_submit_csd *submit = &job->csd.submit; 925 926 /* If the job uses VK_KHR_buffer_device_addess we need to ensure all 927 * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR 928 * are included. 929 */ 930 if (job->uses_buffer_device_address) { 931 util_dynarray_foreach(&queue->device->device_address_bo_list, 932 struct v3dv_bo *, bo) { 933 v3dv_job_add_bo(job, *bo); 934 } 935 } 936 937 submit->bo_handle_count = job->bo_count; 938 uint32_t *bo_handles = 939 (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2)); 940 uint32_t bo_idx = 0; 941 set_foreach(job->bos, entry) { 942 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key; 943 bo_handles[bo_idx++] = bo->handle; 944 } 945 assert(bo_idx == submit->bo_handle_count); 946 submit->bo_handles = (uintptr_t)(void *)bo_handles; 947 948 const bool needs_sync = sync_info->wait_count || job->serialize; 949 950 /* Replace single semaphore settings whenever our kernel-driver supports 951 * multiple semaphore extension. 952 */ 953 struct drm_v3d_multi_sync ms = { 0 }; 954 if (device->pdevice->caps.multisync) { 955 set_multisync(&ms, sync_info, NULL, device, job, 956 V3DV_QUEUE_CSD, V3D_CSD, signal_syncs); 957 if (!ms.base.id) 958 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); 959 960 submit->flags |= DRM_V3D_SUBMIT_EXTENSION; 961 submit->extensions = (uintptr_t)(void *)&ms; 962 /* Disable legacy sync interface when multisync extension is used */ 963 submit->in_sync = 0; 964 submit->out_sync = 0; 965 } else { 966 uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY]; 967 submit->in_sync = needs_sync ? last_job_sync : 0; 968 submit->out_sync = last_job_sync; 969 } 970 submit->perfmon_id = job->perf ? 971 job->perf->kperfmon_ids[counter_pass_idx] : 0; 972 queue->last_perfmon_id = submit->perfmon_id; 973 int ret = v3dv_ioctl(device->pdevice->render_fd, 974 DRM_IOCTL_V3D_SUBMIT_CSD, submit); 975 976 static bool warned = false; 977 if (ret && !warned) { 978 fprintf(stderr, "Compute dispatch returned %s. Expect corruption.\n", 979 strerror(errno)); 980 warned = true; 981 } 982 983 free(bo_handles); 984 985 multisync_free(device, &ms); 986 queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false; 987 988 if (ret) 989 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m"); 990 991 return VK_SUCCESS; 992} 993 994static VkResult 995queue_handle_job(struct v3dv_queue *queue, 996 struct v3dv_job *job, 997 uint32_t counter_pass_idx, 998 struct v3dv_submit_sync_info *sync_info, 999 bool signal_syncs) 1000{ 1001 switch (job->type) { 1002 case V3DV_JOB_TYPE_GPU_CL: 1003 return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs); 1004 case V3DV_JOB_TYPE_GPU_TFU: 1005 return handle_tfu_job(queue, job, sync_info, signal_syncs); 1006 case V3DV_JOB_TYPE_GPU_CSD: 1007 return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs); 1008 case V3DV_JOB_TYPE_CPU_RESET_QUERIES: 1009 return handle_reset_query_cpu_job(queue, job, sync_info); 1010 case V3DV_JOB_TYPE_CPU_END_QUERY: 1011 return handle_end_query_cpu_job(job, counter_pass_idx); 1012 case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS: 1013 return handle_copy_query_results_cpu_job(job); 1014 case V3DV_JOB_TYPE_CPU_SET_EVENT: 1015 return handle_set_event_cpu_job(queue, job, sync_info); 1016 case V3DV_JOB_TYPE_CPU_WAIT_EVENTS: 1017 return handle_wait_events_cpu_job(job); 1018 case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE: 1019 return handle_copy_buffer_to_image_cpu_job(queue, job, sync_info); 1020 case V3DV_JOB_TYPE_CPU_CSD_INDIRECT: 1021 return handle_csd_indirect_cpu_job(queue, job, sync_info); 1022 case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY: 1023 return handle_timestamp_query_cpu_job(queue, job, sync_info); 1024 default: 1025 unreachable("Unhandled job type"); 1026 } 1027} 1028 1029static VkResult 1030queue_create_noop_job(struct v3dv_queue *queue) 1031{ 1032 struct v3dv_device *device = queue->device; 1033 queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8, 1034 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 1035 if (!queue->noop_job) 1036 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 1037 v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1); 1038 1039 v3dv_X(device, job_emit_noop)(queue->noop_job); 1040 1041 /* We use no-op jobs to signal semaphores/fences. These jobs needs to be 1042 * serialized across all hw queues to comply with Vulkan's signal operation 1043 * order requirements, which basically require that signal operations occur 1044 * in submission order. 1045 */ 1046 queue->noop_job->serialize = V3DV_BARRIER_ALL; 1047 1048 return VK_SUCCESS; 1049} 1050 1051static VkResult 1052queue_submit_noop_job(struct v3dv_queue *queue, 1053 uint32_t counter_pass_idx, 1054 struct v3dv_submit_sync_info *sync_info, 1055 bool signal_syncs) 1056{ 1057 if (!queue->noop_job) { 1058 VkResult result = queue_create_noop_job(queue); 1059 if (result != VK_SUCCESS) 1060 return result; 1061 } 1062 1063 assert(queue->noop_job); 1064 return queue_handle_job(queue, queue->noop_job, counter_pass_idx, 1065 sync_info, signal_syncs); 1066} 1067 1068VkResult 1069v3dv_queue_driver_submit(struct vk_queue *vk_queue, 1070 struct vk_queue_submit *submit) 1071{ 1072 struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk); 1073 VkResult result; 1074 1075 struct v3dv_submit_sync_info sync_info = { 1076 .wait_count = submit->wait_count, 1077 .waits = submit->waits, 1078 .signal_count = submit->signal_count, 1079 .signals = submit->signals, 1080 }; 1081 1082 for (int i = 0; i < V3DV_QUEUE_COUNT; i++) 1083 queue->last_job_syncs.first[i] = true; 1084 1085 result = process_waits(queue, sync_info.wait_count, sync_info.waits); 1086 if (result != VK_SUCCESS) 1087 return result; 1088 1089 for (uint32_t i = 0; i < submit->command_buffer_count; i++) { 1090 struct v3dv_cmd_buffer *cmd_buffer = 1091 container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk); 1092 list_for_each_entry_safe(struct v3dv_job, job, 1093 &cmd_buffer->jobs, list_link) { 1094 1095 result = queue_handle_job(queue, job, submit->perf_pass_index, 1096 &sync_info, false); 1097 if (result != VK_SUCCESS) 1098 return result; 1099 } 1100 1101 /* If the command buffer ends with a barrier we need to consume it now. 1102 * 1103 * FIXME: this will drain all hw queues. Instead, we could use the pending 1104 * barrier state to limit the queues we serialize against. 1105 */ 1106 if (cmd_buffer->state.barrier.dst_mask) { 1107 result = queue_submit_noop_job(queue, submit->perf_pass_index, 1108 &sync_info, false); 1109 if (result != VK_SUCCESS) 1110 return result; 1111 } 1112 } 1113 1114 /* Finish by submitting a no-op job that synchronizes across all queues. 1115 * This will ensure that the signal semaphores don't get triggered until 1116 * all work on any queue completes. See Vulkan's signal operation order 1117 * requirements. 1118 */ 1119 if (submit->signal_count > 0) { 1120 result = queue_submit_noop_job(queue, submit->perf_pass_index, 1121 &sync_info, true); 1122 if (result != VK_SUCCESS) 1123 return result; 1124 } 1125 1126 process_signals(queue, sync_info.signal_count, sync_info.signals); 1127 1128 return VK_SUCCESS; 1129} 1130 1131VKAPI_ATTR VkResult VKAPI_CALL 1132v3dv_QueueBindSparse(VkQueue _queue, 1133 uint32_t bindInfoCount, 1134 const VkBindSparseInfo *pBindInfo, 1135 VkFence fence) 1136{ 1137 V3DV_FROM_HANDLE(v3dv_queue, queue, _queue); 1138 return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT); 1139} 1140