1/*
2 * Copyright © 2019 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "v3dv_private.h"
25#include "drm-uapi/v3d_drm.h"
26
27#include "broadcom/clif/clif_dump.h"
28#include "util/libsync.h"
29#include "util/os_time.h"
30#include "vk_drm_syncobj.h"
31
32#include <errno.h>
33#include <time.h>
34
35static void
36v3dv_clif_dump(struct v3dv_device *device,
37               struct v3dv_job *job,
38               struct drm_v3d_submit_cl *submit)
39{
40   if (!(unlikely(V3D_DEBUG & (V3D_DEBUG_CL |
41                               V3D_DEBUG_CL_NO_BIN |
42                               V3D_DEBUG_CLIF))))
43      return;
44
45   struct clif_dump *clif = clif_dump_init(&device->devinfo,
46                                           stderr,
47                                           V3D_DEBUG & (V3D_DEBUG_CL |
48                                                        V3D_DEBUG_CL_NO_BIN),
49                                           V3D_DEBUG & V3D_DEBUG_CL_NO_BIN);
50
51   set_foreach(job->bos, entry) {
52      struct v3dv_bo *bo = (void *)entry->key;
53      char *name = ralloc_asprintf(NULL, "%s_0x%x",
54                                   bo->name, bo->offset);
55
56      bool ok = v3dv_bo_map(device, bo, bo->size);
57      if (!ok) {
58         fprintf(stderr, "failed to map BO for clif_dump.\n");
59         ralloc_free(name);
60         goto free_clif;
61      }
62      clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
63
64      ralloc_free(name);
65   }
66
67   clif_dump(clif, submit);
68
69 free_clif:
70   clif_dump_destroy(clif);
71}
72
73static VkResult
74queue_wait_idle(struct v3dv_queue *queue,
75                struct v3dv_submit_sync_info *sync_info)
76{
77   if (queue->device->pdevice->caps.multisync) {
78      int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
79                               queue->last_job_syncs.syncs, 3,
80                               INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
81                               NULL);
82      if (ret) {
83         return vk_errorf(queue, VK_ERROR_DEVICE_LOST,
84                          "syncobj wait failed: %m");
85      }
86
87      bool first = true;
88      for (int i = 0; i < 3; i++) {
89         if (!queue->last_job_syncs.first[i])
90            first = false;
91      }
92
93      /* If we're not the first job, that means we're waiting on some
94       * per-queue-type syncobj which transitively waited on the semaphores
95       * so we can skip the semaphore wait.
96       */
97      if (first) {
98         VkResult result = vk_sync_wait_many(&queue->device->vk,
99                                             sync_info->wait_count,
100                                             sync_info->waits,
101                                             VK_SYNC_WAIT_COMPLETE,
102                                             UINT64_MAX);
103         if (result != VK_SUCCESS)
104            return result;
105      }
106   } else {
107      /* Without multisync, all the semaphores are baked into the one syncobj
108       * at the start of each submit so we only need to wait on the one.
109       */
110      int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
111                               &queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], 1,
112                               INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
113                               NULL);
114      if (ret) {
115         return vk_errorf(queue, VK_ERROR_DEVICE_LOST,
116                          "syncobj wait failed: %m");
117      }
118   }
119
120   for (int i = 0; i < 3; i++)
121      queue->last_job_syncs.first[i] = false;
122
123   return VK_SUCCESS;
124}
125
126static VkResult
127handle_reset_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
128                           struct v3dv_submit_sync_info *sync_info)
129{
130   struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
131   assert(info->pool);
132
133   /* We are about to reset query counters so we need to make sure that
134    * The GPU is not using them. The exception is timestamp queries, since
135    * we handle those in the CPU.
136    */
137   if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
138      v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
139
140   if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
141      struct vk_sync_wait waits[info->count];
142      unsigned wait_count = 0;
143      for (int i = 0; i < info->count; i++) {
144         struct v3dv_query *query = &info->pool->queries[i];
145         /* Only wait for a query if we've used it otherwise we will be
146          * waiting forever for the fence to become signaled.
147          */
148         if (query->maybe_available) {
149            waits[wait_count] = (struct vk_sync_wait){
150               .sync = info->pool->queries[i].perf.last_job_sync
151            };
152            wait_count++;
153         };
154      }
155
156      VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
157                                          VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
158
159      if (result != VK_SUCCESS)
160         return result;
161   }
162
163   v3dv_reset_query_pools(job->device, info->pool, info->first, info->count);
164
165   return VK_SUCCESS;
166}
167
168static VkResult
169export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
170{
171   int err;
172   if (job->device->pdevice->caps.multisync) {
173      static const enum v3dv_queue_type queues_to_sync[] = {
174         V3DV_QUEUE_CL,
175         V3DV_QUEUE_CSD,
176      };
177
178      for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
179         enum v3dv_queue_type queue_type = queues_to_sync[i];
180         int tmp_fd = -1;
181
182         err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
183                                        queue->last_job_syncs.syncs[queue_type],
184                                        &tmp_fd);
185
186         if (err) {
187            close(*fd);
188            return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
189                             "sync file export failed: %m");
190         }
191
192         err = sync_accumulate("v3dv", fd, tmp_fd);
193
194         if (err) {
195            close(tmp_fd);
196            close(*fd);
197            return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
198                             "failed to accumulate sync files: %m");
199         }
200      }
201   } else {
202      err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
203                                     queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
204                                     fd);
205
206      if (err) {
207         return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
208                          "sync file export failed: %m");
209      }
210   }
211   return VK_SUCCESS;
212}
213
214static VkResult
215handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
216{
217   VkResult result = VK_SUCCESS;
218
219   mtx_lock(&job->device->query_mutex);
220
221   struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end;
222   struct v3dv_queue *queue = &job->device->queue;
223
224   int err = 0;
225   int fd = -1;
226
227   if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
228      result = export_perfmon_last_job_sync(queue, job, &fd);
229
230      if (result != VK_SUCCESS)
231         goto fail;
232
233      assert(fd >= 0);
234   }
235
236   for (uint32_t i = 0; i < info->count; i++) {
237      assert(info->query + i < info->pool->query_count);
238      struct v3dv_query *query = &info->pool->queries[info->query + i];
239
240      if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
241         uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
242         err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
243                                        syncobj, fd);
244
245         if (err) {
246            result = vk_errorf(queue, VK_ERROR_UNKNOWN,
247                               "sync file import failed: %m");
248            goto fail;
249         }
250      }
251
252      query->maybe_available = true;
253   }
254
255fail:
256   if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
257      close(fd);
258
259   cnd_broadcast(&job->device->query_ended);
260   mtx_unlock(&job->device->query_mutex);
261
262   return result;
263}
264
265static VkResult
266handle_copy_query_results_cpu_job(struct v3dv_job *job)
267{
268   struct v3dv_copy_query_results_cpu_job_info *info =
269      &job->cpu.query_copy_results;
270
271   assert(info->dst && info->dst->mem && info->dst->mem->bo);
272   struct v3dv_bo *bo = info->dst->mem->bo;
273
274   /* Map the entire dst buffer for the CPU copy if needed */
275   assert(!bo->map || bo->map_size == bo->size);
276   if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
277      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
278
279   uint8_t *offset = ((uint8_t *) bo->map) +
280                     info->offset + info->dst->mem_offset;
281   v3dv_get_query_pool_results(job->device,
282                               info->pool,
283                               info->first,
284                               info->count,
285                               offset,
286                               info->stride,
287                               info->flags);
288
289   return VK_SUCCESS;
290}
291
292static VkResult
293handle_set_event_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
294                         struct v3dv_submit_sync_info *sync_info)
295{
296   /* From the Vulkan 1.0 spec:
297    *
298    *    "When vkCmdSetEvent is submitted to a queue, it defines an execution
299    *     dependency on commands that were submitted before it, and defines an
300    *     event signal operation which sets the event to the signaled state.
301    *     The first synchronization scope includes every command previously
302    *     submitted to the same queue, including those in the same command
303    *     buffer and batch".
304    *
305    * So we should wait for all prior work to be completed before signaling
306    * the event, this includes all active CPU wait threads spawned for any
307    * command buffer submitted *before* this.
308    */
309
310   VkResult result = queue_wait_idle(queue, sync_info);
311   if (result != VK_SUCCESS)
312      return result;
313
314   struct v3dv_event_set_cpu_job_info *info = &job->cpu.event_set;
315   p_atomic_set(&info->event->state, info->state);
316
317   return VK_SUCCESS;
318}
319
320static bool
321check_wait_events_complete(struct v3dv_job *job)
322{
323   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
324
325   struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
326   for (uint32_t i = 0; i < info->event_count; i++) {
327      if (!p_atomic_read(&info->events[i]->state))
328         return false;
329   }
330   return true;
331}
332
333static VkResult
334handle_wait_events_cpu_job(struct v3dv_job *job)
335{
336   assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
337
338   /* Wait for events to be signaled */
339   const useconds_t wait_interval_ms = 1;
340   while (!check_wait_events_complete(job))
341      usleep(wait_interval_ms * 1000);
342
343   return VK_SUCCESS;
344}
345
346static VkResult
347handle_copy_buffer_to_image_cpu_job(struct v3dv_queue *queue,
348                                    struct v3dv_job *job,
349                                    struct v3dv_submit_sync_info *sync_info)
350{
351   assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE);
352   struct v3dv_copy_buffer_to_image_cpu_job_info *info =
353      &job->cpu.copy_buffer_to_image;
354
355   /* Wait for all GPU work to finish first, since we may be accessing
356    * the BOs involved in the operation.
357    */
358   VkResult result = queue_wait_idle(queue, sync_info);
359   if (result != VK_SUCCESS)
360      return result;
361
362   /* Map BOs */
363   struct v3dv_bo *dst_bo = info->image->mem->bo;
364   assert(!dst_bo->map || dst_bo->map_size == dst_bo->size);
365   if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size))
366      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
367   void *dst_ptr = dst_bo->map;
368
369   struct v3dv_bo *src_bo = info->buffer->mem->bo;
370   assert(!src_bo->map || src_bo->map_size == src_bo->size);
371   if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size))
372      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
373   void *src_ptr = src_bo->map;
374
375   const struct v3d_resource_slice *slice =
376      &info->image->slices[info->mip_level];
377
378   const struct pipe_box box = {
379      info->image_offset.x, info->image_offset.y, info->base_layer,
380      info->image_extent.width, info->image_extent.height, info->layer_count,
381   };
382
383   /* Copy each layer */
384   for (uint32_t i = 0; i < info->layer_count; i++) {
385      const uint32_t dst_offset =
386         v3dv_layer_offset(info->image, info->mip_level, info->base_layer + i);
387      const uint32_t src_offset =
388         info->buffer->mem_offset + info->buffer_offset +
389         info->buffer_layer_stride * i;
390      v3d_store_tiled_image(
391         dst_ptr + dst_offset, slice->stride,
392         src_ptr + src_offset, info->buffer_stride,
393         slice->tiling, info->image->cpp, slice->padded_height, &box);
394   }
395
396   return VK_SUCCESS;
397}
398
399static VkResult
400handle_timestamp_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
401                               struct v3dv_submit_sync_info *sync_info)
402{
403   assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
404   struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
405
406   /* Wait for completion of all work queued before the timestamp query */
407   VkResult result = queue_wait_idle(queue, sync_info);
408   if (result != VK_SUCCESS)
409      return result;
410
411   mtx_lock(&job->device->query_mutex);
412
413   /* Compute timestamp */
414   struct timespec t;
415   clock_gettime(CLOCK_MONOTONIC, &t);
416
417   for (uint32_t i = 0; i < info->count; i++) {
418      assert(info->query + i < info->pool->query_count);
419      struct v3dv_query *query = &info->pool->queries[info->query + i];
420      query->maybe_available = true;
421      if (i == 0)
422         query->value = t.tv_sec * 1000000000ull + t.tv_nsec;
423   }
424
425   cnd_broadcast(&job->device->query_ended);
426   mtx_unlock(&job->device->query_mutex);
427
428   return VK_SUCCESS;
429}
430
431static VkResult
432handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
433                            struct v3dv_job *job,
434                            struct v3dv_submit_sync_info *sync_info)
435{
436   assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
437   struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
438   assert(info->csd_job);
439
440   /* Make sure the GPU is no longer using the indirect buffer*/
441   assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
442   v3dv_bo_wait(queue->device, info->buffer->mem->bo, PIPE_TIMEOUT_INFINITE);
443
444   /* Map the indirect buffer and read the dispatch parameters */
445   assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
446   struct v3dv_bo *bo = info->buffer->mem->bo;
447   if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
448      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
449   assert(bo->map);
450
451   const uint32_t offset = info->buffer->mem_offset + info->offset;
452   const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
453   if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
454      return VK_SUCCESS;
455
456   if (memcmp(group_counts, info->csd_job->csd.wg_count,
457              sizeof(info->csd_job->csd.wg_count)) != 0) {
458      v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
459   }
460
461   return VK_SUCCESS;
462}
463
464static VkResult
465process_waits(struct v3dv_queue *queue,
466              uint32_t count, struct vk_sync_wait *waits)
467{
468   struct v3dv_device *device = queue->device;
469   VkResult result = VK_SUCCESS;
470   int err = 0;
471
472   if (count == 0)
473      return VK_SUCCESS;
474
475   /* If multisync is supported, we wait on semaphores in the first job
476    * submitted to each of the individual queues.  We don't need to
477    * pre-populate the syncobjs.
478    */
479   if (queue->device->pdevice->caps.multisync)
480      return VK_SUCCESS;
481
482   int fd = -1;
483   err = drmSyncobjExportSyncFile(device->pdevice->render_fd,
484                                  queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
485                                  &fd);
486   if (err) {
487      result = vk_errorf(queue, VK_ERROR_UNKNOWN,
488                         "sync file export failed: %m");
489      goto fail;
490   }
491
492   for (uint32_t i = 0; i < count; i++) {
493      uint32_t syncobj = vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
494      int wait_fd = -1;
495
496      err = drmSyncobjExportSyncFile(device->pdevice->render_fd,
497                                     syncobj, &wait_fd);
498      if (err) {
499         result = vk_errorf(queue, VK_ERROR_UNKNOWN,
500                            "sync file export failed: %m");
501         goto fail;
502      }
503
504      err = sync_accumulate("v3dv", &fd, wait_fd);
505      close(wait_fd);
506      if (err) {
507         result = vk_errorf(queue, VK_ERROR_UNKNOWN,
508                            "sync file merge failed: %m");
509         goto fail;
510      }
511   }
512
513   err = drmSyncobjImportSyncFile(device->pdevice->render_fd,
514                                  queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
515                                  fd);
516   if (err) {
517      result = vk_errorf(queue, VK_ERROR_UNKNOWN,
518                         "sync file import failed: %m");
519   }
520
521fail:
522   close(fd);
523   return result;
524}
525
526static VkResult
527process_signals(struct v3dv_queue *queue,
528                uint32_t count, struct vk_sync_signal *signals)
529{
530   struct v3dv_device *device = queue->device;
531
532   if (count == 0)
533      return VK_SUCCESS;
534
535   /* If multisync is supported, we are signalling semaphores in the last job
536    * of the last command buffer and, therefore, we do not need to process any
537    * semaphores here.
538    */
539   if (device->pdevice->caps.multisync)
540      return VK_SUCCESS;
541
542   int fd;
543   drmSyncobjExportSyncFile(device->pdevice->render_fd,
544                            queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
545                            &fd);
546   if (fd == -1) {
547      return vk_errorf(queue, VK_ERROR_UNKNOWN,
548                       "sync file export failed: %m");
549   }
550
551   VkResult result = VK_SUCCESS;
552   for (uint32_t i = 0; i < count; i++) {
553      uint32_t syncobj = vk_sync_as_drm_syncobj(signals[i].sync)->syncobj;
554      int err = drmSyncobjImportSyncFile(device->pdevice->render_fd,
555                                         syncobj, fd);
556      if (err) {
557         result = vk_errorf(queue, VK_ERROR_UNKNOWN,
558                            "sync file import failed: %m");
559         break;
560      }
561   }
562
563   assert(fd >= 0);
564   close(fd);
565
566   return result;
567}
568
569static void
570multisync_free(struct v3dv_device *device,
571               struct drm_v3d_multi_sync *ms)
572{
573   vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs);
574   vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs);
575}
576
577static struct drm_v3d_sem *
578set_in_syncs(struct v3dv_queue *queue,
579             struct v3dv_job *job,
580             enum v3dv_queue_type queue_sync,
581             uint32_t *count,
582             struct v3dv_submit_sync_info *sync_info)
583{
584   struct v3dv_device *device = queue->device;
585   uint32_t n_syncs = 0;
586
587   /* If this is the first job submitted to a given GPU queue in this cmd buf
588    * batch, it has to wait on wait semaphores (if any) before running.
589    */
590   if (queue->last_job_syncs.first[queue_sync])
591      n_syncs = sync_info->wait_count;
592
593   /* If the serialize flag is set the job needs to be serialized in the
594    * corresponding queues. Notice that we may implement transfer operations
595    * as both CL or TFU jobs.
596    *
597    * FIXME: maybe we could track more precisely if the source of a transfer
598    * barrier is a CL and/or a TFU job.
599    */
600   bool sync_csd  = job->serialize & V3DV_BARRIER_COMPUTE_BIT;
601   bool sync_tfu  = job->serialize & V3DV_BARRIER_TRANSFER_BIT;
602   bool sync_cl   = job->serialize & (V3DV_BARRIER_GRAPHICS_BIT |
603                                      V3DV_BARRIER_TRANSFER_BIT);
604   *count = n_syncs;
605   if (sync_cl)
606      (*count)++;
607   if (sync_tfu)
608      (*count)++;
609   if (sync_csd)
610      (*count)++;
611
612   if (!*count)
613      return NULL;
614
615   struct drm_v3d_sem *syncs =
616      vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
617                8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
618
619   if (!syncs)
620      return NULL;
621
622   for (int i = 0; i < n_syncs; i++) {
623      syncs[i].handle =
624         vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj;
625   }
626
627   if (sync_cl)
628      syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CL];
629
630   if (sync_csd)
631      syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD];
632
633   if (sync_tfu)
634      syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU];
635
636   assert(n_syncs == *count);
637   return syncs;
638}
639
640static struct drm_v3d_sem *
641set_out_syncs(struct v3dv_queue *queue,
642              struct v3dv_job *job,
643              enum v3dv_queue_type queue_sync,
644              uint32_t *count,
645              struct v3dv_submit_sync_info *sync_info,
646              bool signal_syncs)
647{
648   struct v3dv_device *device = queue->device;
649
650   uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0;
651
652   /* We always signal the syncobj from `device->last_job_syncs` related to
653    * this v3dv_queue_type to track the last job submitted to this queue.
654    */
655   (*count) = n_vk_syncs + 1;
656
657   struct drm_v3d_sem *syncs =
658      vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
659                8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
660
661   if (!syncs)
662      return NULL;
663
664   if (n_vk_syncs) {
665      for (unsigned i = 0; i < n_vk_syncs; i++) {
666         syncs[i].handle =
667            vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj;
668      }
669   }
670
671   syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync];
672
673   return syncs;
674}
675
676static void
677set_ext(struct drm_v3d_extension *ext,
678	struct drm_v3d_extension *next,
679	uint32_t id,
680	uintptr_t flags)
681{
682   ext->next = (uintptr_t)(void *)next;
683   ext->id = id;
684   ext->flags = flags;
685}
686
687/* This function sets the extension for multiple in/out syncobjs. When it is
688 * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC.
689 * Otherwise, the extension id is 0, which means an out-of-memory error.
690 */
691static void
692set_multisync(struct drm_v3d_multi_sync *ms,
693              struct v3dv_submit_sync_info *sync_info,
694              struct drm_v3d_extension *next,
695              struct v3dv_device *device,
696              struct v3dv_job *job,
697              enum v3dv_queue_type queue_sync,
698              enum v3d_queue wait_stage,
699              bool signal_syncs)
700{
701   struct v3dv_queue *queue = &device->queue;
702   uint32_t out_sync_count = 0, in_sync_count = 0;
703   struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL;
704
705   in_syncs = set_in_syncs(queue, job, queue_sync,
706                           &in_sync_count, sync_info);
707   if (!in_syncs && in_sync_count)
708      goto fail;
709
710   out_syncs = set_out_syncs(queue, job, queue_sync,
711                             &out_sync_count, sync_info, signal_syncs);
712
713   assert(out_sync_count > 0);
714
715   if (!out_syncs)
716      goto fail;
717
718   set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
719   ms->wait_stage = wait_stage;
720   ms->out_sync_count = out_sync_count;
721   ms->out_syncs = (uintptr_t)(void *)out_syncs;
722   ms->in_sync_count = in_sync_count;
723   ms->in_syncs = (uintptr_t)(void *)in_syncs;
724
725   return;
726
727fail:
728   if (in_syncs)
729      vk_free(&device->vk.alloc, in_syncs);
730   assert(!out_syncs);
731
732   return;
733}
734
735static VkResult
736handle_cl_job(struct v3dv_queue *queue,
737              struct v3dv_job *job,
738              uint32_t counter_pass_idx,
739              struct v3dv_submit_sync_info *sync_info,
740              bool signal_syncs)
741{
742   struct v3dv_device *device = queue->device;
743
744   struct drm_v3d_submit_cl submit = { 0 };
745
746   /* Sanity check: we should only flag a bcl sync on a job that needs to be
747    * serialized.
748    */
749   assert(job->serialize || !job->needs_bcl_sync);
750
751   /* We expect to have just one RCL per job which should fit in just one BO.
752    * Our BCL, could chain multiple BOS together though.
753    */
754   assert(list_length(&job->rcl.bo_list) == 1);
755   assert(list_length(&job->bcl.bo_list) >= 1);
756   struct v3dv_bo *bcl_fist_bo =
757      list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link);
758   submit.bcl_start = bcl_fist_bo->offset;
759   submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
760   submit.rcl_start = job->rcl.bo->offset;
761   submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
762
763   submit.qma = job->tile_alloc->offset;
764   submit.qms = job->tile_alloc->size;
765   submit.qts = job->tile_state->offset;
766
767   submit.flags = 0;
768   if (job->tmu_dirty_rcl)
769      submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
770
771   /* If the job uses VK_KHR_buffer_device_addess we need to ensure all
772    * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR
773    * are included.
774    */
775   if (job->uses_buffer_device_address) {
776      util_dynarray_foreach(&queue->device->device_address_bo_list,
777                            struct v3dv_bo *, bo) {
778         v3dv_job_add_bo(job, *bo);
779      }
780   }
781
782   submit.bo_handle_count = job->bo_count;
783   uint32_t *bo_handles =
784      (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
785   uint32_t bo_idx = 0;
786   set_foreach(job->bos, entry) {
787      struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
788      bo_handles[bo_idx++] = bo->handle;
789   }
790   assert(bo_idx == submit.bo_handle_count);
791   submit.bo_handles = (uintptr_t)(void *)bo_handles;
792
793   submit.perfmon_id = job->perf ?
794      job->perf->kperfmon_ids[counter_pass_idx] : 0;
795   const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
796   queue->last_perfmon_id = submit.perfmon_id;
797
798   /* We need a binning sync if we are the first CL job waiting on a semaphore
799    * with a wait stage that involves the geometry pipeline, or if the job
800    * comes after a pipeline barrier that involves geometry stages
801    * (needs_bcl_sync) or when performance queries are in use.
802    *
803    * We need a render sync if the job doesn't need a binning sync but has
804    * still been flagged for serialization. It should be noted that RCL jobs
805    * don't start until the previous RCL job has finished so we don't really
806    * need to add a fence for those, however, we might need to wait on a CSD or
807    * TFU job, which are not automatically serialized with CL jobs.
808    */
809   bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync;
810   if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) {
811      for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) {
812         needs_bcl_sync = sync_info->waits[i].stage_mask &
813             (VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
814              VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT |
815              VK_PIPELINE_STAGE_ALL_COMMANDS_BIT |
816              VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
817              VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
818              VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
819              VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
820              VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
821              VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT);
822      }
823   }
824
825   bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
826
827   /* Replace single semaphore settings whenever our kernel-driver supports
828    * multiple semaphores extension.
829    */
830   struct drm_v3d_multi_sync ms = { 0 };
831   if (device->pdevice->caps.multisync) {
832      enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
833      set_multisync(&ms, sync_info, NULL, device, job,
834                    V3DV_QUEUE_CL, wait_stage, signal_syncs);
835      if (!ms.base.id)
836         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
837
838      submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
839      submit.extensions = (uintptr_t)(void *)&ms;
840      /* Disable legacy sync interface when multisync extension is used */
841      submit.in_sync_rcl = 0;
842      submit.in_sync_bcl = 0;
843      submit.out_sync = 0;
844   } else {
845      uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY];
846      submit.in_sync_bcl = needs_bcl_sync ? last_job_sync : 0;
847      submit.in_sync_rcl = needs_rcl_sync ? last_job_sync : 0;
848      submit.out_sync = last_job_sync;
849   }
850
851   v3dv_clif_dump(device, job, &submit);
852   int ret = v3dv_ioctl(device->pdevice->render_fd,
853                        DRM_IOCTL_V3D_SUBMIT_CL, &submit);
854
855   static bool warned = false;
856   if (ret && !warned) {
857      fprintf(stderr, "Draw call returned %s. Expect corruption.\n",
858              strerror(errno));
859      warned = true;
860   }
861
862   free(bo_handles);
863   multisync_free(device, &ms);
864
865   queue->last_job_syncs.first[V3DV_QUEUE_CL] = false;
866
867   if (ret)
868      return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m");
869
870   return VK_SUCCESS;
871}
872
873static VkResult
874handle_tfu_job(struct v3dv_queue *queue,
875               struct v3dv_job *job,
876               struct v3dv_submit_sync_info *sync_info,
877               bool signal_syncs)
878{
879   struct v3dv_device *device = queue->device;
880
881   const bool needs_sync = sync_info->wait_count || job->serialize;
882
883   /* Replace single semaphore settings whenever our kernel-driver supports
884    * multiple semaphore extension.
885    */
886   struct drm_v3d_multi_sync ms = { 0 };
887   if (device->pdevice->caps.multisync) {
888      set_multisync(&ms, sync_info, NULL, device, job,
889                    V3DV_QUEUE_TFU, V3D_TFU, signal_syncs);
890      if (!ms.base.id)
891         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
892
893      job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION;
894      job->tfu.extensions = (uintptr_t)(void *)&ms;
895      /* Disable legacy sync interface when multisync extension is used */
896      job->tfu.in_sync = 0;
897      job->tfu.out_sync = 0;
898   } else {
899      uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY];
900      job->tfu.in_sync = needs_sync ? last_job_sync : 0;
901      job->tfu.out_sync = last_job_sync;
902   }
903   int ret = v3dv_ioctl(device->pdevice->render_fd,
904                        DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
905
906   multisync_free(device, &ms);
907   queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false;
908
909   if (ret != 0)
910      return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m");
911
912   return VK_SUCCESS;
913}
914
915static VkResult
916handle_csd_job(struct v3dv_queue *queue,
917               struct v3dv_job *job,
918               uint32_t counter_pass_idx,
919               struct v3dv_submit_sync_info *sync_info,
920               bool signal_syncs)
921{
922   struct v3dv_device *device = queue->device;
923
924   struct drm_v3d_submit_csd *submit = &job->csd.submit;
925
926   /* If the job uses VK_KHR_buffer_device_addess we need to ensure all
927    * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR
928    * are included.
929    */
930   if (job->uses_buffer_device_address) {
931      util_dynarray_foreach(&queue->device->device_address_bo_list,
932                            struct v3dv_bo *, bo) {
933         v3dv_job_add_bo(job, *bo);
934      }
935   }
936
937   submit->bo_handle_count = job->bo_count;
938   uint32_t *bo_handles =
939      (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
940   uint32_t bo_idx = 0;
941   set_foreach(job->bos, entry) {
942      struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
943      bo_handles[bo_idx++] = bo->handle;
944   }
945   assert(bo_idx == submit->bo_handle_count);
946   submit->bo_handles = (uintptr_t)(void *)bo_handles;
947
948   const bool needs_sync = sync_info->wait_count || job->serialize;
949
950   /* Replace single semaphore settings whenever our kernel-driver supports
951    * multiple semaphore extension.
952    */
953   struct drm_v3d_multi_sync ms = { 0 };
954   if (device->pdevice->caps.multisync) {
955      set_multisync(&ms, sync_info, NULL, device, job,
956                    V3DV_QUEUE_CSD, V3D_CSD, signal_syncs);
957      if (!ms.base.id)
958         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
959
960      submit->flags |= DRM_V3D_SUBMIT_EXTENSION;
961      submit->extensions = (uintptr_t)(void *)&ms;
962      /* Disable legacy sync interface when multisync extension is used */
963      submit->in_sync = 0;
964      submit->out_sync = 0;
965   } else {
966      uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY];
967      submit->in_sync = needs_sync ? last_job_sync : 0;
968      submit->out_sync = last_job_sync;
969   }
970   submit->perfmon_id = job->perf ?
971      job->perf->kperfmon_ids[counter_pass_idx] : 0;
972   queue->last_perfmon_id = submit->perfmon_id;
973   int ret = v3dv_ioctl(device->pdevice->render_fd,
974                        DRM_IOCTL_V3D_SUBMIT_CSD, submit);
975
976   static bool warned = false;
977   if (ret && !warned) {
978      fprintf(stderr, "Compute dispatch returned %s. Expect corruption.\n",
979              strerror(errno));
980      warned = true;
981   }
982
983   free(bo_handles);
984
985   multisync_free(device, &ms);
986   queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
987
988   if (ret)
989      return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m");
990
991   return VK_SUCCESS;
992}
993
994static VkResult
995queue_handle_job(struct v3dv_queue *queue,
996                 struct v3dv_job *job,
997                 uint32_t counter_pass_idx,
998                 struct v3dv_submit_sync_info *sync_info,
999                 bool signal_syncs)
1000{
1001   switch (job->type) {
1002   case V3DV_JOB_TYPE_GPU_CL:
1003      return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1004   case V3DV_JOB_TYPE_GPU_TFU:
1005      return handle_tfu_job(queue, job, sync_info, signal_syncs);
1006   case V3DV_JOB_TYPE_GPU_CSD:
1007      return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1008   case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
1009      return handle_reset_query_cpu_job(queue, job, sync_info);
1010   case V3DV_JOB_TYPE_CPU_END_QUERY:
1011      return handle_end_query_cpu_job(job, counter_pass_idx);
1012   case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
1013      return handle_copy_query_results_cpu_job(job);
1014   case V3DV_JOB_TYPE_CPU_SET_EVENT:
1015      return handle_set_event_cpu_job(queue, job, sync_info);
1016   case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
1017      return handle_wait_events_cpu_job(job);
1018   case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE:
1019      return handle_copy_buffer_to_image_cpu_job(queue, job, sync_info);
1020   case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
1021      return handle_csd_indirect_cpu_job(queue, job, sync_info);
1022   case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
1023      return handle_timestamp_query_cpu_job(queue, job, sync_info);
1024   default:
1025      unreachable("Unhandled job type");
1026   }
1027}
1028
1029static VkResult
1030queue_create_noop_job(struct v3dv_queue *queue)
1031{
1032   struct v3dv_device *device = queue->device;
1033   queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
1034                               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1035   if (!queue->noop_job)
1036      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1037   v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
1038
1039   v3dv_X(device, job_emit_noop)(queue->noop_job);
1040
1041   /* We use no-op jobs to signal semaphores/fences. These jobs needs to be
1042    * serialized across all hw queues to comply with Vulkan's signal operation
1043    * order requirements, which basically require that signal operations occur
1044    * in submission order.
1045    */
1046   queue->noop_job->serialize = V3DV_BARRIER_ALL;
1047
1048   return VK_SUCCESS;
1049}
1050
1051static VkResult
1052queue_submit_noop_job(struct v3dv_queue *queue,
1053                      uint32_t counter_pass_idx,
1054                      struct v3dv_submit_sync_info *sync_info,
1055                      bool signal_syncs)
1056{
1057   if (!queue->noop_job) {
1058      VkResult result = queue_create_noop_job(queue);
1059      if (result != VK_SUCCESS)
1060         return result;
1061   }
1062
1063   assert(queue->noop_job);
1064   return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
1065                           sync_info, signal_syncs);
1066}
1067
1068VkResult
1069v3dv_queue_driver_submit(struct vk_queue *vk_queue,
1070                         struct vk_queue_submit *submit)
1071{
1072   struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk);
1073   VkResult result;
1074
1075   struct v3dv_submit_sync_info sync_info = {
1076      .wait_count = submit->wait_count,
1077      .waits = submit->waits,
1078      .signal_count = submit->signal_count,
1079      .signals = submit->signals,
1080   };
1081
1082   for (int i = 0; i < V3DV_QUEUE_COUNT; i++)
1083      queue->last_job_syncs.first[i] = true;
1084
1085   result = process_waits(queue, sync_info.wait_count, sync_info.waits);
1086   if (result != VK_SUCCESS)
1087      return result;
1088
1089   for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
1090      struct v3dv_cmd_buffer *cmd_buffer =
1091         container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk);
1092      list_for_each_entry_safe(struct v3dv_job, job,
1093                               &cmd_buffer->jobs, list_link) {
1094
1095         result = queue_handle_job(queue, job, submit->perf_pass_index,
1096                                   &sync_info, false);
1097         if (result != VK_SUCCESS)
1098            return result;
1099      }
1100
1101      /* If the command buffer ends with a barrier we need to consume it now.
1102       *
1103       * FIXME: this will drain all hw queues. Instead, we could use the pending
1104       * barrier state to limit the queues we serialize against.
1105       */
1106      if (cmd_buffer->state.barrier.dst_mask) {
1107         result = queue_submit_noop_job(queue, submit->perf_pass_index,
1108                                        &sync_info, false);
1109         if (result != VK_SUCCESS)
1110            return result;
1111      }
1112   }
1113
1114   /* Finish by submitting a no-op job that synchronizes across all queues.
1115    * This will ensure that the signal semaphores don't get triggered until
1116    * all work on any queue completes. See Vulkan's signal operation order
1117    * requirements.
1118    */
1119   if (submit->signal_count > 0) {
1120      result = queue_submit_noop_job(queue, submit->perf_pass_index,
1121                                     &sync_info, true);
1122      if (result != VK_SUCCESS)
1123         return result;
1124   }
1125
1126   process_signals(queue, sync_info.signal_count, sync_info.signals);
1127
1128   return VK_SUCCESS;
1129}
1130
1131VKAPI_ATTR VkResult VKAPI_CALL
1132v3dv_QueueBindSparse(VkQueue _queue,
1133                     uint32_t bindInfoCount,
1134                     const VkBindSparseInfo *pBindInfo,
1135                     VkFence fence)
1136{
1137   V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
1138   return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
1139}
1140