1/*
2 * Copyright © 2021 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include <stdio.h>
25#include <stdarg.h>
26
27#include "common/intel_gem.h"
28#include "perf/intel_perf.h"
29
30#include "util/hash_table.h"
31#include "util/u_process.h"
32
33#include "intel_driver_ds.h"
34#include "intel_pps_priv.h"
35#include "intel_tracepoints.h"
36
37#ifdef HAVE_PERFETTO
38
39#include "util/u_perfetto.h"
40
41#include "intel_tracepoints_perfetto.h"
42
43/* Just naming stages */
44static const struct {
45   const char *name;
46
47   /* Tells us if a given stage is pipelined. This is used to build stacks of
48    * pipelined elements so that the perfetto UI doesn't get confused by elements
49    * ending out of order.
50    */
51   bool pipelined;
52
53   /* The perfetto UI requires that there is a parent-child relationship
54    * within a row of elements. Which means that all children elements must
55    * end within the lifespan of their parent.
56    *
57    * Some elements like stalls and command buffers follow that relationship,
58    * but not all. This tells us in which UI row the elements should live.
59    */
60   enum intel_ds_queue_stage draw_stage;
61} intel_queue_stage_desc[INTEL_DS_QUEUE_STAGE_N_STAGES] = {
62   /* Order must match the enum! */
63   {
64      "cmd-buffer",
65      false,
66      INTEL_DS_QUEUE_STAGE_CMD_BUFFER,
67   },
68   {
69      "stall",
70      false,
71      INTEL_DS_QUEUE_STAGE_STALL,
72   },
73   {
74      "compute",
75      true,
76      INTEL_DS_QUEUE_STAGE_COMPUTE,
77   },
78   {
79      "render-pass",
80      true,
81      INTEL_DS_QUEUE_STAGE_RENDER_PASS,
82   },
83   {
84      "blorp",
85      true,
86      INTEL_DS_QUEUE_STAGE_BLORP,
87   },
88   {
89      "draw",
90      true,
91      INTEL_DS_QUEUE_STAGE_DRAW,
92   },
93};
94
95struct IntelRenderpassIncrementalState {
96   bool was_cleared = true;
97};
98
99struct IntelRenderpassTraits : public perfetto::DefaultDataSourceTraits {
100   using IncrementalStateType = IntelRenderpassIncrementalState;
101};
102
103class IntelRenderpassDataSource : public perfetto::DataSource<IntelRenderpassDataSource,
104                                                            IntelRenderpassTraits> {
105public:
106   void OnSetup(const SetupArgs &) override
107   {
108      // Use this callback to apply any custom configuration to your data source
109      // based on the TraceConfig in SetupArgs.
110   }
111
112   void OnStart(const StartArgs &) override
113   {
114      // This notification can be used to initialize the GPU driver, enable
115      // counters, etc. StartArgs will contains the DataSourceDescriptor,
116      // which can be extended.
117      u_trace_perfetto_start();
118      PERFETTO_LOG("Tracing started");
119   }
120
121   void OnStop(const StopArgs &) override
122   {
123      PERFETTO_LOG("Tracing stopped");
124
125      // Undo any initialization done in OnStart.
126      u_trace_perfetto_stop();
127      // TODO we should perhaps block until queued traces are flushed?
128
129      Trace([](IntelRenderpassDataSource::TraceContext ctx) {
130         auto packet = ctx.NewTracePacket();
131         packet->Finalize();
132         ctx.Flush();
133      });
134   }
135};
136
137PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(IntelRenderpassDataSource);
138PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(IntelRenderpassDataSource);
139
140using perfetto::protos::pbzero::InternedGpuRenderStageSpecification_RenderStageCategory;
141
142enum InternedGpuRenderStageSpecification_RenderStageCategory
143i915_engine_class_to_category(enum drm_i915_gem_engine_class engine_class)
144{
145   switch (engine_class) {
146   case I915_ENGINE_CLASS_RENDER:
147      return InternedGpuRenderStageSpecification_RenderStageCategory::
148         InternedGpuRenderStageSpecification_RenderStageCategory_GRAPHICS;
149   default:
150      return InternedGpuRenderStageSpecification_RenderStageCategory::InternedGpuRenderStageSpecification_RenderStageCategory_OTHER;
151   }
152}
153
154static void
155sync_timestamp(IntelRenderpassDataSource::TraceContext &ctx,
156               struct intel_ds_device *device)
157{
158   uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
159   uint64_t gpu_ts = intel_device_info_timebase_scale(&device->info,
160                                                      intel_read_gpu_timestamp(device->fd));
161
162   if (cpu_ts < device->next_clock_sync_ns)
163      return;
164
165   PERFETTO_LOG("sending clocks gpu=0x%08x", device->gpu_clock_id);
166
167   device->sync_gpu_ts = gpu_ts;
168   device->next_clock_sync_ns = cpu_ts + 1000000000ull;
169
170   auto packet = ctx.NewTracePacket();
171
172   packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
173   packet->set_timestamp(cpu_ts);
174
175   auto event = packet->set_clock_snapshot();
176   {
177      auto clock = event->add_clocks();
178
179      clock->set_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
180      clock->set_timestamp(cpu_ts);
181   }
182   {
183      auto clock = event->add_clocks();
184
185      clock->set_clock_id(device->gpu_clock_id);
186      clock->set_timestamp(gpu_ts);
187   }
188}
189
190static void
191send_descriptors(IntelRenderpassDataSource::TraceContext &ctx,
192                 struct intel_ds_device *device)
193{
194   struct intel_ds_queue *queue;
195
196   PERFETTO_LOG("Sending renderstage descriptors");
197
198   device->event_id = 0;
199   u_vector_foreach(queue, &device->queues) {
200      for (uint32_t s = 0; s < ARRAY_SIZE(queue->stages); s++) {
201         queue->stages[s].start_ns = 0;
202      }
203   }
204
205   {
206      auto packet = ctx.NewTracePacket();
207
208      packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
209      packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
210      packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
211
212      auto interned_data = packet->set_interned_data();
213
214      {
215         auto desc = interned_data->add_graphics_contexts();
216         desc->set_iid(device->iid);
217         desc->set_pid(getpid());
218         switch (device->api) {
219         case INTEL_DS_API_OPENGL:
220            desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api_OPEN_GL);
221            break;
222         case INTEL_DS_API_VULKAN:
223            desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api_VULKAN);
224            break;
225         default:
226            break;
227         }
228      }
229
230      /* Emit all the IID picked at device/queue creation. */
231      u_vector_foreach(queue, &device->queues) {
232         for (unsigned s = 0; s < INTEL_DS_QUEUE_STAGE_N_STAGES; s++) {
233            {
234               /* We put the stage number in there so that all rows are order
235                * by intel_ds_queue_stage.
236                */
237               char name[100];
238               snprintf(name, sizeof(name), "%.10s-%s-%u-%s",
239                        util_get_process_name(),
240                        queue->name, s, intel_queue_stage_desc[s].name);
241
242               auto desc = interned_data->add_gpu_specifications();
243               desc->set_iid(queue->stages[s].queue_iid);
244               desc->set_name(name);
245            }
246            {
247               auto desc = interned_data->add_gpu_specifications();
248               desc->set_iid(queue->stages[s].stage_iid);
249               desc->set_name(intel_queue_stage_desc[s].name);
250            }
251         }
252      }
253   }
254
255   device->next_clock_sync_ns = 0;
256   sync_timestamp(ctx, device);
257}
258
259typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*);
260
261static void
262begin_event(struct intel_ds_queue *queue, uint64_t ts_ns,
263            enum intel_ds_queue_stage stage_id)
264{
265   /* If we haven't managed to calibrate the alignment between GPU and CPU
266    * timestamps yet, then skip this trace, otherwise perfetto won't know
267    * what to do with it.
268    */
269   if (!queue->device->sync_gpu_ts) {
270      queue->stages[stage_id].start_ns = 0;
271      return;
272   }
273
274   queue->stages[stage_id].start_ns = ts_ns;
275}
276
277static void
278end_event(struct intel_ds_queue *queue, uint64_t ts_ns,
279          enum intel_ds_queue_stage stage_id,
280          uint32_t submission_id, const void* payload = nullptr,
281          trace_payload_as_extra_func payload_as_extra = nullptr)
282{
283   struct intel_ds_device *device = queue->device;
284
285   /* If we haven't managed to calibrate the alignment between GPU and CPU
286    * timestamps yet, then skip this trace, otherwise perfetto won't know
287    * what to do with it.
288    */
289   if (!device->sync_gpu_ts)
290      return;
291
292   struct intel_ds_stage *stage = &queue->stages[stage_id];
293   uint64_t start_ns = stage->start_ns;
294
295   if (!start_ns)
296      return;
297
298   uint64_t evt_id = device->event_id++;
299
300   IntelRenderpassDataSource::Trace([=](IntelRenderpassDataSource::TraceContext tctx) {
301      if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
302         send_descriptors(tctx, queue->device);
303         state->was_cleared = false;
304      }
305
306      sync_timestamp(tctx, queue->device);
307
308      auto packet = tctx.NewTracePacket();
309
310      packet->set_timestamp(start_ns);
311      packet->set_timestamp_clock_id(queue->device->gpu_clock_id);
312
313      assert(ts_ns >= start_ns);
314
315      auto event = packet->set_gpu_render_stage_event();
316      event->set_gpu_id(queue->device->gpu_id);
317
318      event->set_hw_queue_iid(stage->queue_iid);
319      event->set_stage_iid(stage->stage_iid);
320      event->set_context(queue->device->iid);
321      event->set_event_id(evt_id);
322      event->set_duration(ts_ns - start_ns);
323      event->set_submission_id(submission_id);
324
325      if (payload && payload_as_extra) {
326         payload_as_extra(event, payload);
327      }
328   });
329
330   stage->start_ns = 0;
331}
332
333static void
334custom_trace_payload_as_extra_end_stall(perfetto::protos::pbzero::GpuRenderStageEvent *event,
335                                        const struct trace_intel_end_stall *payload)
336{
337   char buf[256];
338
339   {
340      auto data = event->add_extra_data();
341      data->set_name("stall_reason");
342
343      snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s : %s",
344              (payload->flags & INTEL_DS_DEPTH_CACHE_FLUSH_BIT) ? "+depth_flush" : "",
345              (payload->flags & INTEL_DS_DATA_CACHE_FLUSH_BIT) ? "+dc_flush" : "",
346              (payload->flags & INTEL_DS_HDC_PIPELINE_FLUSH_BIT) ? "+hdc_flush" : "",
347              (payload->flags & INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT) ? "+rt_flush" : "",
348              (payload->flags & INTEL_DS_TILE_CACHE_FLUSH_BIT) ? "+tile_flush" : "",
349              (payload->flags & INTEL_DS_STATE_CACHE_INVALIDATE_BIT) ? "+state_inv" : "",
350              (payload->flags & INTEL_DS_CONST_CACHE_INVALIDATE_BIT) ? "+const_inv" : "",
351              (payload->flags & INTEL_DS_VF_CACHE_INVALIDATE_BIT) ? "+vf_inv" : "",
352              (payload->flags & INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT) ? "+tex_inv" : "",
353              (payload->flags & INTEL_DS_INST_CACHE_INVALIDATE_BIT) ? "+inst_inv" : "",
354              (payload->flags & INTEL_DS_STALL_AT_SCOREBOARD_BIT) ? "+pb_stall" : "",
355              (payload->flags & INTEL_DS_DEPTH_STALL_BIT) ? "+depth_stall" : "",
356              (payload->flags & INTEL_DS_HDC_PIPELINE_FLUSH_BIT) ? "+hdc_flush" : "",
357              (payload->flags & INTEL_DS_CS_STALL_BIT) ? "+cs_stall" : "",
358              payload->reason ? payload->reason : "unknown");
359
360      assert(strlen(buf) > 0);
361
362      data->set_value(buf);
363   }
364}
365
366#endif /* HAVE_PERFETTO */
367
368#ifdef __cplusplus
369extern "C" {
370#endif
371
372#ifdef HAVE_PERFETTO
373
374/*
375 * Trace callbacks, called from u_trace once the timestamps from GPU have been
376 * collected.
377 */
378
379#define CREATE_DUAL_EVENT_CALLBACK(event_name, stage)                   \
380   void                                                                 \
381   intel_ds_begin_##event_name(struct intel_ds_device *device,          \
382                               uint64_t ts_ns,                          \
383                               const void *flush_data,                  \
384                               const struct trace_intel_begin_##event_name *payload) \
385   {                                                                    \
386      const struct intel_ds_flush_data *flush =                         \
387         (const struct intel_ds_flush_data *) flush_data;               \
388      begin_event(flush->queue, ts_ns, stage);                          \
389   }                                                                    \
390                                                                        \
391   void                                                                 \
392   intel_ds_end_##event_name(struct intel_ds_device *device,            \
393                             uint64_t ts_ns,                            \
394                             const void *flush_data,                    \
395                             const struct trace_intel_end_##event_name *payload) \
396   {                                                                    \
397      const struct intel_ds_flush_data *flush =                         \
398         (const struct intel_ds_flush_data *) flush_data;               \
399      end_event(flush->queue, ts_ns, stage, flush->submission_id,       \
400                payload,                                                \
401                (trace_payload_as_extra_func)                           \
402                &trace_payload_as_extra_intel_end_##event_name);        \
403   }                                                                    \
404
405
406CREATE_DUAL_EVENT_CALLBACK(batch, INTEL_DS_QUEUE_STAGE_CMD_BUFFER)
407CREATE_DUAL_EVENT_CALLBACK(cmd_buffer, INTEL_DS_QUEUE_STAGE_CMD_BUFFER)
408CREATE_DUAL_EVENT_CALLBACK(render_pass, INTEL_DS_QUEUE_STAGE_RENDER_PASS)
409CREATE_DUAL_EVENT_CALLBACK(dyn_render_pass, INTEL_DS_QUEUE_STAGE_RENDER_PASS)
410CREATE_DUAL_EVENT_CALLBACK(blorp, INTEL_DS_QUEUE_STAGE_BLORP)
411CREATE_DUAL_EVENT_CALLBACK(draw, INTEL_DS_QUEUE_STAGE_DRAW)
412CREATE_DUAL_EVENT_CALLBACK(draw_indexed, INTEL_DS_QUEUE_STAGE_DRAW)
413CREATE_DUAL_EVENT_CALLBACK(draw_indexed_multi, INTEL_DS_QUEUE_STAGE_DRAW)
414CREATE_DUAL_EVENT_CALLBACK(draw_indexed_indirect, INTEL_DS_QUEUE_STAGE_DRAW)
415CREATE_DUAL_EVENT_CALLBACK(draw_multi, INTEL_DS_QUEUE_STAGE_DRAW)
416CREATE_DUAL_EVENT_CALLBACK(draw_indirect, INTEL_DS_QUEUE_STAGE_DRAW)
417CREATE_DUAL_EVENT_CALLBACK(draw_indirect_count, INTEL_DS_QUEUE_STAGE_DRAW)
418CREATE_DUAL_EVENT_CALLBACK(draw_indirect_byte_count, INTEL_DS_QUEUE_STAGE_DRAW)
419CREATE_DUAL_EVENT_CALLBACK(draw_indexed_indirect_count, INTEL_DS_QUEUE_STAGE_DRAW)
420CREATE_DUAL_EVENT_CALLBACK(compute, INTEL_DS_QUEUE_STAGE_COMPUTE)
421
422void
423intel_ds_begin_stall(struct intel_ds_device *device,
424                     uint64_t ts_ns,
425                     const void *flush_data,
426                     const struct trace_intel_begin_stall *payload)
427{
428   const struct intel_ds_flush_data *flush =
429      (const struct intel_ds_flush_data *) flush_data;
430   begin_event(flush->queue, ts_ns, INTEL_DS_QUEUE_STAGE_STALL);
431}
432
433void
434intel_ds_end_stall(struct intel_ds_device *device,
435                   uint64_t ts_ns,
436                   const void *flush_data,
437                   const struct trace_intel_end_stall *payload)
438{
439   const struct intel_ds_flush_data *flush =
440      (const struct intel_ds_flush_data *) flush_data;
441   end_event(flush->queue, ts_ns, INTEL_DS_QUEUE_STAGE_STALL, flush->submission_id,
442             payload,
443             (trace_payload_as_extra_func)custom_trace_payload_as_extra_end_stall);
444}
445
446uint64_t
447intel_ds_begin_submit(struct intel_ds_queue *queue)
448{
449   return perfetto::base::GetBootTimeNs().count();
450}
451
452void
453intel_ds_end_submit(struct intel_ds_queue *queue,
454                    uint64_t start_ts)
455{
456   if (!u_trace_context_actively_tracing(&queue->device->trace_context)) {
457      queue->device->sync_gpu_ts = 0;
458      queue->device->next_clock_sync_ns = 0;
459      return;
460   }
461
462   uint64_t end_ts = perfetto::base::GetBootTimeNs().count();
463   uint32_t submission_id = queue->submission_id++;
464
465   IntelRenderpassDataSource::Trace([=](IntelRenderpassDataSource::TraceContext tctx) {
466      if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
467         send_descriptors(tctx, queue->device);
468         state->was_cleared = false;
469      }
470
471      sync_timestamp(tctx, queue->device);
472
473      auto packet = tctx.NewTracePacket();
474
475      packet->set_timestamp(start_ts);
476
477      auto event = packet->set_vulkan_api_event();
478      auto submit = event->set_vk_queue_submit();
479
480      // submit->set_pid(os_get_pid());
481      // submit->set_tid(os_get_tid());
482      submit->set_duration_ns(end_ts - start_ts);
483      submit->set_vk_queue((uintptr_t) queue);
484      submit->set_submission_id(submission_id);
485   });
486}
487
488#endif /* HAVE_PERFETTO */
489
490static void
491intel_driver_ds_init_once(void)
492{
493#ifdef HAVE_PERFETTO
494   util_perfetto_init();
495   perfetto::DataSourceDescriptor dsd;
496   dsd.set_name("gpu.renderstages.intel");
497   IntelRenderpassDataSource::Register(dsd);
498#endif
499}
500
501static once_flag intel_driver_ds_once_flag = ONCE_FLAG_INIT;
502
503static uint64_t get_iid()
504{
505   static uint64_t iid = 1;
506   return iid++;
507}
508
509void
510intel_driver_ds_init(void)
511{
512   call_once(&intel_driver_ds_once_flag,
513             intel_driver_ds_init_once);
514}
515
516void
517intel_ds_device_init(struct intel_ds_device *device,
518                     struct intel_device_info *devinfo,
519                     int drm_fd,
520                     uint32_t gpu_id,
521                     enum intel_ds_api api)
522{
523   memset(device, 0, sizeof(*device));
524
525   assert(gpu_id < 128);
526   device->gpu_id = gpu_id;
527   device->gpu_clock_id = intel_pps_clock_id(gpu_id);
528   device->fd = drm_fd;
529   device->info = *devinfo;
530   device->iid = get_iid();
531   device->api = api;
532   u_vector_init(&device->queues, 4, sizeof(struct intel_ds_queue));
533}
534
535void
536intel_ds_device_fini(struct intel_ds_device *device)
537{
538   u_trace_context_fini(&device->trace_context);
539   u_vector_finish(&device->queues);
540}
541
542struct intel_ds_queue *
543intel_ds_device_add_queue(struct intel_ds_device *device,
544                          const char *fmt_name,
545                          ...)
546{
547   struct intel_ds_queue *queue =
548      (struct intel_ds_queue *) u_vector_add(&device->queues);
549   va_list ap;
550
551   memset(queue, 0, sizeof(*queue));
552
553   queue->device = device;
554   queue->queue_id = u_vector_length(&device->queues) - 1;
555
556   va_start(ap, fmt_name);
557   vsnprintf(queue->name, sizeof(queue->name), fmt_name, ap);
558   va_end(ap);
559
560   for (unsigned s = 0; s < INTEL_DS_QUEUE_STAGE_N_STAGES; s++) {
561      queue->stages[s].queue_iid = get_iid();
562      queue->stages[s].stage_iid = get_iid();
563   }
564
565   return queue;
566}
567
568void intel_ds_flush_data_init(struct intel_ds_flush_data *data,
569                              struct intel_ds_queue *queue,
570                              uint64_t submission_id)
571{
572   memset(data, 0, sizeof(*data));
573
574   data->queue = queue;
575   data->submission_id = submission_id;
576
577   u_trace_init(&data->trace, &queue->device->trace_context);
578}
579
580void intel_ds_flush_data_fini(struct intel_ds_flush_data *data)
581{
582   u_trace_fini(&data->trace);
583}
584
585#ifdef __cplusplus
586}
587#endif
588