1/* 2 * Copyright © 2021 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include <stdio.h> 25#include <stdarg.h> 26 27#include "common/intel_gem.h" 28#include "perf/intel_perf.h" 29 30#include "util/hash_table.h" 31#include "util/u_process.h" 32 33#include "intel_driver_ds.h" 34#include "intel_pps_priv.h" 35#include "intel_tracepoints.h" 36 37#ifdef HAVE_PERFETTO 38 39#include "util/u_perfetto.h" 40 41#include "intel_tracepoints_perfetto.h" 42 43/* Just naming stages */ 44static const struct { 45 const char *name; 46 47 /* Tells us if a given stage is pipelined. This is used to build stacks of 48 * pipelined elements so that the perfetto UI doesn't get confused by elements 49 * ending out of order. 50 */ 51 bool pipelined; 52 53 /* The perfetto UI requires that there is a parent-child relationship 54 * within a row of elements. Which means that all children elements must 55 * end within the lifespan of their parent. 56 * 57 * Some elements like stalls and command buffers follow that relationship, 58 * but not all. This tells us in which UI row the elements should live. 59 */ 60 enum intel_ds_queue_stage draw_stage; 61} intel_queue_stage_desc[INTEL_DS_QUEUE_STAGE_N_STAGES] = { 62 /* Order must match the enum! */ 63 { 64 "cmd-buffer", 65 false, 66 INTEL_DS_QUEUE_STAGE_CMD_BUFFER, 67 }, 68 { 69 "stall", 70 false, 71 INTEL_DS_QUEUE_STAGE_STALL, 72 }, 73 { 74 "compute", 75 true, 76 INTEL_DS_QUEUE_STAGE_COMPUTE, 77 }, 78 { 79 "render-pass", 80 true, 81 INTEL_DS_QUEUE_STAGE_RENDER_PASS, 82 }, 83 { 84 "blorp", 85 true, 86 INTEL_DS_QUEUE_STAGE_BLORP, 87 }, 88 { 89 "draw", 90 true, 91 INTEL_DS_QUEUE_STAGE_DRAW, 92 }, 93}; 94 95struct IntelRenderpassIncrementalState { 96 bool was_cleared = true; 97}; 98 99struct IntelRenderpassTraits : public perfetto::DefaultDataSourceTraits { 100 using IncrementalStateType = IntelRenderpassIncrementalState; 101}; 102 103class IntelRenderpassDataSource : public perfetto::DataSource<IntelRenderpassDataSource, 104 IntelRenderpassTraits> { 105public: 106 void OnSetup(const SetupArgs &) override 107 { 108 // Use this callback to apply any custom configuration to your data source 109 // based on the TraceConfig in SetupArgs. 110 } 111 112 void OnStart(const StartArgs &) override 113 { 114 // This notification can be used to initialize the GPU driver, enable 115 // counters, etc. StartArgs will contains the DataSourceDescriptor, 116 // which can be extended. 117 u_trace_perfetto_start(); 118 PERFETTO_LOG("Tracing started"); 119 } 120 121 void OnStop(const StopArgs &) override 122 { 123 PERFETTO_LOG("Tracing stopped"); 124 125 // Undo any initialization done in OnStart. 126 u_trace_perfetto_stop(); 127 // TODO we should perhaps block until queued traces are flushed? 128 129 Trace([](IntelRenderpassDataSource::TraceContext ctx) { 130 auto packet = ctx.NewTracePacket(); 131 packet->Finalize(); 132 ctx.Flush(); 133 }); 134 } 135}; 136 137PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(IntelRenderpassDataSource); 138PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(IntelRenderpassDataSource); 139 140using perfetto::protos::pbzero::InternedGpuRenderStageSpecification_RenderStageCategory; 141 142enum InternedGpuRenderStageSpecification_RenderStageCategory 143i915_engine_class_to_category(enum drm_i915_gem_engine_class engine_class) 144{ 145 switch (engine_class) { 146 case I915_ENGINE_CLASS_RENDER: 147 return InternedGpuRenderStageSpecification_RenderStageCategory:: 148 InternedGpuRenderStageSpecification_RenderStageCategory_GRAPHICS; 149 default: 150 return InternedGpuRenderStageSpecification_RenderStageCategory::InternedGpuRenderStageSpecification_RenderStageCategory_OTHER; 151 } 152} 153 154static void 155sync_timestamp(IntelRenderpassDataSource::TraceContext &ctx, 156 struct intel_ds_device *device) 157{ 158 uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count(); 159 uint64_t gpu_ts = intel_device_info_timebase_scale(&device->info, 160 intel_read_gpu_timestamp(device->fd)); 161 162 if (cpu_ts < device->next_clock_sync_ns) 163 return; 164 165 PERFETTO_LOG("sending clocks gpu=0x%08x", device->gpu_clock_id); 166 167 device->sync_gpu_ts = gpu_ts; 168 device->next_clock_sync_ns = cpu_ts + 1000000000ull; 169 170 auto packet = ctx.NewTracePacket(); 171 172 packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME); 173 packet->set_timestamp(cpu_ts); 174 175 auto event = packet->set_clock_snapshot(); 176 { 177 auto clock = event->add_clocks(); 178 179 clock->set_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME); 180 clock->set_timestamp(cpu_ts); 181 } 182 { 183 auto clock = event->add_clocks(); 184 185 clock->set_clock_id(device->gpu_clock_id); 186 clock->set_timestamp(gpu_ts); 187 } 188} 189 190static void 191send_descriptors(IntelRenderpassDataSource::TraceContext &ctx, 192 struct intel_ds_device *device) 193{ 194 struct intel_ds_queue *queue; 195 196 PERFETTO_LOG("Sending renderstage descriptors"); 197 198 device->event_id = 0; 199 u_vector_foreach(queue, &device->queues) { 200 for (uint32_t s = 0; s < ARRAY_SIZE(queue->stages); s++) { 201 queue->stages[s].start_ns = 0; 202 } 203 } 204 205 { 206 auto packet = ctx.NewTracePacket(); 207 208 packet->set_timestamp(perfetto::base::GetBootTimeNs().count()); 209 packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME); 210 packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED); 211 212 auto interned_data = packet->set_interned_data(); 213 214 { 215 auto desc = interned_data->add_graphics_contexts(); 216 desc->set_iid(device->iid); 217 desc->set_pid(getpid()); 218 switch (device->api) { 219 case INTEL_DS_API_OPENGL: 220 desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api_OPEN_GL); 221 break; 222 case INTEL_DS_API_VULKAN: 223 desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api_VULKAN); 224 break; 225 default: 226 break; 227 } 228 } 229 230 /* Emit all the IID picked at device/queue creation. */ 231 u_vector_foreach(queue, &device->queues) { 232 for (unsigned s = 0; s < INTEL_DS_QUEUE_STAGE_N_STAGES; s++) { 233 { 234 /* We put the stage number in there so that all rows are order 235 * by intel_ds_queue_stage. 236 */ 237 char name[100]; 238 snprintf(name, sizeof(name), "%.10s-%s-%u-%s", 239 util_get_process_name(), 240 queue->name, s, intel_queue_stage_desc[s].name); 241 242 auto desc = interned_data->add_gpu_specifications(); 243 desc->set_iid(queue->stages[s].queue_iid); 244 desc->set_name(name); 245 } 246 { 247 auto desc = interned_data->add_gpu_specifications(); 248 desc->set_iid(queue->stages[s].stage_iid); 249 desc->set_name(intel_queue_stage_desc[s].name); 250 } 251 } 252 } 253 } 254 255 device->next_clock_sync_ns = 0; 256 sync_timestamp(ctx, device); 257} 258 259typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*); 260 261static void 262begin_event(struct intel_ds_queue *queue, uint64_t ts_ns, 263 enum intel_ds_queue_stage stage_id) 264{ 265 /* If we haven't managed to calibrate the alignment between GPU and CPU 266 * timestamps yet, then skip this trace, otherwise perfetto won't know 267 * what to do with it. 268 */ 269 if (!queue->device->sync_gpu_ts) { 270 queue->stages[stage_id].start_ns = 0; 271 return; 272 } 273 274 queue->stages[stage_id].start_ns = ts_ns; 275} 276 277static void 278end_event(struct intel_ds_queue *queue, uint64_t ts_ns, 279 enum intel_ds_queue_stage stage_id, 280 uint32_t submission_id, const void* payload = nullptr, 281 trace_payload_as_extra_func payload_as_extra = nullptr) 282{ 283 struct intel_ds_device *device = queue->device; 284 285 /* If we haven't managed to calibrate the alignment between GPU and CPU 286 * timestamps yet, then skip this trace, otherwise perfetto won't know 287 * what to do with it. 288 */ 289 if (!device->sync_gpu_ts) 290 return; 291 292 struct intel_ds_stage *stage = &queue->stages[stage_id]; 293 uint64_t start_ns = stage->start_ns; 294 295 if (!start_ns) 296 return; 297 298 uint64_t evt_id = device->event_id++; 299 300 IntelRenderpassDataSource::Trace([=](IntelRenderpassDataSource::TraceContext tctx) { 301 if (auto state = tctx.GetIncrementalState(); state->was_cleared) { 302 send_descriptors(tctx, queue->device); 303 state->was_cleared = false; 304 } 305 306 sync_timestamp(tctx, queue->device); 307 308 auto packet = tctx.NewTracePacket(); 309 310 packet->set_timestamp(start_ns); 311 packet->set_timestamp_clock_id(queue->device->gpu_clock_id); 312 313 assert(ts_ns >= start_ns); 314 315 auto event = packet->set_gpu_render_stage_event(); 316 event->set_gpu_id(queue->device->gpu_id); 317 318 event->set_hw_queue_iid(stage->queue_iid); 319 event->set_stage_iid(stage->stage_iid); 320 event->set_context(queue->device->iid); 321 event->set_event_id(evt_id); 322 event->set_duration(ts_ns - start_ns); 323 event->set_submission_id(submission_id); 324 325 if (payload && payload_as_extra) { 326 payload_as_extra(event, payload); 327 } 328 }); 329 330 stage->start_ns = 0; 331} 332 333static void 334custom_trace_payload_as_extra_end_stall(perfetto::protos::pbzero::GpuRenderStageEvent *event, 335 const struct trace_intel_end_stall *payload) 336{ 337 char buf[256]; 338 339 { 340 auto data = event->add_extra_data(); 341 data->set_name("stall_reason"); 342 343 snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s : %s", 344 (payload->flags & INTEL_DS_DEPTH_CACHE_FLUSH_BIT) ? "+depth_flush" : "", 345 (payload->flags & INTEL_DS_DATA_CACHE_FLUSH_BIT) ? "+dc_flush" : "", 346 (payload->flags & INTEL_DS_HDC_PIPELINE_FLUSH_BIT) ? "+hdc_flush" : "", 347 (payload->flags & INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT) ? "+rt_flush" : "", 348 (payload->flags & INTEL_DS_TILE_CACHE_FLUSH_BIT) ? "+tile_flush" : "", 349 (payload->flags & INTEL_DS_STATE_CACHE_INVALIDATE_BIT) ? "+state_inv" : "", 350 (payload->flags & INTEL_DS_CONST_CACHE_INVALIDATE_BIT) ? "+const_inv" : "", 351 (payload->flags & INTEL_DS_VF_CACHE_INVALIDATE_BIT) ? "+vf_inv" : "", 352 (payload->flags & INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT) ? "+tex_inv" : "", 353 (payload->flags & INTEL_DS_INST_CACHE_INVALIDATE_BIT) ? "+inst_inv" : "", 354 (payload->flags & INTEL_DS_STALL_AT_SCOREBOARD_BIT) ? "+pb_stall" : "", 355 (payload->flags & INTEL_DS_DEPTH_STALL_BIT) ? "+depth_stall" : "", 356 (payload->flags & INTEL_DS_HDC_PIPELINE_FLUSH_BIT) ? "+hdc_flush" : "", 357 (payload->flags & INTEL_DS_CS_STALL_BIT) ? "+cs_stall" : "", 358 payload->reason ? payload->reason : "unknown"); 359 360 assert(strlen(buf) > 0); 361 362 data->set_value(buf); 363 } 364} 365 366#endif /* HAVE_PERFETTO */ 367 368#ifdef __cplusplus 369extern "C" { 370#endif 371 372#ifdef HAVE_PERFETTO 373 374/* 375 * Trace callbacks, called from u_trace once the timestamps from GPU have been 376 * collected. 377 */ 378 379#define CREATE_DUAL_EVENT_CALLBACK(event_name, stage) \ 380 void \ 381 intel_ds_begin_##event_name(struct intel_ds_device *device, \ 382 uint64_t ts_ns, \ 383 const void *flush_data, \ 384 const struct trace_intel_begin_##event_name *payload) \ 385 { \ 386 const struct intel_ds_flush_data *flush = \ 387 (const struct intel_ds_flush_data *) flush_data; \ 388 begin_event(flush->queue, ts_ns, stage); \ 389 } \ 390 \ 391 void \ 392 intel_ds_end_##event_name(struct intel_ds_device *device, \ 393 uint64_t ts_ns, \ 394 const void *flush_data, \ 395 const struct trace_intel_end_##event_name *payload) \ 396 { \ 397 const struct intel_ds_flush_data *flush = \ 398 (const struct intel_ds_flush_data *) flush_data; \ 399 end_event(flush->queue, ts_ns, stage, flush->submission_id, \ 400 payload, \ 401 (trace_payload_as_extra_func) \ 402 &trace_payload_as_extra_intel_end_##event_name); \ 403 } \ 404 405 406CREATE_DUAL_EVENT_CALLBACK(batch, INTEL_DS_QUEUE_STAGE_CMD_BUFFER) 407CREATE_DUAL_EVENT_CALLBACK(cmd_buffer, INTEL_DS_QUEUE_STAGE_CMD_BUFFER) 408CREATE_DUAL_EVENT_CALLBACK(render_pass, INTEL_DS_QUEUE_STAGE_RENDER_PASS) 409CREATE_DUAL_EVENT_CALLBACK(dyn_render_pass, INTEL_DS_QUEUE_STAGE_RENDER_PASS) 410CREATE_DUAL_EVENT_CALLBACK(blorp, INTEL_DS_QUEUE_STAGE_BLORP) 411CREATE_DUAL_EVENT_CALLBACK(draw, INTEL_DS_QUEUE_STAGE_DRAW) 412CREATE_DUAL_EVENT_CALLBACK(draw_indexed, INTEL_DS_QUEUE_STAGE_DRAW) 413CREATE_DUAL_EVENT_CALLBACK(draw_indexed_multi, INTEL_DS_QUEUE_STAGE_DRAW) 414CREATE_DUAL_EVENT_CALLBACK(draw_indexed_indirect, INTEL_DS_QUEUE_STAGE_DRAW) 415CREATE_DUAL_EVENT_CALLBACK(draw_multi, INTEL_DS_QUEUE_STAGE_DRAW) 416CREATE_DUAL_EVENT_CALLBACK(draw_indirect, INTEL_DS_QUEUE_STAGE_DRAW) 417CREATE_DUAL_EVENT_CALLBACK(draw_indirect_count, INTEL_DS_QUEUE_STAGE_DRAW) 418CREATE_DUAL_EVENT_CALLBACK(draw_indirect_byte_count, INTEL_DS_QUEUE_STAGE_DRAW) 419CREATE_DUAL_EVENT_CALLBACK(draw_indexed_indirect_count, INTEL_DS_QUEUE_STAGE_DRAW) 420CREATE_DUAL_EVENT_CALLBACK(compute, INTEL_DS_QUEUE_STAGE_COMPUTE) 421 422void 423intel_ds_begin_stall(struct intel_ds_device *device, 424 uint64_t ts_ns, 425 const void *flush_data, 426 const struct trace_intel_begin_stall *payload) 427{ 428 const struct intel_ds_flush_data *flush = 429 (const struct intel_ds_flush_data *) flush_data; 430 begin_event(flush->queue, ts_ns, INTEL_DS_QUEUE_STAGE_STALL); 431} 432 433void 434intel_ds_end_stall(struct intel_ds_device *device, 435 uint64_t ts_ns, 436 const void *flush_data, 437 const struct trace_intel_end_stall *payload) 438{ 439 const struct intel_ds_flush_data *flush = 440 (const struct intel_ds_flush_data *) flush_data; 441 end_event(flush->queue, ts_ns, INTEL_DS_QUEUE_STAGE_STALL, flush->submission_id, 442 payload, 443 (trace_payload_as_extra_func)custom_trace_payload_as_extra_end_stall); 444} 445 446uint64_t 447intel_ds_begin_submit(struct intel_ds_queue *queue) 448{ 449 return perfetto::base::GetBootTimeNs().count(); 450} 451 452void 453intel_ds_end_submit(struct intel_ds_queue *queue, 454 uint64_t start_ts) 455{ 456 if (!u_trace_context_actively_tracing(&queue->device->trace_context)) { 457 queue->device->sync_gpu_ts = 0; 458 queue->device->next_clock_sync_ns = 0; 459 return; 460 } 461 462 uint64_t end_ts = perfetto::base::GetBootTimeNs().count(); 463 uint32_t submission_id = queue->submission_id++; 464 465 IntelRenderpassDataSource::Trace([=](IntelRenderpassDataSource::TraceContext tctx) { 466 if (auto state = tctx.GetIncrementalState(); state->was_cleared) { 467 send_descriptors(tctx, queue->device); 468 state->was_cleared = false; 469 } 470 471 sync_timestamp(tctx, queue->device); 472 473 auto packet = tctx.NewTracePacket(); 474 475 packet->set_timestamp(start_ts); 476 477 auto event = packet->set_vulkan_api_event(); 478 auto submit = event->set_vk_queue_submit(); 479 480 // submit->set_pid(os_get_pid()); 481 // submit->set_tid(os_get_tid()); 482 submit->set_duration_ns(end_ts - start_ts); 483 submit->set_vk_queue((uintptr_t) queue); 484 submit->set_submission_id(submission_id); 485 }); 486} 487 488#endif /* HAVE_PERFETTO */ 489 490static void 491intel_driver_ds_init_once(void) 492{ 493#ifdef HAVE_PERFETTO 494 util_perfetto_init(); 495 perfetto::DataSourceDescriptor dsd; 496 dsd.set_name("gpu.renderstages.intel"); 497 IntelRenderpassDataSource::Register(dsd); 498#endif 499} 500 501static once_flag intel_driver_ds_once_flag = ONCE_FLAG_INIT; 502 503static uint64_t get_iid() 504{ 505 static uint64_t iid = 1; 506 return iid++; 507} 508 509void 510intel_driver_ds_init(void) 511{ 512 call_once(&intel_driver_ds_once_flag, 513 intel_driver_ds_init_once); 514} 515 516void 517intel_ds_device_init(struct intel_ds_device *device, 518 struct intel_device_info *devinfo, 519 int drm_fd, 520 uint32_t gpu_id, 521 enum intel_ds_api api) 522{ 523 memset(device, 0, sizeof(*device)); 524 525 assert(gpu_id < 128); 526 device->gpu_id = gpu_id; 527 device->gpu_clock_id = intel_pps_clock_id(gpu_id); 528 device->fd = drm_fd; 529 device->info = *devinfo; 530 device->iid = get_iid(); 531 device->api = api; 532 u_vector_init(&device->queues, 4, sizeof(struct intel_ds_queue)); 533} 534 535void 536intel_ds_device_fini(struct intel_ds_device *device) 537{ 538 u_trace_context_fini(&device->trace_context); 539 u_vector_finish(&device->queues); 540} 541 542struct intel_ds_queue * 543intel_ds_device_add_queue(struct intel_ds_device *device, 544 const char *fmt_name, 545 ...) 546{ 547 struct intel_ds_queue *queue = 548 (struct intel_ds_queue *) u_vector_add(&device->queues); 549 va_list ap; 550 551 memset(queue, 0, sizeof(*queue)); 552 553 queue->device = device; 554 queue->queue_id = u_vector_length(&device->queues) - 1; 555 556 va_start(ap, fmt_name); 557 vsnprintf(queue->name, sizeof(queue->name), fmt_name, ap); 558 va_end(ap); 559 560 for (unsigned s = 0; s < INTEL_DS_QUEUE_STAGE_N_STAGES; s++) { 561 queue->stages[s].queue_iid = get_iid(); 562 queue->stages[s].stage_iid = get_iid(); 563 } 564 565 return queue; 566} 567 568void intel_ds_flush_data_init(struct intel_ds_flush_data *data, 569 struct intel_ds_queue *queue, 570 uint64_t submission_id) 571{ 572 memset(data, 0, sizeof(*data)); 573 574 data->queue = queue; 575 data->submission_id = submission_id; 576 577 u_trace_init(&data->trace, &queue->device->trace_context); 578} 579 580void intel_ds_flush_data_fini(struct intel_ds_flush_data *data) 581{ 582 u_trace_fini(&data->trace); 583} 584 585#ifdef __cplusplus 586} 587#endif 588