1/* 2 * Copyright © 2018 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#ifndef INTEL_PERF_H 25#define INTEL_PERF_H 26 27#include <stdio.h> 28#include <stdbool.h> 29#include <stdint.h> 30#include <string.h> 31 32#if defined(MAJOR_IN_SYSMACROS) 33#include <sys/sysmacros.h> 34#elif defined(MAJOR_IN_MKDEV) 35#include <sys/mkdev.h> 36#endif 37 38#include "compiler/glsl/list.h" 39#include "dev/intel_device_info.h" 40#include "util/bitscan.h" 41#include "util/hash_table.h" 42#include "util/ralloc.h" 43 44#include "drm-uapi/i915_drm.h" 45 46#ifdef __cplusplus 47extern "C" { 48#endif 49 50struct intel_perf_config; 51struct intel_perf_query_info; 52 53#define INTEL_PERF_INVALID_CTX_ID (0xffffffff) 54 55enum PACKED intel_perf_counter_type { 56 INTEL_PERF_COUNTER_TYPE_EVENT, 57 INTEL_PERF_COUNTER_TYPE_DURATION_NORM, 58 INTEL_PERF_COUNTER_TYPE_DURATION_RAW, 59 INTEL_PERF_COUNTER_TYPE_THROUGHPUT, 60 INTEL_PERF_COUNTER_TYPE_RAW, 61 INTEL_PERF_COUNTER_TYPE_TIMESTAMP, 62}; 63 64enum PACKED intel_perf_counter_data_type { 65 INTEL_PERF_COUNTER_DATA_TYPE_BOOL32, 66 INTEL_PERF_COUNTER_DATA_TYPE_UINT32, 67 INTEL_PERF_COUNTER_DATA_TYPE_UINT64, 68 INTEL_PERF_COUNTER_DATA_TYPE_FLOAT, 69 INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE, 70}; 71 72enum PACKED intel_perf_counter_units { 73 /* size */ 74 INTEL_PERF_COUNTER_UNITS_BYTES, 75 76 /* frequency */ 77 INTEL_PERF_COUNTER_UNITS_HZ, 78 79 /* time */ 80 INTEL_PERF_COUNTER_UNITS_NS, 81 INTEL_PERF_COUNTER_UNITS_US, 82 83 /**/ 84 INTEL_PERF_COUNTER_UNITS_PIXELS, 85 INTEL_PERF_COUNTER_UNITS_TEXELS, 86 INTEL_PERF_COUNTER_UNITS_THREADS, 87 INTEL_PERF_COUNTER_UNITS_PERCENT, 88 89 /* events */ 90 INTEL_PERF_COUNTER_UNITS_MESSAGES, 91 INTEL_PERF_COUNTER_UNITS_NUMBER, 92 INTEL_PERF_COUNTER_UNITS_CYCLES, 93 INTEL_PERF_COUNTER_UNITS_EVENTS, 94 INTEL_PERF_COUNTER_UNITS_UTILIZATION, 95 96 /**/ 97 INTEL_PERF_COUNTER_UNITS_EU_SENDS_TO_L3_CACHE_LINES, 98 INTEL_PERF_COUNTER_UNITS_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES, 99 INTEL_PERF_COUNTER_UNITS_EU_REQUESTS_TO_L3_CACHE_LINES, 100 INTEL_PERF_COUNTER_UNITS_EU_BYTES_PER_L3_CACHE_LINE, 101 102 INTEL_PERF_COUNTER_UNITS_MAX 103}; 104 105struct intel_pipeline_stat { 106 uint32_t reg; 107 uint32_t numerator; 108 uint32_t denominator; 109}; 110 111/* 112 * The largest OA formats we can use include: 113 * For Haswell: 114 * 1 timestamp, 45 A counters, 8 B counters and 8 C counters. 115 * For Gfx8+ 116 * 1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters 117 * 118 * Plus 2 PERF_CNT registers and 1 RPSTAT register. 119 */ 120#define MAX_OA_REPORT_COUNTERS (62 + 2 + 1) 121 122/* 123 * When currently allocate only one page for pipeline statistics queries. Here 124 * we derived the maximum number of counters for that amount. 125 */ 126#define STATS_BO_SIZE 4096 127#define STATS_BO_END_OFFSET_BYTES (STATS_BO_SIZE / 2) 128#define MAX_STAT_COUNTERS (STATS_BO_END_OFFSET_BYTES / 8) 129 130#define I915_PERF_OA_SAMPLE_SIZE (8 + /* drm_i915_perf_record_header */ \ 131 256) /* OA counter report */ 132 133struct intel_perf_query_result { 134 /** 135 * Storage for the final accumulated OA counters. 136 */ 137 uint64_t accumulator[MAX_OA_REPORT_COUNTERS]; 138 139 /** 140 * Hw ID used by the context on which the query was running. 141 */ 142 uint32_t hw_id; 143 144 /** 145 * Number of reports accumulated to produce the results. 146 */ 147 uint32_t reports_accumulated; 148 149 /** 150 * Frequency in the slices of the GT at the begin and end of the 151 * query. 152 */ 153 uint64_t slice_frequency[2]; 154 155 /** 156 * Frequency in the unslice of the GT at the begin and end of the 157 * query. 158 */ 159 uint64_t unslice_frequency[2]; 160 161 /** 162 * Frequency of the whole GT at the begin and end of the query. 163 */ 164 uint64_t gt_frequency[2]; 165 166 /** 167 * Timestamp of the query. 168 */ 169 uint64_t begin_timestamp; 170 171 /** 172 * Timestamp of the query. 173 */ 174 uint64_t end_timestamp; 175 176 /** 177 * Whether the query was interrupted by another workload (aka preemption). 178 */ 179 bool query_disjoint; 180}; 181 182typedef uint64_t (*intel_counter_read_uint64_t)(struct intel_perf_config *perf, 183 const struct intel_perf_query_info *query, 184 const struct intel_perf_query_result *results); 185 186typedef float (*intel_counter_read_float_t)(struct intel_perf_config *perf, 187 const struct intel_perf_query_info *query, 188 const struct intel_perf_query_result *results); 189 190struct intel_perf_query_counter { 191 const char *name; 192 const char *desc; 193 const char *symbol_name; 194 const char *category; 195 enum intel_perf_counter_type type; 196 enum intel_perf_counter_data_type data_type; 197 enum intel_perf_counter_units units; 198 size_t offset; 199 200 union { 201 intel_counter_read_uint64_t oa_counter_max_uint64; 202 intel_counter_read_float_t oa_counter_max_float; 203 }; 204 205 union { 206 intel_counter_read_uint64_t oa_counter_read_uint64; 207 intel_counter_read_float_t oa_counter_read_float; 208 struct intel_pipeline_stat pipeline_stat; 209 }; 210}; 211 212struct intel_perf_query_register_prog { 213 uint32_t reg; 214 uint32_t val; 215}; 216 217/* Register programming for a given query */ 218struct intel_perf_registers { 219 const struct intel_perf_query_register_prog *flex_regs; 220 uint32_t n_flex_regs; 221 222 const struct intel_perf_query_register_prog *mux_regs; 223 uint32_t n_mux_regs; 224 225 const struct intel_perf_query_register_prog *b_counter_regs; 226 uint32_t n_b_counter_regs; 227}; 228 229struct intel_perf_query_info { 230 struct intel_perf_config *perf; 231 232 enum intel_perf_query_type { 233 INTEL_PERF_QUERY_TYPE_OA, 234 INTEL_PERF_QUERY_TYPE_RAW, 235 INTEL_PERF_QUERY_TYPE_PIPELINE, 236 } kind; 237 const char *name; 238 const char *symbol_name; 239 const char *guid; 240 struct intel_perf_query_counter *counters; 241 int n_counters; 242 int max_counters; 243 size_t data_size; 244 245 /* OA specific */ 246 uint64_t oa_metrics_set_id; 247 int oa_format; 248 249 /* For indexing into the accumulator[] ... */ 250 int gpu_time_offset; 251 int gpu_clock_offset; 252 int a_offset; 253 int b_offset; 254 int c_offset; 255 int perfcnt_offset; 256 int rpstat_offset; 257 258 struct intel_perf_registers config; 259}; 260 261/* When not using the MI_RPC command, this structure describes the list of 262 * register offsets as well as their storage location so that they can be 263 * stored through a series of MI_SRM commands and accumulated with 264 * intel_perf_query_result_accumulate_snapshots(). 265 */ 266struct intel_perf_query_field_layout { 267 /* Alignment for the layout */ 268 uint32_t alignment; 269 270 /* Size of the whole layout */ 271 uint32_t size; 272 273 uint32_t n_fields; 274 275 struct intel_perf_query_field { 276 /* MMIO location of this register */ 277 uint16_t mmio_offset; 278 279 /* Location of this register in the storage */ 280 uint16_t location; 281 282 /* Type of register, for accumulation (see intel_perf_query_info:*_offset 283 * fields) 284 */ 285 enum intel_perf_query_field_type { 286 INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC, 287 INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT, 288 INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT, 289 INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A, 290 INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B, 291 INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C, 292 } type; 293 294 /* Index of register in the given type (for instance A31 or B2, 295 * etc...) 296 */ 297 uint8_t index; 298 299 /* 4, 8 or 256 */ 300 uint16_t size; 301 302 /* If not 0, mask to apply to the register value. */ 303 uint64_t mask; 304 } *fields; 305}; 306 307struct intel_perf_query_counter_info { 308 struct intel_perf_query_counter *counter; 309 310 uint64_t query_mask; 311 312 /** 313 * Each counter can be a part of many groups, each time at different index. 314 * This struct stores one of those locations. 315 */ 316 struct { 317 int group_idx; /* query/group number */ 318 int counter_idx; /* index inside of query/group */ 319 } location; 320}; 321 322struct intel_perf_config { 323 /* Whether i915 has DRM_I915_QUERY_PERF_CONFIG support. */ 324 bool i915_query_supported; 325 326 /* Version of the i915-perf subsystem, refer to i915_drm.h. */ 327 int i915_perf_version; 328 329 /* Number of bits to shift the OA timestamp values by to match the ring 330 * timestamp. 331 */ 332 int oa_timestamp_shift; 333 334 /* Mask of bits valid from the OA report (for instance you might have the 335 * lower 31 bits [30:0] of timestamp value). This is useful if you want to 336 * recombine a full timestamp value captured from the CPU with OA 337 * timestamps captured on the device but that only include 31bits of data. 338 */ 339 uint64_t oa_timestamp_mask; 340 341 /* Powergating configuration for the running the query. */ 342 struct drm_i915_gem_context_param_sseu sseu; 343 344 struct intel_perf_query_info *queries; 345 int n_queries; 346 347 struct intel_perf_query_counter_info *counter_infos; 348 int n_counters; 349 350 struct intel_perf_query_field_layout query_layout; 351 352 /* Variables referenced in the XML meta data for OA performance 353 * counters, e.g in the normalization equations. 354 * 355 * All uint64_t for consistent operand types in generated code 356 */ 357 struct { 358 uint64_t n_eus; /** $EuCoresTotalCount */ 359 uint64_t n_eu_slices; /** $EuSlicesTotalCount */ 360 uint64_t n_eu_sub_slices; /** $EuSubslicesTotalCount */ 361 uint64_t n_eu_slice0123; /** $EuDualSubslicesSlice0123Count */ 362 uint64_t slice_mask; /** $SliceMask */ 363 uint64_t subslice_mask; /** $SubsliceMask */ 364 uint64_t gt_min_freq; /** $GpuMinFrequency */ 365 uint64_t gt_max_freq; /** $GpuMaxFrequency */ 366 bool query_mode; /** $QueryMode */ 367 } sys_vars; 368 369 struct intel_device_info devinfo; 370 371 /* OA metric sets, indexed by GUID, as know by Mesa at build time, to 372 * cross-reference with the GUIDs of configs advertised by the kernel at 373 * runtime 374 */ 375 struct hash_table *oa_metrics_table; 376 377 /* When MDAPI hasn't configured the metric we need to use by the time the 378 * query begins, this OA metric is used as a fallback. 379 */ 380 uint64_t fallback_raw_oa_metric; 381 382 /* Whether we have support for this platform. If true && n_queries == 0, 383 * this means we will not be able to use i915-perf because of it is in 384 * paranoid mode. 385 */ 386 bool platform_supported; 387 388 /* Location of the device's sysfs entry. */ 389 char sysfs_dev_dir[256]; 390 391 struct { 392 void *(*bo_alloc)(void *bufmgr, const char *name, uint64_t size); 393 void (*bo_unreference)(void *bo); 394 void *(*bo_map)(void *ctx, void *bo, unsigned flags); 395 void (*bo_unmap)(void *bo); 396 bool (*batch_references)(void *batch, void *bo); 397 void (*bo_wait_rendering)(void *bo); 398 int (*bo_busy)(void *bo); 399 void (*emit_stall_at_pixel_scoreboard)(void *ctx); 400 void (*emit_mi_report_perf_count)(void *ctx, 401 void *bo, 402 uint32_t offset_in_bytes, 403 uint32_t report_id); 404 void (*batchbuffer_flush)(void *ctx, 405 const char *file, int line); 406 void (*store_register_mem)(void *ctx, void *bo, uint32_t reg, uint32_t reg_size, uint32_t offset); 407 408 } vtbl; 409}; 410 411struct intel_perf_counter_pass { 412 struct intel_perf_query_info *query; 413 struct intel_perf_query_counter *counter; 414 uint32_t pass; 415}; 416 417/** Initialize the intel_perf_config object for a given device. 418 * 419 * include_pipeline_statistics : Whether to add a pipeline statistic query 420 * intel_perf_query_info object 421 * 422 * use_register_snapshots : Whether the queries should include counters 423 * that rely on register snapshots using command 424 * streamer instructions (not possible when using 425 * only the OA buffer data). 426 */ 427void intel_perf_init_metrics(struct intel_perf_config *perf_cfg, 428 const struct intel_device_info *devinfo, 429 int drm_fd, 430 bool include_pipeline_statistics, 431 bool use_register_snapshots); 432 433/** Query i915 for a metric id using guid. 434 */ 435bool intel_perf_load_metric_id(struct intel_perf_config *perf_cfg, 436 const char *guid, 437 uint64_t *metric_id); 438 439/** Load a configuation's content from i915 using a guid. 440 */ 441struct intel_perf_registers *intel_perf_load_configuration(struct intel_perf_config *perf_cfg, 442 int fd, const char *guid); 443 444/** Store a configuration into i915 using guid and return a new metric id. 445 * 446 * If guid is NULL, then a generated one will be provided by hashing the 447 * content of the configuration. 448 */ 449uint64_t intel_perf_store_configuration(struct intel_perf_config *perf_cfg, int fd, 450 const struct intel_perf_registers *config, 451 const char *guid); 452 453static inline unsigned 454intel_perf_query_counter_info_first_query(const struct intel_perf_query_counter_info *counter_info) 455{ 456 return ffsll(counter_info->query_mask); 457} 458 459/** Read the slice/unslice frequency from 2 OA reports and store then into 460 * result. 461 */ 462void intel_perf_query_result_read_frequencies(struct intel_perf_query_result *result, 463 const struct intel_device_info *devinfo, 464 const uint32_t *start, 465 const uint32_t *end); 466 467/** Store the GT frequency as reported by the RPSTAT register. 468 */ 469void intel_perf_query_result_read_gt_frequency(struct intel_perf_query_result *result, 470 const struct intel_device_info *devinfo, 471 const uint32_t start, 472 const uint32_t end); 473 474/** Store PERFCNT registers values. 475 */ 476void intel_perf_query_result_read_perfcnts(struct intel_perf_query_result *result, 477 const struct intel_perf_query_info *query, 478 const uint64_t *start, 479 const uint64_t *end); 480 481/** Accumulate the delta between 2 OA reports into result for a given query. 482 */ 483void intel_perf_query_result_accumulate(struct intel_perf_query_result *result, 484 const struct intel_perf_query_info *query, 485 const uint32_t *start, 486 const uint32_t *end); 487 488/** Read the timestamp value in a report. 489 */ 490uint64_t intel_perf_report_timestamp(const struct intel_perf_query_info *query, 491 const uint32_t *report); 492 493/** Accumulate the delta between 2 snapshots of OA perf registers (layout 494 * should match description specified through intel_perf_query_register_layout). 495 */ 496void intel_perf_query_result_accumulate_fields(struct intel_perf_query_result *result, 497 const struct intel_perf_query_info *query, 498 const void *start, 499 const void *end, 500 bool no_oa_accumulate); 501 502void intel_perf_query_result_clear(struct intel_perf_query_result *result); 503 504/** Debug helper printing out query data. 505 */ 506void intel_perf_query_result_print_fields(const struct intel_perf_query_info *query, 507 const void *data); 508 509static inline size_t 510intel_perf_query_counter_get_size(const struct intel_perf_query_counter *counter) 511{ 512 switch (counter->data_type) { 513 case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32: 514 return sizeof(uint32_t); 515 case INTEL_PERF_COUNTER_DATA_TYPE_UINT32: 516 return sizeof(uint32_t); 517 case INTEL_PERF_COUNTER_DATA_TYPE_UINT64: 518 return sizeof(uint64_t); 519 case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT: 520 return sizeof(float); 521 case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: 522 return sizeof(double); 523 default: 524 unreachable("invalid counter data type"); 525 } 526} 527 528static inline struct intel_perf_config * 529intel_perf_new(void *ctx) 530{ 531 struct intel_perf_config *perf = rzalloc(ctx, struct intel_perf_config); 532 return perf; 533} 534 535/** Whether we have the ability to hold off preemption on a batch so we don't 536 * have to look at the OA buffer to subtract unrelated workloads off the 537 * values captured through MI_* commands. 538 */ 539static inline bool 540intel_perf_has_hold_preemption(const struct intel_perf_config *perf) 541{ 542 return perf->i915_perf_version >= 3; 543} 544 545/** Whether we have the ability to lock EU array power configuration for the 546 * duration of the performance recording. This is useful on Gfx11 where the HW 547 * architecture requires half the EU for particular workloads. 548 */ 549static inline bool 550intel_perf_has_global_sseu(const struct intel_perf_config *perf) 551{ 552 return perf->i915_perf_version >= 4; 553} 554 555uint32_t intel_perf_get_n_passes(struct intel_perf_config *perf, 556 const uint32_t *counter_indices, 557 uint32_t counter_indices_count, 558 struct intel_perf_query_info **pass_queries); 559void intel_perf_get_counters_passes(struct intel_perf_config *perf, 560 const uint32_t *counter_indices, 561 uint32_t counter_indices_count, 562 struct intel_perf_counter_pass *counter_pass); 563 564#ifdef __cplusplus 565} // extern "C" 566#endif 567 568#endif /* INTEL_PERF_H */ 569