1/* 2 * Copyright © 2018 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <dirent.h> 25 26#include <sys/types.h> 27#include <sys/stat.h> 28#include <fcntl.h> 29#include <unistd.h> 30#include <errno.h> 31 32#ifndef HAVE_DIRENT_D_TYPE 33#include <limits.h> // PATH_MAX 34#endif 35 36#include <drm-uapi/i915_drm.h> 37 38#include "common/intel_gem.h" 39 40#include "dev/intel_debug.h" 41#include "dev/intel_device_info.h" 42 43#include "perf/intel_perf.h" 44#include "perf/intel_perf_regs.h" 45#include "perf/intel_perf_mdapi.h" 46#include "perf/intel_perf_metrics.h" 47#include "perf/intel_perf_private.h" 48 49#include "util/bitscan.h" 50#include "util/macros.h" 51#include "util/mesa-sha1.h" 52#include "util/u_math.h" 53 54#define FILE_DEBUG_FLAG DEBUG_PERFMON 55 56static bool 57is_dir_or_link(const struct dirent *entry, const char *parent_dir) 58{ 59#ifdef HAVE_DIRENT_D_TYPE 60 return entry->d_type == DT_DIR || entry->d_type == DT_LNK; 61#else 62 struct stat st; 63 char path[PATH_MAX + 1]; 64 snprintf(path, sizeof(path), "%s/%s", parent_dir, entry->d_name); 65 lstat(path, &st); 66 return S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode); 67#endif 68} 69 70static bool 71get_sysfs_dev_dir(struct intel_perf_config *perf, int fd) 72{ 73 struct stat sb; 74 int min, maj; 75 DIR *drmdir; 76 struct dirent *drm_entry; 77 int len; 78 79 perf->sysfs_dev_dir[0] = '\0'; 80 81 if (INTEL_DEBUG(DEBUG_NO_OACONFIG)) 82 return true; 83 84 if (fstat(fd, &sb)) { 85 DBG("Failed to stat DRM fd\n"); 86 return false; 87 } 88 89 maj = major(sb.st_rdev); 90 min = minor(sb.st_rdev); 91 92 if (!S_ISCHR(sb.st_mode)) { 93 DBG("DRM fd is not a character device as expected\n"); 94 return false; 95 } 96 97 len = snprintf(perf->sysfs_dev_dir, 98 sizeof(perf->sysfs_dev_dir), 99 "/sys/dev/char/%d:%d/device/drm", maj, min); 100 if (len < 0 || len >= sizeof(perf->sysfs_dev_dir)) { 101 DBG("Failed to concatenate sysfs path to drm device\n"); 102 return false; 103 } 104 105 drmdir = opendir(perf->sysfs_dev_dir); 106 if (!drmdir) { 107 DBG("Failed to open %s: %m\n", perf->sysfs_dev_dir); 108 return false; 109 } 110 111 while ((drm_entry = readdir(drmdir))) { 112 if (is_dir_or_link(drm_entry, perf->sysfs_dev_dir) && 113 strncmp(drm_entry->d_name, "card", 4) == 0) 114 { 115 len = snprintf(perf->sysfs_dev_dir, 116 sizeof(perf->sysfs_dev_dir), 117 "/sys/dev/char/%d:%d/device/drm/%s", 118 maj, min, drm_entry->d_name); 119 closedir(drmdir); 120 if (len < 0 || len >= sizeof(perf->sysfs_dev_dir)) 121 return false; 122 else 123 return true; 124 } 125 } 126 127 closedir(drmdir); 128 129 DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n", 130 maj, min); 131 132 return false; 133} 134 135static bool 136read_file_uint64(const char *file, uint64_t *val) 137{ 138 char buf[32]; 139 int fd, n; 140 141 fd = open(file, 0); 142 if (fd < 0) 143 return false; 144 while ((n = read(fd, buf, sizeof (buf) - 1)) < 0 && 145 errno == EINTR); 146 close(fd); 147 if (n < 0) 148 return false; 149 150 buf[n] = '\0'; 151 *val = strtoull(buf, NULL, 0); 152 153 return true; 154} 155 156static bool 157read_sysfs_drm_device_file_uint64(struct intel_perf_config *perf, 158 const char *file, 159 uint64_t *value) 160{ 161 char buf[512]; 162 int len; 163 164 len = snprintf(buf, sizeof(buf), "%s/%s", perf->sysfs_dev_dir, file); 165 if (len < 0 || len >= sizeof(buf)) { 166 DBG("Failed to concatenate sys filename to read u64 from\n"); 167 return false; 168 } 169 170 return read_file_uint64(buf, value); 171} 172 173static void 174register_oa_config(struct intel_perf_config *perf, 175 const struct intel_device_info *devinfo, 176 const struct intel_perf_query_info *query, 177 uint64_t config_id) 178{ 179 struct intel_perf_query_info *registered_query = 180 intel_perf_append_query_info(perf, 0); 181 182 *registered_query = *query; 183 registered_query->oa_metrics_set_id = config_id; 184 DBG("metric set registered: id = %" PRIu64", guid = %s\n", 185 registered_query->oa_metrics_set_id, query->guid); 186} 187 188static void 189enumerate_sysfs_metrics(struct intel_perf_config *perf, 190 const struct intel_device_info *devinfo) 191{ 192 DIR *metricsdir = NULL; 193 struct dirent *metric_entry; 194 char buf[256]; 195 int len; 196 197 len = snprintf(buf, sizeof(buf), "%s/metrics", perf->sysfs_dev_dir); 198 if (len < 0 || len >= sizeof(buf)) { 199 DBG("Failed to concatenate path to sysfs metrics/ directory\n"); 200 return; 201 } 202 203 metricsdir = opendir(buf); 204 if (!metricsdir) { 205 DBG("Failed to open %s: %m\n", buf); 206 return; 207 } 208 209 while ((metric_entry = readdir(metricsdir))) { 210 struct hash_entry *entry; 211 if (!is_dir_or_link(metric_entry, buf) || 212 metric_entry->d_name[0] == '.') 213 continue; 214 215 DBG("metric set: %s\n", metric_entry->d_name); 216 entry = _mesa_hash_table_search(perf->oa_metrics_table, 217 metric_entry->d_name); 218 if (entry) { 219 uint64_t id; 220 if (!intel_perf_load_metric_id(perf, metric_entry->d_name, &id)) { 221 DBG("Failed to read metric set id from %s: %m", buf); 222 continue; 223 } 224 225 register_oa_config(perf, devinfo, 226 (const struct intel_perf_query_info *)entry->data, id); 227 } else 228 DBG("metric set not known by mesa (skipping)\n"); 229 } 230 231 closedir(metricsdir); 232} 233 234static void 235add_all_metrics(struct intel_perf_config *perf, 236 const struct intel_device_info *devinfo) 237{ 238 hash_table_foreach(perf->oa_metrics_table, entry) { 239 const struct intel_perf_query_info *query = entry->data; 240 register_oa_config(perf, devinfo, query, 0); 241 } 242} 243 244static bool 245kernel_has_dynamic_config_support(struct intel_perf_config *perf, int fd) 246{ 247 uint64_t invalid_config_id = UINT64_MAX; 248 249 return intel_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, 250 &invalid_config_id) < 0 && errno == ENOENT; 251} 252 253static bool 254i915_query_perf_config_supported(struct intel_perf_config *perf, int fd) 255{ 256 int32_t length = 0; 257 return !intel_i915_query_flags(fd, DRM_I915_QUERY_PERF_CONFIG, 258 DRM_I915_QUERY_PERF_CONFIG_LIST, 259 NULL, &length); 260} 261 262static bool 263i915_query_perf_config_data(struct intel_perf_config *perf, 264 int fd, const char *guid, 265 struct drm_i915_perf_oa_config *config) 266{ 267 char data[sizeof(struct drm_i915_query_perf_config) + 268 sizeof(struct drm_i915_perf_oa_config)] = {}; 269 struct drm_i915_query_perf_config *query = (void *)data; 270 271 memcpy(query->uuid, guid, sizeof(query->uuid)); 272 memcpy(query->data, config, sizeof(*config)); 273 274 int32_t item_length = sizeof(data); 275 if (intel_i915_query_flags(fd, DRM_I915_QUERY_PERF_CONFIG, 276 DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID, 277 query, &item_length)) 278 return false; 279 280 memcpy(config, query->data, sizeof(*config)); 281 282 return true; 283} 284 285bool 286intel_perf_load_metric_id(struct intel_perf_config *perf_cfg, 287 const char *guid, 288 uint64_t *metric_id) 289{ 290 char config_path[280]; 291 292 snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id", 293 perf_cfg->sysfs_dev_dir, guid); 294 295 /* Don't recreate already loaded configs. */ 296 return read_file_uint64(config_path, metric_id); 297} 298 299static uint64_t 300i915_add_config(struct intel_perf_config *perf, int fd, 301 const struct intel_perf_registers *config, 302 const char *guid) 303{ 304 struct drm_i915_perf_oa_config i915_config = { 0, }; 305 306 memcpy(i915_config.uuid, guid, sizeof(i915_config.uuid)); 307 308 i915_config.n_mux_regs = config->n_mux_regs; 309 i915_config.mux_regs_ptr = to_const_user_pointer(config->mux_regs); 310 311 i915_config.n_boolean_regs = config->n_b_counter_regs; 312 i915_config.boolean_regs_ptr = to_const_user_pointer(config->b_counter_regs); 313 314 i915_config.n_flex_regs = config->n_flex_regs; 315 i915_config.flex_regs_ptr = to_const_user_pointer(config->flex_regs); 316 317 int ret = intel_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &i915_config); 318 return ret > 0 ? ret : 0; 319} 320 321static void 322init_oa_configs(struct intel_perf_config *perf, int fd, 323 const struct intel_device_info *devinfo) 324{ 325 hash_table_foreach(perf->oa_metrics_table, entry) { 326 const struct intel_perf_query_info *query = entry->data; 327 uint64_t config_id; 328 329 if (intel_perf_load_metric_id(perf, query->guid, &config_id)) { 330 DBG("metric set: %s (already loaded)\n", query->guid); 331 register_oa_config(perf, devinfo, query, config_id); 332 continue; 333 } 334 335 int ret = i915_add_config(perf, fd, &query->config, query->guid); 336 if (ret < 0) { 337 DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n", 338 query->name, query->guid, strerror(errno)); 339 continue; 340 } 341 342 register_oa_config(perf, devinfo, query, ret); 343 DBG("metric set: %s (added)\n", query->guid); 344 } 345} 346 347static void 348compute_topology_builtins(struct intel_perf_config *perf) 349{ 350 const struct intel_device_info *devinfo = &perf->devinfo; 351 352 perf->sys_vars.slice_mask = devinfo->slice_masks; 353 perf->sys_vars.n_eu_slices = devinfo->num_slices; 354 355 perf->sys_vars.n_eu_slice0123 = 0; 356 for (int s = 0; s < MIN2(4, devinfo->max_slices); s++) { 357 if (!intel_device_info_slice_available(devinfo, s)) 358 continue; 359 360 for (int ss = 0; ss < devinfo->max_subslices_per_slice; ss++) { 361 if (!intel_device_info_subslice_available(devinfo, s, ss)) 362 continue; 363 364 for (int eu = 0; eu < devinfo->max_eus_per_subslice; eu++) { 365 if (intel_device_info_eu_available(devinfo, s, ss, eu)) 366 perf->sys_vars.n_eu_slice0123++; 367 } 368 } 369 } 370 371 for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) { 372 perf->sys_vars.n_eu_sub_slices += 373 util_bitcount(devinfo->subslice_masks[i]); 374 } 375 376 for (int i = 0; i < sizeof(devinfo->eu_masks); i++) 377 perf->sys_vars.n_eus += util_bitcount(devinfo->eu_masks[i]); 378 379 /* The subslice mask builtin contains bits for all slices. Prior to Gfx11 380 * it had groups of 3bits for each slice, on Gfx11 and above it's 8bits for 381 * each slice. 382 * 383 * Ideally equations would be updated to have a slice/subslice query 384 * function/operator. 385 */ 386 perf->sys_vars.subslice_mask = 0; 387 388 int bits_per_subslice = devinfo->ver >= 11 ? 8 : 3; 389 390 for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) { 391 for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) { 392 if (intel_device_info_subslice_available(devinfo, s, ss)) 393 perf->sys_vars.subslice_mask |= 1ULL << (s * bits_per_subslice + ss); 394 } 395 } 396} 397 398static bool 399init_oa_sys_vars(struct intel_perf_config *perf, 400 bool use_register_snapshots) 401{ 402 uint64_t min_freq_mhz = 0, max_freq_mhz = 0; 403 404 if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) { 405 if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz)) 406 return false; 407 408 if (!read_sysfs_drm_device_file_uint64(perf, "gt_max_freq_mhz", &max_freq_mhz)) 409 return false; 410 } else { 411 min_freq_mhz = 300; 412 max_freq_mhz = 1000; 413 } 414 415 memset(&perf->sys_vars, 0, sizeof(perf->sys_vars)); 416 perf->sys_vars.gt_min_freq = min_freq_mhz * 1000000; 417 perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000; 418 perf->sys_vars.query_mode = use_register_snapshots; 419 compute_topology_builtins(perf); 420 421 return true; 422} 423 424typedef void (*perf_register_oa_queries_t)(struct intel_perf_config *); 425 426static perf_register_oa_queries_t 427get_register_queries_function(const struct intel_device_info *devinfo) 428{ 429 switch (devinfo->platform) { 430 case INTEL_PLATFORM_HSW: 431 return intel_oa_register_queries_hsw; 432 case INTEL_PLATFORM_CHV: 433 return intel_oa_register_queries_chv; 434 case INTEL_PLATFORM_BDW: 435 return intel_oa_register_queries_bdw; 436 case INTEL_PLATFORM_BXT: 437 return intel_oa_register_queries_bxt; 438 case INTEL_PLATFORM_SKL: 439 if (devinfo->gt == 2) 440 return intel_oa_register_queries_sklgt2; 441 if (devinfo->gt == 3) 442 return intel_oa_register_queries_sklgt3; 443 if (devinfo->gt == 4) 444 return intel_oa_register_queries_sklgt4; 445 return NULL; 446 case INTEL_PLATFORM_KBL: 447 if (devinfo->gt == 2) 448 return intel_oa_register_queries_kblgt2; 449 if (devinfo->gt == 3) 450 return intel_oa_register_queries_kblgt3; 451 return NULL; 452 case INTEL_PLATFORM_GLK: 453 return intel_oa_register_queries_glk; 454 case INTEL_PLATFORM_CFL: 455 if (devinfo->gt == 2) 456 return intel_oa_register_queries_cflgt2; 457 if (devinfo->gt == 3) 458 return intel_oa_register_queries_cflgt3; 459 return NULL; 460 case INTEL_PLATFORM_ICL: 461 return intel_oa_register_queries_icl; 462 case INTEL_PLATFORM_EHL: 463 return intel_oa_register_queries_ehl; 464 case INTEL_PLATFORM_TGL: 465 if (devinfo->gt == 1) 466 return intel_oa_register_queries_tglgt1; 467 if (devinfo->gt == 2) 468 return intel_oa_register_queries_tglgt2; 469 return NULL; 470 case INTEL_PLATFORM_RKL: 471 return intel_oa_register_queries_rkl; 472 case INTEL_PLATFORM_DG1: 473 return intel_oa_register_queries_dg1; 474 case INTEL_PLATFORM_ADL: 475 return intel_oa_register_queries_adl; 476 default: 477 return NULL; 478 } 479} 480 481static int 482intel_perf_compare_counter_names(const void *v1, const void *v2) 483{ 484 const struct intel_perf_query_counter *c1 = v1; 485 const struct intel_perf_query_counter *c2 = v2; 486 487 return strcmp(c1->name, c2->name); 488} 489 490static void 491sort_query(struct intel_perf_query_info *q) 492{ 493 qsort(q->counters, q->n_counters, sizeof(q->counters[0]), 494 intel_perf_compare_counter_names); 495} 496 497static void 498load_pipeline_statistic_metrics(struct intel_perf_config *perf_cfg, 499 const struct intel_device_info *devinfo) 500{ 501 struct intel_perf_query_info *query = 502 intel_perf_append_query_info(perf_cfg, MAX_STAT_COUNTERS); 503 504 query->kind = INTEL_PERF_QUERY_TYPE_PIPELINE; 505 query->name = "Pipeline Statistics Registers"; 506 507 intel_perf_query_add_basic_stat_reg(query, IA_VERTICES_COUNT, 508 "N vertices submitted"); 509 intel_perf_query_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, 510 "N primitives submitted"); 511 intel_perf_query_add_basic_stat_reg(query, VS_INVOCATION_COUNT, 512 "N vertex shader invocations"); 513 514 if (devinfo->ver == 6) { 515 intel_perf_query_add_stat_reg(query, GFX6_SO_PRIM_STORAGE_NEEDED, 1, 1, 516 "SO_PRIM_STORAGE_NEEDED", 517 "N geometry shader stream-out primitives (total)"); 518 intel_perf_query_add_stat_reg(query, GFX6_SO_NUM_PRIMS_WRITTEN, 1, 1, 519 "SO_NUM_PRIMS_WRITTEN", 520 "N geometry shader stream-out primitives (written)"); 521 } else { 522 intel_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(0), 1, 1, 523 "SO_PRIM_STORAGE_NEEDED (Stream 0)", 524 "N stream-out (stream 0) primitives (total)"); 525 intel_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(1), 1, 1, 526 "SO_PRIM_STORAGE_NEEDED (Stream 1)", 527 "N stream-out (stream 1) primitives (total)"); 528 intel_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(2), 1, 1, 529 "SO_PRIM_STORAGE_NEEDED (Stream 2)", 530 "N stream-out (stream 2) primitives (total)"); 531 intel_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(3), 1, 1, 532 "SO_PRIM_STORAGE_NEEDED (Stream 3)", 533 "N stream-out (stream 3) primitives (total)"); 534 intel_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(0), 1, 1, 535 "SO_NUM_PRIMS_WRITTEN (Stream 0)", 536 "N stream-out (stream 0) primitives (written)"); 537 intel_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(1), 1, 1, 538 "SO_NUM_PRIMS_WRITTEN (Stream 1)", 539 "N stream-out (stream 1) primitives (written)"); 540 intel_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(2), 1, 1, 541 "SO_NUM_PRIMS_WRITTEN (Stream 2)", 542 "N stream-out (stream 2) primitives (written)"); 543 intel_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(3), 1, 1, 544 "SO_NUM_PRIMS_WRITTEN (Stream 3)", 545 "N stream-out (stream 3) primitives (written)"); 546 } 547 548 intel_perf_query_add_basic_stat_reg(query, HS_INVOCATION_COUNT, 549 "N TCS shader invocations"); 550 intel_perf_query_add_basic_stat_reg(query, DS_INVOCATION_COUNT, 551 "N TES shader invocations"); 552 553 intel_perf_query_add_basic_stat_reg(query, GS_INVOCATION_COUNT, 554 "N geometry shader invocations"); 555 intel_perf_query_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, 556 "N geometry shader primitives emitted"); 557 558 intel_perf_query_add_basic_stat_reg(query, CL_INVOCATION_COUNT, 559 "N primitives entering clipping"); 560 intel_perf_query_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, 561 "N primitives leaving clipping"); 562 563 if (devinfo->verx10 == 75 || devinfo->ver == 8) { 564 intel_perf_query_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, 565 "N fragment shader invocations", 566 "N fragment shader invocations"); 567 } else { 568 intel_perf_query_add_basic_stat_reg(query, PS_INVOCATION_COUNT, 569 "N fragment shader invocations"); 570 } 571 572 intel_perf_query_add_basic_stat_reg(query, PS_DEPTH_COUNT, 573 "N z-pass fragments"); 574 575 if (devinfo->ver >= 7) { 576 intel_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT, 577 "N compute shader invocations"); 578 } 579 580 query->data_size = sizeof(uint64_t) * query->n_counters; 581 582 sort_query(query); 583} 584 585static int 586i915_perf_version(int drm_fd) 587{ 588 int tmp; 589 drm_i915_getparam_t gp = { 590 .param = I915_PARAM_PERF_REVISION, 591 .value = &tmp, 592 }; 593 594 int ret = intel_ioctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp); 595 596 /* Return 0 if this getparam is not supported, the first version supported 597 * is 1. 598 */ 599 return ret < 0 ? 0 : tmp; 600} 601 602static void 603i915_get_sseu(int drm_fd, struct drm_i915_gem_context_param_sseu *sseu) 604{ 605 struct drm_i915_gem_context_param arg = { 606 .param = I915_CONTEXT_PARAM_SSEU, 607 .size = sizeof(*sseu), 608 .value = to_user_pointer(sseu) 609 }; 610 611 intel_ioctl(drm_fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg); 612} 613 614static inline int 615compare_str_or_null(const char *s1, const char *s2) 616{ 617 if (s1 == NULL && s2 == NULL) 618 return 0; 619 if (s1 == NULL) 620 return -1; 621 if (s2 == NULL) 622 return 1; 623 624 return strcmp(s1, s2); 625} 626 627static int 628compare_counter_categories_and_names(const void *_c1, const void *_c2) 629{ 630 const struct intel_perf_query_counter_info *c1 = (const struct intel_perf_query_counter_info *)_c1; 631 const struct intel_perf_query_counter_info *c2 = (const struct intel_perf_query_counter_info *)_c2; 632 633 /* pipeline counters don't have an assigned category */ 634 int r = compare_str_or_null(c1->counter->category, c2->counter->category); 635 if (r) 636 return r; 637 638 return strcmp(c1->counter->name, c2->counter->name); 639} 640 641static void 642build_unique_counter_list(struct intel_perf_config *perf) 643{ 644 assert(perf->n_queries < 64); 645 646 size_t max_counters = 0; 647 648 for (int q = 0; q < perf->n_queries; q++) 649 max_counters += perf->queries[q].n_counters; 650 651 /* 652 * Allocate big enough array to hold maximum possible number of counters. 653 * We can't alloc it small and realloc when needed because the hash table 654 * below contains pointers to this array. 655 */ 656 struct intel_perf_query_counter_info *counter_infos = 657 ralloc_array_size(perf, sizeof(counter_infos[0]), max_counters); 658 659 perf->n_counters = 0; 660 661 struct hash_table *counters_table = 662 _mesa_hash_table_create(perf, 663 _mesa_hash_string, 664 _mesa_key_string_equal); 665 struct hash_entry *entry; 666 for (int q = 0; q < perf->n_queries ; q++) { 667 struct intel_perf_query_info *query = &perf->queries[q]; 668 669 for (int c = 0; c < query->n_counters; c++) { 670 struct intel_perf_query_counter *counter; 671 struct intel_perf_query_counter_info *counter_info; 672 673 counter = &query->counters[c]; 674 entry = _mesa_hash_table_search(counters_table, counter->symbol_name); 675 676 if (entry) { 677 counter_info = entry->data; 678 counter_info->query_mask |= BITFIELD64_BIT(q); 679 continue; 680 } 681 assert(perf->n_counters < max_counters); 682 683 counter_info = &counter_infos[perf->n_counters++]; 684 counter_info->counter = counter; 685 counter_info->query_mask = BITFIELD64_BIT(q); 686 687 counter_info->location.group_idx = q; 688 counter_info->location.counter_idx = c; 689 690 _mesa_hash_table_insert(counters_table, counter->symbol_name, counter_info); 691 } 692 } 693 694 _mesa_hash_table_destroy(counters_table, NULL); 695 696 /* Now we can realloc counter_infos array because hash table doesn't exist. */ 697 perf->counter_infos = reralloc_array_size(perf, counter_infos, 698 sizeof(counter_infos[0]), perf->n_counters); 699 700 qsort(perf->counter_infos, perf->n_counters, sizeof(perf->counter_infos[0]), 701 compare_counter_categories_and_names); 702} 703 704static bool 705oa_metrics_available(struct intel_perf_config *perf, int fd, 706 const struct intel_device_info *devinfo, 707 bool use_register_snapshots) 708{ 709 perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo); 710 bool i915_perf_oa_available = false; 711 struct stat sb; 712 713 perf->devinfo = *devinfo; 714 perf->i915_query_supported = i915_query_perf_config_supported(perf, fd); 715 perf->i915_perf_version = i915_perf_version(fd); 716 717 /* TODO: We should query this from i915 */ 718 if (intel_device_info_is_dg2(devinfo)) 719 perf->oa_timestamp_shift = 1; 720 721 perf->oa_timestamp_mask = 722 0xffffffffffffffffull >> (32 + perf->oa_timestamp_shift); 723 724 /* Record the default SSEU configuration. */ 725 i915_get_sseu(fd, &perf->sseu); 726 727 /* The existence of this sysctl parameter implies the kernel supports 728 * the i915 perf interface. 729 */ 730 if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) { 731 732 /* If _paranoid == 1 then on Gfx8+ we won't be able to access OA 733 * metrics unless running as root. 734 */ 735 if (devinfo->platform == INTEL_PLATFORM_HSW) 736 i915_perf_oa_available = true; 737 else { 738 uint64_t paranoid = 1; 739 740 read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", ¶noid); 741 742 if (paranoid == 0 || geteuid() == 0) 743 i915_perf_oa_available = true; 744 } 745 746 perf->platform_supported = oa_register != NULL; 747 } 748 749 return i915_perf_oa_available && 750 oa_register && 751 get_sysfs_dev_dir(perf, fd) && 752 init_oa_sys_vars(perf, use_register_snapshots); 753} 754 755static void 756load_oa_metrics(struct intel_perf_config *perf, int fd, 757 const struct intel_device_info *devinfo) 758{ 759 int existing_queries = perf->n_queries; 760 761 perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo); 762 763 perf->oa_metrics_table = 764 _mesa_hash_table_create(perf, _mesa_hash_string, 765 _mesa_key_string_equal); 766 767 /* Index all the metric sets mesa knows about before looking to see what 768 * the kernel is advertising. 769 */ 770 oa_register(perf); 771 772 if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) { 773 if (kernel_has_dynamic_config_support(perf, fd)) 774 init_oa_configs(perf, fd, devinfo); 775 else 776 enumerate_sysfs_metrics(perf, devinfo); 777 } else { 778 add_all_metrics(perf, devinfo); 779 } 780 781 /* sort counters in each individual group created by this function by name */ 782 for (int i = existing_queries; i < perf->n_queries; ++i) 783 sort_query(&perf->queries[i]); 784 785 /* Select a fallback OA metric. Look for the TestOa metric or use the last 786 * one if no present (on HSW). 787 */ 788 for (int i = existing_queries; i < perf->n_queries; i++) { 789 if (perf->queries[i].symbol_name && 790 strcmp(perf->queries[i].symbol_name, "TestOa") == 0) { 791 perf->fallback_raw_oa_metric = perf->queries[i].oa_metrics_set_id; 792 break; 793 } 794 } 795 if (perf->fallback_raw_oa_metric == 0 && perf->n_queries > 0) 796 perf->fallback_raw_oa_metric = perf->queries[perf->n_queries - 1].oa_metrics_set_id; 797} 798 799struct intel_perf_registers * 800intel_perf_load_configuration(struct intel_perf_config *perf_cfg, int fd, const char *guid) 801{ 802 if (!perf_cfg->i915_query_supported) 803 return NULL; 804 805 struct drm_i915_perf_oa_config i915_config = { 0, }; 806 if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) 807 return NULL; 808 809 struct intel_perf_registers *config = rzalloc(NULL, struct intel_perf_registers); 810 config->n_flex_regs = i915_config.n_flex_regs; 811 config->flex_regs = rzalloc_array(config, struct intel_perf_query_register_prog, config->n_flex_regs); 812 config->n_mux_regs = i915_config.n_mux_regs; 813 config->mux_regs = rzalloc_array(config, struct intel_perf_query_register_prog, config->n_mux_regs); 814 config->n_b_counter_regs = i915_config.n_boolean_regs; 815 config->b_counter_regs = rzalloc_array(config, struct intel_perf_query_register_prog, config->n_b_counter_regs); 816 817 /* 818 * struct intel_perf_query_register_prog maps exactly to the tuple of 819 * (register offset, register value) returned by the i915. 820 */ 821 i915_config.flex_regs_ptr = to_const_user_pointer(config->flex_regs); 822 i915_config.mux_regs_ptr = to_const_user_pointer(config->mux_regs); 823 i915_config.boolean_regs_ptr = to_const_user_pointer(config->b_counter_regs); 824 if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) { 825 ralloc_free(config); 826 return NULL; 827 } 828 829 return config; 830} 831 832uint64_t 833intel_perf_store_configuration(struct intel_perf_config *perf_cfg, int fd, 834 const struct intel_perf_registers *config, 835 const char *guid) 836{ 837 if (guid) 838 return i915_add_config(perf_cfg, fd, config, guid); 839 840 struct mesa_sha1 sha1_ctx; 841 _mesa_sha1_init(&sha1_ctx); 842 843 if (config->flex_regs) { 844 _mesa_sha1_update(&sha1_ctx, config->flex_regs, 845 sizeof(config->flex_regs[0]) * 846 config->n_flex_regs); 847 } 848 if (config->mux_regs) { 849 _mesa_sha1_update(&sha1_ctx, config->mux_regs, 850 sizeof(config->mux_regs[0]) * 851 config->n_mux_regs); 852 } 853 if (config->b_counter_regs) { 854 _mesa_sha1_update(&sha1_ctx, config->b_counter_regs, 855 sizeof(config->b_counter_regs[0]) * 856 config->n_b_counter_regs); 857 } 858 859 uint8_t hash[20]; 860 _mesa_sha1_final(&sha1_ctx, hash); 861 862 char formatted_hash[41]; 863 _mesa_sha1_format(formatted_hash, hash); 864 865 char generated_guid[37]; 866 snprintf(generated_guid, sizeof(generated_guid), 867 "%.8s-%.4s-%.4s-%.4s-%.12s", 868 &formatted_hash[0], &formatted_hash[8], 869 &formatted_hash[8 + 4], &formatted_hash[8 + 4 + 4], 870 &formatted_hash[8 + 4 + 4 + 4]); 871 872 /* Check if already present. */ 873 uint64_t id; 874 if (intel_perf_load_metric_id(perf_cfg, generated_guid, &id)) 875 return id; 876 877 return i915_add_config(perf_cfg, fd, config, generated_guid); 878} 879 880static uint64_t 881get_passes_mask(struct intel_perf_config *perf, 882 const uint32_t *counter_indices, 883 uint32_t counter_indices_count) 884{ 885 uint64_t queries_mask = 0; 886 887 assert(perf->n_queries < 64); 888 889 /* Compute the number of passes by going through all counters N times (with 890 * N the number of queries) to make sure we select the most constraining 891 * counters first and look at the more flexible ones (that could be 892 * obtained from multiple queries) later. That way we minimize the number 893 * of passes required. 894 */ 895 for (uint32_t q = 0; q < perf->n_queries; q++) { 896 for (uint32_t i = 0; i < counter_indices_count; i++) { 897 assert(counter_indices[i] < perf->n_counters); 898 899 uint32_t idx = counter_indices[i]; 900 if (util_bitcount64(perf->counter_infos[idx].query_mask) != (q + 1)) 901 continue; 902 903 if (queries_mask & perf->counter_infos[idx].query_mask) 904 continue; 905 906 queries_mask |= BITFIELD64_BIT(ffsll(perf->counter_infos[idx].query_mask) - 1); 907 } 908 } 909 910 return queries_mask; 911} 912 913uint32_t 914intel_perf_get_n_passes(struct intel_perf_config *perf, 915 const uint32_t *counter_indices, 916 uint32_t counter_indices_count, 917 struct intel_perf_query_info **pass_queries) 918{ 919 uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count); 920 921 if (pass_queries) { 922 uint32_t pass = 0; 923 for (uint32_t q = 0; q < perf->n_queries; q++) { 924 if ((1ULL << q) & queries_mask) 925 pass_queries[pass++] = &perf->queries[q]; 926 } 927 } 928 929 return util_bitcount64(queries_mask); 930} 931 932void 933intel_perf_get_counters_passes(struct intel_perf_config *perf, 934 const uint32_t *counter_indices, 935 uint32_t counter_indices_count, 936 struct intel_perf_counter_pass *counter_pass) 937{ 938 uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count); 939 ASSERTED uint32_t n_passes = util_bitcount64(queries_mask); 940 941 for (uint32_t i = 0; i < counter_indices_count; i++) { 942 assert(counter_indices[i] < perf->n_counters); 943 944 uint32_t idx = counter_indices[i]; 945 counter_pass[i].counter = perf->counter_infos[idx].counter; 946 947 uint32_t query_idx = ffsll(perf->counter_infos[idx].query_mask & queries_mask) - 1; 948 counter_pass[i].query = &perf->queries[query_idx]; 949 950 uint32_t clear_bits = 63 - query_idx; 951 counter_pass[i].pass = util_bitcount64((queries_mask << clear_bits) >> clear_bits) - 1; 952 assert(counter_pass[i].pass < n_passes); 953 } 954} 955 956/* Accumulate 32bits OA counters */ 957static inline void 958accumulate_uint32(const uint32_t *report0, 959 const uint32_t *report1, 960 uint64_t *accumulator) 961{ 962 *accumulator += (uint32_t)(*report1 - *report0); 963} 964 965/* Accumulate 40bits OA counters */ 966static inline void 967accumulate_uint40(int a_index, 968 const uint32_t *report0, 969 const uint32_t *report1, 970 uint64_t *accumulator) 971{ 972 const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40); 973 const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40); 974 uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32; 975 uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32; 976 uint64_t value0 = report0[a_index + 4] | high0; 977 uint64_t value1 = report1[a_index + 4] | high1; 978 uint64_t delta; 979 980 if (value0 > value1) 981 delta = (1ULL << 40) + value1 - value0; 982 else 983 delta = value1 - value0; 984 985 *accumulator += delta; 986} 987 988static void 989gfx8_read_report_clock_ratios(const uint32_t *report, 990 uint64_t *slice_freq_hz, 991 uint64_t *unslice_freq_hz) 992{ 993 /* The lower 16bits of the RPT_ID field of the OA reports contains a 994 * snapshot of the bits coming from the RP_FREQ_NORMAL register and is 995 * divided this way : 996 * 997 * RPT_ID[31:25]: RP_FREQ_NORMAL[20:14] (low squashed_slice_clock_frequency) 998 * RPT_ID[10:9]: RP_FREQ_NORMAL[22:21] (high squashed_slice_clock_frequency) 999 * RPT_ID[8:0]: RP_FREQ_NORMAL[31:23] (squashed_unslice_clock_frequency) 1000 * 1001 * RP_FREQ_NORMAL[31:23]: Software Unslice Ratio Request 1002 * Multiple of 33.33MHz 2xclk (16 MHz 1xclk) 1003 * 1004 * RP_FREQ_NORMAL[22:14]: Software Slice Ratio Request 1005 * Multiple of 33.33MHz 2xclk (16 MHz 1xclk) 1006 */ 1007 1008 uint32_t unslice_freq = report[0] & 0x1ff; 1009 uint32_t slice_freq_low = (report[0] >> 25) & 0x7f; 1010 uint32_t slice_freq_high = (report[0] >> 9) & 0x3; 1011 uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7); 1012 1013 *slice_freq_hz = slice_freq * 16666667ULL; 1014 *unslice_freq_hz = unslice_freq * 16666667ULL; 1015} 1016 1017void 1018intel_perf_query_result_read_frequencies(struct intel_perf_query_result *result, 1019 const struct intel_device_info *devinfo, 1020 const uint32_t *start, 1021 const uint32_t *end) 1022{ 1023 /* Slice/Unslice frequency is only available in the OA reports when the 1024 * "Disable OA reports due to clock ratio change" field in 1025 * OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this 1026 * global register (see drivers/gpu/drm/i915/i915_perf.c) 1027 * 1028 * Documentation says this should be available on Gfx9+ but experimentation 1029 * shows that Gfx8 reports similar values, so we enable it there too. 1030 */ 1031 if (devinfo->ver < 8) 1032 return; 1033 1034 gfx8_read_report_clock_ratios(start, 1035 &result->slice_frequency[0], 1036 &result->unslice_frequency[0]); 1037 gfx8_read_report_clock_ratios(end, 1038 &result->slice_frequency[1], 1039 &result->unslice_frequency[1]); 1040} 1041 1042static inline bool 1043can_use_mi_rpc_bc_counters(const struct intel_device_info *devinfo) 1044{ 1045 return devinfo->ver <= 11; 1046} 1047 1048uint64_t 1049intel_perf_report_timestamp(const struct intel_perf_query_info *query, 1050 const uint32_t *report) 1051{ 1052 return report[1] >> query->perf->oa_timestamp_shift; 1053} 1054 1055void 1056intel_perf_query_result_accumulate(struct intel_perf_query_result *result, 1057 const struct intel_perf_query_info *query, 1058 const uint32_t *start, 1059 const uint32_t *end) 1060{ 1061 int i; 1062 1063 if (result->hw_id == INTEL_PERF_INVALID_CTX_ID && 1064 start[2] != INTEL_PERF_INVALID_CTX_ID) 1065 result->hw_id = start[2]; 1066 if (result->reports_accumulated == 0) 1067 result->begin_timestamp = intel_perf_report_timestamp(query, start); 1068 result->end_timestamp = intel_perf_report_timestamp(query, end); 1069 result->reports_accumulated++; 1070 1071 switch (query->oa_format) { 1072 case I915_OA_FORMAT_A32u40_A4u32_B8_C8: 1073 result->accumulator[query->gpu_time_offset] = 1074 intel_perf_report_timestamp(query, end) - 1075 intel_perf_report_timestamp(query, start); 1076 1077 accumulate_uint32(start + 3, end + 3, 1078 result->accumulator + query->gpu_clock_offset); /* clock */ 1079 1080 /* 32x 40bit A counters... */ 1081 for (i = 0; i < 32; i++) { 1082 accumulate_uint40(i, start, end, 1083 result->accumulator + query->a_offset + i); 1084 } 1085 1086 /* 4x 32bit A counters... */ 1087 for (i = 0; i < 4; i++) { 1088 accumulate_uint32(start + 36 + i, end + 36 + i, 1089 result->accumulator + query->a_offset + 32 + i); 1090 } 1091 1092 if (can_use_mi_rpc_bc_counters(&query->perf->devinfo)) { 1093 /* 8x 32bit B counters */ 1094 for (i = 0; i < 8; i++) { 1095 accumulate_uint32(start + 48 + i, end + 48 + i, 1096 result->accumulator + query->b_offset + i); 1097 } 1098 1099 /* 8x 32bit C counters... */ 1100 for (i = 0; i < 8; i++) { 1101 accumulate_uint32(start + 56 + i, end + 56 + i, 1102 result->accumulator + query->c_offset + i); 1103 } 1104 } 1105 break; 1106 1107 case I915_OA_FORMAT_A45_B8_C8: 1108 result->accumulator[query->gpu_time_offset] = 1109 intel_perf_report_timestamp(query, end) - 1110 intel_perf_report_timestamp(query, start); 1111 1112 for (i = 0; i < 61; i++) { 1113 accumulate_uint32(start + 3 + i, end + 3 + i, 1114 result->accumulator + query->a_offset + i); 1115 } 1116 break; 1117 1118 default: 1119 unreachable("Can't accumulate OA counters in unknown format"); 1120 } 1121 1122} 1123 1124#define GET_FIELD(word, field) (((word) & field ## _MASK) >> field ## _SHIFT) 1125 1126void 1127intel_perf_query_result_read_gt_frequency(struct intel_perf_query_result *result, 1128 const struct intel_device_info *devinfo, 1129 const uint32_t start, 1130 const uint32_t end) 1131{ 1132 switch (devinfo->ver) { 1133 case 7: 1134 case 8: 1135 result->gt_frequency[0] = GET_FIELD(start, GFX7_RPSTAT1_CURR_GT_FREQ) * 50ULL; 1136 result->gt_frequency[1] = GET_FIELD(end, GFX7_RPSTAT1_CURR_GT_FREQ) * 50ULL; 1137 break; 1138 case 9: 1139 case 11: 1140 case 12: 1141 result->gt_frequency[0] = GET_FIELD(start, GFX9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; 1142 result->gt_frequency[1] = GET_FIELD(end, GFX9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; 1143 break; 1144 default: 1145 unreachable("unexpected gen"); 1146 } 1147 1148 /* Put the numbers into Hz. */ 1149 result->gt_frequency[0] *= 1000000ULL; 1150 result->gt_frequency[1] *= 1000000ULL; 1151} 1152 1153void 1154intel_perf_query_result_read_perfcnts(struct intel_perf_query_result *result, 1155 const struct intel_perf_query_info *query, 1156 const uint64_t *start, 1157 const uint64_t *end) 1158{ 1159 for (uint32_t i = 0; i < 2; i++) { 1160 uint64_t v0 = start[i] & PERF_CNT_VALUE_MASK; 1161 uint64_t v1 = end[i] & PERF_CNT_VALUE_MASK; 1162 1163 result->accumulator[query->perfcnt_offset + i] = v0 > v1 ? 1164 (PERF_CNT_VALUE_MASK + 1 + v1 - v0) : 1165 (v1 - v0); 1166 } 1167} 1168 1169static uint32_t 1170query_accumulator_offset(const struct intel_perf_query_info *query, 1171 enum intel_perf_query_field_type type, 1172 uint8_t index) 1173{ 1174 switch (type) { 1175 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: 1176 return query->perfcnt_offset + index; 1177 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: 1178 return query->a_offset + index; 1179 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 1180 return query->b_offset + index; 1181 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: 1182 return query->c_offset + index; 1183 default: 1184 unreachable("Invalid register type"); 1185 return 0; 1186 } 1187} 1188 1189void 1190intel_perf_query_result_accumulate_fields(struct intel_perf_query_result *result, 1191 const struct intel_perf_query_info *query, 1192 const void *start, 1193 const void *end, 1194 bool no_oa_accumulate) 1195{ 1196 const struct intel_perf_query_field_layout *layout = &query->perf->query_layout; 1197 const struct intel_device_info *devinfo = &query->perf->devinfo; 1198 1199 for (uint32_t r = 0; r < layout->n_fields; r++) { 1200 const struct intel_perf_query_field *field = &layout->fields[r]; 1201 1202 if (field->type == INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC) { 1203 intel_perf_query_result_read_frequencies(result, devinfo, 1204 start + field->location, 1205 end + field->location); 1206 /* no_oa_accumulate=true is used when doing GL perf queries, we 1207 * manually parse the OA reports from the OA buffer and subtract 1208 * unrelated deltas, so don't accumulate the begin/end reports here. 1209 */ 1210 if (!no_oa_accumulate) { 1211 intel_perf_query_result_accumulate(result, query, 1212 start + field->location, 1213 end + field->location); 1214 } 1215 } else { 1216 uint64_t v0, v1; 1217 1218 if (field->size == 4) { 1219 v0 = *(const uint32_t *)(start + field->location); 1220 v1 = *(const uint32_t *)(end + field->location); 1221 } else { 1222 assert(field->size == 8); 1223 v0 = *(const uint64_t *)(start + field->location); 1224 v1 = *(const uint64_t *)(end + field->location); 1225 } 1226 1227 if (field->mask) { 1228 v0 = field->mask & v0; 1229 v1 = field->mask & v1; 1230 } 1231 1232 /* RPSTAT is a bit of a special case because its begin/end values 1233 * represent frequencies. We store it in a separate location. 1234 */ 1235 if (field->type == INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT) 1236 intel_perf_query_result_read_gt_frequency(result, devinfo, v0, v1); 1237 else 1238 result->accumulator[query_accumulator_offset(query, field->type, field->index)] = v1 - v0; 1239 } 1240 } 1241} 1242 1243void 1244intel_perf_query_result_clear(struct intel_perf_query_result *result) 1245{ 1246 memset(result, 0, sizeof(*result)); 1247 result->hw_id = INTEL_PERF_INVALID_CTX_ID; 1248} 1249 1250void 1251intel_perf_query_result_print_fields(const struct intel_perf_query_info *query, 1252 const void *data) 1253{ 1254 const struct intel_perf_query_field_layout *layout = &query->perf->query_layout; 1255 1256 for (uint32_t r = 0; r < layout->n_fields; r++) { 1257 const struct intel_perf_query_field *field = &layout->fields[r]; 1258 const uint32_t *value32 = data + field->location; 1259 1260 switch (field->type) { 1261 case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: 1262 fprintf(stderr, "MI_RPC:\n"); 1263 fprintf(stderr, " TS: 0x%08x\n", *(value32 + 1)); 1264 fprintf(stderr, " CLK: 0x%08x\n", *(value32 + 3)); 1265 break; 1266 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: 1267 fprintf(stderr, "A%u: 0x%08x\n", field->index, *value32); 1268 break; 1269 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 1270 fprintf(stderr, "B%u: 0x%08x\n", field->index, *value32); 1271 break; 1272 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: 1273 fprintf(stderr, "C%u: 0x%08x\n", field->index, *value32); 1274 break; 1275 default: 1276 break; 1277 } 1278 } 1279} 1280 1281static int 1282intel_perf_compare_query_names(const void *v1, const void *v2) 1283{ 1284 const struct intel_perf_query_info *q1 = v1; 1285 const struct intel_perf_query_info *q2 = v2; 1286 1287 return strcmp(q1->name, q2->name); 1288} 1289 1290static inline struct intel_perf_query_field * 1291add_query_register(struct intel_perf_query_field_layout *layout, 1292 enum intel_perf_query_field_type type, 1293 uint16_t offset, 1294 uint16_t size, 1295 uint8_t index) 1296{ 1297 /* Align MI_RPC to 64bytes (HW requirement) & 64bit registers to 8bytes 1298 * (shows up nicely in the debugger). 1299 */ 1300 if (type == INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC) 1301 layout->size = align(layout->size, 64); 1302 else if (size % 8 == 0) 1303 layout->size = align(layout->size, 8); 1304 1305 layout->fields[layout->n_fields++] = (struct intel_perf_query_field) { 1306 .mmio_offset = offset, 1307 .location = layout->size, 1308 .type = type, 1309 .index = index, 1310 .size = size, 1311 }; 1312 layout->size += size; 1313 1314 return &layout->fields[layout->n_fields - 1]; 1315} 1316 1317static void 1318intel_perf_init_query_fields(struct intel_perf_config *perf_cfg, 1319 const struct intel_device_info *devinfo, 1320 bool use_register_snapshots) 1321{ 1322 struct intel_perf_query_field_layout *layout = &perf_cfg->query_layout; 1323 1324 layout->n_fields = 0; 1325 1326 /* MI_RPC requires a 64byte alignment. */ 1327 layout->alignment = 64; 1328 1329 layout->fields = rzalloc_array(perf_cfg, struct intel_perf_query_field, 5 + 16); 1330 1331 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC, 1332 0, 256, 0); 1333 1334 if (use_register_snapshots) { 1335 if (devinfo->ver <= 11) { 1336 struct intel_perf_query_field *field = 1337 add_query_register(layout, 1338 INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT, 1339 PERF_CNT_1_DW0, 8, 0); 1340 field->mask = PERF_CNT_VALUE_MASK; 1341 1342 field = add_query_register(layout, 1343 INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT, 1344 PERF_CNT_2_DW0, 8, 1); 1345 field->mask = PERF_CNT_VALUE_MASK; 1346 } 1347 1348 if (devinfo->ver == 8 && devinfo->platform != INTEL_PLATFORM_CHV) { 1349 add_query_register(layout, 1350 INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT, 1351 GFX7_RPSTAT1, 4, 0); 1352 } 1353 1354 if (devinfo->ver >= 9) { 1355 add_query_register(layout, 1356 INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT, 1357 GFX9_RPSTAT0, 4, 0); 1358 } 1359 1360 if (!can_use_mi_rpc_bc_counters(devinfo)) { 1361 if (devinfo->ver >= 8 && devinfo->ver <= 11) { 1362 for (uint32_t i = 0; i < GFX8_N_OA_PERF_B32; i++) { 1363 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B, 1364 GFX8_OA_PERF_B32(i), 4, i); 1365 } 1366 for (uint32_t i = 0; i < GFX8_N_OA_PERF_C32; i++) { 1367 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C, 1368 GFX8_OA_PERF_C32(i), 4, i); 1369 } 1370 } else if (devinfo->verx10 == 120) { 1371 for (uint32_t i = 0; i < GFX12_N_OAG_PERF_B32; i++) { 1372 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B, 1373 GFX12_OAG_PERF_B32(i), 4, i); 1374 } 1375 for (uint32_t i = 0; i < GFX12_N_OAG_PERF_C32; i++) { 1376 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C, 1377 GFX12_OAG_PERF_C32(i), 4, i); 1378 } 1379 } else if (devinfo->verx10 == 125) { 1380 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A, 1381 GFX125_OAG_PERF_A36, 4, 36); 1382 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A, 1383 GFX125_OAG_PERF_A37, 4, 37); 1384 for (uint32_t i = 0; i < GFX12_N_OAG_PERF_B32; i++) { 1385 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B, 1386 GFX12_OAG_PERF_B32(i), 4, i); 1387 } 1388 for (uint32_t i = 0; i < GFX12_N_OAG_PERF_C32; i++) { 1389 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C, 1390 GFX12_OAG_PERF_C32(i), 4, i); 1391 } 1392 } 1393 } 1394 } 1395 1396 /* Align the whole package to 64bytes so that 2 snapshots can be put 1397 * together without extract alignment for the user. 1398 */ 1399 layout->size = align(layout->size, 64); 1400} 1401 1402void 1403intel_perf_init_metrics(struct intel_perf_config *perf_cfg, 1404 const struct intel_device_info *devinfo, 1405 int drm_fd, 1406 bool include_pipeline_statistics, 1407 bool use_register_snapshots) 1408{ 1409 intel_perf_init_query_fields(perf_cfg, devinfo, use_register_snapshots); 1410 1411 if (include_pipeline_statistics) { 1412 load_pipeline_statistic_metrics(perf_cfg, devinfo); 1413 intel_perf_register_mdapi_statistic_query(perf_cfg, devinfo); 1414 } 1415 1416 bool oa_metrics = oa_metrics_available(perf_cfg, drm_fd, devinfo, 1417 use_register_snapshots); 1418 if (oa_metrics) 1419 load_oa_metrics(perf_cfg, drm_fd, devinfo); 1420 1421 /* sort query groups by name */ 1422 qsort(perf_cfg->queries, perf_cfg->n_queries, 1423 sizeof(perf_cfg->queries[0]), intel_perf_compare_query_names); 1424 1425 build_unique_counter_list(perf_cfg); 1426 1427 if (oa_metrics) 1428 intel_perf_register_mdapi_oa_query(perf_cfg, devinfo); 1429} 1430