1/* 2 * Copyright © 2019 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <unistd.h> 25#include <poll.h> 26 27#include "common/intel_gem.h" 28 29#include "dev/intel_debug.h" 30#include "dev/intel_device_info.h" 31 32#include "perf/intel_perf.h" 33#include "perf/intel_perf_mdapi.h" 34#include "perf/intel_perf_private.h" 35#include "perf/intel_perf_query.h" 36#include "perf/intel_perf_regs.h" 37 38#include "drm-uapi/i915_drm.h" 39 40#include "util/compiler.h" 41#include "util/u_math.h" 42 43#define FILE_DEBUG_FLAG DEBUG_PERFMON 44 45#define MI_RPC_BO_SIZE (4096) 46#define MI_FREQ_OFFSET_BYTES (256) 47#define MI_PERF_COUNTERS_OFFSET_BYTES (260) 48 49#define ALIGN(x, y) (((x) + (y)-1) & ~((y)-1)) 50 51#define MAP_READ (1 << 0) 52#define MAP_WRITE (1 << 1) 53 54/** 55 * Periodic OA samples are read() into these buffer structures via the 56 * i915 perf kernel interface and appended to the 57 * perf_ctx->sample_buffers linked list. When we process the 58 * results of an OA metrics query we need to consider all the periodic 59 * samples between the Begin and End MI_REPORT_PERF_COUNT command 60 * markers. 61 * 62 * 'Periodic' is a simplification as there are other automatic reports 63 * written by the hardware also buffered here. 64 * 65 * Considering three queries, A, B and C: 66 * 67 * Time ----> 68 * ________________A_________________ 69 * | | 70 * | ________B_________ _____C___________ 71 * | | | | | | 72 * 73 * And an illustration of sample buffers read over this time frame: 74 * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ] 75 * 76 * These nodes may hold samples for query A: 77 * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ] 78 * 79 * These nodes may hold samples for query B: 80 * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ] 81 * 82 * These nodes may hold samples for query C: 83 * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ] 84 * 85 * The illustration assumes we have an even distribution of periodic 86 * samples so all nodes have the same size plotted against time: 87 * 88 * Note, to simplify code, the list is never empty. 89 * 90 * With overlapping queries we can see that periodic OA reports may 91 * relate to multiple queries and care needs to be take to keep 92 * track of sample buffers until there are no queries that might 93 * depend on their contents. 94 * 95 * We use a node ref counting system where a reference ensures that a 96 * node and all following nodes can't be freed/recycled until the 97 * reference drops to zero. 98 * 99 * E.g. with a ref of one here: 100 * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] 101 * 102 * These nodes could be freed or recycled ("reaped"): 103 * [ 0 ][ 0 ] 104 * 105 * These must be preserved until the leading ref drops to zero: 106 * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] 107 * 108 * When a query starts we take a reference on the current tail of 109 * the list, knowing that no already-buffered samples can possibly 110 * relate to the newly-started query. A pointer to this node is 111 * also saved in the query object's ->oa.samples_head. 112 * 113 * E.g. starting query A while there are two nodes in .sample_buffers: 114 * ________________A________ 115 * | 116 * 117 * [ 0 ][ 1 ] 118 * ^_______ Add a reference and store pointer to node in 119 * A->oa.samples_head 120 * 121 * Moving forward to when the B query starts with no new buffer nodes: 122 * (for reference, i915 perf reads() are only done when queries finish) 123 * ________________A_______ 124 * | ________B___ 125 * | | 126 * 127 * [ 0 ][ 2 ] 128 * ^_______ Add a reference and store pointer to 129 * node in B->oa.samples_head 130 * 131 * Once a query is finished, after an OA query has become 'Ready', 132 * once the End OA report has landed and after we we have processed 133 * all the intermediate periodic samples then we drop the 134 * ->oa.samples_head reference we took at the start. 135 * 136 * So when the B query has finished we have: 137 * ________________A________ 138 * | ______B___________ 139 * | | | 140 * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ] 141 * ^_______ Drop B->oa.samples_head reference 142 * 143 * We still can't free these due to the A->oa.samples_head ref: 144 * [ 1 ][ 0 ][ 0 ][ 0 ] 145 * 146 * When the A query finishes: (note there's a new ref for C's samples_head) 147 * ________________A_________________ 148 * | | 149 * | _____C_________ 150 * | | | 151 * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ] 152 * ^_______ Drop A->oa.samples_head reference 153 * 154 * And we can now reap these nodes up to the C->oa.samples_head: 155 * [ X ][ X ][ X ][ X ] 156 * keeping -> [ 1 ][ 0 ][ 0 ] 157 * 158 * We reap old sample buffers each time we finish processing an OA 159 * query by iterating the sample_buffers list from the head until we 160 * find a referenced node and stop. 161 * 162 * Reaped buffers move to a perfquery.free_sample_buffers list and 163 * when we come to read() we first look to recycle a buffer from the 164 * free_sample_buffers list before allocating a new buffer. 165 */ 166struct oa_sample_buf { 167 struct exec_node link; 168 int refcount; 169 int len; 170 uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10]; 171 uint32_t last_timestamp; 172}; 173 174/** 175 * gen representation of a performance query object. 176 * 177 * NB: We want to keep this structure relatively lean considering that 178 * applications may expect to allocate enough objects to be able to 179 * query around all draw calls in a frame. 180 */ 181struct intel_perf_query_object 182{ 183 const struct intel_perf_query_info *queryinfo; 184 185 /* See query->kind to know which state below is in use... */ 186 union { 187 struct { 188 189 /** 190 * BO containing OA counter snapshots at query Begin/End time. 191 */ 192 void *bo; 193 194 /** 195 * Address of mapped of @bo 196 */ 197 void *map; 198 199 /** 200 * The MI_REPORT_PERF_COUNT command lets us specify a unique 201 * ID that will be reflected in the resulting OA report 202 * that's written by the GPU. This is the ID we're expecting 203 * in the begin report and the the end report should be 204 * @begin_report_id + 1. 205 */ 206 int begin_report_id; 207 208 /** 209 * Reference the head of the brw->perfquery.sample_buffers 210 * list at the time that the query started (so we only need 211 * to look at nodes after this point when looking for samples 212 * related to this query) 213 * 214 * (See struct brw_oa_sample_buf description for more details) 215 */ 216 struct exec_node *samples_head; 217 218 /** 219 * false while in the unaccumulated_elements list, and set to 220 * true when the final, end MI_RPC snapshot has been 221 * accumulated. 222 */ 223 bool results_accumulated; 224 225 /** 226 * Accumulated OA results between begin and end of the query. 227 */ 228 struct intel_perf_query_result result; 229 } oa; 230 231 struct { 232 /** 233 * BO containing starting and ending snapshots for the 234 * statistics counters. 235 */ 236 void *bo; 237 } pipeline_stats; 238 }; 239}; 240 241struct intel_perf_context { 242 struct intel_perf_config *perf; 243 244 void * mem_ctx; /* ralloc context */ 245 void * ctx; /* driver context (eg, brw_context) */ 246 void * bufmgr; 247 const struct intel_device_info *devinfo; 248 249 uint32_t hw_ctx; 250 int drm_fd; 251 252 /* The i915 perf stream we open to setup + enable the OA counters */ 253 int oa_stream_fd; 254 255 /* An i915 perf stream fd gives exclusive access to the OA unit that will 256 * report counter snapshots for a specific counter set/profile in a 257 * specific layout/format so we can only start OA queries that are 258 * compatible with the currently open fd... 259 */ 260 int current_oa_metrics_set_id; 261 int current_oa_format; 262 263 /* List of buffers containing OA reports */ 264 struct exec_list sample_buffers; 265 266 /* Cached list of empty sample buffers */ 267 struct exec_list free_sample_buffers; 268 269 int n_active_oa_queries; 270 int n_active_pipeline_stats_queries; 271 272 /* The number of queries depending on running OA counters which 273 * extends beyond brw_end_perf_query() since we need to wait until 274 * the last MI_RPC command has parsed by the GPU. 275 * 276 * Accurate accounting is important here as emitting an 277 * MI_REPORT_PERF_COUNT command while the OA unit is disabled will 278 * effectively hang the gpu. 279 */ 280 int n_oa_users; 281 282 /* To help catch an spurious problem with the hardware or perf 283 * forwarding samples, we emit each MI_REPORT_PERF_COUNT command 284 * with a unique ID that we can explicitly check for... 285 */ 286 int next_query_start_report_id; 287 288 /** 289 * An array of queries whose results haven't yet been assembled 290 * based on the data in buffer objects. 291 * 292 * These may be active, or have already ended. However, the 293 * results have not been requested. 294 */ 295 struct intel_perf_query_object **unaccumulated; 296 int unaccumulated_elements; 297 int unaccumulated_array_size; 298 299 /* The total number of query objects so we can relinquish 300 * our exclusive access to perf if the application deletes 301 * all of its objects. (NB: We only disable perf while 302 * there are no active queries) 303 */ 304 int n_query_instances; 305 306 int period_exponent; 307}; 308 309static bool 310inc_n_users(struct intel_perf_context *perf_ctx) 311{ 312 if (perf_ctx->n_oa_users == 0 && 313 intel_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_ENABLE, 0) < 0) 314 { 315 return false; 316 } 317 ++perf_ctx->n_oa_users; 318 319 return true; 320} 321 322static void 323dec_n_users(struct intel_perf_context *perf_ctx) 324{ 325 /* Disabling the i915 perf stream will effectively disable the OA 326 * counters. Note it's important to be sure there are no outstanding 327 * MI_RPC commands at this point since they could stall the CS 328 * indefinitely once OACONTROL is disabled. 329 */ 330 --perf_ctx->n_oa_users; 331 if (perf_ctx->n_oa_users == 0 && 332 intel_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0) 333 { 334 DBG("WARNING: Error disabling gen perf stream: %m\n"); 335 } 336} 337 338void 339intel_perf_close(struct intel_perf_context *perfquery, 340 const struct intel_perf_query_info *query) 341{ 342 if (perfquery->oa_stream_fd != -1) { 343 close(perfquery->oa_stream_fd); 344 perfquery->oa_stream_fd = -1; 345 } 346 if (query && query->kind == INTEL_PERF_QUERY_TYPE_RAW) { 347 struct intel_perf_query_info *raw_query = 348 (struct intel_perf_query_info *) query; 349 raw_query->oa_metrics_set_id = 0; 350 } 351} 352 353bool 354intel_perf_open(struct intel_perf_context *perf_ctx, 355 int metrics_set_id, 356 int report_format, 357 int period_exponent, 358 int drm_fd, 359 uint32_t ctx_id, 360 bool enable) 361{ 362 uint64_t properties[DRM_I915_PERF_PROP_MAX * 2]; 363 uint32_t p = 0; 364 365 /* Single context sampling if valid context id. */ 366 if (ctx_id != INTEL_PERF_INVALID_CTX_ID) { 367 properties[p++] = DRM_I915_PERF_PROP_CTX_HANDLE; 368 properties[p++] = ctx_id; 369 } 370 371 /* Include OA reports in samples */ 372 properties[p++] = DRM_I915_PERF_PROP_SAMPLE_OA; 373 properties[p++] = true; 374 375 /* OA unit configuration */ 376 properties[p++] = DRM_I915_PERF_PROP_OA_METRICS_SET; 377 properties[p++] = metrics_set_id; 378 379 properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT; 380 properties[p++] = report_format; 381 382 properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT; 383 properties[p++] = period_exponent; 384 385 /* If global SSEU is available, pin it to the default. This will ensure on 386 * Gfx11 for instance we use the full EU array. Initially when perf was 387 * enabled we would use only half on Gfx11 because of functional 388 * requirements. 389 * 390 * Temporary disable this option on Gfx12.5+, kernel doesn't appear to 391 * support it. 392 */ 393 if (intel_perf_has_global_sseu(perf_ctx->perf) && 394 perf_ctx->devinfo->verx10 < 125) { 395 properties[p++] = DRM_I915_PERF_PROP_GLOBAL_SSEU; 396 properties[p++] = to_user_pointer(&perf_ctx->perf->sseu); 397 } 398 399 assert(p <= ARRAY_SIZE(properties)); 400 401 struct drm_i915_perf_open_param param = { 402 .flags = I915_PERF_FLAG_FD_CLOEXEC | 403 I915_PERF_FLAG_FD_NONBLOCK | 404 (enable ? 0 : I915_PERF_FLAG_DISABLED), 405 .num_properties = p / 2, 406 .properties_ptr = (uintptr_t) properties, 407 }; 408 int fd = intel_ioctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, ¶m); 409 if (fd == -1) { 410 DBG("Error opening gen perf OA stream: %m\n"); 411 return false; 412 } 413 414 perf_ctx->oa_stream_fd = fd; 415 416 perf_ctx->current_oa_metrics_set_id = metrics_set_id; 417 perf_ctx->current_oa_format = report_format; 418 419 if (enable) 420 ++perf_ctx->n_oa_users; 421 422 return true; 423} 424 425static uint64_t 426get_metric_id(struct intel_perf_config *perf, 427 const struct intel_perf_query_info *query) 428{ 429 /* These queries are know not to ever change, their config ID has been 430 * loaded upon the first query creation. No need to look them up again. 431 */ 432 if (query->kind == INTEL_PERF_QUERY_TYPE_OA) 433 return query->oa_metrics_set_id; 434 435 assert(query->kind == INTEL_PERF_QUERY_TYPE_RAW); 436 437 /* Raw queries can be reprogrammed up by an external application/library. 438 * When a raw query is used for the first time it's id is set to a value != 439 * 0. When it stops being used the id returns to 0. No need to reload the 440 * ID when it's already loaded. 441 */ 442 if (query->oa_metrics_set_id != 0) { 443 DBG("Raw query '%s' guid=%s using cached ID: %"PRIu64"\n", 444 query->name, query->guid, query->oa_metrics_set_id); 445 return query->oa_metrics_set_id; 446 } 447 448 struct intel_perf_query_info *raw_query = (struct intel_perf_query_info *)query; 449 if (!intel_perf_load_metric_id(perf, query->guid, 450 &raw_query->oa_metrics_set_id)) { 451 DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid); 452 raw_query->oa_metrics_set_id = perf->fallback_raw_oa_metric; 453 } else { 454 DBG("Raw query '%s'guid=%s loaded ID: %"PRIu64"\n", 455 query->name, query->guid, query->oa_metrics_set_id); 456 } 457 return query->oa_metrics_set_id; 458} 459 460static struct oa_sample_buf * 461get_free_sample_buf(struct intel_perf_context *perf_ctx) 462{ 463 struct exec_node *node = exec_list_pop_head(&perf_ctx->free_sample_buffers); 464 struct oa_sample_buf *buf; 465 466 if (node) 467 buf = exec_node_data(struct oa_sample_buf, node, link); 468 else { 469 buf = ralloc_size(perf_ctx->perf, sizeof(*buf)); 470 471 exec_node_init(&buf->link); 472 buf->refcount = 0; 473 } 474 buf->len = 0; 475 476 return buf; 477} 478 479static void 480reap_old_sample_buffers(struct intel_perf_context *perf_ctx) 481{ 482 struct exec_node *tail_node = 483 exec_list_get_tail(&perf_ctx->sample_buffers); 484 struct oa_sample_buf *tail_buf = 485 exec_node_data(struct oa_sample_buf, tail_node, link); 486 487 /* Remove all old, unreferenced sample buffers walking forward from 488 * the head of the list, except always leave at least one node in 489 * the list so we always have a node to reference when we Begin 490 * a new query. 491 */ 492 foreach_list_typed_safe(struct oa_sample_buf, buf, link, 493 &perf_ctx->sample_buffers) 494 { 495 if (buf->refcount == 0 && buf != tail_buf) { 496 exec_node_remove(&buf->link); 497 exec_list_push_head(&perf_ctx->free_sample_buffers, &buf->link); 498 } else 499 return; 500 } 501} 502 503static void 504free_sample_bufs(struct intel_perf_context *perf_ctx) 505{ 506 foreach_list_typed_safe(struct oa_sample_buf, buf, link, 507 &perf_ctx->free_sample_buffers) 508 ralloc_free(buf); 509 510 exec_list_make_empty(&perf_ctx->free_sample_buffers); 511} 512 513 514struct intel_perf_query_object * 515intel_perf_new_query(struct intel_perf_context *perf_ctx, unsigned query_index) 516{ 517 const struct intel_perf_query_info *query = 518 &perf_ctx->perf->queries[query_index]; 519 520 switch (query->kind) { 521 case INTEL_PERF_QUERY_TYPE_OA: 522 case INTEL_PERF_QUERY_TYPE_RAW: 523 if (perf_ctx->period_exponent == 0) 524 return NULL; 525 break; 526 case INTEL_PERF_QUERY_TYPE_PIPELINE: 527 break; 528 } 529 530 struct intel_perf_query_object *obj = 531 calloc(1, sizeof(struct intel_perf_query_object)); 532 533 if (!obj) 534 return NULL; 535 536 obj->queryinfo = query; 537 538 perf_ctx->n_query_instances++; 539 return obj; 540} 541 542int 543intel_perf_active_queries(struct intel_perf_context *perf_ctx, 544 const struct intel_perf_query_info *query) 545{ 546 assert(perf_ctx->n_active_oa_queries == 0 || perf_ctx->n_active_pipeline_stats_queries == 0); 547 548 switch (query->kind) { 549 case INTEL_PERF_QUERY_TYPE_OA: 550 case INTEL_PERF_QUERY_TYPE_RAW: 551 return perf_ctx->n_active_oa_queries; 552 break; 553 554 case INTEL_PERF_QUERY_TYPE_PIPELINE: 555 return perf_ctx->n_active_pipeline_stats_queries; 556 break; 557 558 default: 559 unreachable("Unknown query type"); 560 break; 561 } 562} 563 564const struct intel_perf_query_info* 565intel_perf_query_info(const struct intel_perf_query_object *query) 566{ 567 return query->queryinfo; 568} 569 570struct intel_perf_context * 571intel_perf_new_context(void *parent) 572{ 573 struct intel_perf_context *ctx = rzalloc(parent, struct intel_perf_context); 574 if (! ctx) 575 fprintf(stderr, "%s: failed to alloc context\n", __func__); 576 return ctx; 577} 578 579struct intel_perf_config * 580intel_perf_config(struct intel_perf_context *ctx) 581{ 582 return ctx->perf; 583} 584 585void 586intel_perf_init_context(struct intel_perf_context *perf_ctx, 587 struct intel_perf_config *perf_cfg, 588 void * mem_ctx, /* ralloc context */ 589 void * ctx, /* driver context (eg, brw_context) */ 590 void * bufmgr, /* eg brw_bufmgr */ 591 const struct intel_device_info *devinfo, 592 uint32_t hw_ctx, 593 int drm_fd) 594{ 595 perf_ctx->perf = perf_cfg; 596 perf_ctx->mem_ctx = mem_ctx; 597 perf_ctx->ctx = ctx; 598 perf_ctx->bufmgr = bufmgr; 599 perf_ctx->drm_fd = drm_fd; 600 perf_ctx->hw_ctx = hw_ctx; 601 perf_ctx->devinfo = devinfo; 602 603 perf_ctx->unaccumulated = 604 ralloc_array(mem_ctx, struct intel_perf_query_object *, 2); 605 perf_ctx->unaccumulated_elements = 0; 606 perf_ctx->unaccumulated_array_size = 2; 607 608 exec_list_make_empty(&perf_ctx->sample_buffers); 609 exec_list_make_empty(&perf_ctx->free_sample_buffers); 610 611 /* It's convenient to guarantee that this linked list of sample 612 * buffers is never empty so we add an empty head so when we 613 * Begin an OA query we can always take a reference on a buffer 614 * in this list. 615 */ 616 struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx); 617 exec_list_push_head(&perf_ctx->sample_buffers, &buf->link); 618 619 perf_ctx->oa_stream_fd = -1; 620 perf_ctx->next_query_start_report_id = 1000; 621 622 /* The period_exponent gives a sampling period as follows: 623 * sample_period = timestamp_period * 2^(period_exponent + 1) 624 * 625 * The timestamps increments every 80ns (HSW), ~52ns (GFX9LP) or 626 * ~83ns (GFX8/9). 627 * 628 * The counter overflow period is derived from the EuActive counter 629 * which reads a counter that increments by the number of clock 630 * cycles multiplied by the number of EUs. It can be calculated as: 631 * 632 * 2^(number of bits in A counter) / (n_eus * max_intel_freq * 2) 633 * 634 * (E.g. 40 EUs @ 1GHz = ~53ms) 635 * 636 * We select a sampling period inferior to that overflow period to 637 * ensure we cannot see more than 1 counter overflow, otherwise we 638 * could loose information. 639 */ 640 641 int a_counter_in_bits = 32; 642 if (devinfo->ver >= 8) 643 a_counter_in_bits = 40; 644 645 uint64_t overflow_period = pow(2, a_counter_in_bits) / (perf_cfg->sys_vars.n_eus * 646 /* drop 1GHz freq to have units in nanoseconds */ 647 2); 648 649 DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n", 650 overflow_period, overflow_period / 1000000ul, perf_cfg->sys_vars.n_eus); 651 652 int period_exponent = 0; 653 uint64_t prev_sample_period, next_sample_period; 654 for (int e = 0; e < 30; e++) { 655 prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency; 656 next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency; 657 658 /* Take the previous sampling period, lower than the overflow 659 * period. 660 */ 661 if (prev_sample_period < overflow_period && 662 next_sample_period > overflow_period) 663 period_exponent = e + 1; 664 } 665 666 perf_ctx->period_exponent = period_exponent; 667 668 if (period_exponent == 0) { 669 DBG("WARNING: enable to find a sampling exponent\n"); 670 } else { 671 DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent, 672 prev_sample_period / 1000000ul); 673 } 674} 675 676/** 677 * Add a query to the global list of "unaccumulated queries." 678 * 679 * Queries are tracked here until all the associated OA reports have 680 * been accumulated via accumulate_oa_reports() after the end 681 * MI_REPORT_PERF_COUNT has landed in query->oa.bo. 682 */ 683static void 684add_to_unaccumulated_query_list(struct intel_perf_context *perf_ctx, 685 struct intel_perf_query_object *obj) 686{ 687 if (perf_ctx->unaccumulated_elements >= 688 perf_ctx->unaccumulated_array_size) 689 { 690 perf_ctx->unaccumulated_array_size *= 1.5; 691 perf_ctx->unaccumulated = 692 reralloc(perf_ctx->mem_ctx, perf_ctx->unaccumulated, 693 struct intel_perf_query_object *, 694 perf_ctx->unaccumulated_array_size); 695 } 696 697 perf_ctx->unaccumulated[perf_ctx->unaccumulated_elements++] = obj; 698} 699 700/** 701 * Emit MI_STORE_REGISTER_MEM commands to capture all of the 702 * pipeline statistics for the performance query object. 703 */ 704static void 705snapshot_statistics_registers(struct intel_perf_context *ctx, 706 struct intel_perf_query_object *obj, 707 uint32_t offset_in_bytes) 708{ 709 struct intel_perf_config *perf = ctx->perf; 710 const struct intel_perf_query_info *query = obj->queryinfo; 711 const int n_counters = query->n_counters; 712 713 for (int i = 0; i < n_counters; i++) { 714 const struct intel_perf_query_counter *counter = &query->counters[i]; 715 716 assert(counter->data_type == INTEL_PERF_COUNTER_DATA_TYPE_UINT64); 717 718 perf->vtbl.store_register_mem(ctx->ctx, obj->pipeline_stats.bo, 719 counter->pipeline_stat.reg, 8, 720 offset_in_bytes + counter->offset); 721 } 722} 723 724static void 725snapshot_query_layout(struct intel_perf_context *perf_ctx, 726 struct intel_perf_query_object *query, 727 bool end_snapshot) 728{ 729 struct intel_perf_config *perf_cfg = perf_ctx->perf; 730 const struct intel_perf_query_field_layout *layout = &perf_cfg->query_layout; 731 uint32_t offset = end_snapshot ? align(layout->size, layout->alignment) : 0; 732 733 for (uint32_t f = 0; f < layout->n_fields; f++) { 734 const struct intel_perf_query_field *field = 735 &layout->fields[end_snapshot ? f : (layout->n_fields - 1 - f)]; 736 737 switch (field->type) { 738 case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: 739 perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo, 740 offset + field->location, 741 query->oa.begin_report_id + 742 (end_snapshot ? 1 : 0)); 743 break; 744 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: 745 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: 746 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: 747 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 748 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: 749 perf_cfg->vtbl.store_register_mem(perf_ctx->ctx, query->oa.bo, 750 field->mmio_offset, field->size, 751 offset + field->location); 752 break; 753 default: 754 unreachable("Invalid field type"); 755 } 756 } 757} 758 759bool 760intel_perf_begin_query(struct intel_perf_context *perf_ctx, 761 struct intel_perf_query_object *query) 762{ 763 struct intel_perf_config *perf_cfg = perf_ctx->perf; 764 const struct intel_perf_query_info *queryinfo = query->queryinfo; 765 766 /* XXX: We have to consider that the command parser unit that parses batch 767 * buffer commands and is used to capture begin/end counter snapshots isn't 768 * implicitly synchronized with what's currently running across other GPU 769 * units (such as the EUs running shaders) that the performance counters are 770 * associated with. 771 * 772 * The intention of performance queries is to measure the work associated 773 * with commands between the begin/end delimiters and so for that to be the 774 * case we need to explicitly synchronize the parsing of commands to capture 775 * Begin/End counter snapshots with what's running across other parts of the 776 * GPU. 777 * 778 * When the command parser reaches a Begin marker it effectively needs to 779 * drain everything currently running on the GPU until the hardware is idle 780 * before capturing the first snapshot of counters - otherwise the results 781 * would also be measuring the effects of earlier commands. 782 * 783 * When the command parser reaches an End marker it needs to stall until 784 * everything currently running on the GPU has finished before capturing the 785 * end snapshot - otherwise the results won't be a complete representation 786 * of the work. 787 * 788 * To achieve this, we stall the pipeline at pixel scoreboard (prevent any 789 * additional work to be processed by the pipeline until all pixels of the 790 * previous draw has be completed). 791 * 792 * N.B. The final results are based on deltas of counters between (inside) 793 * Begin/End markers so even though the total wall clock time of the 794 * workload is stretched by larger pipeline bubbles the bubbles themselves 795 * are generally invisible to the query results. Whether that's a good or a 796 * bad thing depends on the use case. For a lower real-time impact while 797 * capturing metrics then periodic sampling may be a better choice than 798 * INTEL_performance_query. 799 * 800 * 801 * This is our Begin synchronization point to drain current work on the 802 * GPU before we capture our first counter snapshot... 803 */ 804 perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx); 805 806 switch (queryinfo->kind) { 807 case INTEL_PERF_QUERY_TYPE_OA: 808 case INTEL_PERF_QUERY_TYPE_RAW: { 809 810 /* Opening an i915 perf stream implies exclusive access to the OA unit 811 * which will generate counter reports for a specific counter set with a 812 * specific layout/format so we can't begin any OA based queries that 813 * require a different counter set or format unless we get an opportunity 814 * to close the stream and open a new one... 815 */ 816 uint64_t metric_id = get_metric_id(perf_ctx->perf, queryinfo); 817 818 if (perf_ctx->oa_stream_fd != -1 && 819 perf_ctx->current_oa_metrics_set_id != metric_id) { 820 821 if (perf_ctx->n_oa_users != 0) { 822 DBG("WARNING: Begin failed already using perf config=%i/%"PRIu64"\n", 823 perf_ctx->current_oa_metrics_set_id, metric_id); 824 return false; 825 } else 826 intel_perf_close(perf_ctx, queryinfo); 827 } 828 829 /* If the OA counters aren't already on, enable them. */ 830 if (perf_ctx->oa_stream_fd == -1) { 831 assert(perf_ctx->period_exponent != 0); 832 833 if (!intel_perf_open(perf_ctx, metric_id, queryinfo->oa_format, 834 perf_ctx->period_exponent, perf_ctx->drm_fd, 835 perf_ctx->hw_ctx, false)) 836 return false; 837 } else { 838 assert(perf_ctx->current_oa_metrics_set_id == metric_id && 839 perf_ctx->current_oa_format == queryinfo->oa_format); 840 } 841 842 if (!inc_n_users(perf_ctx)) { 843 DBG("WARNING: Error enabling i915 perf stream: %m\n"); 844 return false; 845 } 846 847 if (query->oa.bo) { 848 perf_cfg->vtbl.bo_unreference(query->oa.bo); 849 query->oa.bo = NULL; 850 } 851 852 query->oa.bo = perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr, 853 "perf. query OA MI_RPC bo", 854 MI_RPC_BO_SIZE); 855#ifdef DEBUG 856 /* Pre-filling the BO helps debug whether writes landed. */ 857 void *map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_WRITE); 858 memset(map, 0x80, MI_RPC_BO_SIZE); 859 perf_cfg->vtbl.bo_unmap(query->oa.bo); 860#endif 861 862 query->oa.begin_report_id = perf_ctx->next_query_start_report_id; 863 perf_ctx->next_query_start_report_id += 2; 864 865 snapshot_query_layout(perf_ctx, query, false /* end_snapshot */); 866 867 ++perf_ctx->n_active_oa_queries; 868 869 /* No already-buffered samples can possibly be associated with this query 870 * so create a marker within the list of sample buffers enabling us to 871 * easily ignore earlier samples when processing this query after 872 * completion. 873 */ 874 assert(!exec_list_is_empty(&perf_ctx->sample_buffers)); 875 query->oa.samples_head = exec_list_get_tail(&perf_ctx->sample_buffers); 876 877 struct oa_sample_buf *buf = 878 exec_node_data(struct oa_sample_buf, query->oa.samples_head, link); 879 880 /* This reference will ensure that future/following sample 881 * buffers (that may relate to this query) can't be freed until 882 * this drops to zero. 883 */ 884 buf->refcount++; 885 886 intel_perf_query_result_clear(&query->oa.result); 887 query->oa.results_accumulated = false; 888 889 add_to_unaccumulated_query_list(perf_ctx, query); 890 break; 891 } 892 893 case INTEL_PERF_QUERY_TYPE_PIPELINE: 894 if (query->pipeline_stats.bo) { 895 perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo); 896 query->pipeline_stats.bo = NULL; 897 } 898 899 query->pipeline_stats.bo = 900 perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr, 901 "perf. query pipeline stats bo", 902 STATS_BO_SIZE); 903 904 /* Take starting snapshots. */ 905 snapshot_statistics_registers(perf_ctx, query, 0); 906 907 ++perf_ctx->n_active_pipeline_stats_queries; 908 break; 909 910 default: 911 unreachable("Unknown query type"); 912 break; 913 } 914 915 return true; 916} 917 918void 919intel_perf_end_query(struct intel_perf_context *perf_ctx, 920 struct intel_perf_query_object *query) 921{ 922 struct intel_perf_config *perf_cfg = perf_ctx->perf; 923 924 /* Ensure that the work associated with the queried commands will have 925 * finished before taking our query end counter readings. 926 * 927 * For more details see comment in brw_begin_perf_query for 928 * corresponding flush. 929 */ 930 perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx); 931 932 switch (query->queryinfo->kind) { 933 case INTEL_PERF_QUERY_TYPE_OA: 934 case INTEL_PERF_QUERY_TYPE_RAW: 935 936 /* NB: It's possible that the query will have already been marked 937 * as 'accumulated' if an error was seen while reading samples 938 * from perf. In this case we mustn't try and emit a closing 939 * MI_RPC command in case the OA unit has already been disabled 940 */ 941 if (!query->oa.results_accumulated) 942 snapshot_query_layout(perf_ctx, query, true /* end_snapshot */); 943 944 --perf_ctx->n_active_oa_queries; 945 946 /* NB: even though the query has now ended, it can't be accumulated 947 * until the end MI_REPORT_PERF_COUNT snapshot has been written 948 * to query->oa.bo 949 */ 950 break; 951 952 case INTEL_PERF_QUERY_TYPE_PIPELINE: 953 snapshot_statistics_registers(perf_ctx, query, 954 STATS_BO_END_OFFSET_BYTES); 955 --perf_ctx->n_active_pipeline_stats_queries; 956 break; 957 958 default: 959 unreachable("Unknown query type"); 960 break; 961 } 962} 963 964bool intel_perf_oa_stream_ready(struct intel_perf_context *perf_ctx) 965{ 966 struct pollfd pfd; 967 968 pfd.fd = perf_ctx->oa_stream_fd; 969 pfd.events = POLLIN; 970 pfd.revents = 0; 971 972 if (poll(&pfd, 1, 0) < 0) { 973 DBG("Error polling OA stream\n"); 974 return false; 975 } 976 977 if (!(pfd.revents & POLLIN)) 978 return false; 979 980 return true; 981} 982 983ssize_t 984intel_perf_read_oa_stream(struct intel_perf_context *perf_ctx, 985 void* buf, 986 size_t nbytes) 987{ 988 return read(perf_ctx->oa_stream_fd, buf, nbytes); 989} 990 991enum OaReadStatus { 992 OA_READ_STATUS_ERROR, 993 OA_READ_STATUS_UNFINISHED, 994 OA_READ_STATUS_FINISHED, 995}; 996 997static enum OaReadStatus 998read_oa_samples_until(struct intel_perf_context *perf_ctx, 999 uint32_t start_timestamp, 1000 uint32_t end_timestamp) 1001{ 1002 struct exec_node *tail_node = 1003 exec_list_get_tail(&perf_ctx->sample_buffers); 1004 struct oa_sample_buf *tail_buf = 1005 exec_node_data(struct oa_sample_buf, tail_node, link); 1006 uint32_t last_timestamp = 1007 tail_buf->len == 0 ? start_timestamp : tail_buf->last_timestamp; 1008 1009 while (1) { 1010 struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx); 1011 uint32_t offset; 1012 int len; 1013 1014 while ((len = read(perf_ctx->oa_stream_fd, buf->buf, 1015 sizeof(buf->buf))) < 0 && errno == EINTR) 1016 ; 1017 1018 if (len <= 0) { 1019 exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link); 1020 1021 if (len == 0) { 1022 DBG("Spurious EOF reading i915 perf samples\n"); 1023 return OA_READ_STATUS_ERROR; 1024 } 1025 1026 if (errno != EAGAIN) { 1027 DBG("Error reading i915 perf samples: %m\n"); 1028 return OA_READ_STATUS_ERROR; 1029 } 1030 1031 if ((last_timestamp - start_timestamp) >= INT32_MAX) 1032 return OA_READ_STATUS_UNFINISHED; 1033 1034 if ((last_timestamp - start_timestamp) < 1035 (end_timestamp - start_timestamp)) 1036 return OA_READ_STATUS_UNFINISHED; 1037 1038 return OA_READ_STATUS_FINISHED; 1039 } 1040 1041 buf->len = len; 1042 exec_list_push_tail(&perf_ctx->sample_buffers, &buf->link); 1043 1044 /* Go through the reports and update the last timestamp. */ 1045 offset = 0; 1046 while (offset < buf->len) { 1047 const struct drm_i915_perf_record_header *header = 1048 (const struct drm_i915_perf_record_header *) &buf->buf[offset]; 1049 uint32_t *report = (uint32_t *) (header + 1); 1050 1051 if (header->type == DRM_I915_PERF_RECORD_SAMPLE) 1052 last_timestamp = report[1]; 1053 1054 offset += header->size; 1055 } 1056 1057 buf->last_timestamp = last_timestamp; 1058 } 1059 1060 unreachable("not reached"); 1061 return OA_READ_STATUS_ERROR; 1062} 1063 1064/** 1065 * Try to read all the reports until either the delimiting timestamp 1066 * or an error arises. 1067 */ 1068static bool 1069read_oa_samples_for_query(struct intel_perf_context *perf_ctx, 1070 struct intel_perf_query_object *query, 1071 void *current_batch) 1072{ 1073 uint32_t *start; 1074 uint32_t *last; 1075 uint32_t *end; 1076 struct intel_perf_config *perf_cfg = perf_ctx->perf; 1077 1078 /* We need the MI_REPORT_PERF_COUNT to land before we can start 1079 * accumulate. */ 1080 assert(!perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) && 1081 !perf_cfg->vtbl.bo_busy(query->oa.bo)); 1082 1083 /* Map the BO once here and let accumulate_oa_reports() unmap 1084 * it. */ 1085 if (query->oa.map == NULL) 1086 query->oa.map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_READ); 1087 1088 start = last = query->oa.map; 1089 end = query->oa.map + perf_ctx->perf->query_layout.size; 1090 1091 if (start[0] != query->oa.begin_report_id) { 1092 DBG("Spurious start report id=%"PRIu32"\n", start[0]); 1093 return true; 1094 } 1095 if (end[0] != (query->oa.begin_report_id + 1)) { 1096 DBG("Spurious end report id=%"PRIu32"\n", end[0]); 1097 return true; 1098 } 1099 1100 /* Read the reports until the end timestamp. */ 1101 switch (read_oa_samples_until(perf_ctx, start[1], end[1])) { 1102 case OA_READ_STATUS_ERROR: 1103 FALLTHROUGH; /* Let accumulate_oa_reports() deal with the error. */ 1104 case OA_READ_STATUS_FINISHED: 1105 return true; 1106 case OA_READ_STATUS_UNFINISHED: 1107 return false; 1108 } 1109 1110 unreachable("invalid read status"); 1111 return false; 1112} 1113 1114void 1115intel_perf_wait_query(struct intel_perf_context *perf_ctx, 1116 struct intel_perf_query_object *query, 1117 void *current_batch) 1118{ 1119 struct intel_perf_config *perf_cfg = perf_ctx->perf; 1120 struct brw_bo *bo = NULL; 1121 1122 switch (query->queryinfo->kind) { 1123 case INTEL_PERF_QUERY_TYPE_OA: 1124 case INTEL_PERF_QUERY_TYPE_RAW: 1125 bo = query->oa.bo; 1126 break; 1127 1128 case INTEL_PERF_QUERY_TYPE_PIPELINE: 1129 bo = query->pipeline_stats.bo; 1130 break; 1131 1132 default: 1133 unreachable("Unknown query type"); 1134 break; 1135 } 1136 1137 if (bo == NULL) 1138 return; 1139 1140 /* If the current batch references our results bo then we need to 1141 * flush first... 1142 */ 1143 if (perf_cfg->vtbl.batch_references(current_batch, bo)) 1144 perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__); 1145 1146 perf_cfg->vtbl.bo_wait_rendering(bo); 1147} 1148 1149bool 1150intel_perf_is_query_ready(struct intel_perf_context *perf_ctx, 1151 struct intel_perf_query_object *query, 1152 void *current_batch) 1153{ 1154 struct intel_perf_config *perf_cfg = perf_ctx->perf; 1155 1156 switch (query->queryinfo->kind) { 1157 case INTEL_PERF_QUERY_TYPE_OA: 1158 case INTEL_PERF_QUERY_TYPE_RAW: 1159 return (query->oa.results_accumulated || 1160 (query->oa.bo && 1161 !perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) && 1162 !perf_cfg->vtbl.bo_busy(query->oa.bo))); 1163 1164 case INTEL_PERF_QUERY_TYPE_PIPELINE: 1165 return (query->pipeline_stats.bo && 1166 !perf_cfg->vtbl.batch_references(current_batch, query->pipeline_stats.bo) && 1167 !perf_cfg->vtbl.bo_busy(query->pipeline_stats.bo)); 1168 1169 default: 1170 unreachable("Unknown query type"); 1171 break; 1172 } 1173 1174 return false; 1175} 1176 1177/** 1178 * Remove a query from the global list of unaccumulated queries once 1179 * after successfully accumulating the OA reports associated with the 1180 * query in accumulate_oa_reports() or when discarding unwanted query 1181 * results. 1182 */ 1183static void 1184drop_from_unaccumulated_query_list(struct intel_perf_context *perf_ctx, 1185 struct intel_perf_query_object *query) 1186{ 1187 for (int i = 0; i < perf_ctx->unaccumulated_elements; i++) { 1188 if (perf_ctx->unaccumulated[i] == query) { 1189 int last_elt = --perf_ctx->unaccumulated_elements; 1190 1191 if (i == last_elt) 1192 perf_ctx->unaccumulated[i] = NULL; 1193 else { 1194 perf_ctx->unaccumulated[i] = 1195 perf_ctx->unaccumulated[last_elt]; 1196 } 1197 1198 break; 1199 } 1200 } 1201 1202 /* Drop our samples_head reference so that associated periodic 1203 * sample data buffers can potentially be reaped if they aren't 1204 * referenced by any other queries... 1205 */ 1206 1207 struct oa_sample_buf *buf = 1208 exec_node_data(struct oa_sample_buf, query->oa.samples_head, link); 1209 1210 assert(buf->refcount > 0); 1211 buf->refcount--; 1212 1213 query->oa.samples_head = NULL; 1214 1215 reap_old_sample_buffers(perf_ctx); 1216} 1217 1218/* In general if we see anything spurious while accumulating results, 1219 * we don't try and continue accumulating the current query, hoping 1220 * for the best, we scrap anything outstanding, and then hope for the 1221 * best with new queries. 1222 */ 1223static void 1224discard_all_queries(struct intel_perf_context *perf_ctx) 1225{ 1226 while (perf_ctx->unaccumulated_elements) { 1227 struct intel_perf_query_object *query = perf_ctx->unaccumulated[0]; 1228 1229 query->oa.results_accumulated = true; 1230 drop_from_unaccumulated_query_list(perf_ctx, query); 1231 1232 dec_n_users(perf_ctx); 1233 } 1234} 1235 1236/* Looks for the validity bit of context ID (dword 2) of an OA report. */ 1237static bool 1238oa_report_ctx_id_valid(const struct intel_device_info *devinfo, 1239 const uint32_t *report) 1240{ 1241 assert(devinfo->ver >= 8); 1242 if (devinfo->ver == 8) 1243 return (report[0] & (1 << 25)) != 0; 1244 return (report[0] & (1 << 16)) != 0; 1245} 1246 1247/** 1248 * Accumulate raw OA counter values based on deltas between pairs of 1249 * OA reports. 1250 * 1251 * Accumulation starts from the first report captured via 1252 * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the 1253 * last MI_RPC report requested by brw_end_perf_query(). Between these 1254 * two reports there may also some number of periodically sampled OA 1255 * reports collected via the i915 perf interface - depending on the 1256 * duration of the query. 1257 * 1258 * These periodic snapshots help to ensure we handle counter overflow 1259 * correctly by being frequent enough to ensure we don't miss multiple 1260 * overflows of a counter between snapshots. For Gfx8+ the i915 perf 1261 * snapshots provide the extra context-switch reports that let us 1262 * subtract out the progress of counters associated with other 1263 * contexts running on the system. 1264 */ 1265static void 1266accumulate_oa_reports(struct intel_perf_context *perf_ctx, 1267 struct intel_perf_query_object *query) 1268{ 1269 const struct intel_device_info *devinfo = perf_ctx->devinfo; 1270 uint32_t *start; 1271 uint32_t *last; 1272 uint32_t *end; 1273 struct exec_node *first_samples_node; 1274 bool last_report_ctx_match = true; 1275 int out_duration = 0; 1276 1277 assert(query->oa.map != NULL); 1278 1279 start = last = query->oa.map; 1280 end = query->oa.map + perf_ctx->perf->query_layout.size; 1281 1282 if (start[0] != query->oa.begin_report_id) { 1283 DBG("Spurious start report id=%"PRIu32"\n", start[0]); 1284 goto error; 1285 } 1286 if (end[0] != (query->oa.begin_report_id + 1)) { 1287 DBG("Spurious end report id=%"PRIu32"\n", end[0]); 1288 goto error; 1289 } 1290 1291 /* On Gfx12+ OA reports are sourced from per context counters, so we don't 1292 * ever have to look at the global OA buffer. Yey \o/ 1293 */ 1294 if (perf_ctx->devinfo->ver >= 12) { 1295 last = start; 1296 goto end; 1297 } 1298 1299 /* See if we have any periodic reports to accumulate too... */ 1300 1301 /* N.B. The oa.samples_head was set when the query began and 1302 * pointed to the tail of the perf_ctx->sample_buffers list at 1303 * the time the query started. Since the buffer existed before the 1304 * first MI_REPORT_PERF_COUNT command was emitted we therefore know 1305 * that no data in this particular node's buffer can possibly be 1306 * associated with the query - so skip ahead one... 1307 */ 1308 first_samples_node = query->oa.samples_head->next; 1309 1310 foreach_list_typed_from(struct oa_sample_buf, buf, link, 1311 &perf_ctx->sample_buffers, 1312 first_samples_node) 1313 { 1314 int offset = 0; 1315 1316 while (offset < buf->len) { 1317 const struct drm_i915_perf_record_header *header = 1318 (const struct drm_i915_perf_record_header *)(buf->buf + offset); 1319 1320 assert(header->size != 0); 1321 assert(header->size <= buf->len); 1322 1323 offset += header->size; 1324 1325 switch (header->type) { 1326 case DRM_I915_PERF_RECORD_SAMPLE: { 1327 uint32_t *report = (uint32_t *)(header + 1); 1328 bool report_ctx_match = true; 1329 bool add = true; 1330 1331 /* Ignore reports that come before the start marker. 1332 * (Note: takes care to allow overflow of 32bit timestamps) 1333 */ 1334 if (intel_device_info_timebase_scale(devinfo, 1335 report[1] - start[1]) > 5000000000) { 1336 continue; 1337 } 1338 1339 /* Ignore reports that come after the end marker. 1340 * (Note: takes care to allow overflow of 32bit timestamps) 1341 */ 1342 if (intel_device_info_timebase_scale(devinfo, 1343 report[1] - end[1]) <= 5000000000) { 1344 goto end; 1345 } 1346 1347 /* For Gfx8+ since the counters continue while other 1348 * contexts are running we need to discount any unrelated 1349 * deltas. The hardware automatically generates a report 1350 * on context switch which gives us a new reference point 1351 * to continuing adding deltas from. 1352 * 1353 * For Haswell we can rely on the HW to stop the progress 1354 * of OA counters while any other context is acctive. 1355 */ 1356 if (devinfo->ver >= 8) { 1357 /* Consider that the current report matches our context only if 1358 * the report says the report ID is valid. 1359 */ 1360 report_ctx_match = oa_report_ctx_id_valid(devinfo, report) && 1361 report[2] == start[2]; 1362 if (report_ctx_match) 1363 out_duration = 0; 1364 else 1365 out_duration++; 1366 1367 /* Only add the delta between <last, report> if the last report 1368 * was clearly identified as our context, or if we have at most 1369 * 1 report without a matching ID. 1370 * 1371 * The OA unit will sometimes label reports with an invalid 1372 * context ID when i915 rewrites the execlist submit register 1373 * with the same context as the one currently running. This 1374 * happens when i915 wants to notify the HW of ringbuffer tail 1375 * register update. We have to consider this report as part of 1376 * our context as the 3d pipeline behind the OACS unit is still 1377 * processing the operations started at the previous execlist 1378 * submission. 1379 */ 1380 add = last_report_ctx_match && out_duration < 2; 1381 } 1382 1383 if (add) { 1384 intel_perf_query_result_accumulate(&query->oa.result, 1385 query->queryinfo, 1386 last, report); 1387 } else { 1388 /* We're not adding the delta because we've identified it's not 1389 * for the context we filter for. We can consider that the 1390 * query was split. 1391 */ 1392 query->oa.result.query_disjoint = true; 1393 } 1394 1395 last = report; 1396 last_report_ctx_match = report_ctx_match; 1397 1398 break; 1399 } 1400 1401 case DRM_I915_PERF_RECORD_OA_BUFFER_LOST: 1402 DBG("i915 perf: OA error: all reports lost\n"); 1403 goto error; 1404 case DRM_I915_PERF_RECORD_OA_REPORT_LOST: 1405 DBG("i915 perf: OA report lost\n"); 1406 break; 1407 } 1408 } 1409 } 1410 1411end: 1412 1413 intel_perf_query_result_accumulate(&query->oa.result, query->queryinfo, 1414 last, end); 1415 1416 query->oa.results_accumulated = true; 1417 drop_from_unaccumulated_query_list(perf_ctx, query); 1418 dec_n_users(perf_ctx); 1419 1420 return; 1421 1422error: 1423 1424 discard_all_queries(perf_ctx); 1425} 1426 1427void 1428intel_perf_delete_query(struct intel_perf_context *perf_ctx, 1429 struct intel_perf_query_object *query) 1430{ 1431 struct intel_perf_config *perf_cfg = perf_ctx->perf; 1432 1433 /* We can assume that the frontend waits for a query to complete 1434 * before ever calling into here, so we don't have to worry about 1435 * deleting an in-flight query object. 1436 */ 1437 switch (query->queryinfo->kind) { 1438 case INTEL_PERF_QUERY_TYPE_OA: 1439 case INTEL_PERF_QUERY_TYPE_RAW: 1440 if (query->oa.bo) { 1441 if (!query->oa.results_accumulated) { 1442 drop_from_unaccumulated_query_list(perf_ctx, query); 1443 dec_n_users(perf_ctx); 1444 } 1445 1446 perf_cfg->vtbl.bo_unreference(query->oa.bo); 1447 query->oa.bo = NULL; 1448 } 1449 1450 query->oa.results_accumulated = false; 1451 break; 1452 1453 case INTEL_PERF_QUERY_TYPE_PIPELINE: 1454 if (query->pipeline_stats.bo) { 1455 perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo); 1456 query->pipeline_stats.bo = NULL; 1457 } 1458 break; 1459 1460 default: 1461 unreachable("Unknown query type"); 1462 break; 1463 } 1464 1465 /* As an indication that the INTEL_performance_query extension is no 1466 * longer in use, it's a good time to free our cache of sample 1467 * buffers and close any current i915-perf stream. 1468 */ 1469 if (--perf_ctx->n_query_instances == 0) { 1470 free_sample_bufs(perf_ctx); 1471 intel_perf_close(perf_ctx, query->queryinfo); 1472 } 1473 1474 free(query); 1475} 1476 1477static int 1478get_oa_counter_data(struct intel_perf_context *perf_ctx, 1479 struct intel_perf_query_object *query, 1480 size_t data_size, 1481 uint8_t *data) 1482{ 1483 struct intel_perf_config *perf_cfg = perf_ctx->perf; 1484 const struct intel_perf_query_info *queryinfo = query->queryinfo; 1485 int n_counters = queryinfo->n_counters; 1486 int written = 0; 1487 1488 for (int i = 0; i < n_counters; i++) { 1489 const struct intel_perf_query_counter *counter = &queryinfo->counters[i]; 1490 uint64_t *out_uint64; 1491 float *out_float; 1492 size_t counter_size = intel_perf_query_counter_get_size(counter); 1493 1494 if (counter_size) { 1495 switch (counter->data_type) { 1496 case INTEL_PERF_COUNTER_DATA_TYPE_UINT64: 1497 out_uint64 = (uint64_t *)(data + counter->offset); 1498 *out_uint64 = 1499 counter->oa_counter_read_uint64(perf_cfg, queryinfo, 1500 &query->oa.result); 1501 break; 1502 case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT: 1503 out_float = (float *)(data + counter->offset); 1504 *out_float = 1505 counter->oa_counter_read_float(perf_cfg, queryinfo, 1506 &query->oa.result); 1507 break; 1508 default: 1509 /* So far we aren't using uint32, double or bool32... */ 1510 unreachable("unexpected counter data type"); 1511 } 1512 1513 if (counter->offset + counter_size > written) 1514 written = counter->offset + counter_size; 1515 } 1516 } 1517 1518 return written; 1519} 1520 1521static int 1522get_pipeline_stats_data(struct intel_perf_context *perf_ctx, 1523 struct intel_perf_query_object *query, 1524 size_t data_size, 1525 uint8_t *data) 1526 1527{ 1528 struct intel_perf_config *perf_cfg = perf_ctx->perf; 1529 const struct intel_perf_query_info *queryinfo = query->queryinfo; 1530 int n_counters = queryinfo->n_counters; 1531 uint8_t *p = data; 1532 1533 uint64_t *start = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->pipeline_stats.bo, MAP_READ); 1534 uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t)); 1535 1536 for (int i = 0; i < n_counters; i++) { 1537 const struct intel_perf_query_counter *counter = &queryinfo->counters[i]; 1538 uint64_t value = end[i] - start[i]; 1539 1540 if (counter->pipeline_stat.numerator != 1541 counter->pipeline_stat.denominator) { 1542 value *= counter->pipeline_stat.numerator; 1543 value /= counter->pipeline_stat.denominator; 1544 } 1545 1546 *((uint64_t *)p) = value; 1547 p += 8; 1548 } 1549 1550 perf_cfg->vtbl.bo_unmap(query->pipeline_stats.bo); 1551 1552 return p - data; 1553} 1554 1555void 1556intel_perf_get_query_data(struct intel_perf_context *perf_ctx, 1557 struct intel_perf_query_object *query, 1558 void *current_batch, 1559 int data_size, 1560 unsigned *data, 1561 unsigned *bytes_written) 1562{ 1563 struct intel_perf_config *perf_cfg = perf_ctx->perf; 1564 int written = 0; 1565 1566 switch (query->queryinfo->kind) { 1567 case INTEL_PERF_QUERY_TYPE_OA: 1568 case INTEL_PERF_QUERY_TYPE_RAW: 1569 if (!query->oa.results_accumulated) { 1570 /* Due to the sampling frequency of the OA buffer by the i915-perf 1571 * driver, there can be a 5ms delay between the Mesa seeing the query 1572 * complete and i915 making all the OA buffer reports available to us. 1573 * We need to wait for all the reports to come in before we can do 1574 * the post processing removing unrelated deltas. 1575 * There is a i915-perf series to address this issue, but it's 1576 * not been merged upstream yet. 1577 */ 1578 while (!read_oa_samples_for_query(perf_ctx, query, current_batch)) 1579 ; 1580 1581 uint32_t *begin_report = query->oa.map; 1582 uint32_t *end_report = query->oa.map + perf_cfg->query_layout.size; 1583 intel_perf_query_result_accumulate_fields(&query->oa.result, 1584 query->queryinfo, 1585 begin_report, 1586 end_report, 1587 true /* no_oa_accumulate */); 1588 accumulate_oa_reports(perf_ctx, query); 1589 assert(query->oa.results_accumulated); 1590 1591 perf_cfg->vtbl.bo_unmap(query->oa.bo); 1592 query->oa.map = NULL; 1593 } 1594 if (query->queryinfo->kind == INTEL_PERF_QUERY_TYPE_OA) { 1595 written = get_oa_counter_data(perf_ctx, query, data_size, (uint8_t *)data); 1596 } else { 1597 const struct intel_device_info *devinfo = perf_ctx->devinfo; 1598 1599 written = intel_perf_query_result_write_mdapi((uint8_t *)data, data_size, 1600 devinfo, query->queryinfo, 1601 &query->oa.result); 1602 } 1603 break; 1604 1605 case INTEL_PERF_QUERY_TYPE_PIPELINE: 1606 written = get_pipeline_stats_data(perf_ctx, query, data_size, (uint8_t *)data); 1607 break; 1608 1609 default: 1610 unreachable("Unknown query type"); 1611 break; 1612 } 1613 1614 if (bytes_written) 1615 *bytes_written = written; 1616} 1617 1618void 1619intel_perf_dump_query_count(struct intel_perf_context *perf_ctx) 1620{ 1621 DBG("Queries: (Open queries = %d, OA users = %d)\n", 1622 perf_ctx->n_active_oa_queries, perf_ctx->n_oa_users); 1623} 1624 1625void 1626intel_perf_dump_query(struct intel_perf_context *ctx, 1627 struct intel_perf_query_object *obj, 1628 void *current_batch) 1629{ 1630 switch (obj->queryinfo->kind) { 1631 case INTEL_PERF_QUERY_TYPE_OA: 1632 case INTEL_PERF_QUERY_TYPE_RAW: 1633 DBG("BO: %-4s OA data: %-10s %-15s\n", 1634 obj->oa.bo ? "yes," : "no,", 1635 intel_perf_is_query_ready(ctx, obj, current_batch) ? "ready," : "not ready,", 1636 obj->oa.results_accumulated ? "accumulated" : "not accumulated"); 1637 break; 1638 case INTEL_PERF_QUERY_TYPE_PIPELINE: 1639 DBG("BO: %-4s\n", 1640 obj->pipeline_stats.bo ? "yes" : "no"); 1641 break; 1642 default: 1643 unreachable("Unknown query type"); 1644 break; 1645 } 1646} 1647