1/* 2 * Performance events - AMD IBS 3 * 4 * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter 5 * 6 * For licencing details see kernel-base/COPYING 7 */ 8 9#include <linux/perf_event.h> 10#include <linux/init.h> 11#include <linux/export.h> 12#include <linux/pci.h> 13#include <linux/ptrace.h> 14#include <linux/syscore_ops.h> 15#include <linux/sched/clock.h> 16 17#include <asm/apic.h> 18 19#include "../perf_event.h" 20 21static u32 ibs_caps; 22 23#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) 24 25#include <linux/kprobes.h> 26#include <linux/hardirq.h> 27 28#include <asm/nmi.h> 29 30#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) 31#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT 32 33 34/* 35 * IBS states: 36 * 37 * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken 38 * and any further add()s must fail. 39 * 40 * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are 41 * complicated by the fact that the IBS hardware can send late NMIs (ie. after 42 * we've cleared the EN bit). 43 * 44 * In order to consume these late NMIs we have the STOPPED state, any NMI that 45 * happens after we've cleared the EN state will clear this bit and report the 46 * NMI handled (this is fundamentally racy in the face or multiple NMI sources, 47 * someone else can consume our BIT and our NMI will go unhandled). 48 * 49 * And since we cannot set/clear this separate bit together with the EN bit, 50 * there are races; if we cleared STARTED early, an NMI could land in 51 * between clearing STARTED and clearing the EN bit (in fact multiple NMIs 52 * could happen if the period is small enough), and consume our STOPPED bit 53 * and trigger streams of unhandled NMIs. 54 * 55 * If, however, we clear STARTED late, an NMI can hit between clearing the 56 * EN bit and clearing STARTED, still see STARTED set and process the event. 57 * If this event will have the VALID bit clear, we bail properly, but this 58 * is not a given. With VALID set we can end up calling pmu::stop() again 59 * (the throttle logic) and trigger the WARNs in there. 60 * 61 * So what we do is set STOPPING before clearing EN to avoid the pmu::stop() 62 * nesting, and clear STARTED late, so that we have a well defined state over 63 * the clearing of the EN bit. 64 * 65 * XXX: we could probably be using !atomic bitops for all this. 66 */ 67 68enum ibs_states { 69 IBS_ENABLED = 0, 70 IBS_STARTED = 1, 71 IBS_STOPPING = 2, 72 IBS_STOPPED = 3, 73 74 IBS_MAX_STATES, 75}; 76 77struct cpu_perf_ibs { 78 struct perf_event *event; 79 unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; 80}; 81 82struct perf_ibs { 83 struct pmu pmu; 84 unsigned int msr; 85 u64 config_mask; 86 u64 cnt_mask; 87 u64 enable_mask; 88 u64 valid_mask; 89 u64 max_period; 90 unsigned long offset_mask[1]; 91 int offset_max; 92 unsigned int fetch_count_reset_broken : 1; 93 unsigned int fetch_ignore_if_zero_rip : 1; 94 struct cpu_perf_ibs __percpu *pcpu; 95 96 struct attribute **format_attrs; 97 struct attribute_group format_group; 98 const struct attribute_group *attr_groups[2]; 99 100 u64 (*get_count)(u64 config); 101}; 102 103struct perf_ibs_data { 104 u32 size; 105 union { 106 u32 data[0]; /* data buffer starts here */ 107 u32 caps; 108 }; 109 u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; 110}; 111 112static int 113perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) 114{ 115 s64 left = local64_read(&hwc->period_left); 116 s64 period = hwc->sample_period; 117 int overflow = 0; 118 119 /* 120 * If we are way outside a reasonable range then just skip forward: 121 */ 122 if (unlikely(left <= -period)) { 123 left = period; 124 local64_set(&hwc->period_left, left); 125 hwc->last_period = period; 126 overflow = 1; 127 } 128 129 if (unlikely(left < (s64)min)) { 130 left += period; 131 local64_set(&hwc->period_left, left); 132 hwc->last_period = period; 133 overflow = 1; 134 } 135 136 /* 137 * If the hw period that triggers the sw overflow is too short 138 * we might hit the irq handler. This biases the results. 139 * Thus we shorten the next-to-last period and set the last 140 * period to the max period. 141 */ 142 if (left > max) { 143 left -= max; 144 if (left > max) 145 left = max; 146 else if (left < min) 147 left = min; 148 } 149 150 *hw_period = (u64)left; 151 152 return overflow; 153} 154 155static int 156perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) 157{ 158 struct hw_perf_event *hwc = &event->hw; 159 int shift = 64 - width; 160 u64 prev_raw_count; 161 u64 delta; 162 163 /* 164 * Careful: an NMI might modify the previous event value. 165 * 166 * Our tactic to handle this is to first atomically read and 167 * exchange a new raw count - then add that new-prev delta 168 * count to the generic event atomically: 169 */ 170 prev_raw_count = local64_read(&hwc->prev_count); 171 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 172 new_raw_count) != prev_raw_count) 173 return 0; 174 175 /* 176 * Now we have the new raw value and have updated the prev 177 * timestamp already. We can now calculate the elapsed delta 178 * (event-)time and add that to the generic event. 179 * 180 * Careful, not all hw sign-extends above the physical width 181 * of the count. 182 */ 183 delta = (new_raw_count << shift) - (prev_raw_count << shift); 184 delta >>= shift; 185 186 local64_add(delta, &event->count); 187 local64_sub(delta, &hwc->period_left); 188 189 return 1; 190} 191 192static struct perf_ibs perf_ibs_fetch; 193static struct perf_ibs perf_ibs_op; 194 195static struct perf_ibs *get_ibs_pmu(int type) 196{ 197 if (perf_ibs_fetch.pmu.type == type) 198 return &perf_ibs_fetch; 199 if (perf_ibs_op.pmu.type == type) 200 return &perf_ibs_op; 201 return NULL; 202} 203 204/* 205 * core pmu config -> IBS config 206 * 207 * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count 208 * perf record -a -e r076:p ... # same as -e cpu-cycles:p 209 * perf record -a -e r0C1:p ... # use ibs op counting micro-ops 210 * 211 * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, 212 * MSRC001_1033) is used to select either cycle or micro-ops counting 213 * mode. 214 */ 215static int core_pmu_ibs_config(struct perf_event *event, u64 *config) 216{ 217 switch (event->attr.type) { 218 case PERF_TYPE_HARDWARE: 219 switch (event->attr.config) { 220 case PERF_COUNT_HW_CPU_CYCLES: 221 *config = 0; 222 return 0; 223 } 224 break; 225 case PERF_TYPE_RAW: 226 switch (event->attr.config) { 227 case 0x0076: 228 *config = 0; 229 return 0; 230 case 0x00C1: 231 *config = IBS_OP_CNT_CTL; 232 return 0; 233 } 234 break; 235 default: 236 return -ENOENT; 237 } 238 239 return -EOPNOTSUPP; 240} 241 242/* 243 * The rip of IBS samples has skid 0. Thus, IBS supports precise 244 * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the 245 * rip is invalid when IBS was not able to record the rip correctly. 246 * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. 247 */ 248int forward_event_to_ibs(struct perf_event *event) 249{ 250 u64 config = 0; 251 252 if (!event->attr.precise_ip || event->attr.precise_ip > 2) 253 return -EOPNOTSUPP; 254 255 if (!core_pmu_ibs_config(event, &config)) { 256 event->attr.type = perf_ibs_op.pmu.type; 257 event->attr.config = config; 258 } 259 return -ENOENT; 260} 261 262static int perf_ibs_init(struct perf_event *event) 263{ 264 struct hw_perf_event *hwc = &event->hw; 265 struct perf_ibs *perf_ibs; 266 u64 max_cnt, config; 267 268 perf_ibs = get_ibs_pmu(event->attr.type); 269 if (!perf_ibs) 270 return -ENOENT; 271 272 config = event->attr.config; 273 274 if (event->pmu != &perf_ibs->pmu) 275 return -ENOENT; 276 277 if (config & ~perf_ibs->config_mask) 278 return -EINVAL; 279 280 if (hwc->sample_period) { 281 if (config & perf_ibs->cnt_mask) 282 /* raw max_cnt may not be set */ 283 return -EINVAL; 284 if (!event->attr.sample_freq && hwc->sample_period & 0x0f) 285 /* 286 * lower 4 bits can not be set in ibs max cnt, 287 * but allowing it in case we adjust the 288 * sample period to set a frequency. 289 */ 290 return -EINVAL; 291 hwc->sample_period &= ~0x0FULL; 292 if (!hwc->sample_period) 293 hwc->sample_period = 0x10; 294 } else { 295 max_cnt = config & perf_ibs->cnt_mask; 296 config &= ~perf_ibs->cnt_mask; 297 event->attr.sample_period = max_cnt << 4; 298 hwc->sample_period = event->attr.sample_period; 299 } 300 301 if (!hwc->sample_period) 302 return -EINVAL; 303 304 /* 305 * If we modify hwc->sample_period, we also need to update 306 * hwc->last_period and hwc->period_left. 307 */ 308 hwc->last_period = hwc->sample_period; 309 local64_set(&hwc->period_left, hwc->sample_period); 310 311 hwc->config_base = perf_ibs->msr; 312 hwc->config = config; 313 314 /* 315 * rip recorded by IbsOpRip will not be consistent with rsp and rbp 316 * recorded as part of interrupt regs. Thus we need to use rip from 317 * interrupt regs while unwinding call stack. Setting _EARLY flag 318 * makes sure we unwind call-stack before perf sample rip is set to 319 * IbsOpRip. 320 */ 321 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 322 event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; 323 324 return 0; 325} 326 327static int perf_ibs_set_period(struct perf_ibs *perf_ibs, 328 struct hw_perf_event *hwc, u64 *period) 329{ 330 int overflow; 331 332 /* ignore lower 4 bits in min count: */ 333 overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); 334 local64_set(&hwc->prev_count, 0); 335 336 return overflow; 337} 338 339static u64 get_ibs_fetch_count(u64 config) 340{ 341 return (config & IBS_FETCH_CNT) >> 12; 342} 343 344static u64 get_ibs_op_count(u64 config) 345{ 346 u64 count = 0; 347 348 /* 349 * If the internal 27-bit counter rolled over, the count is MaxCnt 350 * and the lower 7 bits of CurCnt are randomized. 351 * Otherwise CurCnt has the full 27-bit current counter value. 352 */ 353 if (config & IBS_OP_VAL) { 354 count = (config & IBS_OP_MAX_CNT) << 4; 355 if (ibs_caps & IBS_CAPS_OPCNTEXT) 356 count += config & IBS_OP_MAX_CNT_EXT_MASK; 357 } else if (ibs_caps & IBS_CAPS_RDWROPCNT) { 358 count = (config & IBS_OP_CUR_CNT) >> 32; 359 } 360 361 return count; 362} 363 364static void 365perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, 366 u64 *config) 367{ 368 u64 count = perf_ibs->get_count(*config); 369 370 /* 371 * Set width to 64 since we do not overflow on max width but 372 * instead on max count. In perf_ibs_set_period() we clear 373 * prev count manually on overflow. 374 */ 375 while (!perf_event_try_update(event, count, 64)) { 376 rdmsrl(event->hw.config_base, *config); 377 count = perf_ibs->get_count(*config); 378 } 379} 380 381static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, 382 struct hw_perf_event *hwc, u64 config) 383{ 384 u64 tmp = hwc->config | config; 385 386 if (perf_ibs->fetch_count_reset_broken) 387 wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask); 388 389 wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask); 390} 391 392/* 393 * Erratum #420 Instruction-Based Sampling Engine May Generate 394 * Interrupt that Cannot Be Cleared: 395 * 396 * Must clear counter mask first, then clear the enable bit. See 397 * Revision Guide for AMD Family 10h Processors, Publication #41322. 398 */ 399static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, 400 struct hw_perf_event *hwc, u64 config) 401{ 402 config &= ~perf_ibs->cnt_mask; 403 if (boot_cpu_data.x86 == 0x10) 404 wrmsrl(hwc->config_base, config); 405 config &= ~perf_ibs->enable_mask; 406 wrmsrl(hwc->config_base, config); 407} 408 409/* 410 * We cannot restore the ibs pmu state, so we always needs to update 411 * the event while stopping it and then reset the state when starting 412 * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in 413 * perf_ibs_start()/perf_ibs_stop() and instead always do it. 414 */ 415static void perf_ibs_start(struct perf_event *event, int flags) 416{ 417 struct hw_perf_event *hwc = &event->hw; 418 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 419 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 420 u64 period, config = 0; 421 422 if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) 423 return; 424 425 WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); 426 hwc->state = 0; 427 428 perf_ibs_set_period(perf_ibs, hwc, &period); 429 if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { 430 config |= period & IBS_OP_MAX_CNT_EXT_MASK; 431 period &= ~IBS_OP_MAX_CNT_EXT_MASK; 432 } 433 config |= period >> 4; 434 435 /* 436 * Set STARTED before enabling the hardware, such that a subsequent NMI 437 * must observe it. 438 */ 439 set_bit(IBS_STARTED, pcpu->state); 440 clear_bit(IBS_STOPPING, pcpu->state); 441 perf_ibs_enable_event(perf_ibs, hwc, config); 442 443 perf_event_update_userpage(event); 444} 445 446static void perf_ibs_stop(struct perf_event *event, int flags) 447{ 448 struct hw_perf_event *hwc = &event->hw; 449 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 450 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 451 u64 config; 452 int stopping; 453 454 if (test_and_set_bit(IBS_STOPPING, pcpu->state)) 455 return; 456 457 stopping = test_bit(IBS_STARTED, pcpu->state); 458 459 if (!stopping && (hwc->state & PERF_HES_UPTODATE)) 460 return; 461 462 rdmsrl(hwc->config_base, config); 463 464 if (stopping) { 465 /* 466 * Set STOPPED before disabling the hardware, such that it 467 * must be visible to NMIs the moment we clear the EN bit, 468 * at which point we can generate an !VALID sample which 469 * we need to consume. 470 */ 471 set_bit(IBS_STOPPED, pcpu->state); 472 perf_ibs_disable_event(perf_ibs, hwc, config); 473 /* 474 * Clear STARTED after disabling the hardware; if it were 475 * cleared before an NMI hitting after the clear but before 476 * clearing the EN bit might think it a spurious NMI and not 477 * handle it. 478 * 479 * Clearing it after, however, creates the problem of the NMI 480 * handler seeing STARTED but not having a valid sample. 481 */ 482 clear_bit(IBS_STARTED, pcpu->state); 483 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 484 hwc->state |= PERF_HES_STOPPED; 485 } 486 487 if (hwc->state & PERF_HES_UPTODATE) 488 return; 489 490 /* 491 * Clear valid bit to not count rollovers on update, rollovers 492 * are only updated in the irq handler. 493 */ 494 config &= ~perf_ibs->valid_mask; 495 496 perf_ibs_event_update(perf_ibs, event, &config); 497 hwc->state |= PERF_HES_UPTODATE; 498} 499 500static int perf_ibs_add(struct perf_event *event, int flags) 501{ 502 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 503 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 504 505 if (test_and_set_bit(IBS_ENABLED, pcpu->state)) 506 return -ENOSPC; 507 508 event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 509 510 pcpu->event = event; 511 512 if (flags & PERF_EF_START) 513 perf_ibs_start(event, PERF_EF_RELOAD); 514 515 return 0; 516} 517 518static void perf_ibs_del(struct perf_event *event, int flags) 519{ 520 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 521 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 522 523 if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) 524 return; 525 526 perf_ibs_stop(event, PERF_EF_UPDATE); 527 528 pcpu->event = NULL; 529 530 perf_event_update_userpage(event); 531} 532 533static void perf_ibs_read(struct perf_event *event) { } 534 535PMU_FORMAT_ATTR(rand_en, "config:57"); 536PMU_FORMAT_ATTR(cnt_ctl, "config:19"); 537 538static struct attribute *ibs_fetch_format_attrs[] = { 539 &format_attr_rand_en.attr, 540 NULL, 541}; 542 543static struct attribute *ibs_op_format_attrs[] = { 544 NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */ 545 NULL, 546}; 547 548static struct perf_ibs perf_ibs_fetch = { 549 .pmu = { 550 .task_ctx_nr = perf_invalid_context, 551 552 .event_init = perf_ibs_init, 553 .add = perf_ibs_add, 554 .del = perf_ibs_del, 555 .start = perf_ibs_start, 556 .stop = perf_ibs_stop, 557 .read = perf_ibs_read, 558 .capabilities = PERF_PMU_CAP_NO_EXCLUDE, 559 }, 560 .msr = MSR_AMD64_IBSFETCHCTL, 561 .config_mask = IBS_FETCH_CONFIG_MASK, 562 .cnt_mask = IBS_FETCH_MAX_CNT, 563 .enable_mask = IBS_FETCH_ENABLE, 564 .valid_mask = IBS_FETCH_VAL, 565 .max_period = IBS_FETCH_MAX_CNT << 4, 566 .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, 567 .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, 568 .format_attrs = ibs_fetch_format_attrs, 569 570 .get_count = get_ibs_fetch_count, 571}; 572 573static struct perf_ibs perf_ibs_op = { 574 .pmu = { 575 .task_ctx_nr = perf_invalid_context, 576 577 .event_init = perf_ibs_init, 578 .add = perf_ibs_add, 579 .del = perf_ibs_del, 580 .start = perf_ibs_start, 581 .stop = perf_ibs_stop, 582 .read = perf_ibs_read, 583 .capabilities = PERF_PMU_CAP_NO_EXCLUDE, 584 }, 585 .msr = MSR_AMD64_IBSOPCTL, 586 .config_mask = IBS_OP_CONFIG_MASK, 587 .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | 588 IBS_OP_CUR_CNT_RAND, 589 .enable_mask = IBS_OP_ENABLE, 590 .valid_mask = IBS_OP_VAL, 591 .max_period = IBS_OP_MAX_CNT << 4, 592 .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, 593 .offset_max = MSR_AMD64_IBSOP_REG_COUNT, 594 .format_attrs = ibs_op_format_attrs, 595 596 .get_count = get_ibs_op_count, 597}; 598 599static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) 600{ 601 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 602 struct perf_event *event = pcpu->event; 603 struct hw_perf_event *hwc; 604 struct perf_sample_data data; 605 struct perf_raw_record raw; 606 struct pt_regs regs; 607 struct perf_ibs_data ibs_data; 608 int offset, size, check_rip, offset_max, throttle = 0; 609 unsigned int msr; 610 u64 *buf, *config, period, new_config = 0; 611 612 if (!test_bit(IBS_STARTED, pcpu->state)) { 613fail: 614 /* 615 * Catch spurious interrupts after stopping IBS: After 616 * disabling IBS there could be still incoming NMIs 617 * with samples that even have the valid bit cleared. 618 * Mark all this NMIs as handled. 619 */ 620 if (test_and_clear_bit(IBS_STOPPED, pcpu->state)) 621 return 1; 622 623 return 0; 624 } 625 626 if (WARN_ON_ONCE(!event)) 627 goto fail; 628 629 hwc = &event->hw; 630 msr = hwc->config_base; 631 buf = ibs_data.regs; 632 rdmsrl(msr, *buf); 633 if (!(*buf++ & perf_ibs->valid_mask)) 634 goto fail; 635 636 config = &ibs_data.regs[0]; 637 perf_ibs_event_update(perf_ibs, event, config); 638 perf_sample_data_init(&data, 0, hwc->last_period); 639 if (!perf_ibs_set_period(perf_ibs, hwc, &period)) 640 goto out; /* no sw counter overflow */ 641 642 ibs_data.caps = ibs_caps; 643 size = 1; 644 offset = 1; 645 check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); 646 if (event->attr.sample_type & PERF_SAMPLE_RAW) 647 offset_max = perf_ibs->offset_max; 648 else if (check_rip) 649 offset_max = 3; 650 else 651 offset_max = 1; 652 do { 653 rdmsrl(msr + offset, *buf++); 654 size++; 655 offset = find_next_bit(perf_ibs->offset_mask, 656 perf_ibs->offset_max, 657 offset + 1); 658 } while (offset < offset_max); 659 /* 660 * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately 661 * depending on their availability. 662 * Can't add to offset_max as they are staggered 663 */ 664 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 665 if (perf_ibs == &perf_ibs_op) { 666 if (ibs_caps & IBS_CAPS_BRNTRGT) { 667 rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); 668 size++; 669 } 670 if (ibs_caps & IBS_CAPS_OPDATA4) { 671 rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); 672 size++; 673 } 674 } 675 if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) { 676 rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++); 677 size++; 678 } 679 } 680 ibs_data.size = sizeof(u64) * size; 681 682 regs = *iregs; 683 if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { 684 regs.flags &= ~PERF_EFLAGS_EXACT; 685 } else { 686 /* Workaround for erratum #1197 */ 687 if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1])) 688 goto out; 689 690 set_linear_ip(®s, ibs_data.regs[1]); 691 regs.flags |= PERF_EFLAGS_EXACT; 692 } 693 694 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 695 raw = (struct perf_raw_record){ 696 .frag = { 697 .size = sizeof(u32) + ibs_data.size, 698 .data = ibs_data.data, 699 }, 700 }; 701 data.raw = &raw; 702 } 703 704 /* 705 * rip recorded by IbsOpRip will not be consistent with rsp and rbp 706 * recorded as part of interrupt regs. Thus we need to use rip from 707 * interrupt regs while unwinding call stack. 708 */ 709 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 710 data.callchain = perf_callchain(event, iregs); 711 712 throttle = perf_event_overflow(event, &data, ®s); 713out: 714 if (throttle) { 715 perf_ibs_stop(event, 0); 716 } else { 717 if (perf_ibs == &perf_ibs_op) { 718 if (ibs_caps & IBS_CAPS_OPCNTEXT) { 719 new_config = period & IBS_OP_MAX_CNT_EXT_MASK; 720 period &= ~IBS_OP_MAX_CNT_EXT_MASK; 721 } 722 if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL)) 723 new_config |= *config & IBS_OP_CUR_CNT_RAND; 724 } 725 new_config |= period >> 4; 726 727 perf_ibs_enable_event(perf_ibs, hwc, new_config); 728 } 729 730 perf_event_update_userpage(event); 731 732 return 1; 733} 734 735static int 736perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) 737{ 738 u64 stamp = sched_clock(); 739 int handled = 0; 740 741 handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); 742 handled += perf_ibs_handle_irq(&perf_ibs_op, regs); 743 744 if (handled) 745 inc_irq_stat(apic_perf_irqs); 746 747 perf_sample_event_took(sched_clock() - stamp); 748 749 return handled; 750} 751NOKPROBE_SYMBOL(perf_ibs_nmi_handler); 752 753static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) 754{ 755 struct cpu_perf_ibs __percpu *pcpu; 756 int ret; 757 758 pcpu = alloc_percpu(struct cpu_perf_ibs); 759 if (!pcpu) 760 return -ENOMEM; 761 762 perf_ibs->pcpu = pcpu; 763 764 /* register attributes */ 765 if (perf_ibs->format_attrs[0]) { 766 memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group)); 767 perf_ibs->format_group.name = "format"; 768 perf_ibs->format_group.attrs = perf_ibs->format_attrs; 769 770 memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups)); 771 perf_ibs->attr_groups[0] = &perf_ibs->format_group; 772 perf_ibs->pmu.attr_groups = perf_ibs->attr_groups; 773 } 774 775 ret = perf_pmu_register(&perf_ibs->pmu, name, -1); 776 if (ret) { 777 perf_ibs->pcpu = NULL; 778 free_percpu(pcpu); 779 } 780 781 return ret; 782} 783 784static __init int perf_event_ibs_init(void) 785{ 786 struct attribute **attr = ibs_op_format_attrs; 787 int ret; 788 789 /* 790 * Some chips fail to reset the fetch count when it is written; instead 791 * they need a 0-1 transition of IbsFetchEn. 792 */ 793 if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18) 794 perf_ibs_fetch.fetch_count_reset_broken = 1; 795 796 if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) 797 perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; 798 799 ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); 800 if (ret) 801 return ret; 802 803 if (ibs_caps & IBS_CAPS_OPCNT) { 804 perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; 805 *attr++ = &format_attr_cnt_ctl.attr; 806 } 807 808 if (ibs_caps & IBS_CAPS_OPCNTEXT) { 809 perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; 810 perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; 811 perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; 812 } 813 814 ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); 815 if (ret) 816 goto err_op; 817 818 ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); 819 if (ret) 820 goto err_nmi; 821 822 pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps); 823 return 0; 824 825err_nmi: 826 perf_pmu_unregister(&perf_ibs_op.pmu); 827 free_percpu(perf_ibs_op.pcpu); 828 perf_ibs_op.pcpu = NULL; 829err_op: 830 perf_pmu_unregister(&perf_ibs_fetch.pmu); 831 free_percpu(perf_ibs_fetch.pcpu); 832 perf_ibs_fetch.pcpu = NULL; 833 834 return ret; 835} 836 837#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ 838 839static __init int perf_event_ibs_init(void) 840{ 841 return 0; 842} 843 844#endif 845 846/* IBS - apic initialization, for perf and oprofile */ 847 848static __init u32 __get_ibs_caps(void) 849{ 850 u32 caps; 851 unsigned int max_level; 852 853 if (!boot_cpu_has(X86_FEATURE_IBS)) 854 return 0; 855 856 /* check IBS cpuid feature flags */ 857 max_level = cpuid_eax(0x80000000); 858 if (max_level < IBS_CPUID_FEATURES) 859 return IBS_CAPS_DEFAULT; 860 861 caps = cpuid_eax(IBS_CPUID_FEATURES); 862 if (!(caps & IBS_CAPS_AVAIL)) 863 /* cpuid flags not valid */ 864 return IBS_CAPS_DEFAULT; 865 866 return caps; 867} 868 869u32 get_ibs_caps(void) 870{ 871 return ibs_caps; 872} 873 874EXPORT_SYMBOL(get_ibs_caps); 875 876static inline int get_eilvt(int offset) 877{ 878 return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); 879} 880 881static inline int put_eilvt(int offset) 882{ 883 return !setup_APIC_eilvt(offset, 0, 0, 1); 884} 885 886/* 887 * Check and reserve APIC extended interrupt LVT offset for IBS if available. 888 */ 889static inline int ibs_eilvt_valid(void) 890{ 891 int offset; 892 u64 val; 893 int valid = 0; 894 895 preempt_disable(); 896 897 rdmsrl(MSR_AMD64_IBSCTL, val); 898 offset = val & IBSCTL_LVT_OFFSET_MASK; 899 900 if (!(val & IBSCTL_LVT_OFFSET_VALID)) { 901 pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", 902 smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); 903 goto out; 904 } 905 906 if (!get_eilvt(offset)) { 907 pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", 908 smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); 909 goto out; 910 } 911 912 valid = 1; 913out: 914 preempt_enable(); 915 916 return valid; 917} 918 919static int setup_ibs_ctl(int ibs_eilvt_off) 920{ 921 struct pci_dev *cpu_cfg; 922 int nodes; 923 u32 value = 0; 924 925 nodes = 0; 926 cpu_cfg = NULL; 927 do { 928 cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, 929 PCI_DEVICE_ID_AMD_10H_NB_MISC, 930 cpu_cfg); 931 if (!cpu_cfg) 932 break; 933 ++nodes; 934 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off 935 | IBSCTL_LVT_OFFSET_VALID); 936 pci_read_config_dword(cpu_cfg, IBSCTL, &value); 937 if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { 938 pci_dev_put(cpu_cfg); 939 pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n", 940 value); 941 return -EINVAL; 942 } 943 } while (1); 944 945 if (!nodes) { 946 pr_debug("No CPU node configured for IBS\n"); 947 return -ENODEV; 948 } 949 950 return 0; 951} 952 953/* 954 * This runs only on the current cpu. We try to find an LVT offset and 955 * setup the local APIC. For this we must disable preemption. On 956 * success we initialize all nodes with this offset. This updates then 957 * the offset in the IBS_CTL per-node msr. The per-core APIC setup of 958 * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that 959 * is using the new offset. 960 */ 961static void force_ibs_eilvt_setup(void) 962{ 963 int offset; 964 int ret; 965 966 preempt_disable(); 967 /* find the next free available EILVT entry, skip offset 0 */ 968 for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { 969 if (get_eilvt(offset)) 970 break; 971 } 972 preempt_enable(); 973 974 if (offset == APIC_EILVT_NR_MAX) { 975 pr_debug("No EILVT entry available\n"); 976 return; 977 } 978 979 ret = setup_ibs_ctl(offset); 980 if (ret) 981 goto out; 982 983 if (!ibs_eilvt_valid()) 984 goto out; 985 986 pr_info("LVT offset %d assigned\n", offset); 987 988 return; 989out: 990 preempt_disable(); 991 put_eilvt(offset); 992 preempt_enable(); 993 return; 994} 995 996static void ibs_eilvt_setup(void) 997{ 998 /* 999 * Force LVT offset assignment for family 10h: The offsets are 1000 * not assigned by the BIOS for this family, so the OS is 1001 * responsible for doing it. If the OS assignment fails, fall 1002 * back to BIOS settings and try to setup this. 1003 */ 1004 if (boot_cpu_data.x86 == 0x10) 1005 force_ibs_eilvt_setup(); 1006} 1007 1008static inline int get_ibs_lvt_offset(void) 1009{ 1010 u64 val; 1011 1012 rdmsrl(MSR_AMD64_IBSCTL, val); 1013 if (!(val & IBSCTL_LVT_OFFSET_VALID)) 1014 return -EINVAL; 1015 1016 return val & IBSCTL_LVT_OFFSET_MASK; 1017} 1018 1019static void setup_APIC_ibs(void) 1020{ 1021 int offset; 1022 1023 offset = get_ibs_lvt_offset(); 1024 if (offset < 0) 1025 goto failed; 1026 1027 if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) 1028 return; 1029failed: 1030 pr_warn("perf: IBS APIC setup failed on cpu #%d\n", 1031 smp_processor_id()); 1032} 1033 1034static void clear_APIC_ibs(void) 1035{ 1036 int offset; 1037 1038 offset = get_ibs_lvt_offset(); 1039 if (offset >= 0) 1040 setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); 1041} 1042 1043static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu) 1044{ 1045 setup_APIC_ibs(); 1046 return 0; 1047} 1048 1049#ifdef CONFIG_PM 1050 1051static int perf_ibs_suspend(void) 1052{ 1053 clear_APIC_ibs(); 1054 return 0; 1055} 1056 1057static void perf_ibs_resume(void) 1058{ 1059 ibs_eilvt_setup(); 1060 setup_APIC_ibs(); 1061} 1062 1063static struct syscore_ops perf_ibs_syscore_ops = { 1064 .resume = perf_ibs_resume, 1065 .suspend = perf_ibs_suspend, 1066}; 1067 1068static void perf_ibs_pm_init(void) 1069{ 1070 register_syscore_ops(&perf_ibs_syscore_ops); 1071} 1072 1073#else 1074 1075static inline void perf_ibs_pm_init(void) { } 1076 1077#endif 1078 1079static int x86_pmu_amd_ibs_dying_cpu(unsigned int cpu) 1080{ 1081 clear_APIC_ibs(); 1082 return 0; 1083} 1084 1085static __init int amd_ibs_init(void) 1086{ 1087 u32 caps; 1088 1089 caps = __get_ibs_caps(); 1090 if (!caps) 1091 return -ENODEV; /* ibs not supported by the cpu */ 1092 1093 ibs_eilvt_setup(); 1094 1095 if (!ibs_eilvt_valid()) 1096 return -EINVAL; 1097 1098 perf_ibs_pm_init(); 1099 1100 ibs_caps = caps; 1101 /* make ibs_caps visible to other cpus: */ 1102 smp_mb(); 1103 /* 1104 * x86_pmu_amd_ibs_starting_cpu will be called from core on 1105 * all online cpus. 1106 */ 1107 cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING, 1108 "perf/x86/amd/ibs:starting", 1109 x86_pmu_amd_ibs_starting_cpu, 1110 x86_pmu_amd_ibs_dying_cpu); 1111 1112 return perf_event_ibs_init(); 1113} 1114 1115/* Since we need the pci subsystem to init ibs we can't do this earlier: */ 1116device_initcall(amd_ibs_init); 1117