1/* 2 * Performance events x86 architecture code 3 * 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2009 Jaswinder Singh Rajput 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 10 * Copyright (C) 2009 Google, Inc., Stephane Eranian 11 * 12 * For licencing details see kernel-base/COPYING 13 */ 14 15#include <linux/perf_event.h> 16#include <linux/capability.h> 17#include <linux/notifier.h> 18#include <linux/hardirq.h> 19#include <linux/kprobes.h> 20#include <linux/export.h> 21#include <linux/init.h> 22#include <linux/kdebug.h> 23#include <linux/sched/mm.h> 24#include <linux/sched/clock.h> 25#include <linux/uaccess.h> 26#include <linux/slab.h> 27#include <linux/cpu.h> 28#include <linux/bitops.h> 29#include <linux/device.h> 30#include <linux/nospec.h> 31#include <linux/static_call.h> 32 33#include <asm/apic.h> 34#include <asm/stacktrace.h> 35#include <asm/nmi.h> 36#include <asm/smp.h> 37#include <asm/alternative.h> 38#include <asm/mmu_context.h> 39#include <asm/tlbflush.h> 40#include <asm/timer.h> 41#include <asm/desc.h> 42#include <asm/ldt.h> 43#include <asm/unwind.h> 44 45#include "perf_event.h" 46 47struct x86_pmu x86_pmu __read_mostly; 48static struct pmu pmu; 49 50DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 51 .enabled = 1, 52 .pmu = &pmu, 53}; 54 55DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); 56DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); 57 58/* 59 * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined 60 * from just a typename, as opposed to an actual function. 61 */ 62DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq); 63DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all); 64DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); 65DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); 66DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); 67 68DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); 69DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); 70DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); 71 72DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); 73DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); 74DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); 75 76DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling); 77DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling); 78DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling); 79 80DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task); 81DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); 82 83DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); 84DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); 85 86u64 __read_mostly hw_cache_event_ids 87 [PERF_COUNT_HW_CACHE_MAX] 88 [PERF_COUNT_HW_CACHE_OP_MAX] 89 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 90u64 __read_mostly hw_cache_extra_regs 91 [PERF_COUNT_HW_CACHE_MAX] 92 [PERF_COUNT_HW_CACHE_OP_MAX] 93 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 94 95/* 96 * Propagate event elapsed time into the generic event. 97 * Can only be executed on the CPU where the event is active. 98 * Returns the delta events processed. 99 */ 100u64 x86_perf_event_update(struct perf_event *event) 101{ 102 struct hw_perf_event *hwc = &event->hw; 103 int shift = 64 - x86_pmu.cntval_bits; 104 u64 prev_raw_count, new_raw_count; 105 u64 delta; 106 107 if (unlikely(!hwc->event_base)) 108 return 0; 109 110 if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event) 111 return x86_pmu.update_topdown_event(event); 112 113 /* 114 * Careful: an NMI might modify the previous event value. 115 * 116 * Our tactic to handle this is to first atomically read and 117 * exchange a new raw count - then add that new-prev delta 118 * count to the generic event atomically: 119 */ 120again: 121 prev_raw_count = local64_read(&hwc->prev_count); 122 rdpmcl(hwc->event_base_rdpmc, new_raw_count); 123 124 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 125 new_raw_count) != prev_raw_count) 126 goto again; 127 128 /* 129 * Now we have the new raw value and have updated the prev 130 * timestamp already. We can now calculate the elapsed delta 131 * (event-)time and add that to the generic event. 132 * 133 * Careful, not all hw sign-extends above the physical width 134 * of the count. 135 */ 136 delta = (new_raw_count << shift) - (prev_raw_count << shift); 137 delta >>= shift; 138 139 local64_add(delta, &event->count); 140 local64_sub(delta, &hwc->period_left); 141 142 return new_raw_count; 143} 144 145/* 146 * Find and validate any extra registers to set up. 147 */ 148static int x86_pmu_extra_regs(u64 config, struct perf_event *event) 149{ 150 struct hw_perf_event_extra *reg; 151 struct extra_reg *er; 152 153 reg = &event->hw.extra_reg; 154 155 if (!x86_pmu.extra_regs) 156 return 0; 157 158 for (er = x86_pmu.extra_regs; er->msr; er++) { 159 if (er->event != (config & er->config_mask)) 160 continue; 161 if (event->attr.config1 & ~er->valid_mask) 162 return -EINVAL; 163 /* Check if the extra msrs can be safely accessed*/ 164 if (!er->extra_msr_access) 165 return -ENXIO; 166 167 reg->idx = er->idx; 168 reg->config = event->attr.config1; 169 reg->reg = er->msr; 170 break; 171 } 172 return 0; 173} 174 175static atomic_t active_events; 176static atomic_t pmc_refcount; 177static DEFINE_MUTEX(pmc_reserve_mutex); 178 179#ifdef CONFIG_X86_LOCAL_APIC 180 181static bool reserve_pmc_hardware(void) 182{ 183 int i; 184 185 for (i = 0; i < x86_pmu.num_counters; i++) { 186 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) 187 goto perfctr_fail; 188 } 189 190 for (i = 0; i < x86_pmu.num_counters; i++) { 191 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) 192 goto eventsel_fail; 193 } 194 195 return true; 196 197eventsel_fail: 198 for (i--; i >= 0; i--) 199 release_evntsel_nmi(x86_pmu_config_addr(i)); 200 201 i = x86_pmu.num_counters; 202 203perfctr_fail: 204 for (i--; i >= 0; i--) 205 release_perfctr_nmi(x86_pmu_event_addr(i)); 206 207 return false; 208} 209 210static void release_pmc_hardware(void) 211{ 212 int i; 213 214 for (i = 0; i < x86_pmu.num_counters; i++) { 215 release_perfctr_nmi(x86_pmu_event_addr(i)); 216 release_evntsel_nmi(x86_pmu_config_addr(i)); 217 } 218} 219 220#else 221 222static bool reserve_pmc_hardware(void) { return true; } 223static void release_pmc_hardware(void) {} 224 225#endif 226 227static bool check_hw_exists(void) 228{ 229 u64 val, val_fail = -1, val_new= ~0; 230 int i, reg, reg_fail = -1, ret = 0; 231 int bios_fail = 0; 232 int reg_safe = -1; 233 234 /* 235 * Check to see if the BIOS enabled any of the counters, if so 236 * complain and bail. 237 */ 238 for (i = 0; i < x86_pmu.num_counters; i++) { 239 reg = x86_pmu_config_addr(i); 240 ret = rdmsrl_safe(reg, &val); 241 if (ret) 242 goto msr_fail; 243 if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { 244 bios_fail = 1; 245 val_fail = val; 246 reg_fail = reg; 247 } else { 248 reg_safe = i; 249 } 250 } 251 252 if (x86_pmu.num_counters_fixed) { 253 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 254 ret = rdmsrl_safe(reg, &val); 255 if (ret) 256 goto msr_fail; 257 for (i = 0; i < x86_pmu.num_counters_fixed; i++) { 258 if (val & (0x03 << i*4)) { 259 bios_fail = 1; 260 val_fail = val; 261 reg_fail = reg; 262 } 263 } 264 } 265 266 /* 267 * If all the counters are enabled, the below test will always 268 * fail. The tools will also become useless in this scenario. 269 * Just fail and disable the hardware counters. 270 */ 271 272 if (reg_safe == -1) { 273 reg = reg_safe; 274 goto msr_fail; 275 } 276 277 /* 278 * Read the current value, change it and read it back to see if it 279 * matches, this is needed to detect certain hardware emulators 280 * (qemu/kvm) that don't trap on the MSR access and always return 0s. 281 */ 282 reg = x86_pmu_event_addr(reg_safe); 283 if (rdmsrl_safe(reg, &val)) 284 goto msr_fail; 285 val ^= 0xffffUL; 286 ret = wrmsrl_safe(reg, val); 287 ret |= rdmsrl_safe(reg, &val_new); 288 if (ret || val != val_new) 289 goto msr_fail; 290 291 /* 292 * We still allow the PMU driver to operate: 293 */ 294 if (bios_fail) { 295 pr_cont("Broken BIOS detected, complain to your hardware vendor.\n"); 296 pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", 297 reg_fail, val_fail); 298 } 299 300 return true; 301 302msr_fail: 303 if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { 304 pr_cont("PMU not available due to virtualization, using software events only.\n"); 305 } else { 306 pr_cont("Broken PMU hardware detected, using software events only.\n"); 307 pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n", 308 reg, val_new); 309 } 310 311 return false; 312} 313 314static void hw_perf_event_destroy(struct perf_event *event) 315{ 316 x86_release_hardware(); 317 atomic_dec(&active_events); 318} 319 320void hw_perf_lbr_event_destroy(struct perf_event *event) 321{ 322 hw_perf_event_destroy(event); 323 324 /* undo the lbr/bts event accounting */ 325 x86_del_exclusive(x86_lbr_exclusive_lbr); 326} 327 328static inline int x86_pmu_initialized(void) 329{ 330 return x86_pmu.handle_irq != NULL; 331} 332 333static inline int 334set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) 335{ 336 struct perf_event_attr *attr = &event->attr; 337 unsigned int cache_type, cache_op, cache_result; 338 u64 config, val; 339 340 config = attr->config; 341 342 cache_type = (config >> 0) & 0xff; 343 if (cache_type >= PERF_COUNT_HW_CACHE_MAX) 344 return -EINVAL; 345 cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX); 346 347 cache_op = (config >> 8) & 0xff; 348 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) 349 return -EINVAL; 350 cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX); 351 352 cache_result = (config >> 16) & 0xff; 353 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) 354 return -EINVAL; 355 cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); 356 357 val = hw_cache_event_ids[cache_type][cache_op][cache_result]; 358 359 if (val == 0) 360 return -ENOENT; 361 362 if (val == -1) 363 return -EINVAL; 364 365 hwc->config |= val; 366 attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; 367 return x86_pmu_extra_regs(val, event); 368} 369 370int x86_reserve_hardware(void) 371{ 372 int err = 0; 373 374 if (!atomic_inc_not_zero(&pmc_refcount)) { 375 mutex_lock(&pmc_reserve_mutex); 376 if (atomic_read(&pmc_refcount) == 0) { 377 if (!reserve_pmc_hardware()) { 378 err = -EBUSY; 379 } else { 380 reserve_ds_buffers(); 381 reserve_lbr_buffers(); 382 } 383 } 384 if (!err) 385 atomic_inc(&pmc_refcount); 386 mutex_unlock(&pmc_reserve_mutex); 387 } 388 389 return err; 390} 391 392void x86_release_hardware(void) 393{ 394 if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { 395 release_pmc_hardware(); 396 release_ds_buffers(); 397 release_lbr_buffers(); 398 mutex_unlock(&pmc_reserve_mutex); 399 } 400} 401 402/* 403 * Check if we can create event of a certain type (that no conflicting events 404 * are present). 405 */ 406int x86_add_exclusive(unsigned int what) 407{ 408 int i; 409 410 /* 411 * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. 412 * LBR and BTS are still mutually exclusive. 413 */ 414 if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) 415 goto out; 416 417 if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { 418 mutex_lock(&pmc_reserve_mutex); 419 for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { 420 if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) 421 goto fail_unlock; 422 } 423 atomic_inc(&x86_pmu.lbr_exclusive[what]); 424 mutex_unlock(&pmc_reserve_mutex); 425 } 426 427out: 428 atomic_inc(&active_events); 429 return 0; 430 431fail_unlock: 432 mutex_unlock(&pmc_reserve_mutex); 433 return -EBUSY; 434} 435 436void x86_del_exclusive(unsigned int what) 437{ 438 atomic_dec(&active_events); 439 440 /* 441 * See the comment in x86_add_exclusive(). 442 */ 443 if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) 444 return; 445 446 atomic_dec(&x86_pmu.lbr_exclusive[what]); 447} 448 449int x86_setup_perfctr(struct perf_event *event) 450{ 451 struct perf_event_attr *attr = &event->attr; 452 struct hw_perf_event *hwc = &event->hw; 453 u64 config; 454 455 if (!is_sampling_event(event)) { 456 hwc->sample_period = x86_pmu.max_period; 457 hwc->last_period = hwc->sample_period; 458 local64_set(&hwc->period_left, hwc->sample_period); 459 } 460 461 if (attr->type == PERF_TYPE_RAW) 462 return x86_pmu_extra_regs(event->attr.config, event); 463 464 if (attr->type == PERF_TYPE_HW_CACHE) 465 return set_ext_hw_attr(hwc, event); 466 467 if (attr->config >= x86_pmu.max_events) 468 return -EINVAL; 469 470 attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events); 471 472 /* 473 * The generic map: 474 */ 475 config = x86_pmu.event_map(attr->config); 476 477 if (config == 0) 478 return -ENOENT; 479 480 if (config == -1LL) 481 return -EINVAL; 482 483 hwc->config |= config; 484 485 return 0; 486} 487 488/* 489 * check that branch_sample_type is compatible with 490 * settings needed for precise_ip > 1 which implies 491 * using the LBR to capture ALL taken branches at the 492 * priv levels of the measurement 493 */ 494static inline int precise_br_compat(struct perf_event *event) 495{ 496 u64 m = event->attr.branch_sample_type; 497 u64 b = 0; 498 499 /* must capture all branches */ 500 if (!(m & PERF_SAMPLE_BRANCH_ANY)) 501 return 0; 502 503 m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; 504 505 if (!event->attr.exclude_user) 506 b |= PERF_SAMPLE_BRANCH_USER; 507 508 if (!event->attr.exclude_kernel) 509 b |= PERF_SAMPLE_BRANCH_KERNEL; 510 511 /* 512 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 513 */ 514 515 return m == b; 516} 517 518int x86_pmu_max_precise(void) 519{ 520 int precise = 0; 521 522 /* Support for constant skid */ 523 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { 524 precise++; 525 526 /* Support for IP fixup */ 527 if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) 528 precise++; 529 530 if (x86_pmu.pebs_prec_dist) 531 precise++; 532 } 533 return precise; 534} 535 536int x86_pmu_hw_config(struct perf_event *event) 537{ 538 if (event->attr.precise_ip) { 539 int precise = x86_pmu_max_precise(); 540 541 if (event->attr.precise_ip > precise) 542 return -EOPNOTSUPP; 543 544 /* There's no sense in having PEBS for non sampling events: */ 545 if (!is_sampling_event(event)) 546 return -EINVAL; 547 } 548 /* 549 * check that PEBS LBR correction does not conflict with 550 * whatever the user is asking with attr->branch_sample_type 551 */ 552 if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { 553 u64 *br_type = &event->attr.branch_sample_type; 554 555 if (has_branch_stack(event)) { 556 if (!precise_br_compat(event)) 557 return -EOPNOTSUPP; 558 559 /* branch_sample_type is compatible */ 560 561 } else { 562 /* 563 * user did not specify branch_sample_type 564 * 565 * For PEBS fixups, we capture all 566 * the branches at the priv level of the 567 * event. 568 */ 569 *br_type = PERF_SAMPLE_BRANCH_ANY; 570 571 if (!event->attr.exclude_user) 572 *br_type |= PERF_SAMPLE_BRANCH_USER; 573 574 if (!event->attr.exclude_kernel) 575 *br_type |= PERF_SAMPLE_BRANCH_KERNEL; 576 } 577 } 578 579 if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK) 580 event->attach_state |= PERF_ATTACH_TASK_DATA; 581 582 /* 583 * Generate PMC IRQs: 584 * (keep 'enabled' bit clear for now) 585 */ 586 event->hw.config = ARCH_PERFMON_EVENTSEL_INT; 587 588 /* 589 * Count user and OS events unless requested not to 590 */ 591 if (!event->attr.exclude_user) 592 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; 593 if (!event->attr.exclude_kernel) 594 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; 595 596 if (event->attr.type == PERF_TYPE_RAW) 597 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; 598 599 if (event->attr.sample_period && x86_pmu.limit_period) { 600 if (x86_pmu.limit_period(event, event->attr.sample_period) > 601 event->attr.sample_period) 602 return -EINVAL; 603 } 604 605 /* sample_regs_user never support XMM registers */ 606 if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK)) 607 return -EINVAL; 608 /* 609 * Besides the general purpose registers, XMM registers may 610 * be collected in PEBS on some platforms, e.g. Icelake 611 */ 612 if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) { 613 if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS)) 614 return -EINVAL; 615 616 if (!event->attr.precise_ip) 617 return -EINVAL; 618 } 619 620 return x86_setup_perfctr(event); 621} 622 623/* 624 * Setup the hardware configuration for a given attr_type 625 */ 626static int __x86_pmu_event_init(struct perf_event *event) 627{ 628 int err; 629 630 if (!x86_pmu_initialized()) 631 return -ENODEV; 632 633 err = x86_reserve_hardware(); 634 if (err) 635 return err; 636 637 atomic_inc(&active_events); 638 event->destroy = hw_perf_event_destroy; 639 640 event->hw.idx = -1; 641 event->hw.last_cpu = -1; 642 event->hw.last_tag = ~0ULL; 643 644 /* mark unused */ 645 event->hw.extra_reg.idx = EXTRA_REG_NONE; 646 event->hw.branch_reg.idx = EXTRA_REG_NONE; 647 648 return x86_pmu.hw_config(event); 649} 650 651void x86_pmu_disable_all(void) 652{ 653 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 654 int idx; 655 656 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 657 struct hw_perf_event *hwc = &cpuc->events[idx]->hw; 658 u64 val; 659 660 if (!test_bit(idx, cpuc->active_mask)) 661 continue; 662 rdmsrl(x86_pmu_config_addr(idx), val); 663 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) 664 continue; 665 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; 666 wrmsrl(x86_pmu_config_addr(idx), val); 667 if (is_counter_pair(hwc)) 668 wrmsrl(x86_pmu_config_addr(idx + 1), 0); 669 } 670} 671 672/* 673 * There may be PMI landing after enabled=0. The PMI hitting could be before or 674 * after disable_all. 675 * 676 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler. 677 * It will not be re-enabled in the NMI handler again, because enabled=0. After 678 * handling the NMI, disable_all will be called, which will not change the 679 * state either. If PMI hits after disable_all, the PMU is already disabled 680 * before entering NMI handler. The NMI handler will not change the state 681 * either. 682 * 683 * So either situation is harmless. 684 */ 685static void x86_pmu_disable(struct pmu *pmu) 686{ 687 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 688 689 if (!x86_pmu_initialized()) 690 return; 691 692 if (!cpuc->enabled) 693 return; 694 695 cpuc->n_added = 0; 696 cpuc->enabled = 0; 697 barrier(); 698 699 static_call(x86_pmu_disable_all)(); 700} 701 702void x86_pmu_enable_all(int added) 703{ 704 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 705 int idx; 706 707 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 708 struct hw_perf_event *hwc = &cpuc->events[idx]->hw; 709 710 if (!test_bit(idx, cpuc->active_mask)) 711 continue; 712 713 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); 714 } 715} 716 717static inline int is_x86_event(struct perf_event *event) 718{ 719 return event->pmu == &pmu; 720} 721 722struct pmu *x86_get_pmu(unsigned int cpu) 723{ 724 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 725 726 /* 727 * All CPUs of the hybrid type have been offline. 728 * The x86_get_pmu() should not be invoked. 729 */ 730 if (WARN_ON_ONCE(!cpuc->pmu)) 731 return &pmu; 732 733 return cpuc->pmu; 734} 735/* 736 * Event scheduler state: 737 * 738 * Assign events iterating over all events and counters, beginning 739 * with events with least weights first. Keep the current iterator 740 * state in struct sched_state. 741 */ 742struct sched_state { 743 int weight; 744 int event; /* event index */ 745 int counter; /* counter index */ 746 int unassigned; /* number of events to be assigned left */ 747 int nr_gp; /* number of GP counters used */ 748 u64 used; 749}; 750 751/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ 752#define SCHED_STATES_MAX 2 753 754struct perf_sched { 755 int max_weight; 756 int max_events; 757 int max_gp; 758 int saved_states; 759 struct event_constraint **constraints; 760 struct sched_state state; 761 struct sched_state saved[SCHED_STATES_MAX]; 762}; 763 764/* 765 * Initialize interator that runs through all events and counters. 766 */ 767static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, 768 int num, int wmin, int wmax, int gpmax) 769{ 770 int idx; 771 772 memset(sched, 0, sizeof(*sched)); 773 sched->max_events = num; 774 sched->max_weight = wmax; 775 sched->max_gp = gpmax; 776 sched->constraints = constraints; 777 778 for (idx = 0; idx < num; idx++) { 779 if (constraints[idx]->weight == wmin) 780 break; 781 } 782 783 sched->state.event = idx; /* start with min weight */ 784 sched->state.weight = wmin; 785 sched->state.unassigned = num; 786} 787 788static void perf_sched_save_state(struct perf_sched *sched) 789{ 790 if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) 791 return; 792 793 sched->saved[sched->saved_states] = sched->state; 794 sched->saved_states++; 795} 796 797static bool perf_sched_restore_state(struct perf_sched *sched) 798{ 799 if (!sched->saved_states) 800 return false; 801 802 sched->saved_states--; 803 sched->state = sched->saved[sched->saved_states]; 804 805 /* this assignment didn't work out */ 806 /* XXX broken vs EVENT_PAIR */ 807 sched->state.used &= ~BIT_ULL(sched->state.counter); 808 809 /* try the next one */ 810 sched->state.counter++; 811 812 return true; 813} 814 815/* 816 * Select a counter for the current event to schedule. Return true on 817 * success. 818 */ 819static bool __perf_sched_find_counter(struct perf_sched *sched) 820{ 821 struct event_constraint *c; 822 int idx; 823 824 if (!sched->state.unassigned) 825 return false; 826 827 if (sched->state.event >= sched->max_events) 828 return false; 829 830 c = sched->constraints[sched->state.event]; 831 /* Prefer fixed purpose counters */ 832 if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { 833 idx = INTEL_PMC_IDX_FIXED; 834 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { 835 u64 mask = BIT_ULL(idx); 836 837 if (sched->state.used & mask) 838 continue; 839 840 sched->state.used |= mask; 841 goto done; 842 } 843 } 844 845 /* Grab the first unused counter starting with idx */ 846 idx = sched->state.counter; 847 for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { 848 u64 mask = BIT_ULL(idx); 849 850 if (c->flags & PERF_X86_EVENT_PAIR) 851 mask |= mask << 1; 852 853 if (sched->state.used & mask) 854 continue; 855 856 if (sched->state.nr_gp++ >= sched->max_gp) 857 return false; 858 859 sched->state.used |= mask; 860 goto done; 861 } 862 863 return false; 864 865done: 866 sched->state.counter = idx; 867 868 if (c->overlap) 869 perf_sched_save_state(sched); 870 871 return true; 872} 873 874static bool perf_sched_find_counter(struct perf_sched *sched) 875{ 876 while (!__perf_sched_find_counter(sched)) { 877 if (!perf_sched_restore_state(sched)) 878 return false; 879 } 880 881 return true; 882} 883 884/* 885 * Go through all unassigned events and find the next one to schedule. 886 * Take events with the least weight first. Return true on success. 887 */ 888static bool perf_sched_next_event(struct perf_sched *sched) 889{ 890 struct event_constraint *c; 891 892 if (!sched->state.unassigned || !--sched->state.unassigned) 893 return false; 894 895 do { 896 /* next event */ 897 sched->state.event++; 898 if (sched->state.event >= sched->max_events) { 899 /* next weight */ 900 sched->state.event = 0; 901 sched->state.weight++; 902 if (sched->state.weight > sched->max_weight) 903 return false; 904 } 905 c = sched->constraints[sched->state.event]; 906 } while (c->weight != sched->state.weight); 907 908 sched->state.counter = 0; /* start with first counter */ 909 910 return true; 911} 912 913/* 914 * Assign a counter for each event. 915 */ 916int perf_assign_events(struct event_constraint **constraints, int n, 917 int wmin, int wmax, int gpmax, int *assign) 918{ 919 struct perf_sched sched; 920 921 perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); 922 923 do { 924 if (!perf_sched_find_counter(&sched)) 925 break; /* failed */ 926 if (assign) 927 assign[sched.state.event] = sched.state.counter; 928 } while (perf_sched_next_event(&sched)); 929 930 return sched.state.unassigned; 931} 932EXPORT_SYMBOL_GPL(perf_assign_events); 933 934int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) 935{ 936 struct event_constraint *c; 937 struct perf_event *e; 938 int n0, i, wmin, wmax, unsched = 0; 939 struct hw_perf_event *hwc; 940 u64 used_mask = 0; 941 942 /* 943 * Compute the number of events already present; see x86_pmu_add(), 944 * validate_group() and x86_pmu_commit_txn(). For the former two 945 * cpuc->n_events hasn't been updated yet, while for the latter 946 * cpuc->n_txn contains the number of events added in the current 947 * transaction. 948 */ 949 n0 = cpuc->n_events; 950 if (cpuc->txn_flags & PERF_PMU_TXN_ADD) 951 n0 -= cpuc->n_txn; 952 953 static_call_cond(x86_pmu_start_scheduling)(cpuc); 954 955 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { 956 c = cpuc->event_constraint[i]; 957 958 /* 959 * Previously scheduled events should have a cached constraint, 960 * while new events should not have one. 961 */ 962 WARN_ON_ONCE((c && i >= n0) || (!c && i < n0)); 963 964 /* 965 * Request constraints for new events; or for those events that 966 * have a dynamic constraint -- for those the constraint can 967 * change due to external factors (sibling state, allow_tfa). 968 */ 969 if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { 970 c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]); 971 cpuc->event_constraint[i] = c; 972 } 973 974 wmin = min(wmin, c->weight); 975 wmax = max(wmax, c->weight); 976 } 977 978 /* 979 * fastpath, try to reuse previous register 980 */ 981 for (i = 0; i < n; i++) { 982 u64 mask; 983 984 hwc = &cpuc->event_list[i]->hw; 985 c = cpuc->event_constraint[i]; 986 987 /* never assigned */ 988 if (hwc->idx == -1) 989 break; 990 991 /* constraint still honored */ 992 if (!test_bit(hwc->idx, c->idxmsk)) 993 break; 994 995 mask = BIT_ULL(hwc->idx); 996 if (is_counter_pair(hwc)) 997 mask |= mask << 1; 998 999 /* not already used */ 1000 if (used_mask & mask) 1001 break; 1002 1003 used_mask |= mask; 1004 1005 if (assign) 1006 assign[i] = hwc->idx; 1007 } 1008 1009 /* slow path */ 1010 if (i != n) { 1011 int gpmax = x86_pmu.num_counters; 1012 1013 /* 1014 * Do not allow scheduling of more than half the available 1015 * generic counters. 1016 * 1017 * This helps avoid counter starvation of sibling thread by 1018 * ensuring at most half the counters cannot be in exclusive 1019 * mode. There is no designated counters for the limits. Any 1020 * N/2 counters can be used. This helps with events with 1021 * specific counter constraints. 1022 */ 1023 if (is_ht_workaround_enabled() && !cpuc->is_fake && 1024 READ_ONCE(cpuc->excl_cntrs->exclusive_present)) 1025 gpmax /= 2; 1026 1027 /* 1028 * Reduce the amount of available counters to allow fitting 1029 * the extra Merge events needed by large increment events. 1030 */ 1031 if (x86_pmu.flags & PMU_FL_PAIR) { 1032 gpmax = x86_pmu.num_counters - cpuc->n_pair; 1033 WARN_ON(gpmax <= 0); 1034 } 1035 1036 unsched = perf_assign_events(cpuc->event_constraint, n, wmin, 1037 wmax, gpmax, assign); 1038 } 1039 1040 /* 1041 * In case of success (unsched = 0), mark events as committed, 1042 * so we do not put_constraint() in case new events are added 1043 * and fail to be scheduled 1044 * 1045 * We invoke the lower level commit callback to lock the resource 1046 * 1047 * We do not need to do all of this in case we are called to 1048 * validate an event group (assign == NULL) 1049 */ 1050 if (!unsched && assign) { 1051 for (i = 0; i < n; i++) { 1052 e = cpuc->event_list[i]; 1053 static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]); 1054 } 1055 } else { 1056 for (i = n0; i < n; i++) { 1057 e = cpuc->event_list[i]; 1058 1059 /* 1060 * release events that failed scheduling 1061 */ 1062 static_call_cond(x86_pmu_put_event_constraints)(cpuc, e); 1063 1064 cpuc->event_constraint[i] = NULL; 1065 } 1066 } 1067 1068 static_call_cond(x86_pmu_stop_scheduling)(cpuc); 1069 1070 return unsched ? -EINVAL : 0; 1071} 1072 1073static int add_nr_metric_event(struct cpu_hw_events *cpuc, 1074 struct perf_event *event) 1075{ 1076 if (is_metric_event(event)) { 1077 if (cpuc->n_metric == INTEL_TD_METRIC_NUM) 1078 return -EINVAL; 1079 cpuc->n_metric++; 1080 cpuc->n_txn_metric++; 1081 } 1082 1083 return 0; 1084} 1085 1086static void del_nr_metric_event(struct cpu_hw_events *cpuc, 1087 struct perf_event *event) 1088{ 1089 if (is_metric_event(event)) 1090 cpuc->n_metric--; 1091} 1092 1093static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, 1094 int max_count, int n) 1095{ 1096 1097 if (x86_pmu.intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) 1098 return -EINVAL; 1099 1100 if (n >= max_count + cpuc->n_metric) 1101 return -EINVAL; 1102 1103 cpuc->event_list[n] = event; 1104 if (is_counter_pair(&event->hw)) { 1105 cpuc->n_pair++; 1106 cpuc->n_txn_pair++; 1107 } 1108 1109 return 0; 1110} 1111 1112/* 1113 * dogrp: true if must collect siblings events (group) 1114 * returns total number of events and error code 1115 */ 1116static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) 1117{ 1118 struct perf_event *event; 1119 int n, max_count; 1120 1121 max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed; 1122 1123 /* current number of events already accepted */ 1124 n = cpuc->n_events; 1125 if (!cpuc->n_events) 1126 cpuc->pebs_output = 0; 1127 1128 if (!cpuc->is_fake && leader->attr.precise_ip) { 1129 /* 1130 * For PEBS->PT, if !aux_event, the group leader (PT) went 1131 * away, the group was broken down and this singleton event 1132 * can't schedule any more. 1133 */ 1134 if (is_pebs_pt(leader) && !leader->aux_event) 1135 return -EINVAL; 1136 1137 /* 1138 * pebs_output: 0: no PEBS so far, 1: PT, 2: DS 1139 */ 1140 if (cpuc->pebs_output && 1141 cpuc->pebs_output != is_pebs_pt(leader) + 1) 1142 return -EINVAL; 1143 1144 cpuc->pebs_output = is_pebs_pt(leader) + 1; 1145 } 1146 1147 if (is_x86_event(leader)) { 1148 if (collect_event(cpuc, leader, max_count, n)) 1149 return -EINVAL; 1150 n++; 1151 } 1152 1153 if (!dogrp) 1154 return n; 1155 1156 for_each_sibling_event(event, leader) { 1157 if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) 1158 continue; 1159 1160 if (collect_event(cpuc, event, max_count, n)) 1161 return -EINVAL; 1162 1163 n++; 1164 } 1165 return n; 1166} 1167 1168static inline void x86_assign_hw_event(struct perf_event *event, 1169 struct cpu_hw_events *cpuc, int i) 1170{ 1171 struct hw_perf_event *hwc = &event->hw; 1172 int idx; 1173 1174 idx = hwc->idx = cpuc->assign[i]; 1175 hwc->last_cpu = smp_processor_id(); 1176 hwc->last_tag = ++cpuc->tags[i]; 1177 1178 switch (hwc->idx) { 1179 case INTEL_PMC_IDX_FIXED_BTS: 1180 case INTEL_PMC_IDX_FIXED_VLBR: 1181 hwc->config_base = 0; 1182 hwc->event_base = 0; 1183 break; 1184 1185 case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: 1186 /* All the metric events are mapped onto the fixed counter 3. */ 1187 idx = INTEL_PMC_IDX_FIXED_SLOTS; 1188 /* fall through */ 1189 case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: 1190 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 1191 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + 1192 (idx - INTEL_PMC_IDX_FIXED); 1193 hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | 1194 INTEL_PMC_FIXED_RDPMC_BASE; 1195 break; 1196 1197 default: 1198 hwc->config_base = x86_pmu_config_addr(hwc->idx); 1199 hwc->event_base = x86_pmu_event_addr(hwc->idx); 1200 hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); 1201 break; 1202 } 1203} 1204 1205/** 1206 * x86_perf_rdpmc_index - Return PMC counter used for event 1207 * @event: the perf_event to which the PMC counter was assigned 1208 * 1209 * The counter assigned to this performance event may change if interrupts 1210 * are enabled. This counter should thus never be used while interrupts are 1211 * enabled. Before this function is used to obtain the assigned counter the 1212 * event should be checked for validity using, for example, 1213 * perf_event_read_local(), within the same interrupt disabled section in 1214 * which this counter is planned to be used. 1215 * 1216 * Return: The index of the performance monitoring counter assigned to 1217 * @perf_event. 1218 */ 1219int x86_perf_rdpmc_index(struct perf_event *event) 1220{ 1221 lockdep_assert_irqs_disabled(); 1222 1223 return event->hw.event_base_rdpmc; 1224} 1225 1226static inline int match_prev_assignment(struct hw_perf_event *hwc, 1227 struct cpu_hw_events *cpuc, 1228 int i) 1229{ 1230 return hwc->idx == cpuc->assign[i] && 1231 hwc->last_cpu == smp_processor_id() && 1232 hwc->last_tag == cpuc->tags[i]; 1233} 1234 1235static void x86_pmu_start(struct perf_event *event, int flags); 1236 1237static void x86_pmu_enable(struct pmu *pmu) 1238{ 1239 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1240 struct perf_event *event; 1241 struct hw_perf_event *hwc; 1242 int i, added = cpuc->n_added; 1243 1244 if (!x86_pmu_initialized()) 1245 return; 1246 1247 if (cpuc->enabled) 1248 return; 1249 1250 if (cpuc->n_added) { 1251 int n_running = cpuc->n_events - cpuc->n_added; 1252 /* 1253 * apply assignment obtained either from 1254 * hw_perf_group_sched_in() or x86_pmu_enable() 1255 * 1256 * step1: save events moving to new counters 1257 */ 1258 for (i = 0; i < n_running; i++) { 1259 event = cpuc->event_list[i]; 1260 hwc = &event->hw; 1261 1262 /* 1263 * we can avoid reprogramming counter if: 1264 * - assigned same counter as last time 1265 * - running on same CPU as last time 1266 * - no other event has used the counter since 1267 */ 1268 if (hwc->idx == -1 || 1269 match_prev_assignment(hwc, cpuc, i)) 1270 continue; 1271 1272 /* 1273 * Ensure we don't accidentally enable a stopped 1274 * counter simply because we rescheduled. 1275 */ 1276 if (hwc->state & PERF_HES_STOPPED) 1277 hwc->state |= PERF_HES_ARCH; 1278 1279 x86_pmu_stop(event, PERF_EF_UPDATE); 1280 } 1281 1282 /* 1283 * step2: reprogram moved events into new counters 1284 */ 1285 for (i = 0; i < cpuc->n_events; i++) { 1286 event = cpuc->event_list[i]; 1287 hwc = &event->hw; 1288 1289 if (!match_prev_assignment(hwc, cpuc, i)) 1290 x86_assign_hw_event(event, cpuc, i); 1291 else if (i < n_running) 1292 continue; 1293 1294 if (hwc->state & PERF_HES_ARCH) 1295 continue; 1296 1297 x86_pmu_start(event, PERF_EF_RELOAD); 1298 } 1299 cpuc->n_added = 0; 1300 perf_events_lapic_init(); 1301 } 1302 1303 cpuc->enabled = 1; 1304 barrier(); 1305 1306 static_call(x86_pmu_enable_all)(added); 1307} 1308 1309static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1310 1311/* 1312 * Set the next IRQ period, based on the hwc->period_left value. 1313 * To be called with the event disabled in hw: 1314 */ 1315int x86_perf_event_set_period(struct perf_event *event) 1316{ 1317 struct hw_perf_event *hwc = &event->hw; 1318 s64 left = local64_read(&hwc->period_left); 1319 s64 period = hwc->sample_period; 1320 int ret = 0, idx = hwc->idx; 1321 1322 if (unlikely(!hwc->event_base)) 1323 return 0; 1324 1325 if (unlikely(is_topdown_count(event)) && 1326 x86_pmu.set_topdown_event_period) 1327 return x86_pmu.set_topdown_event_period(event); 1328 1329 /* 1330 * If we are way outside a reasonable range then just skip forward: 1331 */ 1332 if (unlikely(left <= -period)) { 1333 left = period; 1334 local64_set(&hwc->period_left, left); 1335 hwc->last_period = period; 1336 ret = 1; 1337 } 1338 1339 if (unlikely(left <= 0)) { 1340 left += period; 1341 local64_set(&hwc->period_left, left); 1342 hwc->last_period = period; 1343 ret = 1; 1344 } 1345 /* 1346 * Quirk: certain CPUs dont like it if just 1 hw_event is left: 1347 */ 1348 if (unlikely(left < 2)) 1349 left = 2; 1350 1351 if (left > x86_pmu.max_period) 1352 left = x86_pmu.max_period; 1353 1354 if (x86_pmu.limit_period) 1355 left = x86_pmu.limit_period(event, left); 1356 1357 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; 1358 1359 /* 1360 * The hw event starts counting from this event offset, 1361 * mark it to be able to extra future deltas: 1362 */ 1363 local64_set(&hwc->prev_count, (u64)-left); 1364 1365 wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); 1366 1367 /* 1368 * Sign extend the Merge event counter's upper 16 bits since 1369 * we currently declare a 48-bit counter width 1370 */ 1371 if (is_counter_pair(hwc)) 1372 wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); 1373 1374 /* 1375 * Due to erratum on certan cpu we need 1376 * a second write to be sure the register 1377 * is updated properly 1378 */ 1379 if (x86_pmu.perfctr_second_write) { 1380 wrmsrl(hwc->event_base, 1381 (u64)(-left) & x86_pmu.cntval_mask); 1382 } 1383 1384 perf_event_update_userpage(event); 1385 1386 return ret; 1387} 1388 1389void x86_pmu_enable_event(struct perf_event *event) 1390{ 1391 if (__this_cpu_read(cpu_hw_events.enabled)) 1392 __x86_pmu_enable_event(&event->hw, 1393 ARCH_PERFMON_EVENTSEL_ENABLE); 1394} 1395 1396/* 1397 * Add a single event to the PMU. 1398 * 1399 * The event is added to the group of enabled events 1400 * but only if it can be scheduled with existing events. 1401 */ 1402static int x86_pmu_add(struct perf_event *event, int flags) 1403{ 1404 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1405 struct hw_perf_event *hwc; 1406 int assign[X86_PMC_IDX_MAX]; 1407 int n, n0, ret; 1408 1409 hwc = &event->hw; 1410 1411 n0 = cpuc->n_events; 1412 ret = n = collect_events(cpuc, event, false); 1413 if (ret < 0) 1414 goto out; 1415 1416 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 1417 if (!(flags & PERF_EF_START)) 1418 hwc->state |= PERF_HES_ARCH; 1419 1420 /* 1421 * If group events scheduling transaction was started, 1422 * skip the schedulability test here, it will be performed 1423 * at commit time (->commit_txn) as a whole. 1424 * 1425 * If commit fails, we'll call ->del() on all events 1426 * for which ->add() was called. 1427 */ 1428 if (cpuc->txn_flags & PERF_PMU_TXN_ADD) 1429 goto done_collect; 1430 1431 ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); 1432 if (ret) 1433 goto out; 1434 /* 1435 * copy new assignment, now we know it is possible 1436 * will be used by hw_perf_enable() 1437 */ 1438 memcpy(cpuc->assign, assign, n*sizeof(int)); 1439 1440done_collect: 1441 /* 1442 * Commit the collect_events() state. See x86_pmu_del() and 1443 * x86_pmu_*_txn(). 1444 */ 1445 cpuc->n_events = n; 1446 cpuc->n_added += n - n0; 1447 cpuc->n_txn += n - n0; 1448 1449 /* 1450 * This is before x86_pmu_enable() will call x86_pmu_start(), 1451 * so we enable LBRs before an event needs them etc.. 1452 */ 1453 static_call_cond(x86_pmu_add)(event); 1454 1455 ret = 0; 1456out: 1457 return ret; 1458} 1459 1460static void x86_pmu_start(struct perf_event *event, int flags) 1461{ 1462 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1463 int idx = event->hw.idx; 1464 1465 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) 1466 return; 1467 1468 if (WARN_ON_ONCE(idx == -1)) 1469 return; 1470 1471 if (flags & PERF_EF_RELOAD) { 1472 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); 1473 x86_perf_event_set_period(event); 1474 } 1475 1476 event->hw.state = 0; 1477 1478 cpuc->events[idx] = event; 1479 __set_bit(idx, cpuc->active_mask); 1480 __set_bit(idx, cpuc->running); 1481 static_call(x86_pmu_enable)(event); 1482 perf_event_update_userpage(event); 1483} 1484 1485void perf_event_print_debug(void) 1486{ 1487 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1488 u64 pebs, debugctl; 1489 struct cpu_hw_events *cpuc; 1490 unsigned long flags; 1491 int cpu, idx; 1492 1493 if (!x86_pmu.num_counters) 1494 return; 1495 1496 local_irq_save(flags); 1497 1498 cpu = smp_processor_id(); 1499 cpuc = &per_cpu(cpu_hw_events, cpu); 1500 1501 if (x86_pmu.version >= 2) { 1502 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); 1503 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1504 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1505 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1506 1507 pr_info("\n"); 1508 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1509 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1510 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1511 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1512 if (x86_pmu.pebs_constraints) { 1513 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); 1514 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); 1515 } 1516 if (x86_pmu.lbr_nr) { 1517 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 1518 pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); 1519 } 1520 } 1521 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1522 1523 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1524 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); 1525 rdmsrl(x86_pmu_event_addr(idx), pmc_count); 1526 1527 prev_left = per_cpu(pmc_prev_left[idx], cpu); 1528 1529 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", 1530 cpu, idx, pmc_ctrl); 1531 pr_info("CPU#%d: gen-PMC%d count: %016llx\n", 1532 cpu, idx, pmc_count); 1533 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1534 cpu, idx, prev_left); 1535 } 1536 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1537 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1538 1539 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1540 cpu, idx, pmc_count); 1541 } 1542 local_irq_restore(flags); 1543} 1544 1545void x86_pmu_stop(struct perf_event *event, int flags) 1546{ 1547 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1548 struct hw_perf_event *hwc = &event->hw; 1549 1550 if (test_bit(hwc->idx, cpuc->active_mask)) { 1551 static_call(x86_pmu_disable)(event); 1552 __clear_bit(hwc->idx, cpuc->active_mask); 1553 cpuc->events[hwc->idx] = NULL; 1554 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 1555 hwc->state |= PERF_HES_STOPPED; 1556 } 1557 1558 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { 1559 /* 1560 * Drain the remaining delta count out of a event 1561 * that we are disabling: 1562 */ 1563 x86_perf_event_update(event); 1564 hwc->state |= PERF_HES_UPTODATE; 1565 } 1566} 1567 1568static void x86_pmu_del(struct perf_event *event, int flags) 1569{ 1570 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1571 int i; 1572 1573 /* 1574 * If we're called during a txn, we only need to undo x86_pmu.add. 1575 * The events never got scheduled and ->cancel_txn will truncate 1576 * the event_list. 1577 * 1578 * XXX assumes any ->del() called during a TXN will only be on 1579 * an event added during that same TXN. 1580 */ 1581 if (cpuc->txn_flags & PERF_PMU_TXN_ADD) 1582 goto do_del; 1583 1584 /* 1585 * Not a TXN, therefore cleanup properly. 1586 */ 1587 x86_pmu_stop(event, PERF_EF_UPDATE); 1588 1589 for (i = 0; i < cpuc->n_events; i++) { 1590 if (event == cpuc->event_list[i]) 1591 break; 1592 } 1593 1594 if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ 1595 return; 1596 1597 /* If we have a newly added event; make sure to decrease n_added. */ 1598 if (i >= cpuc->n_events - cpuc->n_added) 1599 --cpuc->n_added; 1600 1601 static_call_cond(x86_pmu_put_event_constraints)(cpuc, event); 1602 1603 /* Delete the array entry. */ 1604 while (++i < cpuc->n_events) { 1605 cpuc->event_list[i-1] = cpuc->event_list[i]; 1606 cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; 1607 } 1608 cpuc->event_constraint[i-1] = NULL; 1609 --cpuc->n_events; 1610 if (x86_pmu.intel_cap.perf_metrics) 1611 del_nr_metric_event(cpuc, event); 1612 1613 perf_event_update_userpage(event); 1614 1615do_del: 1616 1617 /* 1618 * This is after x86_pmu_stop(); so we disable LBRs after any 1619 * event can need them etc.. 1620 */ 1621 static_call_cond(x86_pmu_del)(event); 1622} 1623 1624int x86_pmu_handle_irq(struct pt_regs *regs) 1625{ 1626 struct perf_sample_data data; 1627 struct cpu_hw_events *cpuc; 1628 struct perf_event *event; 1629 int idx, handled = 0; 1630 u64 val; 1631 1632 cpuc = this_cpu_ptr(&cpu_hw_events); 1633 1634 /* 1635 * Some chipsets need to unmask the LVTPC in a particular spot 1636 * inside the nmi handler. As a result, the unmasking was pushed 1637 * into all the nmi handlers. 1638 * 1639 * This generic handler doesn't seem to have any issues where the 1640 * unmasking occurs so it was left at the top. 1641 */ 1642 apic_write(APIC_LVTPC, APIC_DM_NMI); 1643 1644 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1645 if (!test_bit(idx, cpuc->active_mask)) 1646 continue; 1647 1648 event = cpuc->events[idx]; 1649 1650 val = x86_perf_event_update(event); 1651 if (val & (1ULL << (x86_pmu.cntval_bits - 1))) 1652 continue; 1653 1654 /* 1655 * event overflow 1656 */ 1657 handled++; 1658 perf_sample_data_init(&data, 0, event->hw.last_period); 1659 1660 if (!x86_perf_event_set_period(event)) 1661 continue; 1662 1663 if (perf_event_overflow(event, &data, regs)) 1664 x86_pmu_stop(event, 0); 1665 } 1666 1667 if (handled) 1668 inc_irq_stat(apic_perf_irqs); 1669 1670 return handled; 1671} 1672 1673void perf_events_lapic_init(void) 1674{ 1675 if (!x86_pmu.apic || !x86_pmu_initialized()) 1676 return; 1677 1678 /* 1679 * Always use NMI for PMU 1680 */ 1681 apic_write(APIC_LVTPC, APIC_DM_NMI); 1682} 1683 1684static int 1685perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) 1686{ 1687 u64 start_clock; 1688 u64 finish_clock; 1689 int ret; 1690 1691 /* 1692 * All PMUs/events that share this PMI handler should make sure to 1693 * increment active_events for their events. 1694 */ 1695 if (!atomic_read(&active_events)) 1696 return NMI_DONE; 1697 1698 start_clock = sched_clock(); 1699 ret = static_call(x86_pmu_handle_irq)(regs); 1700 finish_clock = sched_clock(); 1701 1702 perf_sample_event_took(finish_clock - start_clock); 1703 1704 return ret; 1705} 1706NOKPROBE_SYMBOL(perf_event_nmi_handler); 1707 1708struct event_constraint emptyconstraint; 1709struct event_constraint unconstrained; 1710 1711static int x86_pmu_prepare_cpu(unsigned int cpu) 1712{ 1713 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1714 int i; 1715 1716 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) 1717 cpuc->kfree_on_online[i] = NULL; 1718 if (x86_pmu.cpu_prepare) 1719 return x86_pmu.cpu_prepare(cpu); 1720 return 0; 1721} 1722 1723static int x86_pmu_dead_cpu(unsigned int cpu) 1724{ 1725 if (x86_pmu.cpu_dead) 1726 x86_pmu.cpu_dead(cpu); 1727 return 0; 1728} 1729 1730static int x86_pmu_online_cpu(unsigned int cpu) 1731{ 1732 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1733 int i; 1734 1735 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { 1736 kfree(cpuc->kfree_on_online[i]); 1737 cpuc->kfree_on_online[i] = NULL; 1738 } 1739 return 0; 1740} 1741 1742static int x86_pmu_starting_cpu(unsigned int cpu) 1743{ 1744 if (x86_pmu.cpu_starting) 1745 x86_pmu.cpu_starting(cpu); 1746 return 0; 1747} 1748 1749static int x86_pmu_dying_cpu(unsigned int cpu) 1750{ 1751 if (x86_pmu.cpu_dying) 1752 x86_pmu.cpu_dying(cpu); 1753 return 0; 1754} 1755 1756static void __init pmu_check_apic(void) 1757{ 1758 if (boot_cpu_has(X86_FEATURE_APIC)) 1759 return; 1760 1761 x86_pmu.apic = 0; 1762 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); 1763 pr_info("no hardware sampling interrupt available.\n"); 1764 1765 /* 1766 * If we have a PMU initialized but no APIC 1767 * interrupts, we cannot sample hardware 1768 * events (user-space has to fall back and 1769 * sample via a hrtimer based software event): 1770 */ 1771 pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; 1772 1773} 1774 1775static struct attribute_group x86_pmu_format_group __ro_after_init = { 1776 .name = "format", 1777 .attrs = NULL, 1778}; 1779 1780ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) 1781{ 1782 struct perf_pmu_events_attr *pmu_attr = 1783 container_of(attr, struct perf_pmu_events_attr, attr); 1784 u64 config = 0; 1785 1786 if (pmu_attr->id < x86_pmu.max_events) 1787 config = x86_pmu.event_map(pmu_attr->id); 1788 1789 /* string trumps id */ 1790 if (pmu_attr->event_str) 1791 return sprintf(page, "%s", pmu_attr->event_str); 1792 1793 return x86_pmu.events_sysfs_show(page, config); 1794} 1795EXPORT_SYMBOL_GPL(events_sysfs_show); 1796 1797ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, 1798 char *page) 1799{ 1800 struct perf_pmu_events_ht_attr *pmu_attr = 1801 container_of(attr, struct perf_pmu_events_ht_attr, attr); 1802 1803 /* 1804 * Report conditional events depending on Hyper-Threading. 1805 * 1806 * This is overly conservative as usually the HT special 1807 * handling is not needed if the other CPU thread is idle. 1808 * 1809 * Note this does not (and cannot) handle the case when thread 1810 * siblings are invisible, for example with virtualization 1811 * if they are owned by some other guest. The user tool 1812 * has to re-read when a thread sibling gets onlined later. 1813 */ 1814 return sprintf(page, "%s", 1815 topology_max_smt_threads() > 1 ? 1816 pmu_attr->event_str_ht : 1817 pmu_attr->event_str_noht); 1818} 1819 1820EVENT_ATTR(cpu-cycles, CPU_CYCLES ); 1821EVENT_ATTR(instructions, INSTRUCTIONS ); 1822EVENT_ATTR(cache-references, CACHE_REFERENCES ); 1823EVENT_ATTR(cache-misses, CACHE_MISSES ); 1824EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); 1825EVENT_ATTR(branch-misses, BRANCH_MISSES ); 1826EVENT_ATTR(bus-cycles, BUS_CYCLES ); 1827EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); 1828EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); 1829EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); 1830 1831static struct attribute *empty_attrs; 1832 1833static struct attribute *events_attr[] = { 1834 EVENT_PTR(CPU_CYCLES), 1835 EVENT_PTR(INSTRUCTIONS), 1836 EVENT_PTR(CACHE_REFERENCES), 1837 EVENT_PTR(CACHE_MISSES), 1838 EVENT_PTR(BRANCH_INSTRUCTIONS), 1839 EVENT_PTR(BRANCH_MISSES), 1840 EVENT_PTR(BUS_CYCLES), 1841 EVENT_PTR(STALLED_CYCLES_FRONTEND), 1842 EVENT_PTR(STALLED_CYCLES_BACKEND), 1843 EVENT_PTR(REF_CPU_CYCLES), 1844 NULL, 1845}; 1846 1847/* 1848 * Remove all undefined events (x86_pmu.event_map(id) == 0) 1849 * out of events_attr attributes. 1850 */ 1851static umode_t 1852is_visible(struct kobject *kobj, struct attribute *attr, int idx) 1853{ 1854 struct perf_pmu_events_attr *pmu_attr; 1855 1856 if (idx >= x86_pmu.max_events) 1857 return 0; 1858 1859 pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); 1860 /* str trumps id */ 1861 return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0; 1862} 1863 1864static struct attribute_group x86_pmu_events_group __ro_after_init = { 1865 .name = "events", 1866 .attrs = events_attr, 1867 .is_visible = is_visible, 1868}; 1869 1870ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) 1871{ 1872 u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 1873 u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; 1874 bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); 1875 bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); 1876 bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); 1877 bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); 1878 ssize_t ret; 1879 1880 /* 1881 * We have whole page size to spend and just little data 1882 * to write, so we can safely use sprintf. 1883 */ 1884 ret = sprintf(page, "event=0x%02llx", event); 1885 1886 if (umask) 1887 ret += sprintf(page + ret, ",umask=0x%02llx", umask); 1888 1889 if (edge) 1890 ret += sprintf(page + ret, ",edge"); 1891 1892 if (pc) 1893 ret += sprintf(page + ret, ",pc"); 1894 1895 if (any) 1896 ret += sprintf(page + ret, ",any"); 1897 1898 if (inv) 1899 ret += sprintf(page + ret, ",inv"); 1900 1901 if (cmask) 1902 ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); 1903 1904 ret += sprintf(page + ret, "\n"); 1905 1906 return ret; 1907} 1908 1909static struct attribute_group x86_pmu_attr_group; 1910static struct attribute_group x86_pmu_caps_group; 1911 1912static void x86_pmu_static_call_update(void) 1913{ 1914 static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq); 1915 static_call_update(x86_pmu_disable_all, x86_pmu.disable_all); 1916 static_call_update(x86_pmu_enable_all, x86_pmu.enable_all); 1917 static_call_update(x86_pmu_enable, x86_pmu.enable); 1918 static_call_update(x86_pmu_disable, x86_pmu.disable); 1919 1920 static_call_update(x86_pmu_add, x86_pmu.add); 1921 static_call_update(x86_pmu_del, x86_pmu.del); 1922 static_call_update(x86_pmu_read, x86_pmu.read); 1923 1924 static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); 1925 static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); 1926 static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); 1927 1928 static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling); 1929 static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling); 1930 static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling); 1931 1932 static_call_update(x86_pmu_sched_task, x86_pmu.sched_task); 1933 static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx); 1934 1935 static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); 1936 static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); 1937} 1938 1939static void _x86_pmu_read(struct perf_event *event) 1940{ 1941 x86_perf_event_update(event); 1942} 1943 1944static int __init init_hw_perf_events(void) 1945{ 1946 struct x86_pmu_quirk *quirk; 1947 int err; 1948 1949 pr_info("Performance Events: "); 1950 1951 switch (boot_cpu_data.x86_vendor) { 1952 case X86_VENDOR_INTEL: 1953 err = intel_pmu_init(); 1954 break; 1955 case X86_VENDOR_AMD: 1956 err = amd_pmu_init(); 1957 break; 1958 case X86_VENDOR_HYGON: 1959 err = amd_pmu_init(); 1960 x86_pmu.name = "HYGON"; 1961 break; 1962 case X86_VENDOR_ZHAOXIN: 1963 case X86_VENDOR_CENTAUR: 1964 err = zhaoxin_pmu_init(); 1965 break; 1966 default: 1967 err = -ENOTSUPP; 1968 } 1969 if (err != 0) { 1970 pr_cont("no PMU driver, software events only.\n"); 1971 return 0; 1972 } 1973 1974 pmu_check_apic(); 1975 1976 /* sanity check that the hardware exists or is emulated */ 1977 if (!check_hw_exists()) 1978 return 0; 1979 1980 pr_cont("%s PMU driver.\n", x86_pmu.name); 1981 1982 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ 1983 1984 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) 1985 quirk->func(); 1986 1987 if (!x86_pmu.intel_ctrl) 1988 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; 1989 1990 perf_events_lapic_init(); 1991 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); 1992 1993 unconstrained = (struct event_constraint) 1994 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 1995 0, x86_pmu.num_counters, 0, 0); 1996 1997 x86_pmu_format_group.attrs = x86_pmu.format_attrs; 1998 1999 if (!x86_pmu.events_sysfs_show) 2000 x86_pmu_events_group.attrs = &empty_attrs; 2001 2002 pmu.attr_update = x86_pmu.attr_update; 2003 2004 pr_info("... version: %d\n", x86_pmu.version); 2005 pr_info("... bit width: %d\n", x86_pmu.cntval_bits); 2006 pr_info("... generic registers: %d\n", x86_pmu.num_counters); 2007 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); 2008 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 2009 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); 2010 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); 2011 2012 if (!x86_pmu.read) 2013 x86_pmu.read = _x86_pmu_read; 2014 2015 x86_pmu_static_call_update(); 2016 2017 /* 2018 * Install callbacks. Core will call them for each online 2019 * cpu. 2020 */ 2021 err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare", 2022 x86_pmu_prepare_cpu, x86_pmu_dead_cpu); 2023 if (err) 2024 return err; 2025 2026 err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, 2027 "perf/x86:starting", x86_pmu_starting_cpu, 2028 x86_pmu_dying_cpu); 2029 if (err) 2030 goto out; 2031 2032 err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online", 2033 x86_pmu_online_cpu, NULL); 2034 if (err) 2035 goto out1; 2036 2037 err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); 2038 if (err) 2039 goto out2; 2040 2041 return 0; 2042 2043out2: 2044 cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE); 2045out1: 2046 cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); 2047out: 2048 cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); 2049 return err; 2050} 2051early_initcall(init_hw_perf_events); 2052 2053static void x86_pmu_read(struct perf_event *event) 2054{ 2055 static_call(x86_pmu_read)(event); 2056} 2057 2058/* 2059 * Start group events scheduling transaction 2060 * Set the flag to make pmu::enable() not perform the 2061 * schedulability test, it will be performed at commit time 2062 * 2063 * We only support PERF_PMU_TXN_ADD transactions. Save the 2064 * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD 2065 * transactions. 2066 */ 2067static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) 2068{ 2069 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2070 2071 WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ 2072 2073 cpuc->txn_flags = txn_flags; 2074 if (txn_flags & ~PERF_PMU_TXN_ADD) 2075 return; 2076 2077 perf_pmu_disable(pmu); 2078 __this_cpu_write(cpu_hw_events.n_txn, 0); 2079 __this_cpu_write(cpu_hw_events.n_txn_pair, 0); 2080 __this_cpu_write(cpu_hw_events.n_txn_metric, 0); 2081} 2082 2083/* 2084 * Stop group events scheduling transaction 2085 * Clear the flag and pmu::enable() will perform the 2086 * schedulability test. 2087 */ 2088static void x86_pmu_cancel_txn(struct pmu *pmu) 2089{ 2090 unsigned int txn_flags; 2091 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2092 2093 WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ 2094 2095 txn_flags = cpuc->txn_flags; 2096 cpuc->txn_flags = 0; 2097 if (txn_flags & ~PERF_PMU_TXN_ADD) 2098 return; 2099 2100 /* 2101 * Truncate collected array by the number of events added in this 2102 * transaction. See x86_pmu_add() and x86_pmu_*_txn(). 2103 */ 2104 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); 2105 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); 2106 __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair)); 2107 __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric)); 2108 perf_pmu_enable(pmu); 2109} 2110 2111/* 2112 * Commit group events scheduling transaction 2113 * Perform the group schedulability test as a whole 2114 * Return 0 if success 2115 * 2116 * Does not cancel the transaction on failure; expects the caller to do this. 2117 */ 2118static int x86_pmu_commit_txn(struct pmu *pmu) 2119{ 2120 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2121 int assign[X86_PMC_IDX_MAX]; 2122 int n, ret; 2123 2124 WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ 2125 2126 if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { 2127 cpuc->txn_flags = 0; 2128 return 0; 2129 } 2130 2131 n = cpuc->n_events; 2132 2133 if (!x86_pmu_initialized()) 2134 return -EAGAIN; 2135 2136 ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); 2137 if (ret) 2138 return ret; 2139 2140 /* 2141 * copy new assignment, now we know it is possible 2142 * will be used by hw_perf_enable() 2143 */ 2144 memcpy(cpuc->assign, assign, n*sizeof(int)); 2145 2146 cpuc->txn_flags = 0; 2147 perf_pmu_enable(pmu); 2148 return 0; 2149} 2150/* 2151 * a fake_cpuc is used to validate event groups. Due to 2152 * the extra reg logic, we need to also allocate a fake 2153 * per_core and per_cpu structure. Otherwise, group events 2154 * using extra reg may conflict without the kernel being 2155 * able to catch this when the last event gets added to 2156 * the group. 2157 */ 2158static void free_fake_cpuc(struct cpu_hw_events *cpuc) 2159{ 2160 intel_cpuc_finish(cpuc); 2161 kfree(cpuc); 2162} 2163 2164static struct cpu_hw_events *allocate_fake_cpuc(void) 2165{ 2166 struct cpu_hw_events *cpuc; 2167 int cpu = raw_smp_processor_id(); 2168 2169 cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); 2170 if (!cpuc) 2171 return ERR_PTR(-ENOMEM); 2172 cpuc->is_fake = 1; 2173 2174 if (intel_cpuc_prepare(cpuc, cpu)) 2175 goto error; 2176 2177 return cpuc; 2178error: 2179 free_fake_cpuc(cpuc); 2180 return ERR_PTR(-ENOMEM); 2181} 2182 2183/* 2184 * validate that we can schedule this event 2185 */ 2186static int validate_event(struct perf_event *event) 2187{ 2188 struct cpu_hw_events *fake_cpuc; 2189 struct event_constraint *c; 2190 int ret = 0; 2191 2192 fake_cpuc = allocate_fake_cpuc(); 2193 if (IS_ERR(fake_cpuc)) 2194 return PTR_ERR(fake_cpuc); 2195 2196 c = x86_pmu.get_event_constraints(fake_cpuc, 0, event); 2197 2198 if (!c || !c->weight) 2199 ret = -EINVAL; 2200 2201 if (x86_pmu.put_event_constraints) 2202 x86_pmu.put_event_constraints(fake_cpuc, event); 2203 2204 free_fake_cpuc(fake_cpuc); 2205 2206 return ret; 2207} 2208 2209/* 2210 * validate a single event group 2211 * 2212 * validation include: 2213 * - check events are compatible which each other 2214 * - events do not compete for the same counter 2215 * - number of events <= number of counters 2216 * 2217 * validation ensures the group can be loaded onto the 2218 * PMU if it was the only group available. 2219 */ 2220static int validate_group(struct perf_event *event) 2221{ 2222 struct perf_event *leader = event->group_leader; 2223 struct cpu_hw_events *fake_cpuc; 2224 int ret = -EINVAL, n; 2225 2226 fake_cpuc = allocate_fake_cpuc(); 2227 if (IS_ERR(fake_cpuc)) 2228 return PTR_ERR(fake_cpuc); 2229 /* 2230 * the event is not yet connected with its 2231 * siblings therefore we must first collect 2232 * existing siblings, then add the new event 2233 * before we can simulate the scheduling 2234 */ 2235 n = collect_events(fake_cpuc, leader, true); 2236 if (n < 0) 2237 goto out; 2238 2239 fake_cpuc->n_events = n; 2240 n = collect_events(fake_cpuc, event, false); 2241 if (n < 0) 2242 goto out; 2243 2244 fake_cpuc->n_events = 0; 2245 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); 2246 2247out: 2248 free_fake_cpuc(fake_cpuc); 2249 return ret; 2250} 2251 2252static int x86_pmu_event_init(struct perf_event *event) 2253{ 2254 struct pmu *tmp; 2255 int err; 2256 2257 switch (event->attr.type) { 2258 case PERF_TYPE_RAW: 2259 case PERF_TYPE_HARDWARE: 2260 case PERF_TYPE_HW_CACHE: 2261 break; 2262 2263 default: 2264 return -ENOENT; 2265 } 2266 2267 err = __x86_pmu_event_init(event); 2268 if (!err) { 2269 /* 2270 * we temporarily connect event to its pmu 2271 * such that validate_group() can classify 2272 * it as an x86 event using is_x86_event() 2273 */ 2274 tmp = event->pmu; 2275 event->pmu = &pmu; 2276 2277 if (event->group_leader != event) 2278 err = validate_group(event); 2279 else 2280 err = validate_event(event); 2281 2282 event->pmu = tmp; 2283 } 2284 if (err) { 2285 if (event->destroy) 2286 event->destroy(event); 2287 event->destroy = NULL; 2288 } 2289 2290 if (READ_ONCE(x86_pmu.attr_rdpmc) && 2291 !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) 2292 event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; 2293 2294 return err; 2295} 2296 2297static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) 2298{ 2299 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) 2300 return; 2301 2302 /* 2303 * This function relies on not being called concurrently in two 2304 * tasks in the same mm. Otherwise one task could observe 2305 * perf_rdpmc_allowed > 1 and return all the way back to 2306 * userspace with CR4.PCE clear while another task is still 2307 * doing on_each_cpu_mask() to propagate CR4.PCE. 2308 * 2309 * For now, this can't happen because all callers hold mmap_lock 2310 * for write. If this changes, we'll need a different solution. 2311 */ 2312 mmap_assert_write_locked(mm); 2313 2314 if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) 2315 on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); 2316} 2317 2318static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) 2319{ 2320 2321 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) 2322 return; 2323 2324 if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) 2325 on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); 2326} 2327 2328static int x86_pmu_event_idx(struct perf_event *event) 2329{ 2330 struct hw_perf_event *hwc = &event->hw; 2331 2332 if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED)) 2333 return 0; 2334 2335 if (is_metric_idx(hwc->idx)) 2336 return INTEL_PMC_FIXED_RDPMC_METRICS + 1; 2337 else 2338 return hwc->event_base_rdpmc + 1; 2339} 2340 2341static ssize_t get_attr_rdpmc(struct device *cdev, 2342 struct device_attribute *attr, 2343 char *buf) 2344{ 2345 return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); 2346} 2347 2348static ssize_t set_attr_rdpmc(struct device *cdev, 2349 struct device_attribute *attr, 2350 const char *buf, size_t count) 2351{ 2352 unsigned long val; 2353 ssize_t ret; 2354 2355 ret = kstrtoul(buf, 0, &val); 2356 if (ret) 2357 return ret; 2358 2359 if (val > 2) 2360 return -EINVAL; 2361 2362 if (x86_pmu.attr_rdpmc_broken) 2363 return -ENOTSUPP; 2364 2365 if (val != x86_pmu.attr_rdpmc) { 2366 /* 2367 * Changing into or out of never available or always available, 2368 * aka perf-event-bypassing mode. This path is extremely slow, 2369 * but only root can trigger it, so it's okay. 2370 */ 2371 if (val == 0) 2372 static_branch_inc(&rdpmc_never_available_key); 2373 else if (x86_pmu.attr_rdpmc == 0) 2374 static_branch_dec(&rdpmc_never_available_key); 2375 2376 if (val == 2) 2377 static_branch_inc(&rdpmc_always_available_key); 2378 else if (x86_pmu.attr_rdpmc == 2) 2379 static_branch_dec(&rdpmc_always_available_key); 2380 2381 on_each_cpu(cr4_update_pce, NULL, 1); 2382 x86_pmu.attr_rdpmc = val; 2383 } 2384 2385 return count; 2386} 2387 2388static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); 2389 2390static struct attribute *x86_pmu_attrs[] = { 2391 &dev_attr_rdpmc.attr, 2392 NULL, 2393}; 2394 2395static struct attribute_group x86_pmu_attr_group __ro_after_init = { 2396 .attrs = x86_pmu_attrs, 2397}; 2398 2399static ssize_t max_precise_show(struct device *cdev, 2400 struct device_attribute *attr, 2401 char *buf) 2402{ 2403 return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); 2404} 2405 2406static DEVICE_ATTR_RO(max_precise); 2407 2408static struct attribute *x86_pmu_caps_attrs[] = { 2409 &dev_attr_max_precise.attr, 2410 NULL 2411}; 2412 2413static struct attribute_group x86_pmu_caps_group __ro_after_init = { 2414 .name = "caps", 2415 .attrs = x86_pmu_caps_attrs, 2416}; 2417 2418static const struct attribute_group *x86_pmu_attr_groups[] = { 2419 &x86_pmu_attr_group, 2420 &x86_pmu_format_group, 2421 &x86_pmu_events_group, 2422 &x86_pmu_caps_group, 2423 NULL, 2424}; 2425 2426static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) 2427{ 2428 static_call_cond(x86_pmu_sched_task)(ctx, sched_in); 2429} 2430 2431static void x86_pmu_swap_task_ctx(struct perf_event_context *prev, 2432 struct perf_event_context *next) 2433{ 2434 static_call_cond(x86_pmu_swap_task_ctx)(prev, next); 2435} 2436 2437void perf_check_microcode(void) 2438{ 2439 if (x86_pmu.check_microcode) 2440 x86_pmu.check_microcode(); 2441} 2442 2443static int x86_pmu_check_period(struct perf_event *event, u64 value) 2444{ 2445 if (x86_pmu.check_period && x86_pmu.check_period(event, value)) 2446 return -EINVAL; 2447 2448 if (value && x86_pmu.limit_period) { 2449 if (x86_pmu.limit_period(event, value) > value) 2450 return -EINVAL; 2451 } 2452 2453 return 0; 2454} 2455 2456static int x86_pmu_aux_output_match(struct perf_event *event) 2457{ 2458 if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT)) 2459 return 0; 2460 2461 if (x86_pmu.aux_output_match) 2462 return x86_pmu.aux_output_match(event); 2463 2464 return 0; 2465} 2466 2467static struct pmu pmu = { 2468 .pmu_enable = x86_pmu_enable, 2469 .pmu_disable = x86_pmu_disable, 2470 2471 .attr_groups = x86_pmu_attr_groups, 2472 2473 .event_init = x86_pmu_event_init, 2474 2475 .event_mapped = x86_pmu_event_mapped, 2476 .event_unmapped = x86_pmu_event_unmapped, 2477 2478 .add = x86_pmu_add, 2479 .del = x86_pmu_del, 2480 .start = x86_pmu_start, 2481 .stop = x86_pmu_stop, 2482 .read = x86_pmu_read, 2483 2484 .start_txn = x86_pmu_start_txn, 2485 .cancel_txn = x86_pmu_cancel_txn, 2486 .commit_txn = x86_pmu_commit_txn, 2487 2488 .event_idx = x86_pmu_event_idx, 2489 .sched_task = x86_pmu_sched_task, 2490 .swap_task_ctx = x86_pmu_swap_task_ctx, 2491 .check_period = x86_pmu_check_period, 2492 2493 .aux_output_match = x86_pmu_aux_output_match, 2494}; 2495 2496void arch_perf_update_userpage(struct perf_event *event, 2497 struct perf_event_mmap_page *userpg, u64 now) 2498{ 2499 struct cyc2ns_data data; 2500 u64 offset; 2501 2502 userpg->cap_user_time = 0; 2503 userpg->cap_user_time_zero = 0; 2504 userpg->cap_user_rdpmc = 2505 !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); 2506 userpg->pmc_width = x86_pmu.cntval_bits; 2507 2508 if (!using_native_sched_clock() || !sched_clock_stable()) 2509 return; 2510 2511 cyc2ns_read_begin(&data); 2512 2513 offset = data.cyc2ns_offset + __sched_clock_offset; 2514 2515 /* 2516 * Internal timekeeping for enabled/running/stopped times 2517 * is always in the local_clock domain. 2518 */ 2519 userpg->cap_user_time = 1; 2520 userpg->time_mult = data.cyc2ns_mul; 2521 userpg->time_shift = data.cyc2ns_shift; 2522 userpg->time_offset = offset - now; 2523 2524 /* 2525 * cap_user_time_zero doesn't make sense when we're using a different 2526 * time base for the records. 2527 */ 2528 if (!event->attr.use_clockid) { 2529 userpg->cap_user_time_zero = 1; 2530 userpg->time_zero = offset; 2531 } 2532 2533 cyc2ns_read_end(); 2534} 2535 2536/* 2537 * Determine whether the regs were taken from an irq/exception handler rather 2538 * than from perf_arch_fetch_caller_regs(). 2539 */ 2540static bool perf_hw_regs(struct pt_regs *regs) 2541{ 2542 return regs->flags & X86_EFLAGS_FIXED; 2543} 2544 2545void 2546perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) 2547{ 2548 struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); 2549 struct unwind_state state; 2550 unsigned long addr; 2551 2552 if (guest_cbs && guest_cbs->is_in_guest()) { 2553 /* TODO: We don't support guest os callchain now */ 2554 return; 2555 } 2556 2557 if (perf_callchain_store(entry, regs->ip)) 2558 return; 2559 2560 if (perf_hw_regs(regs)) 2561 unwind_start(&state, current, regs, NULL); 2562 else 2563 unwind_start(&state, current, NULL, (void *)regs->sp); 2564 2565 for (; !unwind_done(&state); unwind_next_frame(&state)) { 2566 addr = unwind_get_return_address(&state); 2567 if (!addr || perf_callchain_store(entry, addr)) 2568 return; 2569 } 2570} 2571 2572static inline int 2573valid_user_frame(const void __user *fp, unsigned long size) 2574{ 2575 return (__range_not_ok(fp, size, TASK_SIZE) == 0); 2576} 2577 2578static unsigned long get_segment_base(unsigned int segment) 2579{ 2580 struct desc_struct *desc; 2581 unsigned int idx = segment >> 3; 2582 2583 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { 2584#ifdef CONFIG_MODIFY_LDT_SYSCALL 2585 struct ldt_struct *ldt; 2586 2587 /* IRQs are off, so this synchronizes with smp_store_release */ 2588 ldt = READ_ONCE(current->active_mm->context.ldt); 2589 if (!ldt || idx >= ldt->nr_entries) 2590 return 0; 2591 2592 desc = &ldt->entries[idx]; 2593#else 2594 return 0; 2595#endif 2596 } else { 2597 if (idx >= GDT_ENTRIES) 2598 return 0; 2599 2600 desc = raw_cpu_ptr(gdt_page.gdt) + idx; 2601 } 2602 2603 return get_desc_base(desc); 2604} 2605 2606#ifdef CONFIG_IA32_EMULATION 2607 2608#include <linux/compat.h> 2609 2610static inline int 2611perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) 2612{ 2613 /* 32-bit process in 64-bit kernel. */ 2614 unsigned long ss_base, cs_base; 2615 struct stack_frame_ia32 frame; 2616 const struct stack_frame_ia32 __user *fp; 2617 2618 if (!test_thread_flag(TIF_IA32)) 2619 return 0; 2620 2621 cs_base = get_segment_base(regs->cs); 2622 ss_base = get_segment_base(regs->ss); 2623 2624 fp = compat_ptr(ss_base + regs->bp); 2625 pagefault_disable(); 2626 while (entry->nr < entry->max_stack) { 2627 if (!valid_user_frame(fp, sizeof(frame))) 2628 break; 2629 2630 if (__get_user(frame.next_frame, &fp->next_frame)) 2631 break; 2632 if (__get_user(frame.return_address, &fp->return_address)) 2633 break; 2634 2635 perf_callchain_store(entry, cs_base + frame.return_address); 2636 fp = compat_ptr(ss_base + frame.next_frame); 2637 } 2638 pagefault_enable(); 2639 return 1; 2640} 2641#else 2642static inline int 2643perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) 2644{ 2645 return 0; 2646} 2647#endif 2648 2649void 2650perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) 2651{ 2652 struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); 2653 struct stack_frame frame; 2654 const struct stack_frame __user *fp; 2655 2656 if (guest_cbs && guest_cbs->is_in_guest()) { 2657 /* TODO: We don't support guest os callchain now */ 2658 return; 2659 } 2660 2661 /* 2662 * We don't know what to do with VM86 stacks.. ignore them for now. 2663 */ 2664 if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) 2665 return; 2666 2667 fp = (void __user *)regs->bp; 2668 2669 perf_callchain_store(entry, regs->ip); 2670 2671 if (!nmi_uaccess_okay()) 2672 return; 2673 2674 if (perf_callchain_user32(regs, entry)) 2675 return; 2676 2677 pagefault_disable(); 2678 while (entry->nr < entry->max_stack) { 2679 if (!valid_user_frame(fp, sizeof(frame))) 2680 break; 2681 2682 if (__get_user(frame.next_frame, &fp->next_frame)) 2683 break; 2684 if (__get_user(frame.return_address, &fp->return_address)) 2685 break; 2686 2687 perf_callchain_store(entry, frame.return_address); 2688 fp = (void __user *)frame.next_frame; 2689 } 2690 pagefault_enable(); 2691} 2692 2693/* 2694 * Deal with code segment offsets for the various execution modes: 2695 * 2696 * VM86 - the good olde 16 bit days, where the linear address is 2697 * 20 bits and we use regs->ip + 0x10 * regs->cs. 2698 * 2699 * IA32 - Where we need to look at GDT/LDT segment descriptor tables 2700 * to figure out what the 32bit base address is. 2701 * 2702 * X32 - has TIF_X32 set, but is running in x86_64 2703 * 2704 * X86_64 - CS,DS,SS,ES are all zero based. 2705 */ 2706static unsigned long code_segment_base(struct pt_regs *regs) 2707{ 2708 /* 2709 * For IA32 we look at the GDT/LDT segment base to convert the 2710 * effective IP to a linear address. 2711 */ 2712 2713#ifdef CONFIG_X86_32 2714 /* 2715 * If we are in VM86 mode, add the segment offset to convert to a 2716 * linear address. 2717 */ 2718 if (regs->flags & X86_VM_MASK) 2719 return 0x10 * regs->cs; 2720 2721 if (user_mode(regs) && regs->cs != __USER_CS) 2722 return get_segment_base(regs->cs); 2723#else 2724 if (user_mode(regs) && !user_64bit_mode(regs) && 2725 regs->cs != __USER32_CS) 2726 return get_segment_base(regs->cs); 2727#endif 2728 return 0; 2729} 2730 2731unsigned long perf_instruction_pointer(struct pt_regs *regs) 2732{ 2733 struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); 2734 2735 if (guest_cbs && guest_cbs->is_in_guest()) 2736 return guest_cbs->get_guest_ip(); 2737 2738 return regs->ip + code_segment_base(regs); 2739} 2740 2741unsigned long perf_misc_flags(struct pt_regs *regs) 2742{ 2743 struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); 2744 int misc = 0; 2745 2746 if (guest_cbs && guest_cbs->is_in_guest()) { 2747 if (guest_cbs->is_user_mode()) 2748 misc |= PERF_RECORD_MISC_GUEST_USER; 2749 else 2750 misc |= PERF_RECORD_MISC_GUEST_KERNEL; 2751 } else { 2752 if (user_mode(regs)) 2753 misc |= PERF_RECORD_MISC_USER; 2754 else 2755 misc |= PERF_RECORD_MISC_KERNEL; 2756 } 2757 2758 if (regs->flags & PERF_EFLAGS_EXACT) 2759 misc |= PERF_RECORD_MISC_EXACT_IP; 2760 2761 return misc; 2762} 2763 2764void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) 2765{ 2766 cap->version = x86_pmu.version; 2767 cap->num_counters_gp = x86_pmu.num_counters; 2768 cap->num_counters_fixed = x86_pmu.num_counters_fixed; 2769 cap->bit_width_gp = x86_pmu.cntval_bits; 2770 cap->bit_width_fixed = x86_pmu.cntval_bits; 2771 cap->events_mask = (unsigned int)x86_pmu.events_maskl; 2772 cap->events_mask_len = x86_pmu.events_mask_len; 2773} 2774EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); 2775