xref: /kernel/linux/linux-5.10/arch/x86/events/core.c (revision 8c2ecf20)
1/*
2 * Performance events x86 architecture code
3 *
4 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 *  Copyright (C) 2009 Jaswinder Singh Rajput
7 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
9 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
11 *
12 *  For licencing details see kernel-base/COPYING
13 */
14
15#include <linux/perf_event.h>
16#include <linux/capability.h>
17#include <linux/notifier.h>
18#include <linux/hardirq.h>
19#include <linux/kprobes.h>
20#include <linux/export.h>
21#include <linux/init.h>
22#include <linux/kdebug.h>
23#include <linux/sched/mm.h>
24#include <linux/sched/clock.h>
25#include <linux/uaccess.h>
26#include <linux/slab.h>
27#include <linux/cpu.h>
28#include <linux/bitops.h>
29#include <linux/device.h>
30#include <linux/nospec.h>
31#include <linux/static_call.h>
32
33#include <asm/apic.h>
34#include <asm/stacktrace.h>
35#include <asm/nmi.h>
36#include <asm/smp.h>
37#include <asm/alternative.h>
38#include <asm/mmu_context.h>
39#include <asm/tlbflush.h>
40#include <asm/timer.h>
41#include <asm/desc.h>
42#include <asm/ldt.h>
43#include <asm/unwind.h>
44
45#include "perf_event.h"
46
47struct x86_pmu x86_pmu __read_mostly;
48static struct pmu pmu;
49
50DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
51	.enabled = 1,
52	.pmu = &pmu,
53};
54
55DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
56DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
57
58/*
59 * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
60 * from just a typename, as opposed to an actual function.
61 */
62DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq,  *x86_pmu.handle_irq);
63DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all);
64DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all,  *x86_pmu.enable_all);
65DEFINE_STATIC_CALL_NULL(x86_pmu_enable,	     *x86_pmu.enable);
66DEFINE_STATIC_CALL_NULL(x86_pmu_disable,     *x86_pmu.disable);
67
68DEFINE_STATIC_CALL_NULL(x86_pmu_add,  *x86_pmu.add);
69DEFINE_STATIC_CALL_NULL(x86_pmu_del,  *x86_pmu.del);
70DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);
71
72DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events,       *x86_pmu.schedule_events);
73DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints);
74DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints);
75
76DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling,  *x86_pmu.start_scheduling);
77DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling);
78DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling,   *x86_pmu.stop_scheduling);
79
80DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task,    *x86_pmu.sched_task);
81DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
82
83DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs,   *x86_pmu.drain_pebs);
84DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
85
86u64 __read_mostly hw_cache_event_ids
87				[PERF_COUNT_HW_CACHE_MAX]
88				[PERF_COUNT_HW_CACHE_OP_MAX]
89				[PERF_COUNT_HW_CACHE_RESULT_MAX];
90u64 __read_mostly hw_cache_extra_regs
91				[PERF_COUNT_HW_CACHE_MAX]
92				[PERF_COUNT_HW_CACHE_OP_MAX]
93				[PERF_COUNT_HW_CACHE_RESULT_MAX];
94
95/*
96 * Propagate event elapsed time into the generic event.
97 * Can only be executed on the CPU where the event is active.
98 * Returns the delta events processed.
99 */
100u64 x86_perf_event_update(struct perf_event *event)
101{
102	struct hw_perf_event *hwc = &event->hw;
103	int shift = 64 - x86_pmu.cntval_bits;
104	u64 prev_raw_count, new_raw_count;
105	u64 delta;
106
107	if (unlikely(!hwc->event_base))
108		return 0;
109
110	if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event)
111		return x86_pmu.update_topdown_event(event);
112
113	/*
114	 * Careful: an NMI might modify the previous event value.
115	 *
116	 * Our tactic to handle this is to first atomically read and
117	 * exchange a new raw count - then add that new-prev delta
118	 * count to the generic event atomically:
119	 */
120again:
121	prev_raw_count = local64_read(&hwc->prev_count);
122	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
123
124	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
125					new_raw_count) != prev_raw_count)
126		goto again;
127
128	/*
129	 * Now we have the new raw value and have updated the prev
130	 * timestamp already. We can now calculate the elapsed delta
131	 * (event-)time and add that to the generic event.
132	 *
133	 * Careful, not all hw sign-extends above the physical width
134	 * of the count.
135	 */
136	delta = (new_raw_count << shift) - (prev_raw_count << shift);
137	delta >>= shift;
138
139	local64_add(delta, &event->count);
140	local64_sub(delta, &hwc->period_left);
141
142	return new_raw_count;
143}
144
145/*
146 * Find and validate any extra registers to set up.
147 */
148static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
149{
150	struct hw_perf_event_extra *reg;
151	struct extra_reg *er;
152
153	reg = &event->hw.extra_reg;
154
155	if (!x86_pmu.extra_regs)
156		return 0;
157
158	for (er = x86_pmu.extra_regs; er->msr; er++) {
159		if (er->event != (config & er->config_mask))
160			continue;
161		if (event->attr.config1 & ~er->valid_mask)
162			return -EINVAL;
163		/* Check if the extra msrs can be safely accessed*/
164		if (!er->extra_msr_access)
165			return -ENXIO;
166
167		reg->idx = er->idx;
168		reg->config = event->attr.config1;
169		reg->reg = er->msr;
170		break;
171	}
172	return 0;
173}
174
175static atomic_t active_events;
176static atomic_t pmc_refcount;
177static DEFINE_MUTEX(pmc_reserve_mutex);
178
179#ifdef CONFIG_X86_LOCAL_APIC
180
181static bool reserve_pmc_hardware(void)
182{
183	int i;
184
185	for (i = 0; i < x86_pmu.num_counters; i++) {
186		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
187			goto perfctr_fail;
188	}
189
190	for (i = 0; i < x86_pmu.num_counters; i++) {
191		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
192			goto eventsel_fail;
193	}
194
195	return true;
196
197eventsel_fail:
198	for (i--; i >= 0; i--)
199		release_evntsel_nmi(x86_pmu_config_addr(i));
200
201	i = x86_pmu.num_counters;
202
203perfctr_fail:
204	for (i--; i >= 0; i--)
205		release_perfctr_nmi(x86_pmu_event_addr(i));
206
207	return false;
208}
209
210static void release_pmc_hardware(void)
211{
212	int i;
213
214	for (i = 0; i < x86_pmu.num_counters; i++) {
215		release_perfctr_nmi(x86_pmu_event_addr(i));
216		release_evntsel_nmi(x86_pmu_config_addr(i));
217	}
218}
219
220#else
221
222static bool reserve_pmc_hardware(void) { return true; }
223static void release_pmc_hardware(void) {}
224
225#endif
226
227static bool check_hw_exists(void)
228{
229	u64 val, val_fail = -1, val_new= ~0;
230	int i, reg, reg_fail = -1, ret = 0;
231	int bios_fail = 0;
232	int reg_safe = -1;
233
234	/*
235	 * Check to see if the BIOS enabled any of the counters, if so
236	 * complain and bail.
237	 */
238	for (i = 0; i < x86_pmu.num_counters; i++) {
239		reg = x86_pmu_config_addr(i);
240		ret = rdmsrl_safe(reg, &val);
241		if (ret)
242			goto msr_fail;
243		if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
244			bios_fail = 1;
245			val_fail = val;
246			reg_fail = reg;
247		} else {
248			reg_safe = i;
249		}
250	}
251
252	if (x86_pmu.num_counters_fixed) {
253		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
254		ret = rdmsrl_safe(reg, &val);
255		if (ret)
256			goto msr_fail;
257		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
258			if (val & (0x03 << i*4)) {
259				bios_fail = 1;
260				val_fail = val;
261				reg_fail = reg;
262			}
263		}
264	}
265
266	/*
267	 * If all the counters are enabled, the below test will always
268	 * fail.  The tools will also become useless in this scenario.
269	 * Just fail and disable the hardware counters.
270	 */
271
272	if (reg_safe == -1) {
273		reg = reg_safe;
274		goto msr_fail;
275	}
276
277	/*
278	 * Read the current value, change it and read it back to see if it
279	 * matches, this is needed to detect certain hardware emulators
280	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
281	 */
282	reg = x86_pmu_event_addr(reg_safe);
283	if (rdmsrl_safe(reg, &val))
284		goto msr_fail;
285	val ^= 0xffffUL;
286	ret = wrmsrl_safe(reg, val);
287	ret |= rdmsrl_safe(reg, &val_new);
288	if (ret || val != val_new)
289		goto msr_fail;
290
291	/*
292	 * We still allow the PMU driver to operate:
293	 */
294	if (bios_fail) {
295		pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
296		pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
297			      reg_fail, val_fail);
298	}
299
300	return true;
301
302msr_fail:
303	if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
304		pr_cont("PMU not available due to virtualization, using software events only.\n");
305	} else {
306		pr_cont("Broken PMU hardware detected, using software events only.\n");
307		pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
308		       reg, val_new);
309	}
310
311	return false;
312}
313
314static void hw_perf_event_destroy(struct perf_event *event)
315{
316	x86_release_hardware();
317	atomic_dec(&active_events);
318}
319
320void hw_perf_lbr_event_destroy(struct perf_event *event)
321{
322	hw_perf_event_destroy(event);
323
324	/* undo the lbr/bts event accounting */
325	x86_del_exclusive(x86_lbr_exclusive_lbr);
326}
327
328static inline int x86_pmu_initialized(void)
329{
330	return x86_pmu.handle_irq != NULL;
331}
332
333static inline int
334set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
335{
336	struct perf_event_attr *attr = &event->attr;
337	unsigned int cache_type, cache_op, cache_result;
338	u64 config, val;
339
340	config = attr->config;
341
342	cache_type = (config >> 0) & 0xff;
343	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
344		return -EINVAL;
345	cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX);
346
347	cache_op = (config >>  8) & 0xff;
348	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
349		return -EINVAL;
350	cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
351
352	cache_result = (config >> 16) & 0xff;
353	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
354		return -EINVAL;
355	cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
356
357	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
358
359	if (val == 0)
360		return -ENOENT;
361
362	if (val == -1)
363		return -EINVAL;
364
365	hwc->config |= val;
366	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
367	return x86_pmu_extra_regs(val, event);
368}
369
370int x86_reserve_hardware(void)
371{
372	int err = 0;
373
374	if (!atomic_inc_not_zero(&pmc_refcount)) {
375		mutex_lock(&pmc_reserve_mutex);
376		if (atomic_read(&pmc_refcount) == 0) {
377			if (!reserve_pmc_hardware()) {
378				err = -EBUSY;
379			} else {
380				reserve_ds_buffers();
381				reserve_lbr_buffers();
382			}
383		}
384		if (!err)
385			atomic_inc(&pmc_refcount);
386		mutex_unlock(&pmc_reserve_mutex);
387	}
388
389	return err;
390}
391
392void x86_release_hardware(void)
393{
394	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
395		release_pmc_hardware();
396		release_ds_buffers();
397		release_lbr_buffers();
398		mutex_unlock(&pmc_reserve_mutex);
399	}
400}
401
402/*
403 * Check if we can create event of a certain type (that no conflicting events
404 * are present).
405 */
406int x86_add_exclusive(unsigned int what)
407{
408	int i;
409
410	/*
411	 * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
412	 * LBR and BTS are still mutually exclusive.
413	 */
414	if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
415		goto out;
416
417	if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
418		mutex_lock(&pmc_reserve_mutex);
419		for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
420			if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
421				goto fail_unlock;
422		}
423		atomic_inc(&x86_pmu.lbr_exclusive[what]);
424		mutex_unlock(&pmc_reserve_mutex);
425	}
426
427out:
428	atomic_inc(&active_events);
429	return 0;
430
431fail_unlock:
432	mutex_unlock(&pmc_reserve_mutex);
433	return -EBUSY;
434}
435
436void x86_del_exclusive(unsigned int what)
437{
438	atomic_dec(&active_events);
439
440	/*
441	 * See the comment in x86_add_exclusive().
442	 */
443	if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
444		return;
445
446	atomic_dec(&x86_pmu.lbr_exclusive[what]);
447}
448
449int x86_setup_perfctr(struct perf_event *event)
450{
451	struct perf_event_attr *attr = &event->attr;
452	struct hw_perf_event *hwc = &event->hw;
453	u64 config;
454
455	if (!is_sampling_event(event)) {
456		hwc->sample_period = x86_pmu.max_period;
457		hwc->last_period = hwc->sample_period;
458		local64_set(&hwc->period_left, hwc->sample_period);
459	}
460
461	if (attr->type == PERF_TYPE_RAW)
462		return x86_pmu_extra_regs(event->attr.config, event);
463
464	if (attr->type == PERF_TYPE_HW_CACHE)
465		return set_ext_hw_attr(hwc, event);
466
467	if (attr->config >= x86_pmu.max_events)
468		return -EINVAL;
469
470	attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events);
471
472	/*
473	 * The generic map:
474	 */
475	config = x86_pmu.event_map(attr->config);
476
477	if (config == 0)
478		return -ENOENT;
479
480	if (config == -1LL)
481		return -EINVAL;
482
483	hwc->config |= config;
484
485	return 0;
486}
487
488/*
489 * check that branch_sample_type is compatible with
490 * settings needed for precise_ip > 1 which implies
491 * using the LBR to capture ALL taken branches at the
492 * priv levels of the measurement
493 */
494static inline int precise_br_compat(struct perf_event *event)
495{
496	u64 m = event->attr.branch_sample_type;
497	u64 b = 0;
498
499	/* must capture all branches */
500	if (!(m & PERF_SAMPLE_BRANCH_ANY))
501		return 0;
502
503	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
504
505	if (!event->attr.exclude_user)
506		b |= PERF_SAMPLE_BRANCH_USER;
507
508	if (!event->attr.exclude_kernel)
509		b |= PERF_SAMPLE_BRANCH_KERNEL;
510
511	/*
512	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
513	 */
514
515	return m == b;
516}
517
518int x86_pmu_max_precise(void)
519{
520	int precise = 0;
521
522	/* Support for constant skid */
523	if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
524		precise++;
525
526		/* Support for IP fixup */
527		if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
528			precise++;
529
530		if (x86_pmu.pebs_prec_dist)
531			precise++;
532	}
533	return precise;
534}
535
536int x86_pmu_hw_config(struct perf_event *event)
537{
538	if (event->attr.precise_ip) {
539		int precise = x86_pmu_max_precise();
540
541		if (event->attr.precise_ip > precise)
542			return -EOPNOTSUPP;
543
544		/* There's no sense in having PEBS for non sampling events: */
545		if (!is_sampling_event(event))
546			return -EINVAL;
547	}
548	/*
549	 * check that PEBS LBR correction does not conflict with
550	 * whatever the user is asking with attr->branch_sample_type
551	 */
552	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
553		u64 *br_type = &event->attr.branch_sample_type;
554
555		if (has_branch_stack(event)) {
556			if (!precise_br_compat(event))
557				return -EOPNOTSUPP;
558
559			/* branch_sample_type is compatible */
560
561		} else {
562			/*
563			 * user did not specify  branch_sample_type
564			 *
565			 * For PEBS fixups, we capture all
566			 * the branches at the priv level of the
567			 * event.
568			 */
569			*br_type = PERF_SAMPLE_BRANCH_ANY;
570
571			if (!event->attr.exclude_user)
572				*br_type |= PERF_SAMPLE_BRANCH_USER;
573
574			if (!event->attr.exclude_kernel)
575				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
576		}
577	}
578
579	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
580		event->attach_state |= PERF_ATTACH_TASK_DATA;
581
582	/*
583	 * Generate PMC IRQs:
584	 * (keep 'enabled' bit clear for now)
585	 */
586	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
587
588	/*
589	 * Count user and OS events unless requested not to
590	 */
591	if (!event->attr.exclude_user)
592		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
593	if (!event->attr.exclude_kernel)
594		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
595
596	if (event->attr.type == PERF_TYPE_RAW)
597		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
598
599	if (event->attr.sample_period && x86_pmu.limit_period) {
600		if (x86_pmu.limit_period(event, event->attr.sample_period) >
601				event->attr.sample_period)
602			return -EINVAL;
603	}
604
605	/* sample_regs_user never support XMM registers */
606	if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK))
607		return -EINVAL;
608	/*
609	 * Besides the general purpose registers, XMM registers may
610	 * be collected in PEBS on some platforms, e.g. Icelake
611	 */
612	if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) {
613		if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
614			return -EINVAL;
615
616		if (!event->attr.precise_ip)
617			return -EINVAL;
618	}
619
620	return x86_setup_perfctr(event);
621}
622
623/*
624 * Setup the hardware configuration for a given attr_type
625 */
626static int __x86_pmu_event_init(struct perf_event *event)
627{
628	int err;
629
630	if (!x86_pmu_initialized())
631		return -ENODEV;
632
633	err = x86_reserve_hardware();
634	if (err)
635		return err;
636
637	atomic_inc(&active_events);
638	event->destroy = hw_perf_event_destroy;
639
640	event->hw.idx = -1;
641	event->hw.last_cpu = -1;
642	event->hw.last_tag = ~0ULL;
643
644	/* mark unused */
645	event->hw.extra_reg.idx = EXTRA_REG_NONE;
646	event->hw.branch_reg.idx = EXTRA_REG_NONE;
647
648	return x86_pmu.hw_config(event);
649}
650
651void x86_pmu_disable_all(void)
652{
653	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
654	int idx;
655
656	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
657		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
658		u64 val;
659
660		if (!test_bit(idx, cpuc->active_mask))
661			continue;
662		rdmsrl(x86_pmu_config_addr(idx), val);
663		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
664			continue;
665		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
666		wrmsrl(x86_pmu_config_addr(idx), val);
667		if (is_counter_pair(hwc))
668			wrmsrl(x86_pmu_config_addr(idx + 1), 0);
669	}
670}
671
672/*
673 * There may be PMI landing after enabled=0. The PMI hitting could be before or
674 * after disable_all.
675 *
676 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
677 * It will not be re-enabled in the NMI handler again, because enabled=0. After
678 * handling the NMI, disable_all will be called, which will not change the
679 * state either. If PMI hits after disable_all, the PMU is already disabled
680 * before entering NMI handler. The NMI handler will not change the state
681 * either.
682 *
683 * So either situation is harmless.
684 */
685static void x86_pmu_disable(struct pmu *pmu)
686{
687	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
688
689	if (!x86_pmu_initialized())
690		return;
691
692	if (!cpuc->enabled)
693		return;
694
695	cpuc->n_added = 0;
696	cpuc->enabled = 0;
697	barrier();
698
699	static_call(x86_pmu_disable_all)();
700}
701
702void x86_pmu_enable_all(int added)
703{
704	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
705	int idx;
706
707	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
708		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
709
710		if (!test_bit(idx, cpuc->active_mask))
711			continue;
712
713		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
714	}
715}
716
717static inline int is_x86_event(struct perf_event *event)
718{
719	return event->pmu == &pmu;
720}
721
722struct pmu *x86_get_pmu(unsigned int cpu)
723{
724	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
725
726	/*
727	 * All CPUs of the hybrid type have been offline.
728	 * The x86_get_pmu() should not be invoked.
729	 */
730	if (WARN_ON_ONCE(!cpuc->pmu))
731		return &pmu;
732
733	return cpuc->pmu;
734}
735/*
736 * Event scheduler state:
737 *
738 * Assign events iterating over all events and counters, beginning
739 * with events with least weights first. Keep the current iterator
740 * state in struct sched_state.
741 */
742struct sched_state {
743	int	weight;
744	int	event;		/* event index */
745	int	counter;	/* counter index */
746	int	unassigned;	/* number of events to be assigned left */
747	int	nr_gp;		/* number of GP counters used */
748	u64	used;
749};
750
751/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
752#define	SCHED_STATES_MAX	2
753
754struct perf_sched {
755	int			max_weight;
756	int			max_events;
757	int			max_gp;
758	int			saved_states;
759	struct event_constraint	**constraints;
760	struct sched_state	state;
761	struct sched_state	saved[SCHED_STATES_MAX];
762};
763
764/*
765 * Initialize interator that runs through all events and counters.
766 */
767static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
768			    int num, int wmin, int wmax, int gpmax)
769{
770	int idx;
771
772	memset(sched, 0, sizeof(*sched));
773	sched->max_events	= num;
774	sched->max_weight	= wmax;
775	sched->max_gp		= gpmax;
776	sched->constraints	= constraints;
777
778	for (idx = 0; idx < num; idx++) {
779		if (constraints[idx]->weight == wmin)
780			break;
781	}
782
783	sched->state.event	= idx;		/* start with min weight */
784	sched->state.weight	= wmin;
785	sched->state.unassigned	= num;
786}
787
788static void perf_sched_save_state(struct perf_sched *sched)
789{
790	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
791		return;
792
793	sched->saved[sched->saved_states] = sched->state;
794	sched->saved_states++;
795}
796
797static bool perf_sched_restore_state(struct perf_sched *sched)
798{
799	if (!sched->saved_states)
800		return false;
801
802	sched->saved_states--;
803	sched->state = sched->saved[sched->saved_states];
804
805	/* this assignment didn't work out */
806	/* XXX broken vs EVENT_PAIR */
807	sched->state.used &= ~BIT_ULL(sched->state.counter);
808
809	/* try the next one */
810	sched->state.counter++;
811
812	return true;
813}
814
815/*
816 * Select a counter for the current event to schedule. Return true on
817 * success.
818 */
819static bool __perf_sched_find_counter(struct perf_sched *sched)
820{
821	struct event_constraint *c;
822	int idx;
823
824	if (!sched->state.unassigned)
825		return false;
826
827	if (sched->state.event >= sched->max_events)
828		return false;
829
830	c = sched->constraints[sched->state.event];
831	/* Prefer fixed purpose counters */
832	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
833		idx = INTEL_PMC_IDX_FIXED;
834		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
835			u64 mask = BIT_ULL(idx);
836
837			if (sched->state.used & mask)
838				continue;
839
840			sched->state.used |= mask;
841			goto done;
842		}
843	}
844
845	/* Grab the first unused counter starting with idx */
846	idx = sched->state.counter;
847	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
848		u64 mask = BIT_ULL(idx);
849
850		if (c->flags & PERF_X86_EVENT_PAIR)
851			mask |= mask << 1;
852
853		if (sched->state.used & mask)
854			continue;
855
856		if (sched->state.nr_gp++ >= sched->max_gp)
857			return false;
858
859		sched->state.used |= mask;
860		goto done;
861	}
862
863	return false;
864
865done:
866	sched->state.counter = idx;
867
868	if (c->overlap)
869		perf_sched_save_state(sched);
870
871	return true;
872}
873
874static bool perf_sched_find_counter(struct perf_sched *sched)
875{
876	while (!__perf_sched_find_counter(sched)) {
877		if (!perf_sched_restore_state(sched))
878			return false;
879	}
880
881	return true;
882}
883
884/*
885 * Go through all unassigned events and find the next one to schedule.
886 * Take events with the least weight first. Return true on success.
887 */
888static bool perf_sched_next_event(struct perf_sched *sched)
889{
890	struct event_constraint *c;
891
892	if (!sched->state.unassigned || !--sched->state.unassigned)
893		return false;
894
895	do {
896		/* next event */
897		sched->state.event++;
898		if (sched->state.event >= sched->max_events) {
899			/* next weight */
900			sched->state.event = 0;
901			sched->state.weight++;
902			if (sched->state.weight > sched->max_weight)
903				return false;
904		}
905		c = sched->constraints[sched->state.event];
906	} while (c->weight != sched->state.weight);
907
908	sched->state.counter = 0;	/* start with first counter */
909
910	return true;
911}
912
913/*
914 * Assign a counter for each event.
915 */
916int perf_assign_events(struct event_constraint **constraints, int n,
917			int wmin, int wmax, int gpmax, int *assign)
918{
919	struct perf_sched sched;
920
921	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
922
923	do {
924		if (!perf_sched_find_counter(&sched))
925			break;	/* failed */
926		if (assign)
927			assign[sched.state.event] = sched.state.counter;
928	} while (perf_sched_next_event(&sched));
929
930	return sched.state.unassigned;
931}
932EXPORT_SYMBOL_GPL(perf_assign_events);
933
934int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
935{
936	struct event_constraint *c;
937	struct perf_event *e;
938	int n0, i, wmin, wmax, unsched = 0;
939	struct hw_perf_event *hwc;
940	u64 used_mask = 0;
941
942	/*
943	 * Compute the number of events already present; see x86_pmu_add(),
944	 * validate_group() and x86_pmu_commit_txn(). For the former two
945	 * cpuc->n_events hasn't been updated yet, while for the latter
946	 * cpuc->n_txn contains the number of events added in the current
947	 * transaction.
948	 */
949	n0 = cpuc->n_events;
950	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
951		n0 -= cpuc->n_txn;
952
953	static_call_cond(x86_pmu_start_scheduling)(cpuc);
954
955	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
956		c = cpuc->event_constraint[i];
957
958		/*
959		 * Previously scheduled events should have a cached constraint,
960		 * while new events should not have one.
961		 */
962		WARN_ON_ONCE((c && i >= n0) || (!c && i < n0));
963
964		/*
965		 * Request constraints for new events; or for those events that
966		 * have a dynamic constraint -- for those the constraint can
967		 * change due to external factors (sibling state, allow_tfa).
968		 */
969		if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) {
970			c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]);
971			cpuc->event_constraint[i] = c;
972		}
973
974		wmin = min(wmin, c->weight);
975		wmax = max(wmax, c->weight);
976	}
977
978	/*
979	 * fastpath, try to reuse previous register
980	 */
981	for (i = 0; i < n; i++) {
982		u64 mask;
983
984		hwc = &cpuc->event_list[i]->hw;
985		c = cpuc->event_constraint[i];
986
987		/* never assigned */
988		if (hwc->idx == -1)
989			break;
990
991		/* constraint still honored */
992		if (!test_bit(hwc->idx, c->idxmsk))
993			break;
994
995		mask = BIT_ULL(hwc->idx);
996		if (is_counter_pair(hwc))
997			mask |= mask << 1;
998
999		/* not already used */
1000		if (used_mask & mask)
1001			break;
1002
1003		used_mask |= mask;
1004
1005		if (assign)
1006			assign[i] = hwc->idx;
1007	}
1008
1009	/* slow path */
1010	if (i != n) {
1011		int gpmax = x86_pmu.num_counters;
1012
1013		/*
1014		 * Do not allow scheduling of more than half the available
1015		 * generic counters.
1016		 *
1017		 * This helps avoid counter starvation of sibling thread by
1018		 * ensuring at most half the counters cannot be in exclusive
1019		 * mode. There is no designated counters for the limits. Any
1020		 * N/2 counters can be used. This helps with events with
1021		 * specific counter constraints.
1022		 */
1023		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
1024		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
1025			gpmax /= 2;
1026
1027		/*
1028		 * Reduce the amount of available counters to allow fitting
1029		 * the extra Merge events needed by large increment events.
1030		 */
1031		if (x86_pmu.flags & PMU_FL_PAIR) {
1032			gpmax = x86_pmu.num_counters - cpuc->n_pair;
1033			WARN_ON(gpmax <= 0);
1034		}
1035
1036		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
1037					     wmax, gpmax, assign);
1038	}
1039
1040	/*
1041	 * In case of success (unsched = 0), mark events as committed,
1042	 * so we do not put_constraint() in case new events are added
1043	 * and fail to be scheduled
1044	 *
1045	 * We invoke the lower level commit callback to lock the resource
1046	 *
1047	 * We do not need to do all of this in case we are called to
1048	 * validate an event group (assign == NULL)
1049	 */
1050	if (!unsched && assign) {
1051		for (i = 0; i < n; i++) {
1052			e = cpuc->event_list[i];
1053			static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
1054		}
1055	} else {
1056		for (i = n0; i < n; i++) {
1057			e = cpuc->event_list[i];
1058
1059			/*
1060			 * release events that failed scheduling
1061			 */
1062			static_call_cond(x86_pmu_put_event_constraints)(cpuc, e);
1063
1064			cpuc->event_constraint[i] = NULL;
1065		}
1066	}
1067
1068	static_call_cond(x86_pmu_stop_scheduling)(cpuc);
1069
1070	return unsched ? -EINVAL : 0;
1071}
1072
1073static int add_nr_metric_event(struct cpu_hw_events *cpuc,
1074			       struct perf_event *event)
1075{
1076	if (is_metric_event(event)) {
1077		if (cpuc->n_metric == INTEL_TD_METRIC_NUM)
1078			return -EINVAL;
1079		cpuc->n_metric++;
1080		cpuc->n_txn_metric++;
1081	}
1082
1083	return 0;
1084}
1085
1086static void del_nr_metric_event(struct cpu_hw_events *cpuc,
1087				struct perf_event *event)
1088{
1089	if (is_metric_event(event))
1090		cpuc->n_metric--;
1091}
1092
1093static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
1094			 int max_count, int n)
1095{
1096
1097	if (x86_pmu.intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
1098		return -EINVAL;
1099
1100	if (n >= max_count + cpuc->n_metric)
1101		return -EINVAL;
1102
1103	cpuc->event_list[n] = event;
1104	if (is_counter_pair(&event->hw)) {
1105		cpuc->n_pair++;
1106		cpuc->n_txn_pair++;
1107	}
1108
1109	return 0;
1110}
1111
1112/*
1113 * dogrp: true if must collect siblings events (group)
1114 * returns total number of events and error code
1115 */
1116static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
1117{
1118	struct perf_event *event;
1119	int n, max_count;
1120
1121	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
1122
1123	/* current number of events already accepted */
1124	n = cpuc->n_events;
1125	if (!cpuc->n_events)
1126		cpuc->pebs_output = 0;
1127
1128	if (!cpuc->is_fake && leader->attr.precise_ip) {
1129		/*
1130		 * For PEBS->PT, if !aux_event, the group leader (PT) went
1131		 * away, the group was broken down and this singleton event
1132		 * can't schedule any more.
1133		 */
1134		if (is_pebs_pt(leader) && !leader->aux_event)
1135			return -EINVAL;
1136
1137		/*
1138		 * pebs_output: 0: no PEBS so far, 1: PT, 2: DS
1139		 */
1140		if (cpuc->pebs_output &&
1141		    cpuc->pebs_output != is_pebs_pt(leader) + 1)
1142			return -EINVAL;
1143
1144		cpuc->pebs_output = is_pebs_pt(leader) + 1;
1145	}
1146
1147	if (is_x86_event(leader)) {
1148		if (collect_event(cpuc, leader, max_count, n))
1149			return -EINVAL;
1150		n++;
1151	}
1152
1153	if (!dogrp)
1154		return n;
1155
1156	for_each_sibling_event(event, leader) {
1157		if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF)
1158			continue;
1159
1160		if (collect_event(cpuc, event, max_count, n))
1161			return -EINVAL;
1162
1163		n++;
1164	}
1165	return n;
1166}
1167
1168static inline void x86_assign_hw_event(struct perf_event *event,
1169				struct cpu_hw_events *cpuc, int i)
1170{
1171	struct hw_perf_event *hwc = &event->hw;
1172	int idx;
1173
1174	idx = hwc->idx = cpuc->assign[i];
1175	hwc->last_cpu = smp_processor_id();
1176	hwc->last_tag = ++cpuc->tags[i];
1177
1178	switch (hwc->idx) {
1179	case INTEL_PMC_IDX_FIXED_BTS:
1180	case INTEL_PMC_IDX_FIXED_VLBR:
1181		hwc->config_base = 0;
1182		hwc->event_base	= 0;
1183		break;
1184
1185	case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
1186		/* All the metric events are mapped onto the fixed counter 3. */
1187		idx = INTEL_PMC_IDX_FIXED_SLOTS;
1188		/* fall through */
1189	case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1:
1190		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1191		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 +
1192				(idx - INTEL_PMC_IDX_FIXED);
1193		hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) |
1194					INTEL_PMC_FIXED_RDPMC_BASE;
1195		break;
1196
1197	default:
1198		hwc->config_base = x86_pmu_config_addr(hwc->idx);
1199		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
1200		hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
1201		break;
1202	}
1203}
1204
1205/**
1206 * x86_perf_rdpmc_index - Return PMC counter used for event
1207 * @event: the perf_event to which the PMC counter was assigned
1208 *
1209 * The counter assigned to this performance event may change if interrupts
1210 * are enabled. This counter should thus never be used while interrupts are
1211 * enabled. Before this function is used to obtain the assigned counter the
1212 * event should be checked for validity using, for example,
1213 * perf_event_read_local(), within the same interrupt disabled section in
1214 * which this counter is planned to be used.
1215 *
1216 * Return: The index of the performance monitoring counter assigned to
1217 * @perf_event.
1218 */
1219int x86_perf_rdpmc_index(struct perf_event *event)
1220{
1221	lockdep_assert_irqs_disabled();
1222
1223	return event->hw.event_base_rdpmc;
1224}
1225
1226static inline int match_prev_assignment(struct hw_perf_event *hwc,
1227					struct cpu_hw_events *cpuc,
1228					int i)
1229{
1230	return hwc->idx == cpuc->assign[i] &&
1231		hwc->last_cpu == smp_processor_id() &&
1232		hwc->last_tag == cpuc->tags[i];
1233}
1234
1235static void x86_pmu_start(struct perf_event *event, int flags);
1236
1237static void x86_pmu_enable(struct pmu *pmu)
1238{
1239	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1240	struct perf_event *event;
1241	struct hw_perf_event *hwc;
1242	int i, added = cpuc->n_added;
1243
1244	if (!x86_pmu_initialized())
1245		return;
1246
1247	if (cpuc->enabled)
1248		return;
1249
1250	if (cpuc->n_added) {
1251		int n_running = cpuc->n_events - cpuc->n_added;
1252		/*
1253		 * apply assignment obtained either from
1254		 * hw_perf_group_sched_in() or x86_pmu_enable()
1255		 *
1256		 * step1: save events moving to new counters
1257		 */
1258		for (i = 0; i < n_running; i++) {
1259			event = cpuc->event_list[i];
1260			hwc = &event->hw;
1261
1262			/*
1263			 * we can avoid reprogramming counter if:
1264			 * - assigned same counter as last time
1265			 * - running on same CPU as last time
1266			 * - no other event has used the counter since
1267			 */
1268			if (hwc->idx == -1 ||
1269			    match_prev_assignment(hwc, cpuc, i))
1270				continue;
1271
1272			/*
1273			 * Ensure we don't accidentally enable a stopped
1274			 * counter simply because we rescheduled.
1275			 */
1276			if (hwc->state & PERF_HES_STOPPED)
1277				hwc->state |= PERF_HES_ARCH;
1278
1279			x86_pmu_stop(event, PERF_EF_UPDATE);
1280		}
1281
1282		/*
1283		 * step2: reprogram moved events into new counters
1284		 */
1285		for (i = 0; i < cpuc->n_events; i++) {
1286			event = cpuc->event_list[i];
1287			hwc = &event->hw;
1288
1289			if (!match_prev_assignment(hwc, cpuc, i))
1290				x86_assign_hw_event(event, cpuc, i);
1291			else if (i < n_running)
1292				continue;
1293
1294			if (hwc->state & PERF_HES_ARCH)
1295				continue;
1296
1297			x86_pmu_start(event, PERF_EF_RELOAD);
1298		}
1299		cpuc->n_added = 0;
1300		perf_events_lapic_init();
1301	}
1302
1303	cpuc->enabled = 1;
1304	barrier();
1305
1306	static_call(x86_pmu_enable_all)(added);
1307}
1308
1309static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1310
1311/*
1312 * Set the next IRQ period, based on the hwc->period_left value.
1313 * To be called with the event disabled in hw:
1314 */
1315int x86_perf_event_set_period(struct perf_event *event)
1316{
1317	struct hw_perf_event *hwc = &event->hw;
1318	s64 left = local64_read(&hwc->period_left);
1319	s64 period = hwc->sample_period;
1320	int ret = 0, idx = hwc->idx;
1321
1322	if (unlikely(!hwc->event_base))
1323		return 0;
1324
1325	if (unlikely(is_topdown_count(event)) &&
1326	    x86_pmu.set_topdown_event_period)
1327		return x86_pmu.set_topdown_event_period(event);
1328
1329	/*
1330	 * If we are way outside a reasonable range then just skip forward:
1331	 */
1332	if (unlikely(left <= -period)) {
1333		left = period;
1334		local64_set(&hwc->period_left, left);
1335		hwc->last_period = period;
1336		ret = 1;
1337	}
1338
1339	if (unlikely(left <= 0)) {
1340		left += period;
1341		local64_set(&hwc->period_left, left);
1342		hwc->last_period = period;
1343		ret = 1;
1344	}
1345	/*
1346	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1347	 */
1348	if (unlikely(left < 2))
1349		left = 2;
1350
1351	if (left > x86_pmu.max_period)
1352		left = x86_pmu.max_period;
1353
1354	if (x86_pmu.limit_period)
1355		left = x86_pmu.limit_period(event, left);
1356
1357	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1358
1359	/*
1360	 * The hw event starts counting from this event offset,
1361	 * mark it to be able to extra future deltas:
1362	 */
1363	local64_set(&hwc->prev_count, (u64)-left);
1364
1365	wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
1366
1367	/*
1368	 * Sign extend the Merge event counter's upper 16 bits since
1369	 * we currently declare a 48-bit counter width
1370	 */
1371	if (is_counter_pair(hwc))
1372		wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff);
1373
1374	/*
1375	 * Due to erratum on certan cpu we need
1376	 * a second write to be sure the register
1377	 * is updated properly
1378	 */
1379	if (x86_pmu.perfctr_second_write) {
1380		wrmsrl(hwc->event_base,
1381			(u64)(-left) & x86_pmu.cntval_mask);
1382	}
1383
1384	perf_event_update_userpage(event);
1385
1386	return ret;
1387}
1388
1389void x86_pmu_enable_event(struct perf_event *event)
1390{
1391	if (__this_cpu_read(cpu_hw_events.enabled))
1392		__x86_pmu_enable_event(&event->hw,
1393				       ARCH_PERFMON_EVENTSEL_ENABLE);
1394}
1395
1396/*
1397 * Add a single event to the PMU.
1398 *
1399 * The event is added to the group of enabled events
1400 * but only if it can be scheduled with existing events.
1401 */
1402static int x86_pmu_add(struct perf_event *event, int flags)
1403{
1404	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1405	struct hw_perf_event *hwc;
1406	int assign[X86_PMC_IDX_MAX];
1407	int n, n0, ret;
1408
1409	hwc = &event->hw;
1410
1411	n0 = cpuc->n_events;
1412	ret = n = collect_events(cpuc, event, false);
1413	if (ret < 0)
1414		goto out;
1415
1416	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1417	if (!(flags & PERF_EF_START))
1418		hwc->state |= PERF_HES_ARCH;
1419
1420	/*
1421	 * If group events scheduling transaction was started,
1422	 * skip the schedulability test here, it will be performed
1423	 * at commit time (->commit_txn) as a whole.
1424	 *
1425	 * If commit fails, we'll call ->del() on all events
1426	 * for which ->add() was called.
1427	 */
1428	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
1429		goto done_collect;
1430
1431	ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
1432	if (ret)
1433		goto out;
1434	/*
1435	 * copy new assignment, now we know it is possible
1436	 * will be used by hw_perf_enable()
1437	 */
1438	memcpy(cpuc->assign, assign, n*sizeof(int));
1439
1440done_collect:
1441	/*
1442	 * Commit the collect_events() state. See x86_pmu_del() and
1443	 * x86_pmu_*_txn().
1444	 */
1445	cpuc->n_events = n;
1446	cpuc->n_added += n - n0;
1447	cpuc->n_txn += n - n0;
1448
1449	/*
1450	 * This is before x86_pmu_enable() will call x86_pmu_start(),
1451	 * so we enable LBRs before an event needs them etc..
1452	 */
1453	static_call_cond(x86_pmu_add)(event);
1454
1455	ret = 0;
1456out:
1457	return ret;
1458}
1459
1460static void x86_pmu_start(struct perf_event *event, int flags)
1461{
1462	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1463	int idx = event->hw.idx;
1464
1465	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1466		return;
1467
1468	if (WARN_ON_ONCE(idx == -1))
1469		return;
1470
1471	if (flags & PERF_EF_RELOAD) {
1472		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1473		x86_perf_event_set_period(event);
1474	}
1475
1476	event->hw.state = 0;
1477
1478	cpuc->events[idx] = event;
1479	__set_bit(idx, cpuc->active_mask);
1480	__set_bit(idx, cpuc->running);
1481	static_call(x86_pmu_enable)(event);
1482	perf_event_update_userpage(event);
1483}
1484
1485void perf_event_print_debug(void)
1486{
1487	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1488	u64 pebs, debugctl;
1489	struct cpu_hw_events *cpuc;
1490	unsigned long flags;
1491	int cpu, idx;
1492
1493	if (!x86_pmu.num_counters)
1494		return;
1495
1496	local_irq_save(flags);
1497
1498	cpu = smp_processor_id();
1499	cpuc = &per_cpu(cpu_hw_events, cpu);
1500
1501	if (x86_pmu.version >= 2) {
1502		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1503		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1504		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1505		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1506
1507		pr_info("\n");
1508		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
1509		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
1510		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
1511		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1512		if (x86_pmu.pebs_constraints) {
1513			rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1514			pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1515		}
1516		if (x86_pmu.lbr_nr) {
1517			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
1518			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
1519		}
1520	}
1521	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1522
1523	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1524		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1525		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1526
1527		prev_left = per_cpu(pmc_prev_left[idx], cpu);
1528
1529		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
1530			cpu, idx, pmc_ctrl);
1531		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
1532			cpu, idx, pmc_count);
1533		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1534			cpu, idx, prev_left);
1535	}
1536	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1537		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1538
1539		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1540			cpu, idx, pmc_count);
1541	}
1542	local_irq_restore(flags);
1543}
1544
1545void x86_pmu_stop(struct perf_event *event, int flags)
1546{
1547	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1548	struct hw_perf_event *hwc = &event->hw;
1549
1550	if (test_bit(hwc->idx, cpuc->active_mask)) {
1551		static_call(x86_pmu_disable)(event);
1552		__clear_bit(hwc->idx, cpuc->active_mask);
1553		cpuc->events[hwc->idx] = NULL;
1554		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1555		hwc->state |= PERF_HES_STOPPED;
1556	}
1557
1558	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1559		/*
1560		 * Drain the remaining delta count out of a event
1561		 * that we are disabling:
1562		 */
1563		x86_perf_event_update(event);
1564		hwc->state |= PERF_HES_UPTODATE;
1565	}
1566}
1567
1568static void x86_pmu_del(struct perf_event *event, int flags)
1569{
1570	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1571	int i;
1572
1573	/*
1574	 * If we're called during a txn, we only need to undo x86_pmu.add.
1575	 * The events never got scheduled and ->cancel_txn will truncate
1576	 * the event_list.
1577	 *
1578	 * XXX assumes any ->del() called during a TXN will only be on
1579	 * an event added during that same TXN.
1580	 */
1581	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
1582		goto do_del;
1583
1584	/*
1585	 * Not a TXN, therefore cleanup properly.
1586	 */
1587	x86_pmu_stop(event, PERF_EF_UPDATE);
1588
1589	for (i = 0; i < cpuc->n_events; i++) {
1590		if (event == cpuc->event_list[i])
1591			break;
1592	}
1593
1594	if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
1595		return;
1596
1597	/* If we have a newly added event; make sure to decrease n_added. */
1598	if (i >= cpuc->n_events - cpuc->n_added)
1599		--cpuc->n_added;
1600
1601	static_call_cond(x86_pmu_put_event_constraints)(cpuc, event);
1602
1603	/* Delete the array entry. */
1604	while (++i < cpuc->n_events) {
1605		cpuc->event_list[i-1] = cpuc->event_list[i];
1606		cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
1607	}
1608	cpuc->event_constraint[i-1] = NULL;
1609	--cpuc->n_events;
1610	if (x86_pmu.intel_cap.perf_metrics)
1611		del_nr_metric_event(cpuc, event);
1612
1613	perf_event_update_userpage(event);
1614
1615do_del:
1616
1617	/*
1618	 * This is after x86_pmu_stop(); so we disable LBRs after any
1619	 * event can need them etc..
1620	 */
1621	static_call_cond(x86_pmu_del)(event);
1622}
1623
1624int x86_pmu_handle_irq(struct pt_regs *regs)
1625{
1626	struct perf_sample_data data;
1627	struct cpu_hw_events *cpuc;
1628	struct perf_event *event;
1629	int idx, handled = 0;
1630	u64 val;
1631
1632	cpuc = this_cpu_ptr(&cpu_hw_events);
1633
1634	/*
1635	 * Some chipsets need to unmask the LVTPC in a particular spot
1636	 * inside the nmi handler.  As a result, the unmasking was pushed
1637	 * into all the nmi handlers.
1638	 *
1639	 * This generic handler doesn't seem to have any issues where the
1640	 * unmasking occurs so it was left at the top.
1641	 */
1642	apic_write(APIC_LVTPC, APIC_DM_NMI);
1643
1644	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1645		if (!test_bit(idx, cpuc->active_mask))
1646			continue;
1647
1648		event = cpuc->events[idx];
1649
1650		val = x86_perf_event_update(event);
1651		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1652			continue;
1653
1654		/*
1655		 * event overflow
1656		 */
1657		handled++;
1658		perf_sample_data_init(&data, 0, event->hw.last_period);
1659
1660		if (!x86_perf_event_set_period(event))
1661			continue;
1662
1663		if (perf_event_overflow(event, &data, regs))
1664			x86_pmu_stop(event, 0);
1665	}
1666
1667	if (handled)
1668		inc_irq_stat(apic_perf_irqs);
1669
1670	return handled;
1671}
1672
1673void perf_events_lapic_init(void)
1674{
1675	if (!x86_pmu.apic || !x86_pmu_initialized())
1676		return;
1677
1678	/*
1679	 * Always use NMI for PMU
1680	 */
1681	apic_write(APIC_LVTPC, APIC_DM_NMI);
1682}
1683
1684static int
1685perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
1686{
1687	u64 start_clock;
1688	u64 finish_clock;
1689	int ret;
1690
1691	/*
1692	 * All PMUs/events that share this PMI handler should make sure to
1693	 * increment active_events for their events.
1694	 */
1695	if (!atomic_read(&active_events))
1696		return NMI_DONE;
1697
1698	start_clock = sched_clock();
1699	ret = static_call(x86_pmu_handle_irq)(regs);
1700	finish_clock = sched_clock();
1701
1702	perf_sample_event_took(finish_clock - start_clock);
1703
1704	return ret;
1705}
1706NOKPROBE_SYMBOL(perf_event_nmi_handler);
1707
1708struct event_constraint emptyconstraint;
1709struct event_constraint unconstrained;
1710
1711static int x86_pmu_prepare_cpu(unsigned int cpu)
1712{
1713	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1714	int i;
1715
1716	for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
1717		cpuc->kfree_on_online[i] = NULL;
1718	if (x86_pmu.cpu_prepare)
1719		return x86_pmu.cpu_prepare(cpu);
1720	return 0;
1721}
1722
1723static int x86_pmu_dead_cpu(unsigned int cpu)
1724{
1725	if (x86_pmu.cpu_dead)
1726		x86_pmu.cpu_dead(cpu);
1727	return 0;
1728}
1729
1730static int x86_pmu_online_cpu(unsigned int cpu)
1731{
1732	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1733	int i;
1734
1735	for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
1736		kfree(cpuc->kfree_on_online[i]);
1737		cpuc->kfree_on_online[i] = NULL;
1738	}
1739	return 0;
1740}
1741
1742static int x86_pmu_starting_cpu(unsigned int cpu)
1743{
1744	if (x86_pmu.cpu_starting)
1745		x86_pmu.cpu_starting(cpu);
1746	return 0;
1747}
1748
1749static int x86_pmu_dying_cpu(unsigned int cpu)
1750{
1751	if (x86_pmu.cpu_dying)
1752		x86_pmu.cpu_dying(cpu);
1753	return 0;
1754}
1755
1756static void __init pmu_check_apic(void)
1757{
1758	if (boot_cpu_has(X86_FEATURE_APIC))
1759		return;
1760
1761	x86_pmu.apic = 0;
1762	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1763	pr_info("no hardware sampling interrupt available.\n");
1764
1765	/*
1766	 * If we have a PMU initialized but no APIC
1767	 * interrupts, we cannot sample hardware
1768	 * events (user-space has to fall back and
1769	 * sample via a hrtimer based software event):
1770	 */
1771	pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
1772
1773}
1774
1775static struct attribute_group x86_pmu_format_group __ro_after_init = {
1776	.name = "format",
1777	.attrs = NULL,
1778};
1779
1780ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
1781{
1782	struct perf_pmu_events_attr *pmu_attr =
1783		container_of(attr, struct perf_pmu_events_attr, attr);
1784	u64 config = 0;
1785
1786	if (pmu_attr->id < x86_pmu.max_events)
1787		config = x86_pmu.event_map(pmu_attr->id);
1788
1789	/* string trumps id */
1790	if (pmu_attr->event_str)
1791		return sprintf(page, "%s", pmu_attr->event_str);
1792
1793	return x86_pmu.events_sysfs_show(page, config);
1794}
1795EXPORT_SYMBOL_GPL(events_sysfs_show);
1796
1797ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
1798			  char *page)
1799{
1800	struct perf_pmu_events_ht_attr *pmu_attr =
1801		container_of(attr, struct perf_pmu_events_ht_attr, attr);
1802
1803	/*
1804	 * Report conditional events depending on Hyper-Threading.
1805	 *
1806	 * This is overly conservative as usually the HT special
1807	 * handling is not needed if the other CPU thread is idle.
1808	 *
1809	 * Note this does not (and cannot) handle the case when thread
1810	 * siblings are invisible, for example with virtualization
1811	 * if they are owned by some other guest.  The user tool
1812	 * has to re-read when a thread sibling gets onlined later.
1813	 */
1814	return sprintf(page, "%s",
1815			topology_max_smt_threads() > 1 ?
1816			pmu_attr->event_str_ht :
1817			pmu_attr->event_str_noht);
1818}
1819
1820EVENT_ATTR(cpu-cycles,			CPU_CYCLES		);
1821EVENT_ATTR(instructions,		INSTRUCTIONS		);
1822EVENT_ATTR(cache-references,		CACHE_REFERENCES	);
1823EVENT_ATTR(cache-misses, 		CACHE_MISSES		);
1824EVENT_ATTR(branch-instructions,		BRANCH_INSTRUCTIONS	);
1825EVENT_ATTR(branch-misses,		BRANCH_MISSES		);
1826EVENT_ATTR(bus-cycles,			BUS_CYCLES		);
1827EVENT_ATTR(stalled-cycles-frontend,	STALLED_CYCLES_FRONTEND	);
1828EVENT_ATTR(stalled-cycles-backend,	STALLED_CYCLES_BACKEND	);
1829EVENT_ATTR(ref-cycles,			REF_CPU_CYCLES		);
1830
1831static struct attribute *empty_attrs;
1832
1833static struct attribute *events_attr[] = {
1834	EVENT_PTR(CPU_CYCLES),
1835	EVENT_PTR(INSTRUCTIONS),
1836	EVENT_PTR(CACHE_REFERENCES),
1837	EVENT_PTR(CACHE_MISSES),
1838	EVENT_PTR(BRANCH_INSTRUCTIONS),
1839	EVENT_PTR(BRANCH_MISSES),
1840	EVENT_PTR(BUS_CYCLES),
1841	EVENT_PTR(STALLED_CYCLES_FRONTEND),
1842	EVENT_PTR(STALLED_CYCLES_BACKEND),
1843	EVENT_PTR(REF_CPU_CYCLES),
1844	NULL,
1845};
1846
1847/*
1848 * Remove all undefined events (x86_pmu.event_map(id) == 0)
1849 * out of events_attr attributes.
1850 */
1851static umode_t
1852is_visible(struct kobject *kobj, struct attribute *attr, int idx)
1853{
1854	struct perf_pmu_events_attr *pmu_attr;
1855
1856	if (idx >= x86_pmu.max_events)
1857		return 0;
1858
1859	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr);
1860	/* str trumps id */
1861	return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0;
1862}
1863
1864static struct attribute_group x86_pmu_events_group __ro_after_init = {
1865	.name = "events",
1866	.attrs = events_attr,
1867	.is_visible = is_visible,
1868};
1869
1870ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
1871{
1872	u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
1873	u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
1874	bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
1875	bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
1876	bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
1877	bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
1878	ssize_t ret;
1879
1880	/*
1881	* We have whole page size to spend and just little data
1882	* to write, so we can safely use sprintf.
1883	*/
1884	ret = sprintf(page, "event=0x%02llx", event);
1885
1886	if (umask)
1887		ret += sprintf(page + ret, ",umask=0x%02llx", umask);
1888
1889	if (edge)
1890		ret += sprintf(page + ret, ",edge");
1891
1892	if (pc)
1893		ret += sprintf(page + ret, ",pc");
1894
1895	if (any)
1896		ret += sprintf(page + ret, ",any");
1897
1898	if (inv)
1899		ret += sprintf(page + ret, ",inv");
1900
1901	if (cmask)
1902		ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
1903
1904	ret += sprintf(page + ret, "\n");
1905
1906	return ret;
1907}
1908
1909static struct attribute_group x86_pmu_attr_group;
1910static struct attribute_group x86_pmu_caps_group;
1911
1912static void x86_pmu_static_call_update(void)
1913{
1914	static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
1915	static_call_update(x86_pmu_disable_all, x86_pmu.disable_all);
1916	static_call_update(x86_pmu_enable_all, x86_pmu.enable_all);
1917	static_call_update(x86_pmu_enable, x86_pmu.enable);
1918	static_call_update(x86_pmu_disable, x86_pmu.disable);
1919
1920	static_call_update(x86_pmu_add, x86_pmu.add);
1921	static_call_update(x86_pmu_del, x86_pmu.del);
1922	static_call_update(x86_pmu_read, x86_pmu.read);
1923
1924	static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events);
1925	static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints);
1926	static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints);
1927
1928	static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling);
1929	static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling);
1930	static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling);
1931
1932	static_call_update(x86_pmu_sched_task, x86_pmu.sched_task);
1933	static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx);
1934
1935	static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
1936	static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
1937}
1938
1939static void _x86_pmu_read(struct perf_event *event)
1940{
1941	x86_perf_event_update(event);
1942}
1943
1944static int __init init_hw_perf_events(void)
1945{
1946	struct x86_pmu_quirk *quirk;
1947	int err;
1948
1949	pr_info("Performance Events: ");
1950
1951	switch (boot_cpu_data.x86_vendor) {
1952	case X86_VENDOR_INTEL:
1953		err = intel_pmu_init();
1954		break;
1955	case X86_VENDOR_AMD:
1956		err = amd_pmu_init();
1957		break;
1958	case X86_VENDOR_HYGON:
1959		err = amd_pmu_init();
1960		x86_pmu.name = "HYGON";
1961		break;
1962	case X86_VENDOR_ZHAOXIN:
1963	case X86_VENDOR_CENTAUR:
1964		err = zhaoxin_pmu_init();
1965		break;
1966	default:
1967		err = -ENOTSUPP;
1968	}
1969	if (err != 0) {
1970		pr_cont("no PMU driver, software events only.\n");
1971		return 0;
1972	}
1973
1974	pmu_check_apic();
1975
1976	/* sanity check that the hardware exists or is emulated */
1977	if (!check_hw_exists())
1978		return 0;
1979
1980	pr_cont("%s PMU driver.\n", x86_pmu.name);
1981
1982	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1983
1984	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
1985		quirk->func();
1986
1987	if (!x86_pmu.intel_ctrl)
1988		x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1989
1990	perf_events_lapic_init();
1991	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
1992
1993	unconstrained = (struct event_constraint)
1994		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1995				   0, x86_pmu.num_counters, 0, 0);
1996
1997	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1998
1999	if (!x86_pmu.events_sysfs_show)
2000		x86_pmu_events_group.attrs = &empty_attrs;
2001
2002	pmu.attr_update = x86_pmu.attr_update;
2003
2004	pr_info("... version:                %d\n",     x86_pmu.version);
2005	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
2006	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
2007	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
2008	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
2009	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
2010	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
2011
2012	if (!x86_pmu.read)
2013		x86_pmu.read = _x86_pmu_read;
2014
2015	x86_pmu_static_call_update();
2016
2017	/*
2018	 * Install callbacks. Core will call them for each online
2019	 * cpu.
2020	 */
2021	err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare",
2022				x86_pmu_prepare_cpu, x86_pmu_dead_cpu);
2023	if (err)
2024		return err;
2025
2026	err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING,
2027				"perf/x86:starting", x86_pmu_starting_cpu,
2028				x86_pmu_dying_cpu);
2029	if (err)
2030		goto out;
2031
2032	err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online",
2033				x86_pmu_online_cpu, NULL);
2034	if (err)
2035		goto out1;
2036
2037	err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
2038	if (err)
2039		goto out2;
2040
2041	return 0;
2042
2043out2:
2044	cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE);
2045out1:
2046	cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING);
2047out:
2048	cpuhp_remove_state(CPUHP_PERF_X86_PREPARE);
2049	return err;
2050}
2051early_initcall(init_hw_perf_events);
2052
2053static void x86_pmu_read(struct perf_event *event)
2054{
2055	static_call(x86_pmu_read)(event);
2056}
2057
2058/*
2059 * Start group events scheduling transaction
2060 * Set the flag to make pmu::enable() not perform the
2061 * schedulability test, it will be performed at commit time
2062 *
2063 * We only support PERF_PMU_TXN_ADD transactions. Save the
2064 * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
2065 * transactions.
2066 */
2067static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
2068{
2069	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2070
2071	WARN_ON_ONCE(cpuc->txn_flags);		/* txn already in flight */
2072
2073	cpuc->txn_flags = txn_flags;
2074	if (txn_flags & ~PERF_PMU_TXN_ADD)
2075		return;
2076
2077	perf_pmu_disable(pmu);
2078	__this_cpu_write(cpu_hw_events.n_txn, 0);
2079	__this_cpu_write(cpu_hw_events.n_txn_pair, 0);
2080	__this_cpu_write(cpu_hw_events.n_txn_metric, 0);
2081}
2082
2083/*
2084 * Stop group events scheduling transaction
2085 * Clear the flag and pmu::enable() will perform the
2086 * schedulability test.
2087 */
2088static void x86_pmu_cancel_txn(struct pmu *pmu)
2089{
2090	unsigned int txn_flags;
2091	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2092
2093	WARN_ON_ONCE(!cpuc->txn_flags);	/* no txn in flight */
2094
2095	txn_flags = cpuc->txn_flags;
2096	cpuc->txn_flags = 0;
2097	if (txn_flags & ~PERF_PMU_TXN_ADD)
2098		return;
2099
2100	/*
2101	 * Truncate collected array by the number of events added in this
2102	 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
2103	 */
2104	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
2105	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
2106	__this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair));
2107	__this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric));
2108	perf_pmu_enable(pmu);
2109}
2110
2111/*
2112 * Commit group events scheduling transaction
2113 * Perform the group schedulability test as a whole
2114 * Return 0 if success
2115 *
2116 * Does not cancel the transaction on failure; expects the caller to do this.
2117 */
2118static int x86_pmu_commit_txn(struct pmu *pmu)
2119{
2120	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2121	int assign[X86_PMC_IDX_MAX];
2122	int n, ret;
2123
2124	WARN_ON_ONCE(!cpuc->txn_flags);	/* no txn in flight */
2125
2126	if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
2127		cpuc->txn_flags = 0;
2128		return 0;
2129	}
2130
2131	n = cpuc->n_events;
2132
2133	if (!x86_pmu_initialized())
2134		return -EAGAIN;
2135
2136	ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
2137	if (ret)
2138		return ret;
2139
2140	/*
2141	 * copy new assignment, now we know it is possible
2142	 * will be used by hw_perf_enable()
2143	 */
2144	memcpy(cpuc->assign, assign, n*sizeof(int));
2145
2146	cpuc->txn_flags = 0;
2147	perf_pmu_enable(pmu);
2148	return 0;
2149}
2150/*
2151 * a fake_cpuc is used to validate event groups. Due to
2152 * the extra reg logic, we need to also allocate a fake
2153 * per_core and per_cpu structure. Otherwise, group events
2154 * using extra reg may conflict without the kernel being
2155 * able to catch this when the last event gets added to
2156 * the group.
2157 */
2158static void free_fake_cpuc(struct cpu_hw_events *cpuc)
2159{
2160	intel_cpuc_finish(cpuc);
2161	kfree(cpuc);
2162}
2163
2164static struct cpu_hw_events *allocate_fake_cpuc(void)
2165{
2166	struct cpu_hw_events *cpuc;
2167	int cpu = raw_smp_processor_id();
2168
2169	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
2170	if (!cpuc)
2171		return ERR_PTR(-ENOMEM);
2172	cpuc->is_fake = 1;
2173
2174	if (intel_cpuc_prepare(cpuc, cpu))
2175		goto error;
2176
2177	return cpuc;
2178error:
2179	free_fake_cpuc(cpuc);
2180	return ERR_PTR(-ENOMEM);
2181}
2182
2183/*
2184 * validate that we can schedule this event
2185 */
2186static int validate_event(struct perf_event *event)
2187{
2188	struct cpu_hw_events *fake_cpuc;
2189	struct event_constraint *c;
2190	int ret = 0;
2191
2192	fake_cpuc = allocate_fake_cpuc();
2193	if (IS_ERR(fake_cpuc))
2194		return PTR_ERR(fake_cpuc);
2195
2196	c = x86_pmu.get_event_constraints(fake_cpuc, 0, event);
2197
2198	if (!c || !c->weight)
2199		ret = -EINVAL;
2200
2201	if (x86_pmu.put_event_constraints)
2202		x86_pmu.put_event_constraints(fake_cpuc, event);
2203
2204	free_fake_cpuc(fake_cpuc);
2205
2206	return ret;
2207}
2208
2209/*
2210 * validate a single event group
2211 *
2212 * validation include:
2213 *	- check events are compatible which each other
2214 *	- events do not compete for the same counter
2215 *	- number of events <= number of counters
2216 *
2217 * validation ensures the group can be loaded onto the
2218 * PMU if it was the only group available.
2219 */
2220static int validate_group(struct perf_event *event)
2221{
2222	struct perf_event *leader = event->group_leader;
2223	struct cpu_hw_events *fake_cpuc;
2224	int ret = -EINVAL, n;
2225
2226	fake_cpuc = allocate_fake_cpuc();
2227	if (IS_ERR(fake_cpuc))
2228		return PTR_ERR(fake_cpuc);
2229	/*
2230	 * the event is not yet connected with its
2231	 * siblings therefore we must first collect
2232	 * existing siblings, then add the new event
2233	 * before we can simulate the scheduling
2234	 */
2235	n = collect_events(fake_cpuc, leader, true);
2236	if (n < 0)
2237		goto out;
2238
2239	fake_cpuc->n_events = n;
2240	n = collect_events(fake_cpuc, event, false);
2241	if (n < 0)
2242		goto out;
2243
2244	fake_cpuc->n_events = 0;
2245	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
2246
2247out:
2248	free_fake_cpuc(fake_cpuc);
2249	return ret;
2250}
2251
2252static int x86_pmu_event_init(struct perf_event *event)
2253{
2254	struct pmu *tmp;
2255	int err;
2256
2257	switch (event->attr.type) {
2258	case PERF_TYPE_RAW:
2259	case PERF_TYPE_HARDWARE:
2260	case PERF_TYPE_HW_CACHE:
2261		break;
2262
2263	default:
2264		return -ENOENT;
2265	}
2266
2267	err = __x86_pmu_event_init(event);
2268	if (!err) {
2269		/*
2270		 * we temporarily connect event to its pmu
2271		 * such that validate_group() can classify
2272		 * it as an x86 event using is_x86_event()
2273		 */
2274		tmp = event->pmu;
2275		event->pmu = &pmu;
2276
2277		if (event->group_leader != event)
2278			err = validate_group(event);
2279		else
2280			err = validate_event(event);
2281
2282		event->pmu = tmp;
2283	}
2284	if (err) {
2285		if (event->destroy)
2286			event->destroy(event);
2287		event->destroy = NULL;
2288	}
2289
2290	if (READ_ONCE(x86_pmu.attr_rdpmc) &&
2291	    !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
2292		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
2293
2294	return err;
2295}
2296
2297static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
2298{
2299	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
2300		return;
2301
2302	/*
2303	 * This function relies on not being called concurrently in two
2304	 * tasks in the same mm.  Otherwise one task could observe
2305	 * perf_rdpmc_allowed > 1 and return all the way back to
2306	 * userspace with CR4.PCE clear while another task is still
2307	 * doing on_each_cpu_mask() to propagate CR4.PCE.
2308	 *
2309	 * For now, this can't happen because all callers hold mmap_lock
2310	 * for write.  If this changes, we'll need a different solution.
2311	 */
2312	mmap_assert_write_locked(mm);
2313
2314	if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
2315		on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
2316}
2317
2318static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
2319{
2320
2321	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
2322		return;
2323
2324	if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
2325		on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
2326}
2327
2328static int x86_pmu_event_idx(struct perf_event *event)
2329{
2330	struct hw_perf_event *hwc = &event->hw;
2331
2332	if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED))
2333		return 0;
2334
2335	if (is_metric_idx(hwc->idx))
2336		return INTEL_PMC_FIXED_RDPMC_METRICS + 1;
2337	else
2338		return hwc->event_base_rdpmc + 1;
2339}
2340
2341static ssize_t get_attr_rdpmc(struct device *cdev,
2342			      struct device_attribute *attr,
2343			      char *buf)
2344{
2345	return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
2346}
2347
2348static ssize_t set_attr_rdpmc(struct device *cdev,
2349			      struct device_attribute *attr,
2350			      const char *buf, size_t count)
2351{
2352	unsigned long val;
2353	ssize_t ret;
2354
2355	ret = kstrtoul(buf, 0, &val);
2356	if (ret)
2357		return ret;
2358
2359	if (val > 2)
2360		return -EINVAL;
2361
2362	if (x86_pmu.attr_rdpmc_broken)
2363		return -ENOTSUPP;
2364
2365	if (val != x86_pmu.attr_rdpmc) {
2366		/*
2367		 * Changing into or out of never available or always available,
2368		 * aka perf-event-bypassing mode. This path is extremely slow,
2369		 * but only root can trigger it, so it's okay.
2370		 */
2371		if (val == 0)
2372			static_branch_inc(&rdpmc_never_available_key);
2373		else if (x86_pmu.attr_rdpmc == 0)
2374			static_branch_dec(&rdpmc_never_available_key);
2375
2376		if (val == 2)
2377			static_branch_inc(&rdpmc_always_available_key);
2378		else if (x86_pmu.attr_rdpmc == 2)
2379			static_branch_dec(&rdpmc_always_available_key);
2380
2381		on_each_cpu(cr4_update_pce, NULL, 1);
2382		x86_pmu.attr_rdpmc = val;
2383	}
2384
2385	return count;
2386}
2387
2388static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
2389
2390static struct attribute *x86_pmu_attrs[] = {
2391	&dev_attr_rdpmc.attr,
2392	NULL,
2393};
2394
2395static struct attribute_group x86_pmu_attr_group __ro_after_init = {
2396	.attrs = x86_pmu_attrs,
2397};
2398
2399static ssize_t max_precise_show(struct device *cdev,
2400				  struct device_attribute *attr,
2401				  char *buf)
2402{
2403	return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
2404}
2405
2406static DEVICE_ATTR_RO(max_precise);
2407
2408static struct attribute *x86_pmu_caps_attrs[] = {
2409	&dev_attr_max_precise.attr,
2410	NULL
2411};
2412
2413static struct attribute_group x86_pmu_caps_group __ro_after_init = {
2414	.name = "caps",
2415	.attrs = x86_pmu_caps_attrs,
2416};
2417
2418static const struct attribute_group *x86_pmu_attr_groups[] = {
2419	&x86_pmu_attr_group,
2420	&x86_pmu_format_group,
2421	&x86_pmu_events_group,
2422	&x86_pmu_caps_group,
2423	NULL,
2424};
2425
2426static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
2427{
2428	static_call_cond(x86_pmu_sched_task)(ctx, sched_in);
2429}
2430
2431static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
2432				  struct perf_event_context *next)
2433{
2434	static_call_cond(x86_pmu_swap_task_ctx)(prev, next);
2435}
2436
2437void perf_check_microcode(void)
2438{
2439	if (x86_pmu.check_microcode)
2440		x86_pmu.check_microcode();
2441}
2442
2443static int x86_pmu_check_period(struct perf_event *event, u64 value)
2444{
2445	if (x86_pmu.check_period && x86_pmu.check_period(event, value))
2446		return -EINVAL;
2447
2448	if (value && x86_pmu.limit_period) {
2449		if (x86_pmu.limit_period(event, value) > value)
2450			return -EINVAL;
2451	}
2452
2453	return 0;
2454}
2455
2456static int x86_pmu_aux_output_match(struct perf_event *event)
2457{
2458	if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT))
2459		return 0;
2460
2461	if (x86_pmu.aux_output_match)
2462		return x86_pmu.aux_output_match(event);
2463
2464	return 0;
2465}
2466
2467static struct pmu pmu = {
2468	.pmu_enable		= x86_pmu_enable,
2469	.pmu_disable		= x86_pmu_disable,
2470
2471	.attr_groups		= x86_pmu_attr_groups,
2472
2473	.event_init		= x86_pmu_event_init,
2474
2475	.event_mapped		= x86_pmu_event_mapped,
2476	.event_unmapped		= x86_pmu_event_unmapped,
2477
2478	.add			= x86_pmu_add,
2479	.del			= x86_pmu_del,
2480	.start			= x86_pmu_start,
2481	.stop			= x86_pmu_stop,
2482	.read			= x86_pmu_read,
2483
2484	.start_txn		= x86_pmu_start_txn,
2485	.cancel_txn		= x86_pmu_cancel_txn,
2486	.commit_txn		= x86_pmu_commit_txn,
2487
2488	.event_idx		= x86_pmu_event_idx,
2489	.sched_task		= x86_pmu_sched_task,
2490	.swap_task_ctx		= x86_pmu_swap_task_ctx,
2491	.check_period		= x86_pmu_check_period,
2492
2493	.aux_output_match	= x86_pmu_aux_output_match,
2494};
2495
2496void arch_perf_update_userpage(struct perf_event *event,
2497			       struct perf_event_mmap_page *userpg, u64 now)
2498{
2499	struct cyc2ns_data data;
2500	u64 offset;
2501
2502	userpg->cap_user_time = 0;
2503	userpg->cap_user_time_zero = 0;
2504	userpg->cap_user_rdpmc =
2505		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
2506	userpg->pmc_width = x86_pmu.cntval_bits;
2507
2508	if (!using_native_sched_clock() || !sched_clock_stable())
2509		return;
2510
2511	cyc2ns_read_begin(&data);
2512
2513	offset = data.cyc2ns_offset + __sched_clock_offset;
2514
2515	/*
2516	 * Internal timekeeping for enabled/running/stopped times
2517	 * is always in the local_clock domain.
2518	 */
2519	userpg->cap_user_time = 1;
2520	userpg->time_mult = data.cyc2ns_mul;
2521	userpg->time_shift = data.cyc2ns_shift;
2522	userpg->time_offset = offset - now;
2523
2524	/*
2525	 * cap_user_time_zero doesn't make sense when we're using a different
2526	 * time base for the records.
2527	 */
2528	if (!event->attr.use_clockid) {
2529		userpg->cap_user_time_zero = 1;
2530		userpg->time_zero = offset;
2531	}
2532
2533	cyc2ns_read_end();
2534}
2535
2536/*
2537 * Determine whether the regs were taken from an irq/exception handler rather
2538 * than from perf_arch_fetch_caller_regs().
2539 */
2540static bool perf_hw_regs(struct pt_regs *regs)
2541{
2542	return regs->flags & X86_EFLAGS_FIXED;
2543}
2544
2545void
2546perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
2547{
2548	struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
2549	struct unwind_state state;
2550	unsigned long addr;
2551
2552	if (guest_cbs && guest_cbs->is_in_guest()) {
2553		/* TODO: We don't support guest os callchain now */
2554		return;
2555	}
2556
2557	if (perf_callchain_store(entry, regs->ip))
2558		return;
2559
2560	if (perf_hw_regs(regs))
2561		unwind_start(&state, current, regs, NULL);
2562	else
2563		unwind_start(&state, current, NULL, (void *)regs->sp);
2564
2565	for (; !unwind_done(&state); unwind_next_frame(&state)) {
2566		addr = unwind_get_return_address(&state);
2567		if (!addr || perf_callchain_store(entry, addr))
2568			return;
2569	}
2570}
2571
2572static inline int
2573valid_user_frame(const void __user *fp, unsigned long size)
2574{
2575	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
2576}
2577
2578static unsigned long get_segment_base(unsigned int segment)
2579{
2580	struct desc_struct *desc;
2581	unsigned int idx = segment >> 3;
2582
2583	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2584#ifdef CONFIG_MODIFY_LDT_SYSCALL
2585		struct ldt_struct *ldt;
2586
2587		/* IRQs are off, so this synchronizes with smp_store_release */
2588		ldt = READ_ONCE(current->active_mm->context.ldt);
2589		if (!ldt || idx >= ldt->nr_entries)
2590			return 0;
2591
2592		desc = &ldt->entries[idx];
2593#else
2594		return 0;
2595#endif
2596	} else {
2597		if (idx >= GDT_ENTRIES)
2598			return 0;
2599
2600		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
2601	}
2602
2603	return get_desc_base(desc);
2604}
2605
2606#ifdef CONFIG_IA32_EMULATION
2607
2608#include <linux/compat.h>
2609
2610static inline int
2611perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
2612{
2613	/* 32-bit process in 64-bit kernel. */
2614	unsigned long ss_base, cs_base;
2615	struct stack_frame_ia32 frame;
2616	const struct stack_frame_ia32 __user *fp;
2617
2618	if (!test_thread_flag(TIF_IA32))
2619		return 0;
2620
2621	cs_base = get_segment_base(regs->cs);
2622	ss_base = get_segment_base(regs->ss);
2623
2624	fp = compat_ptr(ss_base + regs->bp);
2625	pagefault_disable();
2626	while (entry->nr < entry->max_stack) {
2627		if (!valid_user_frame(fp, sizeof(frame)))
2628			break;
2629
2630		if (__get_user(frame.next_frame, &fp->next_frame))
2631			break;
2632		if (__get_user(frame.return_address, &fp->return_address))
2633			break;
2634
2635		perf_callchain_store(entry, cs_base + frame.return_address);
2636		fp = compat_ptr(ss_base + frame.next_frame);
2637	}
2638	pagefault_enable();
2639	return 1;
2640}
2641#else
2642static inline int
2643perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
2644{
2645    return 0;
2646}
2647#endif
2648
2649void
2650perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
2651{
2652	struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
2653	struct stack_frame frame;
2654	const struct stack_frame __user *fp;
2655
2656	if (guest_cbs && guest_cbs->is_in_guest()) {
2657		/* TODO: We don't support guest os callchain now */
2658		return;
2659	}
2660
2661	/*
2662	 * We don't know what to do with VM86 stacks.. ignore them for now.
2663	 */
2664	if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
2665		return;
2666
2667	fp = (void __user *)regs->bp;
2668
2669	perf_callchain_store(entry, regs->ip);
2670
2671	if (!nmi_uaccess_okay())
2672		return;
2673
2674	if (perf_callchain_user32(regs, entry))
2675		return;
2676
2677	pagefault_disable();
2678	while (entry->nr < entry->max_stack) {
2679		if (!valid_user_frame(fp, sizeof(frame)))
2680			break;
2681
2682		if (__get_user(frame.next_frame, &fp->next_frame))
2683			break;
2684		if (__get_user(frame.return_address, &fp->return_address))
2685			break;
2686
2687		perf_callchain_store(entry, frame.return_address);
2688		fp = (void __user *)frame.next_frame;
2689	}
2690	pagefault_enable();
2691}
2692
2693/*
2694 * Deal with code segment offsets for the various execution modes:
2695 *
2696 *   VM86 - the good olde 16 bit days, where the linear address is
2697 *          20 bits and we use regs->ip + 0x10 * regs->cs.
2698 *
2699 *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
2700 *          to figure out what the 32bit base address is.
2701 *
2702 *    X32 - has TIF_X32 set, but is running in x86_64
2703 *
2704 * X86_64 - CS,DS,SS,ES are all zero based.
2705 */
2706static unsigned long code_segment_base(struct pt_regs *regs)
2707{
2708	/*
2709	 * For IA32 we look at the GDT/LDT segment base to convert the
2710	 * effective IP to a linear address.
2711	 */
2712
2713#ifdef CONFIG_X86_32
2714	/*
2715	 * If we are in VM86 mode, add the segment offset to convert to a
2716	 * linear address.
2717	 */
2718	if (regs->flags & X86_VM_MASK)
2719		return 0x10 * regs->cs;
2720
2721	if (user_mode(regs) && regs->cs != __USER_CS)
2722		return get_segment_base(regs->cs);
2723#else
2724	if (user_mode(regs) && !user_64bit_mode(regs) &&
2725	    regs->cs != __USER32_CS)
2726		return get_segment_base(regs->cs);
2727#endif
2728	return 0;
2729}
2730
2731unsigned long perf_instruction_pointer(struct pt_regs *regs)
2732{
2733	struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
2734
2735	if (guest_cbs && guest_cbs->is_in_guest())
2736		return guest_cbs->get_guest_ip();
2737
2738	return regs->ip + code_segment_base(regs);
2739}
2740
2741unsigned long perf_misc_flags(struct pt_regs *regs)
2742{
2743	struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
2744	int misc = 0;
2745
2746	if (guest_cbs && guest_cbs->is_in_guest()) {
2747		if (guest_cbs->is_user_mode())
2748			misc |= PERF_RECORD_MISC_GUEST_USER;
2749		else
2750			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
2751	} else {
2752		if (user_mode(regs))
2753			misc |= PERF_RECORD_MISC_USER;
2754		else
2755			misc |= PERF_RECORD_MISC_KERNEL;
2756	}
2757
2758	if (regs->flags & PERF_EFLAGS_EXACT)
2759		misc |= PERF_RECORD_MISC_EXACT_IP;
2760
2761	return misc;
2762}
2763
2764void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
2765{
2766	cap->version		= x86_pmu.version;
2767	cap->num_counters_gp	= x86_pmu.num_counters;
2768	cap->num_counters_fixed	= x86_pmu.num_counters_fixed;
2769	cap->bit_width_gp	= x86_pmu.cntval_bits;
2770	cap->bit_width_fixed	= x86_pmu.cntval_bits;
2771	cap->events_mask	= (unsigned int)x86_pmu.events_maskl;
2772	cap->events_mask_len	= x86_pmu.events_mask_len;
2773}
2774EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
2775