xref: /kernel/linux/linux-6.6/arch/x86/events/amd/ibs.c (revision 62306a36)
1/*
2 * Performance events - AMD IBS
3 *
4 *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
5 *
6 *  For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/perf_event.h>
10#include <linux/init.h>
11#include <linux/export.h>
12#include <linux/pci.h>
13#include <linux/ptrace.h>
14#include <linux/syscore_ops.h>
15#include <linux/sched/clock.h>
16
17#include <asm/apic.h>
18
19#include "../perf_event.h"
20
21static u32 ibs_caps;
22
23#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
24
25#include <linux/kprobes.h>
26#include <linux/hardirq.h>
27
28#include <asm/nmi.h>
29#include <asm/amd-ibs.h>
30
31#define IBS_FETCH_CONFIG_MASK	(IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
32#define IBS_OP_CONFIG_MASK	IBS_OP_MAX_CNT
33
34
35/*
36 * IBS states:
37 *
38 * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken
39 * and any further add()s must fail.
40 *
41 * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are
42 * complicated by the fact that the IBS hardware can send late NMIs (ie. after
43 * we've cleared the EN bit).
44 *
45 * In order to consume these late NMIs we have the STOPPED state, any NMI that
46 * happens after we've cleared the EN state will clear this bit and report the
47 * NMI handled (this is fundamentally racy in the face or multiple NMI sources,
48 * someone else can consume our BIT and our NMI will go unhandled).
49 *
50 * And since we cannot set/clear this separate bit together with the EN bit,
51 * there are races; if we cleared STARTED early, an NMI could land in
52 * between clearing STARTED and clearing the EN bit (in fact multiple NMIs
53 * could happen if the period is small enough), and consume our STOPPED bit
54 * and trigger streams of unhandled NMIs.
55 *
56 * If, however, we clear STARTED late, an NMI can hit between clearing the
57 * EN bit and clearing STARTED, still see STARTED set and process the event.
58 * If this event will have the VALID bit clear, we bail properly, but this
59 * is not a given. With VALID set we can end up calling pmu::stop() again
60 * (the throttle logic) and trigger the WARNs in there.
61 *
62 * So what we do is set STOPPING before clearing EN to avoid the pmu::stop()
63 * nesting, and clear STARTED late, so that we have a well defined state over
64 * the clearing of the EN bit.
65 *
66 * XXX: we could probably be using !atomic bitops for all this.
67 */
68
69enum ibs_states {
70	IBS_ENABLED	= 0,
71	IBS_STARTED	= 1,
72	IBS_STOPPING	= 2,
73	IBS_STOPPED	= 3,
74
75	IBS_MAX_STATES,
76};
77
78struct cpu_perf_ibs {
79	struct perf_event	*event;
80	unsigned long		state[BITS_TO_LONGS(IBS_MAX_STATES)];
81};
82
83struct perf_ibs {
84	struct pmu			pmu;
85	unsigned int			msr;
86	u64				config_mask;
87	u64				cnt_mask;
88	u64				enable_mask;
89	u64				valid_mask;
90	u64				max_period;
91	unsigned long			offset_mask[1];
92	int				offset_max;
93	unsigned int			fetch_count_reset_broken : 1;
94	unsigned int			fetch_ignore_if_zero_rip : 1;
95	struct cpu_perf_ibs __percpu	*pcpu;
96
97	u64				(*get_count)(u64 config);
98};
99
100static int
101perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
102{
103	s64 left = local64_read(&hwc->period_left);
104	s64 period = hwc->sample_period;
105	int overflow = 0;
106
107	/*
108	 * If we are way outside a reasonable range then just skip forward:
109	 */
110	if (unlikely(left <= -period)) {
111		left = period;
112		local64_set(&hwc->period_left, left);
113		hwc->last_period = period;
114		overflow = 1;
115	}
116
117	if (unlikely(left < (s64)min)) {
118		left += period;
119		local64_set(&hwc->period_left, left);
120		hwc->last_period = period;
121		overflow = 1;
122	}
123
124	/*
125	 * If the hw period that triggers the sw overflow is too short
126	 * we might hit the irq handler. This biases the results.
127	 * Thus we shorten the next-to-last period and set the last
128	 * period to the max period.
129	 */
130	if (left > max) {
131		left -= max;
132		if (left > max)
133			left = max;
134		else if (left < min)
135			left = min;
136	}
137
138	*hw_period = (u64)left;
139
140	return overflow;
141}
142
143static  int
144perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
145{
146	struct hw_perf_event *hwc = &event->hw;
147	int shift = 64 - width;
148	u64 prev_raw_count;
149	u64 delta;
150
151	/*
152	 * Careful: an NMI might modify the previous event value.
153	 *
154	 * Our tactic to handle this is to first atomically read and
155	 * exchange a new raw count - then add that new-prev delta
156	 * count to the generic event atomically:
157	 */
158	prev_raw_count = local64_read(&hwc->prev_count);
159	if (!local64_try_cmpxchg(&hwc->prev_count,
160				 &prev_raw_count, new_raw_count))
161		return 0;
162
163	/*
164	 * Now we have the new raw value and have updated the prev
165	 * timestamp already. We can now calculate the elapsed delta
166	 * (event-)time and add that to the generic event.
167	 *
168	 * Careful, not all hw sign-extends above the physical width
169	 * of the count.
170	 */
171	delta = (new_raw_count << shift) - (prev_raw_count << shift);
172	delta >>= shift;
173
174	local64_add(delta, &event->count);
175	local64_sub(delta, &hwc->period_left);
176
177	return 1;
178}
179
180static struct perf_ibs perf_ibs_fetch;
181static struct perf_ibs perf_ibs_op;
182
183static struct perf_ibs *get_ibs_pmu(int type)
184{
185	if (perf_ibs_fetch.pmu.type == type)
186		return &perf_ibs_fetch;
187	if (perf_ibs_op.pmu.type == type)
188		return &perf_ibs_op;
189	return NULL;
190}
191
192/*
193 * core pmu config -> IBS config
194 *
195 *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
196 *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
197 *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
198 *
199 * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
200 * MSRC001_1033) is used to select either cycle or micro-ops counting
201 * mode.
202 */
203static int core_pmu_ibs_config(struct perf_event *event, u64 *config)
204{
205	switch (event->attr.type) {
206	case PERF_TYPE_HARDWARE:
207		switch (event->attr.config) {
208		case PERF_COUNT_HW_CPU_CYCLES:
209			*config = 0;
210			return 0;
211		}
212		break;
213	case PERF_TYPE_RAW:
214		switch (event->attr.config) {
215		case 0x0076:
216			*config = 0;
217			return 0;
218		case 0x00C1:
219			*config = IBS_OP_CNT_CTL;
220			return 0;
221		}
222		break;
223	default:
224		return -ENOENT;
225	}
226
227	return -EOPNOTSUPP;
228}
229
230/*
231 * The rip of IBS samples has skid 0. Thus, IBS supports precise
232 * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
233 * rip is invalid when IBS was not able to record the rip correctly.
234 * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
235 */
236int forward_event_to_ibs(struct perf_event *event)
237{
238	u64 config = 0;
239
240	if (!event->attr.precise_ip || event->attr.precise_ip > 2)
241		return -EOPNOTSUPP;
242
243	if (!core_pmu_ibs_config(event, &config)) {
244		event->attr.type = perf_ibs_op.pmu.type;
245		event->attr.config = config;
246	}
247	return -ENOENT;
248}
249
250/*
251 * Grouping of IBS events is not possible since IBS can have only
252 * one event active at any point in time.
253 */
254static int validate_group(struct perf_event *event)
255{
256	struct perf_event *sibling;
257
258	if (event->group_leader == event)
259		return 0;
260
261	if (event->group_leader->pmu == event->pmu)
262		return -EINVAL;
263
264	for_each_sibling_event(sibling, event->group_leader) {
265		if (sibling->pmu == event->pmu)
266			return -EINVAL;
267	}
268	return 0;
269}
270
271static int perf_ibs_init(struct perf_event *event)
272{
273	struct hw_perf_event *hwc = &event->hw;
274	struct perf_ibs *perf_ibs;
275	u64 max_cnt, config;
276	int ret;
277
278	perf_ibs = get_ibs_pmu(event->attr.type);
279	if (!perf_ibs)
280		return -ENOENT;
281
282	config = event->attr.config;
283
284	if (event->pmu != &perf_ibs->pmu)
285		return -ENOENT;
286
287	if (config & ~perf_ibs->config_mask)
288		return -EINVAL;
289
290	ret = validate_group(event);
291	if (ret)
292		return ret;
293
294	if (hwc->sample_period) {
295		if (config & perf_ibs->cnt_mask)
296			/* raw max_cnt may not be set */
297			return -EINVAL;
298		if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
299			/*
300			 * lower 4 bits can not be set in ibs max cnt,
301			 * but allowing it in case we adjust the
302			 * sample period to set a frequency.
303			 */
304			return -EINVAL;
305		hwc->sample_period &= ~0x0FULL;
306		if (!hwc->sample_period)
307			hwc->sample_period = 0x10;
308	} else {
309		max_cnt = config & perf_ibs->cnt_mask;
310		config &= ~perf_ibs->cnt_mask;
311		event->attr.sample_period = max_cnt << 4;
312		hwc->sample_period = event->attr.sample_period;
313	}
314
315	if (!hwc->sample_period)
316		return -EINVAL;
317
318	/*
319	 * If we modify hwc->sample_period, we also need to update
320	 * hwc->last_period and hwc->period_left.
321	 */
322	hwc->last_period = hwc->sample_period;
323	local64_set(&hwc->period_left, hwc->sample_period);
324
325	hwc->config_base = perf_ibs->msr;
326	hwc->config = config;
327
328	return 0;
329}
330
331static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
332			       struct hw_perf_event *hwc, u64 *period)
333{
334	int overflow;
335
336	/* ignore lower 4 bits in min count: */
337	overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
338	local64_set(&hwc->prev_count, 0);
339
340	return overflow;
341}
342
343static u64 get_ibs_fetch_count(u64 config)
344{
345	union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config;
346
347	return fetch_ctl.fetch_cnt << 4;
348}
349
350static u64 get_ibs_op_count(u64 config)
351{
352	union ibs_op_ctl op_ctl = (union ibs_op_ctl)config;
353	u64 count = 0;
354
355	/*
356	 * If the internal 27-bit counter rolled over, the count is MaxCnt
357	 * and the lower 7 bits of CurCnt are randomized.
358	 * Otherwise CurCnt has the full 27-bit current counter value.
359	 */
360	if (op_ctl.op_val) {
361		count = op_ctl.opmaxcnt << 4;
362		if (ibs_caps & IBS_CAPS_OPCNTEXT)
363			count += op_ctl.opmaxcnt_ext << 20;
364	} else if (ibs_caps & IBS_CAPS_RDWROPCNT) {
365		count = op_ctl.opcurcnt;
366	}
367
368	return count;
369}
370
371static void
372perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
373		      u64 *config)
374{
375	u64 count = perf_ibs->get_count(*config);
376
377	/*
378	 * Set width to 64 since we do not overflow on max width but
379	 * instead on max count. In perf_ibs_set_period() we clear
380	 * prev count manually on overflow.
381	 */
382	while (!perf_event_try_update(event, count, 64)) {
383		rdmsrl(event->hw.config_base, *config);
384		count = perf_ibs->get_count(*config);
385	}
386}
387
388static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
389					 struct hw_perf_event *hwc, u64 config)
390{
391	u64 tmp = hwc->config | config;
392
393	if (perf_ibs->fetch_count_reset_broken)
394		wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask);
395
396	wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask);
397}
398
399/*
400 * Erratum #420 Instruction-Based Sampling Engine May Generate
401 * Interrupt that Cannot Be Cleared:
402 *
403 * Must clear counter mask first, then clear the enable bit. See
404 * Revision Guide for AMD Family 10h Processors, Publication #41322.
405 */
406static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
407					  struct hw_perf_event *hwc, u64 config)
408{
409	config &= ~perf_ibs->cnt_mask;
410	if (boot_cpu_data.x86 == 0x10)
411		wrmsrl(hwc->config_base, config);
412	config &= ~perf_ibs->enable_mask;
413	wrmsrl(hwc->config_base, config);
414}
415
416/*
417 * We cannot restore the ibs pmu state, so we always needs to update
418 * the event while stopping it and then reset the state when starting
419 * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
420 * perf_ibs_start()/perf_ibs_stop() and instead always do it.
421 */
422static void perf_ibs_start(struct perf_event *event, int flags)
423{
424	struct hw_perf_event *hwc = &event->hw;
425	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
426	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
427	u64 period, config = 0;
428
429	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
430		return;
431
432	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
433	hwc->state = 0;
434
435	perf_ibs_set_period(perf_ibs, hwc, &period);
436	if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) {
437		config |= period & IBS_OP_MAX_CNT_EXT_MASK;
438		period &= ~IBS_OP_MAX_CNT_EXT_MASK;
439	}
440	config |= period >> 4;
441
442	/*
443	 * Set STARTED before enabling the hardware, such that a subsequent NMI
444	 * must observe it.
445	 */
446	set_bit(IBS_STARTED,    pcpu->state);
447	clear_bit(IBS_STOPPING, pcpu->state);
448	perf_ibs_enable_event(perf_ibs, hwc, config);
449
450	perf_event_update_userpage(event);
451}
452
453static void perf_ibs_stop(struct perf_event *event, int flags)
454{
455	struct hw_perf_event *hwc = &event->hw;
456	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
457	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
458	u64 config;
459	int stopping;
460
461	if (test_and_set_bit(IBS_STOPPING, pcpu->state))
462		return;
463
464	stopping = test_bit(IBS_STARTED, pcpu->state);
465
466	if (!stopping && (hwc->state & PERF_HES_UPTODATE))
467		return;
468
469	rdmsrl(hwc->config_base, config);
470
471	if (stopping) {
472		/*
473		 * Set STOPPED before disabling the hardware, such that it
474		 * must be visible to NMIs the moment we clear the EN bit,
475		 * at which point we can generate an !VALID sample which
476		 * we need to consume.
477		 */
478		set_bit(IBS_STOPPED, pcpu->state);
479		perf_ibs_disable_event(perf_ibs, hwc, config);
480		/*
481		 * Clear STARTED after disabling the hardware; if it were
482		 * cleared before an NMI hitting after the clear but before
483		 * clearing the EN bit might think it a spurious NMI and not
484		 * handle it.
485		 *
486		 * Clearing it after, however, creates the problem of the NMI
487		 * handler seeing STARTED but not having a valid sample.
488		 */
489		clear_bit(IBS_STARTED, pcpu->state);
490		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
491		hwc->state |= PERF_HES_STOPPED;
492	}
493
494	if (hwc->state & PERF_HES_UPTODATE)
495		return;
496
497	/*
498	 * Clear valid bit to not count rollovers on update, rollovers
499	 * are only updated in the irq handler.
500	 */
501	config &= ~perf_ibs->valid_mask;
502
503	perf_ibs_event_update(perf_ibs, event, &config);
504	hwc->state |= PERF_HES_UPTODATE;
505}
506
507static int perf_ibs_add(struct perf_event *event, int flags)
508{
509	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
510	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
511
512	if (test_and_set_bit(IBS_ENABLED, pcpu->state))
513		return -ENOSPC;
514
515	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
516
517	pcpu->event = event;
518
519	if (flags & PERF_EF_START)
520		perf_ibs_start(event, PERF_EF_RELOAD);
521
522	return 0;
523}
524
525static void perf_ibs_del(struct perf_event *event, int flags)
526{
527	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
528	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
529
530	if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
531		return;
532
533	perf_ibs_stop(event, PERF_EF_UPDATE);
534
535	pcpu->event = NULL;
536
537	perf_event_update_userpage(event);
538}
539
540static void perf_ibs_read(struct perf_event *event) { }
541
542/*
543 * We need to initialize with empty group if all attributes in the
544 * group are dynamic.
545 */
546static struct attribute *attrs_empty[] = {
547	NULL,
548};
549
550static struct attribute_group empty_format_group = {
551	.name = "format",
552	.attrs = attrs_empty,
553};
554
555static struct attribute_group empty_caps_group = {
556	.name = "caps",
557	.attrs = attrs_empty,
558};
559
560static const struct attribute_group *empty_attr_groups[] = {
561	&empty_format_group,
562	&empty_caps_group,
563	NULL,
564};
565
566PMU_FORMAT_ATTR(rand_en,	"config:57");
567PMU_FORMAT_ATTR(cnt_ctl,	"config:19");
568PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
569PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
570PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
571
572static umode_t
573zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i)
574{
575	return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
576}
577
578static struct attribute *rand_en_attrs[] = {
579	&format_attr_rand_en.attr,
580	NULL,
581};
582
583static struct attribute *fetch_l3missonly_attrs[] = {
584	&fetch_l3missonly.attr.attr,
585	NULL,
586};
587
588static struct attribute *zen4_ibs_extensions_attrs[] = {
589	&zen4_ibs_extensions.attr.attr,
590	NULL,
591};
592
593static struct attribute_group group_rand_en = {
594	.name = "format",
595	.attrs = rand_en_attrs,
596};
597
598static struct attribute_group group_fetch_l3missonly = {
599	.name = "format",
600	.attrs = fetch_l3missonly_attrs,
601	.is_visible = zen4_ibs_extensions_is_visible,
602};
603
604static struct attribute_group group_zen4_ibs_extensions = {
605	.name = "caps",
606	.attrs = zen4_ibs_extensions_attrs,
607	.is_visible = zen4_ibs_extensions_is_visible,
608};
609
610static const struct attribute_group *fetch_attr_groups[] = {
611	&group_rand_en,
612	&empty_caps_group,
613	NULL,
614};
615
616static const struct attribute_group *fetch_attr_update[] = {
617	&group_fetch_l3missonly,
618	&group_zen4_ibs_extensions,
619	NULL,
620};
621
622static umode_t
623cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
624{
625	return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
626}
627
628static struct attribute *cnt_ctl_attrs[] = {
629	&format_attr_cnt_ctl.attr,
630	NULL,
631};
632
633static struct attribute *op_l3missonly_attrs[] = {
634	&op_l3missonly.attr.attr,
635	NULL,
636};
637
638static struct attribute_group group_cnt_ctl = {
639	.name = "format",
640	.attrs = cnt_ctl_attrs,
641	.is_visible = cnt_ctl_is_visible,
642};
643
644static struct attribute_group group_op_l3missonly = {
645	.name = "format",
646	.attrs = op_l3missonly_attrs,
647	.is_visible = zen4_ibs_extensions_is_visible,
648};
649
650static const struct attribute_group *op_attr_update[] = {
651	&group_cnt_ctl,
652	&group_op_l3missonly,
653	&group_zen4_ibs_extensions,
654	NULL,
655};
656
657static struct perf_ibs perf_ibs_fetch = {
658	.pmu = {
659		.task_ctx_nr	= perf_hw_context,
660
661		.event_init	= perf_ibs_init,
662		.add		= perf_ibs_add,
663		.del		= perf_ibs_del,
664		.start		= perf_ibs_start,
665		.stop		= perf_ibs_stop,
666		.read		= perf_ibs_read,
667		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
668	},
669	.msr			= MSR_AMD64_IBSFETCHCTL,
670	.config_mask		= IBS_FETCH_CONFIG_MASK,
671	.cnt_mask		= IBS_FETCH_MAX_CNT,
672	.enable_mask		= IBS_FETCH_ENABLE,
673	.valid_mask		= IBS_FETCH_VAL,
674	.max_period		= IBS_FETCH_MAX_CNT << 4,
675	.offset_mask		= { MSR_AMD64_IBSFETCH_REG_MASK },
676	.offset_max		= MSR_AMD64_IBSFETCH_REG_COUNT,
677
678	.get_count		= get_ibs_fetch_count,
679};
680
681static struct perf_ibs perf_ibs_op = {
682	.pmu = {
683		.task_ctx_nr	= perf_hw_context,
684
685		.event_init	= perf_ibs_init,
686		.add		= perf_ibs_add,
687		.del		= perf_ibs_del,
688		.start		= perf_ibs_start,
689		.stop		= perf_ibs_stop,
690		.read		= perf_ibs_read,
691		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
692	},
693	.msr			= MSR_AMD64_IBSOPCTL,
694	.config_mask		= IBS_OP_CONFIG_MASK,
695	.cnt_mask		= IBS_OP_MAX_CNT | IBS_OP_CUR_CNT |
696				  IBS_OP_CUR_CNT_RAND,
697	.enable_mask		= IBS_OP_ENABLE,
698	.valid_mask		= IBS_OP_VAL,
699	.max_period		= IBS_OP_MAX_CNT << 4,
700	.offset_mask		= { MSR_AMD64_IBSOP_REG_MASK },
701	.offset_max		= MSR_AMD64_IBSOP_REG_COUNT,
702
703	.get_count		= get_ibs_op_count,
704};
705
706static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3,
707				struct perf_sample_data *data)
708{
709	union perf_mem_data_src *data_src = &data->data_src;
710
711	data_src->mem_op = PERF_MEM_OP_NA;
712
713	if (op_data3->ld_op)
714		data_src->mem_op = PERF_MEM_OP_LOAD;
715	else if (op_data3->st_op)
716		data_src->mem_op = PERF_MEM_OP_STORE;
717}
718
719/*
720 * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has
721 * more fine granular DataSrc encodings. Others have coarse.
722 */
723static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2)
724{
725	if (ibs_caps & IBS_CAPS_ZEN4)
726		return (op_data2->data_src_hi << 3) | op_data2->data_src_lo;
727
728	return op_data2->data_src_lo;
729}
730
731#define	L(x)		(PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT))
732#define	LN(x)		PERF_MEM_S(LVLNUM, x)
733#define	REM		PERF_MEM_S(REMOTE, REMOTE)
734#define	HOPS(x)		PERF_MEM_S(HOPS, x)
735
736static u64 g_data_src[8] = {
737	[IBS_DATA_SRC_LOC_CACHE]	  = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0),
738	[IBS_DATA_SRC_DRAM]		  = L(LOC_RAM) | LN(RAM),
739	[IBS_DATA_SRC_REM_CACHE]	  = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
740	[IBS_DATA_SRC_IO]		  = L(IO) | LN(IO),
741};
742
743#define RMT_NODE_BITS			(1 << IBS_DATA_SRC_DRAM)
744#define RMT_NODE_APPLICABLE(x)		(RMT_NODE_BITS & (1 << x))
745
746static u64 g_zen4_data_src[32] = {
747	[IBS_DATA_SRC_EXT_LOC_CACHE]	  = L(L3) | LN(L3),
748	[IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0),
749	[IBS_DATA_SRC_EXT_DRAM]		  = L(LOC_RAM) | LN(RAM),
750	[IBS_DATA_SRC_EXT_FAR_CCX_CACHE]  = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
751	[IBS_DATA_SRC_EXT_PMEM]		  = LN(PMEM),
752	[IBS_DATA_SRC_EXT_IO]		  = L(IO) | LN(IO),
753	[IBS_DATA_SRC_EXT_EXT_MEM]	  = LN(CXL),
754};
755
756#define ZEN4_RMT_NODE_BITS		((1 << IBS_DATA_SRC_EXT_DRAM) | \
757					 (1 << IBS_DATA_SRC_EXT_PMEM) | \
758					 (1 << IBS_DATA_SRC_EXT_EXT_MEM))
759#define ZEN4_RMT_NODE_APPLICABLE(x)	(ZEN4_RMT_NODE_BITS & (1 << x))
760
761static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
762				  union ibs_op_data3 *op_data3,
763				  struct perf_sample_data *data)
764{
765	union perf_mem_data_src *data_src = &data->data_src;
766	u8 ibs_data_src = perf_ibs_data_src(op_data2);
767
768	data_src->mem_lvl = 0;
769	data_src->mem_lvl_num = 0;
770
771	/*
772	 * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
773	 * memory accesses. So, check DcUcMemAcc bit early.
774	 */
775	if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
776		return L(UNC) | LN(UNC);
777
778	/* L1 Hit */
779	if (op_data3->dc_miss == 0)
780		return L(L1) | LN(L1);
781
782	/* L2 Hit */
783	if (op_data3->l2_miss == 0) {
784		/* Erratum #1293 */
785		if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF ||
786		    !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc))
787			return L(L2) | LN(L2);
788	}
789
790	/*
791	 * OP_DATA2 is valid only for load ops. Skip all checks which
792	 * uses OP_DATA2[DataSrc].
793	 */
794	if (data_src->mem_op != PERF_MEM_OP_LOAD)
795		goto check_mab;
796
797	if (ibs_caps & IBS_CAPS_ZEN4) {
798		u64 val = g_zen4_data_src[ibs_data_src];
799
800		if (!val)
801			goto check_mab;
802
803		/* HOPS_1 because IBS doesn't provide remote socket detail */
804		if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) {
805			if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM)
806				val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
807			else
808				val |= REM | HOPS(1);
809		}
810
811		return val;
812	} else {
813		u64 val = g_data_src[ibs_data_src];
814
815		if (!val)
816			goto check_mab;
817
818		/* HOPS_1 because IBS doesn't provide remote socket detail */
819		if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) {
820			if (ibs_data_src == IBS_DATA_SRC_DRAM)
821				val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
822			else
823				val |= REM | HOPS(1);
824		}
825
826		return val;
827	}
828
829check_mab:
830	/*
831	 * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding
832	 * DC misses. However, such data may come from any level in mem
833	 * hierarchy. IBS provides detail about both MAB as well as actual
834	 * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
835	 * MAB only when IBS fails to provide DataSrc.
836	 */
837	if (op_data3->dc_miss_no_mab_alloc)
838		return L(LFB) | LN(LFB);
839
840	/* Don't set HIT with NA */
841	return PERF_MEM_S(LVL, NA) | LN(NA);
842}
843
844static bool perf_ibs_cache_hit_st_valid(void)
845{
846	/* 0: Uninitialized, 1: Valid, -1: Invalid */
847	static int cache_hit_st_valid;
848
849	if (unlikely(!cache_hit_st_valid)) {
850		if (boot_cpu_data.x86 == 0x19 &&
851		    (boot_cpu_data.x86_model <= 0xF ||
852		    (boot_cpu_data.x86_model >= 0x20 &&
853		     boot_cpu_data.x86_model <= 0x5F))) {
854			cache_hit_st_valid = -1;
855		} else {
856			cache_hit_st_valid = 1;
857		}
858	}
859
860	return cache_hit_st_valid == 1;
861}
862
863static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2,
864				   struct perf_sample_data *data)
865{
866	union perf_mem_data_src *data_src = &data->data_src;
867	u8 ibs_data_src;
868
869	data_src->mem_snoop = PERF_MEM_SNOOP_NA;
870
871	if (!perf_ibs_cache_hit_st_valid() ||
872	    data_src->mem_op != PERF_MEM_OP_LOAD ||
873	    data_src->mem_lvl & PERF_MEM_LVL_L1 ||
874	    data_src->mem_lvl & PERF_MEM_LVL_L2 ||
875	    op_data2->cache_hit_st)
876		return;
877
878	ibs_data_src = perf_ibs_data_src(op_data2);
879
880	if (ibs_caps & IBS_CAPS_ZEN4) {
881		if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE ||
882		    ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE ||
883		    ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE)
884			data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
885	} else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) {
886		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
887	}
888}
889
890static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3,
891				 struct perf_sample_data *data)
892{
893	union perf_mem_data_src *data_src = &data->data_src;
894
895	data_src->mem_dtlb = PERF_MEM_TLB_NA;
896
897	if (!op_data3->dc_lin_addr_valid)
898		return;
899
900	if (!op_data3->dc_l1tlb_miss) {
901		data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT;
902		return;
903	}
904
905	if (!op_data3->dc_l2tlb_miss) {
906		data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT;
907		return;
908	}
909
910	data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS;
911}
912
913static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3,
914				  struct perf_sample_data *data)
915{
916	union perf_mem_data_src *data_src = &data->data_src;
917
918	data_src->mem_lock = PERF_MEM_LOCK_NA;
919
920	if (op_data3->dc_locked_op)
921		data_src->mem_lock = PERF_MEM_LOCK_LOCKED;
922}
923
924#define ibs_op_msr_idx(msr)	(msr - MSR_AMD64_IBSOPCTL)
925
926static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data,
927				  struct perf_sample_data *data,
928				  union ibs_op_data2 *op_data2,
929				  union ibs_op_data3 *op_data3)
930{
931	union perf_mem_data_src *data_src = &data->data_src;
932
933	data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data);
934	perf_ibs_get_mem_snoop(op_data2, data);
935	perf_ibs_get_tlb_lvl(op_data3, data);
936	perf_ibs_get_mem_lock(op_data3, data);
937}
938
939static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data,
940				   union ibs_op_data3 *op_data3)
941{
942	__u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)];
943
944	/* Erratum #1293 */
945	if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF &&
946	    (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) {
947		/*
948		 * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode.
949		 * DataSrc=0 is 'No valid status' and RmtNode is invalid when
950		 * DataSrc=0.
951		 */
952		val = 0;
953	}
954	return val;
955}
956
957static void perf_ibs_parse_ld_st_data(__u64 sample_type,
958				      struct perf_ibs_data *ibs_data,
959				      struct perf_sample_data *data)
960{
961	union ibs_op_data3 op_data3;
962	union ibs_op_data2 op_data2;
963	union ibs_op_data op_data;
964
965	data->data_src.val = PERF_MEM_NA;
966	op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)];
967
968	perf_ibs_get_mem_op(&op_data3, data);
969	if (data->data_src.mem_op != PERF_MEM_OP_LOAD &&
970	    data->data_src.mem_op != PERF_MEM_OP_STORE)
971		return;
972
973	op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3);
974
975	if (sample_type & PERF_SAMPLE_DATA_SRC) {
976		perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3);
977		data->sample_flags |= PERF_SAMPLE_DATA_SRC;
978	}
979
980	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss &&
981	    data->data_src.mem_op == PERF_MEM_OP_LOAD) {
982		op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)];
983
984		if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
985			data->weight.var1_dw = op_data3.dc_miss_lat;
986			data->weight.var2_w = op_data.tag_to_ret_ctr;
987		} else if (sample_type & PERF_SAMPLE_WEIGHT) {
988			data->weight.full = op_data3.dc_miss_lat;
989		}
990		data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
991	}
992
993	if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) {
994		data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)];
995		data->sample_flags |= PERF_SAMPLE_ADDR;
996	}
997
998	if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) {
999		data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)];
1000		data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
1001	}
1002}
1003
1004static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type,
1005				   int check_rip)
1006{
1007	if (sample_type & PERF_SAMPLE_RAW ||
1008	    (perf_ibs == &perf_ibs_op &&
1009	     (sample_type & PERF_SAMPLE_DATA_SRC ||
1010	      sample_type & PERF_SAMPLE_WEIGHT_TYPE ||
1011	      sample_type & PERF_SAMPLE_ADDR ||
1012	      sample_type & PERF_SAMPLE_PHYS_ADDR)))
1013		return perf_ibs->offset_max;
1014	else if (check_rip)
1015		return 3;
1016	return 1;
1017}
1018
1019static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
1020{
1021	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
1022	struct perf_event *event = pcpu->event;
1023	struct hw_perf_event *hwc;
1024	struct perf_sample_data data;
1025	struct perf_raw_record raw;
1026	struct pt_regs regs;
1027	struct perf_ibs_data ibs_data;
1028	int offset, size, check_rip, offset_max, throttle = 0;
1029	unsigned int msr;
1030	u64 *buf, *config, period, new_config = 0;
1031
1032	if (!test_bit(IBS_STARTED, pcpu->state)) {
1033fail:
1034		/*
1035		 * Catch spurious interrupts after stopping IBS: After
1036		 * disabling IBS there could be still incoming NMIs
1037		 * with samples that even have the valid bit cleared.
1038		 * Mark all this NMIs as handled.
1039		 */
1040		if (test_and_clear_bit(IBS_STOPPED, pcpu->state))
1041			return 1;
1042
1043		return 0;
1044	}
1045
1046	if (WARN_ON_ONCE(!event))
1047		goto fail;
1048
1049	hwc = &event->hw;
1050	msr = hwc->config_base;
1051	buf = ibs_data.regs;
1052	rdmsrl(msr, *buf);
1053	if (!(*buf++ & perf_ibs->valid_mask))
1054		goto fail;
1055
1056	config = &ibs_data.regs[0];
1057	perf_ibs_event_update(perf_ibs, event, config);
1058	perf_sample_data_init(&data, 0, hwc->last_period);
1059	if (!perf_ibs_set_period(perf_ibs, hwc, &period))
1060		goto out;	/* no sw counter overflow */
1061
1062	ibs_data.caps = ibs_caps;
1063	size = 1;
1064	offset = 1;
1065	check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
1066
1067	offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip);
1068
1069	do {
1070		rdmsrl(msr + offset, *buf++);
1071		size++;
1072		offset = find_next_bit(perf_ibs->offset_mask,
1073				       perf_ibs->offset_max,
1074				       offset + 1);
1075	} while (offset < offset_max);
1076	/*
1077	 * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately
1078	 * depending on their availability.
1079	 * Can't add to offset_max as they are staggered
1080	 */
1081	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
1082		if (perf_ibs == &perf_ibs_op) {
1083			if (ibs_caps & IBS_CAPS_BRNTRGT) {
1084				rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
1085				size++;
1086			}
1087			if (ibs_caps & IBS_CAPS_OPDATA4) {
1088				rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
1089				size++;
1090			}
1091		}
1092		if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) {
1093			rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++);
1094			size++;
1095		}
1096	}
1097	ibs_data.size = sizeof(u64) * size;
1098
1099	regs = *iregs;
1100	if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
1101		regs.flags &= ~PERF_EFLAGS_EXACT;
1102	} else {
1103		/* Workaround for erratum #1197 */
1104		if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1]))
1105			goto out;
1106
1107		set_linear_ip(&regs, ibs_data.regs[1]);
1108		regs.flags |= PERF_EFLAGS_EXACT;
1109	}
1110
1111	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
1112		raw = (struct perf_raw_record){
1113			.frag = {
1114				.size = sizeof(u32) + ibs_data.size,
1115				.data = ibs_data.data,
1116			},
1117		};
1118		perf_sample_save_raw_data(&data, &raw);
1119	}
1120
1121	if (perf_ibs == &perf_ibs_op)
1122		perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data);
1123
1124	/*
1125	 * rip recorded by IbsOpRip will not be consistent with rsp and rbp
1126	 * recorded as part of interrupt regs. Thus we need to use rip from
1127	 * interrupt regs while unwinding call stack.
1128	 */
1129	if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
1130		perf_sample_save_callchain(&data, event, iregs);
1131
1132	throttle = perf_event_overflow(event, &data, &regs);
1133out:
1134	if (throttle) {
1135		perf_ibs_stop(event, 0);
1136	} else {
1137		if (perf_ibs == &perf_ibs_op) {
1138			if (ibs_caps & IBS_CAPS_OPCNTEXT) {
1139				new_config = period & IBS_OP_MAX_CNT_EXT_MASK;
1140				period &= ~IBS_OP_MAX_CNT_EXT_MASK;
1141			}
1142			if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL))
1143				new_config |= *config & IBS_OP_CUR_CNT_RAND;
1144		}
1145		new_config |= period >> 4;
1146
1147		perf_ibs_enable_event(perf_ibs, hwc, new_config);
1148	}
1149
1150	perf_event_update_userpage(event);
1151
1152	return 1;
1153}
1154
1155static int
1156perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
1157{
1158	u64 stamp = sched_clock();
1159	int handled = 0;
1160
1161	handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
1162	handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
1163
1164	if (handled)
1165		inc_irq_stat(apic_perf_irqs);
1166
1167	perf_sample_event_took(sched_clock() - stamp);
1168
1169	return handled;
1170}
1171NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
1172
1173static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
1174{
1175	struct cpu_perf_ibs __percpu *pcpu;
1176	int ret;
1177
1178	pcpu = alloc_percpu(struct cpu_perf_ibs);
1179	if (!pcpu)
1180		return -ENOMEM;
1181
1182	perf_ibs->pcpu = pcpu;
1183
1184	ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
1185	if (ret) {
1186		perf_ibs->pcpu = NULL;
1187		free_percpu(pcpu);
1188	}
1189
1190	return ret;
1191}
1192
1193static __init int perf_ibs_fetch_init(void)
1194{
1195	/*
1196	 * Some chips fail to reset the fetch count when it is written; instead
1197	 * they need a 0-1 transition of IbsFetchEn.
1198	 */
1199	if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18)
1200		perf_ibs_fetch.fetch_count_reset_broken = 1;
1201
1202	if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10)
1203		perf_ibs_fetch.fetch_ignore_if_zero_rip = 1;
1204
1205	if (ibs_caps & IBS_CAPS_ZEN4)
1206		perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY;
1207
1208	perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups;
1209	perf_ibs_fetch.pmu.attr_update = fetch_attr_update;
1210
1211	return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
1212}
1213
1214static __init int perf_ibs_op_init(void)
1215{
1216	if (ibs_caps & IBS_CAPS_OPCNT)
1217		perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
1218
1219	if (ibs_caps & IBS_CAPS_OPCNTEXT) {
1220		perf_ibs_op.max_period  |= IBS_OP_MAX_CNT_EXT_MASK;
1221		perf_ibs_op.config_mask	|= IBS_OP_MAX_CNT_EXT_MASK;
1222		perf_ibs_op.cnt_mask    |= IBS_OP_MAX_CNT_EXT_MASK;
1223	}
1224
1225	if (ibs_caps & IBS_CAPS_ZEN4)
1226		perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
1227
1228	perf_ibs_op.pmu.attr_groups = empty_attr_groups;
1229	perf_ibs_op.pmu.attr_update = op_attr_update;
1230
1231	return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
1232}
1233
1234static __init int perf_event_ibs_init(void)
1235{
1236	int ret;
1237
1238	ret = perf_ibs_fetch_init();
1239	if (ret)
1240		return ret;
1241
1242	ret = perf_ibs_op_init();
1243	if (ret)
1244		goto err_op;
1245
1246	ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
1247	if (ret)
1248		goto err_nmi;
1249
1250	pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
1251	return 0;
1252
1253err_nmi:
1254	perf_pmu_unregister(&perf_ibs_op.pmu);
1255	free_percpu(perf_ibs_op.pcpu);
1256	perf_ibs_op.pcpu = NULL;
1257err_op:
1258	perf_pmu_unregister(&perf_ibs_fetch.pmu);
1259	free_percpu(perf_ibs_fetch.pcpu);
1260	perf_ibs_fetch.pcpu = NULL;
1261
1262	return ret;
1263}
1264
1265#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
1266
1267static __init int perf_event_ibs_init(void)
1268{
1269	return 0;
1270}
1271
1272#endif
1273
1274/* IBS - apic initialization, for perf and oprofile */
1275
1276static __init u32 __get_ibs_caps(void)
1277{
1278	u32 caps;
1279	unsigned int max_level;
1280
1281	if (!boot_cpu_has(X86_FEATURE_IBS))
1282		return 0;
1283
1284	/* check IBS cpuid feature flags */
1285	max_level = cpuid_eax(0x80000000);
1286	if (max_level < IBS_CPUID_FEATURES)
1287		return IBS_CAPS_DEFAULT;
1288
1289	caps = cpuid_eax(IBS_CPUID_FEATURES);
1290	if (!(caps & IBS_CAPS_AVAIL))
1291		/* cpuid flags not valid */
1292		return IBS_CAPS_DEFAULT;
1293
1294	return caps;
1295}
1296
1297u32 get_ibs_caps(void)
1298{
1299	return ibs_caps;
1300}
1301
1302EXPORT_SYMBOL(get_ibs_caps);
1303
1304static inline int get_eilvt(int offset)
1305{
1306	return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
1307}
1308
1309static inline int put_eilvt(int offset)
1310{
1311	return !setup_APIC_eilvt(offset, 0, 0, 1);
1312}
1313
1314/*
1315 * Check and reserve APIC extended interrupt LVT offset for IBS if available.
1316 */
1317static inline int ibs_eilvt_valid(void)
1318{
1319	int offset;
1320	u64 val;
1321	int valid = 0;
1322
1323	preempt_disable();
1324
1325	rdmsrl(MSR_AMD64_IBSCTL, val);
1326	offset = val & IBSCTL_LVT_OFFSET_MASK;
1327
1328	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
1329		pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
1330		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
1331		goto out;
1332	}
1333
1334	if (!get_eilvt(offset)) {
1335		pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
1336		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
1337		goto out;
1338	}
1339
1340	valid = 1;
1341out:
1342	preempt_enable();
1343
1344	return valid;
1345}
1346
1347static int setup_ibs_ctl(int ibs_eilvt_off)
1348{
1349	struct pci_dev *cpu_cfg;
1350	int nodes;
1351	u32 value = 0;
1352
1353	nodes = 0;
1354	cpu_cfg = NULL;
1355	do {
1356		cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
1357					 PCI_DEVICE_ID_AMD_10H_NB_MISC,
1358					 cpu_cfg);
1359		if (!cpu_cfg)
1360			break;
1361		++nodes;
1362		pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
1363				       | IBSCTL_LVT_OFFSET_VALID);
1364		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
1365		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
1366			pci_dev_put(cpu_cfg);
1367			pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
1368				 value);
1369			return -EINVAL;
1370		}
1371	} while (1);
1372
1373	if (!nodes) {
1374		pr_debug("No CPU node configured for IBS\n");
1375		return -ENODEV;
1376	}
1377
1378	return 0;
1379}
1380
1381/*
1382 * This runs only on the current cpu. We try to find an LVT offset and
1383 * setup the local APIC. For this we must disable preemption. On
1384 * success we initialize all nodes with this offset. This updates then
1385 * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
1386 * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
1387 * is using the new offset.
1388 */
1389static void force_ibs_eilvt_setup(void)
1390{
1391	int offset;
1392	int ret;
1393
1394	preempt_disable();
1395	/* find the next free available EILVT entry, skip offset 0 */
1396	for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
1397		if (get_eilvt(offset))
1398			break;
1399	}
1400	preempt_enable();
1401
1402	if (offset == APIC_EILVT_NR_MAX) {
1403		pr_debug("No EILVT entry available\n");
1404		return;
1405	}
1406
1407	ret = setup_ibs_ctl(offset);
1408	if (ret)
1409		goto out;
1410
1411	if (!ibs_eilvt_valid())
1412		goto out;
1413
1414	pr_info("LVT offset %d assigned\n", offset);
1415
1416	return;
1417out:
1418	preempt_disable();
1419	put_eilvt(offset);
1420	preempt_enable();
1421	return;
1422}
1423
1424static void ibs_eilvt_setup(void)
1425{
1426	/*
1427	 * Force LVT offset assignment for family 10h: The offsets are
1428	 * not assigned by the BIOS for this family, so the OS is
1429	 * responsible for doing it. If the OS assignment fails, fall
1430	 * back to BIOS settings and try to setup this.
1431	 */
1432	if (boot_cpu_data.x86 == 0x10)
1433		force_ibs_eilvt_setup();
1434}
1435
1436static inline int get_ibs_lvt_offset(void)
1437{
1438	u64 val;
1439
1440	rdmsrl(MSR_AMD64_IBSCTL, val);
1441	if (!(val & IBSCTL_LVT_OFFSET_VALID))
1442		return -EINVAL;
1443
1444	return val & IBSCTL_LVT_OFFSET_MASK;
1445}
1446
1447static void setup_APIC_ibs(void)
1448{
1449	int offset;
1450
1451	offset = get_ibs_lvt_offset();
1452	if (offset < 0)
1453		goto failed;
1454
1455	if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
1456		return;
1457failed:
1458	pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
1459		smp_processor_id());
1460}
1461
1462static void clear_APIC_ibs(void)
1463{
1464	int offset;
1465
1466	offset = get_ibs_lvt_offset();
1467	if (offset >= 0)
1468		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
1469}
1470
1471static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu)
1472{
1473	setup_APIC_ibs();
1474	return 0;
1475}
1476
1477#ifdef CONFIG_PM
1478
1479static int perf_ibs_suspend(void)
1480{
1481	clear_APIC_ibs();
1482	return 0;
1483}
1484
1485static void perf_ibs_resume(void)
1486{
1487	ibs_eilvt_setup();
1488	setup_APIC_ibs();
1489}
1490
1491static struct syscore_ops perf_ibs_syscore_ops = {
1492	.resume		= perf_ibs_resume,
1493	.suspend	= perf_ibs_suspend,
1494};
1495
1496static void perf_ibs_pm_init(void)
1497{
1498	register_syscore_ops(&perf_ibs_syscore_ops);
1499}
1500
1501#else
1502
1503static inline void perf_ibs_pm_init(void) { }
1504
1505#endif
1506
1507static int x86_pmu_amd_ibs_dying_cpu(unsigned int cpu)
1508{
1509	clear_APIC_ibs();
1510	return 0;
1511}
1512
1513static __init int amd_ibs_init(void)
1514{
1515	u32 caps;
1516
1517	caps = __get_ibs_caps();
1518	if (!caps)
1519		return -ENODEV;	/* ibs not supported by the cpu */
1520
1521	ibs_eilvt_setup();
1522
1523	if (!ibs_eilvt_valid())
1524		return -EINVAL;
1525
1526	perf_ibs_pm_init();
1527
1528	ibs_caps = caps;
1529	/* make ibs_caps visible to other cpus: */
1530	smp_mb();
1531	/*
1532	 * x86_pmu_amd_ibs_starting_cpu will be called from core on
1533	 * all online cpus.
1534	 */
1535	cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
1536			  "perf/x86/amd/ibs:starting",
1537			  x86_pmu_amd_ibs_starting_cpu,
1538			  x86_pmu_amd_ibs_dying_cpu);
1539
1540	return perf_event_ibs_init();
1541}
1542
1543/* Since we need the pci subsystem to init ibs we can't do this earlier: */
1544device_initcall(amd_ibs_init);
1545