xref: /kernel/linux/linux-5.10/arch/x86/events/rapl.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Support Intel/AMD RAPL energy consumption counters
4 * Copyright (C) 2013 Google, Inc., Stephane Eranian
5 *
6 * Intel RAPL interface is specified in the IA-32 Manual Vol3b
7 * section 14.7.1 (September 2013)
8 *
9 * AMD RAPL interface for Fam17h is described in the public PPR:
10 * https://bugzilla.kernel.org/show_bug.cgi?id=206537
11 *
12 * RAPL provides more controls than just reporting energy consumption
13 * however here we only expose the 3 energy consumption free running
14 * counters (pp0, pkg, dram).
15 *
16 * Each of those counters increments in a power unit defined by the
17 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
18 * but it can vary.
19 *
20 * Counter to rapl events mappings:
21 *
22 *  pp0 counter: consumption of all physical cores (power plane 0)
23 * 	  event: rapl_energy_cores
24 *    perf code: 0x1
25 *
26 *  pkg counter: consumption of the whole processor package
27 *	  event: rapl_energy_pkg
28 *    perf code: 0x2
29 *
30 * dram counter: consumption of the dram domain (servers only)
31 *	  event: rapl_energy_dram
32 *    perf code: 0x3
33 *
34 * gpu counter: consumption of the builtin-gpu domain (client only)
35 *	  event: rapl_energy_gpu
36 *    perf code: 0x4
37 *
38 *  psys counter: consumption of the builtin-psys domain (client only)
39 *	  event: rapl_energy_psys
40 *    perf code: 0x5
41 *
42 * We manage those counters as free running (read-only). They may be
43 * use simultaneously by other tools, such as turbostat.
44 *
45 * The events only support system-wide mode counting. There is no
46 * sampling support because it does not make sense and is not
47 * supported by the RAPL hardware.
48 *
49 * Because we want to avoid floating-point operations in the kernel,
50 * the events are all reported in fixed point arithmetic (32.32).
51 * Tools must adjust the counts to convert them to Watts using
52 * the duration of the measurement. Tools may use a function such as
53 * ldexp(raw_count, -32);
54 */
55
56#define pr_fmt(fmt) "RAPL PMU: " fmt
57
58#include <linux/module.h>
59#include <linux/slab.h>
60#include <linux/perf_event.h>
61#include <linux/nospec.h>
62#include <asm/cpu_device_id.h>
63#include <asm/intel-family.h>
64#include "perf_event.h"
65#include "probe.h"
66
67MODULE_LICENSE("GPL");
68
69/*
70 * RAPL energy status counters
71 */
72enum perf_rapl_events {
73	PERF_RAPL_PP0 = 0,		/* all cores */
74	PERF_RAPL_PKG,			/* entire package */
75	PERF_RAPL_RAM,			/* DRAM */
76	PERF_RAPL_PP1,			/* gpu */
77	PERF_RAPL_PSYS,			/* psys */
78
79	PERF_RAPL_MAX,
80	NR_RAPL_DOMAINS = PERF_RAPL_MAX,
81};
82
83static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
84	"pp0-core",
85	"package",
86	"dram",
87	"pp1-gpu",
88	"psys",
89};
90
91/*
92 * event code: LSB 8 bits, passed in attr->config
93 * any other bit is reserved
94 */
95#define RAPL_EVENT_MASK	0xFFULL
96#define RAPL_CNTR_WIDTH 32
97
98#define RAPL_EVENT_ATTR_STR(_name, v, str)					\
99static struct perf_pmu_events_attr event_attr_##v = {				\
100	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
101	.id		= 0,							\
102	.event_str	= str,							\
103};
104
105struct rapl_pmu {
106	raw_spinlock_t		lock;
107	int			n_active;
108	int			cpu;
109	struct list_head	active_list;
110	struct pmu		*pmu;
111	ktime_t			timer_interval;
112	struct hrtimer		hrtimer;
113};
114
115struct rapl_pmus {
116	struct pmu		pmu;
117	unsigned int		maxdie;
118	struct rapl_pmu		*pmus[];
119};
120
121enum rapl_unit_quirk {
122	RAPL_UNIT_QUIRK_NONE,
123	RAPL_UNIT_QUIRK_INTEL_HSW,
124	RAPL_UNIT_QUIRK_INTEL_SPR,
125};
126
127struct rapl_model {
128	struct perf_msr *rapl_msrs;
129	unsigned long	events;
130	unsigned int	msr_power_unit;
131	enum rapl_unit_quirk	unit_quirk;
132};
133
134 /* 1/2^hw_unit Joule */
135static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
136static struct rapl_pmus *rapl_pmus;
137static cpumask_t rapl_cpu_mask;
138static unsigned int rapl_cntr_mask;
139static u64 rapl_timer_ms;
140static struct perf_msr *rapl_msrs;
141
142static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
143{
144	unsigned int dieid = topology_logical_die_id(cpu);
145
146	/*
147	 * The unsigned check also catches the '-1' return value for non
148	 * existent mappings in the topology map.
149	 */
150	return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
151}
152
153static inline u64 rapl_read_counter(struct perf_event *event)
154{
155	u64 raw;
156	rdmsrl(event->hw.event_base, raw);
157	return raw;
158}
159
160static inline u64 rapl_scale(u64 v, int cfg)
161{
162	if (cfg > NR_RAPL_DOMAINS) {
163		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
164		return v;
165	}
166	/*
167	 * scale delta to smallest unit (1/2^32)
168	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
169	 * or use ldexp(count, -32).
170	 * Watts = Joules/Time delta
171	 */
172	return v << (32 - rapl_hw_unit[cfg - 1]);
173}
174
175static u64 rapl_event_update(struct perf_event *event)
176{
177	struct hw_perf_event *hwc = &event->hw;
178	u64 prev_raw_count, new_raw_count;
179	s64 delta, sdelta;
180	int shift = RAPL_CNTR_WIDTH;
181
182again:
183	prev_raw_count = local64_read(&hwc->prev_count);
184	rdmsrl(event->hw.event_base, new_raw_count);
185
186	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
187			    new_raw_count) != prev_raw_count) {
188		cpu_relax();
189		goto again;
190	}
191
192	/*
193	 * Now we have the new raw value and have updated the prev
194	 * timestamp already. We can now calculate the elapsed delta
195	 * (event-)time and add that to the generic event.
196	 *
197	 * Careful, not all hw sign-extends above the physical width
198	 * of the count.
199	 */
200	delta = (new_raw_count << shift) - (prev_raw_count << shift);
201	delta >>= shift;
202
203	sdelta = rapl_scale(delta, event->hw.config);
204
205	local64_add(sdelta, &event->count);
206
207	return new_raw_count;
208}
209
210static void rapl_start_hrtimer(struct rapl_pmu *pmu)
211{
212       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
213		     HRTIMER_MODE_REL_PINNED);
214}
215
216static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
217{
218	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
219	struct perf_event *event;
220	unsigned long flags;
221
222	if (!pmu->n_active)
223		return HRTIMER_NORESTART;
224
225	raw_spin_lock_irqsave(&pmu->lock, flags);
226
227	list_for_each_entry(event, &pmu->active_list, active_entry)
228		rapl_event_update(event);
229
230	raw_spin_unlock_irqrestore(&pmu->lock, flags);
231
232	hrtimer_forward_now(hrtimer, pmu->timer_interval);
233
234	return HRTIMER_RESTART;
235}
236
237static void rapl_hrtimer_init(struct rapl_pmu *pmu)
238{
239	struct hrtimer *hr = &pmu->hrtimer;
240
241	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
242	hr->function = rapl_hrtimer_handle;
243}
244
245static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
246				   struct perf_event *event)
247{
248	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
249		return;
250
251	event->hw.state = 0;
252
253	list_add_tail(&event->active_entry, &pmu->active_list);
254
255	local64_set(&event->hw.prev_count, rapl_read_counter(event));
256
257	pmu->n_active++;
258	if (pmu->n_active == 1)
259		rapl_start_hrtimer(pmu);
260}
261
262static void rapl_pmu_event_start(struct perf_event *event, int mode)
263{
264	struct rapl_pmu *pmu = event->pmu_private;
265	unsigned long flags;
266
267	raw_spin_lock_irqsave(&pmu->lock, flags);
268	__rapl_pmu_event_start(pmu, event);
269	raw_spin_unlock_irqrestore(&pmu->lock, flags);
270}
271
272static void rapl_pmu_event_stop(struct perf_event *event, int mode)
273{
274	struct rapl_pmu *pmu = event->pmu_private;
275	struct hw_perf_event *hwc = &event->hw;
276	unsigned long flags;
277
278	raw_spin_lock_irqsave(&pmu->lock, flags);
279
280	/* mark event as deactivated and stopped */
281	if (!(hwc->state & PERF_HES_STOPPED)) {
282		WARN_ON_ONCE(pmu->n_active <= 0);
283		pmu->n_active--;
284		if (pmu->n_active == 0)
285			hrtimer_cancel(&pmu->hrtimer);
286
287		list_del(&event->active_entry);
288
289		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
290		hwc->state |= PERF_HES_STOPPED;
291	}
292
293	/* check if update of sw counter is necessary */
294	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
295		/*
296		 * Drain the remaining delta count out of a event
297		 * that we are disabling:
298		 */
299		rapl_event_update(event);
300		hwc->state |= PERF_HES_UPTODATE;
301	}
302
303	raw_spin_unlock_irqrestore(&pmu->lock, flags);
304}
305
306static int rapl_pmu_event_add(struct perf_event *event, int mode)
307{
308	struct rapl_pmu *pmu = event->pmu_private;
309	struct hw_perf_event *hwc = &event->hw;
310	unsigned long flags;
311
312	raw_spin_lock_irqsave(&pmu->lock, flags);
313
314	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
315
316	if (mode & PERF_EF_START)
317		__rapl_pmu_event_start(pmu, event);
318
319	raw_spin_unlock_irqrestore(&pmu->lock, flags);
320
321	return 0;
322}
323
324static void rapl_pmu_event_del(struct perf_event *event, int flags)
325{
326	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
327}
328
329static int rapl_pmu_event_init(struct perf_event *event)
330{
331	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
332	int bit, ret = 0;
333	struct rapl_pmu *pmu;
334
335	/* only look at RAPL events */
336	if (event->attr.type != rapl_pmus->pmu.type)
337		return -ENOENT;
338
339	/* check only supported bits are set */
340	if (event->attr.config & ~RAPL_EVENT_MASK)
341		return -EINVAL;
342
343	if (event->cpu < 0)
344		return -EINVAL;
345
346	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
347
348	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
349		return -EINVAL;
350
351	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
352	bit = cfg - 1;
353
354	/* check event supported */
355	if (!(rapl_cntr_mask & (1 << bit)))
356		return -EINVAL;
357
358	/* unsupported modes and filters */
359	if (event->attr.sample_period) /* no sampling */
360		return -EINVAL;
361
362	/* must be done before validate_group */
363	pmu = cpu_to_rapl_pmu(event->cpu);
364	if (!pmu)
365		return -EINVAL;
366	event->cpu = pmu->cpu;
367	event->pmu_private = pmu;
368	event->hw.event_base = rapl_msrs[bit].msr;
369	event->hw.config = cfg;
370	event->hw.idx = bit;
371
372	return ret;
373}
374
375static void rapl_pmu_event_read(struct perf_event *event)
376{
377	rapl_event_update(event);
378}
379
380static ssize_t rapl_get_attr_cpumask(struct device *dev,
381				struct device_attribute *attr, char *buf)
382{
383	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
384}
385
386static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
387
388static struct attribute *rapl_pmu_attrs[] = {
389	&dev_attr_cpumask.attr,
390	NULL,
391};
392
393static struct attribute_group rapl_pmu_attr_group = {
394	.attrs = rapl_pmu_attrs,
395};
396
397RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
398RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
399RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
400RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
401RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
402
403RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
404RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
405RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
406RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
407RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
408
409/*
410 * we compute in 0.23 nJ increments regardless of MSR
411 */
412RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
413RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
414RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
415RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
416RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
417
418/*
419 * There are no default events, but we need to create
420 * "events" group (with empty attrs) before updating
421 * it with detected events.
422 */
423static struct attribute *attrs_empty[] = {
424	NULL,
425};
426
427static struct attribute_group rapl_pmu_events_group = {
428	.name = "events",
429	.attrs = attrs_empty,
430};
431
432PMU_FORMAT_ATTR(event, "config:0-7");
433static struct attribute *rapl_formats_attr[] = {
434	&format_attr_event.attr,
435	NULL,
436};
437
438static struct attribute_group rapl_pmu_format_group = {
439	.name = "format",
440	.attrs = rapl_formats_attr,
441};
442
443static const struct attribute_group *rapl_attr_groups[] = {
444	&rapl_pmu_attr_group,
445	&rapl_pmu_format_group,
446	&rapl_pmu_events_group,
447	NULL,
448};
449
450static struct attribute *rapl_events_cores[] = {
451	EVENT_PTR(rapl_cores),
452	EVENT_PTR(rapl_cores_unit),
453	EVENT_PTR(rapl_cores_scale),
454	NULL,
455};
456
457static umode_t
458rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i)
459{
460	return 0;
461}
462
463static struct attribute_group rapl_events_cores_group = {
464	.name  = "events",
465	.attrs = rapl_events_cores,
466	.is_visible = rapl_not_visible,
467};
468
469static struct attribute *rapl_events_pkg[] = {
470	EVENT_PTR(rapl_pkg),
471	EVENT_PTR(rapl_pkg_unit),
472	EVENT_PTR(rapl_pkg_scale),
473	NULL,
474};
475
476static struct attribute_group rapl_events_pkg_group = {
477	.name  = "events",
478	.attrs = rapl_events_pkg,
479	.is_visible = rapl_not_visible,
480};
481
482static struct attribute *rapl_events_ram[] = {
483	EVENT_PTR(rapl_ram),
484	EVENT_PTR(rapl_ram_unit),
485	EVENT_PTR(rapl_ram_scale),
486	NULL,
487};
488
489static struct attribute_group rapl_events_ram_group = {
490	.name  = "events",
491	.attrs = rapl_events_ram,
492	.is_visible = rapl_not_visible,
493};
494
495static struct attribute *rapl_events_gpu[] = {
496	EVENT_PTR(rapl_gpu),
497	EVENT_PTR(rapl_gpu_unit),
498	EVENT_PTR(rapl_gpu_scale),
499	NULL,
500};
501
502static struct attribute_group rapl_events_gpu_group = {
503	.name  = "events",
504	.attrs = rapl_events_gpu,
505	.is_visible = rapl_not_visible,
506};
507
508static struct attribute *rapl_events_psys[] = {
509	EVENT_PTR(rapl_psys),
510	EVENT_PTR(rapl_psys_unit),
511	EVENT_PTR(rapl_psys_scale),
512	NULL,
513};
514
515static struct attribute_group rapl_events_psys_group = {
516	.name  = "events",
517	.attrs = rapl_events_psys,
518	.is_visible = rapl_not_visible,
519};
520
521static bool test_msr(int idx, void *data)
522{
523	return test_bit(idx, (unsigned long *) data);
524}
525
526static struct perf_msr intel_rapl_msrs[] = {
527	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr },
528	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr },
529	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr },
530	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr },
531	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr },
532};
533
534/*
535 * Force to PERF_RAPL_MAX size due to:
536 * - perf_msr_probe(PERF_RAPL_MAX)
537 * - want to use same event codes across both architectures
538 */
539static struct perf_msr amd_rapl_msrs[PERF_RAPL_MAX] = {
540	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr },
541};
542
543
544static int rapl_cpu_offline(unsigned int cpu)
545{
546	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
547	int target;
548
549	/* Check if exiting cpu is used for collecting rapl events */
550	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
551		return 0;
552
553	pmu->cpu = -1;
554	/* Find a new cpu to collect rapl events */
555	target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
556
557	/* Migrate rapl events to the new target */
558	if (target < nr_cpu_ids) {
559		cpumask_set_cpu(target, &rapl_cpu_mask);
560		pmu->cpu = target;
561		perf_pmu_migrate_context(pmu->pmu, cpu, target);
562	}
563	return 0;
564}
565
566static int rapl_cpu_online(unsigned int cpu)
567{
568	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
569	int target;
570
571	if (!pmu) {
572		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
573		if (!pmu)
574			return -ENOMEM;
575
576		raw_spin_lock_init(&pmu->lock);
577		INIT_LIST_HEAD(&pmu->active_list);
578		pmu->pmu = &rapl_pmus->pmu;
579		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
580		rapl_hrtimer_init(pmu);
581
582		rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
583	}
584
585	/*
586	 * Check if there is an online cpu in the package which collects rapl
587	 * events already.
588	 */
589	target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
590	if (target < nr_cpu_ids)
591		return 0;
592
593	cpumask_set_cpu(cpu, &rapl_cpu_mask);
594	pmu->cpu = cpu;
595	return 0;
596}
597
598static int rapl_check_hw_unit(struct rapl_model *rm)
599{
600	u64 msr_rapl_power_unit_bits;
601	int i;
602
603	/* protect rdmsrl() to handle virtualization */
604	if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
605		return -1;
606	for (i = 0; i < NR_RAPL_DOMAINS; i++)
607		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
608
609	switch (rm->unit_quirk) {
610	/*
611	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
612	 * different than the unit from power unit MSR. See
613	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
614	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
615	 */
616	case RAPL_UNIT_QUIRK_INTEL_HSW:
617		rapl_hw_unit[PERF_RAPL_RAM] = 16;
618		break;
619	/*
620	 * SPR shares the same DRAM domain energy unit as HSW, plus it
621	 * also has a fixed energy unit for Psys domain.
622	 */
623	case RAPL_UNIT_QUIRK_INTEL_SPR:
624		rapl_hw_unit[PERF_RAPL_RAM] = 16;
625		rapl_hw_unit[PERF_RAPL_PSYS] = 0;
626		break;
627	default:
628		break;
629	}
630
631
632	/*
633	 * Calculate the timer rate:
634	 * Use reference of 200W for scaling the timeout to avoid counter
635	 * overflows. 200W = 200 Joules/sec
636	 * Divide interval by 2 to avoid lockstep (2 * 100)
637	 * if hw unit is 32, then we use 2 ms 1/200/2
638	 */
639	rapl_timer_ms = 2;
640	if (rapl_hw_unit[0] < 32) {
641		rapl_timer_ms = (1000 / (2 * 100));
642		rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
643	}
644	return 0;
645}
646
647static void __init rapl_advertise(void)
648{
649	int i;
650
651	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
652		hweight32(rapl_cntr_mask), rapl_timer_ms);
653
654	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
655		if (rapl_cntr_mask & (1 << i)) {
656			pr_info("hw unit of domain %s 2^-%d Joules\n",
657				rapl_domain_names[i], rapl_hw_unit[i]);
658		}
659	}
660}
661
662static void cleanup_rapl_pmus(void)
663{
664	int i;
665
666	for (i = 0; i < rapl_pmus->maxdie; i++)
667		kfree(rapl_pmus->pmus[i]);
668	kfree(rapl_pmus);
669}
670
671static const struct attribute_group *rapl_attr_update[] = {
672	&rapl_events_cores_group,
673	&rapl_events_pkg_group,
674	&rapl_events_ram_group,
675	&rapl_events_gpu_group,
676	&rapl_events_psys_group,
677	NULL,
678};
679
680static int __init init_rapl_pmus(void)
681{
682	int maxdie = topology_max_packages() * topology_max_die_per_package();
683	size_t size;
684
685	size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
686	rapl_pmus = kzalloc(size, GFP_KERNEL);
687	if (!rapl_pmus)
688		return -ENOMEM;
689
690	rapl_pmus->maxdie		= maxdie;
691	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
692	rapl_pmus->pmu.attr_update	= rapl_attr_update;
693	rapl_pmus->pmu.task_ctx_nr	= perf_invalid_context;
694	rapl_pmus->pmu.event_init	= rapl_pmu_event_init;
695	rapl_pmus->pmu.add		= rapl_pmu_event_add;
696	rapl_pmus->pmu.del		= rapl_pmu_event_del;
697	rapl_pmus->pmu.start		= rapl_pmu_event_start;
698	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
699	rapl_pmus->pmu.read		= rapl_pmu_event_read;
700	rapl_pmus->pmu.module		= THIS_MODULE;
701	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
702	return 0;
703}
704
705static struct rapl_model model_snb = {
706	.events		= BIT(PERF_RAPL_PP0) |
707			  BIT(PERF_RAPL_PKG) |
708			  BIT(PERF_RAPL_PP1),
709	.msr_power_unit = MSR_RAPL_POWER_UNIT,
710	.rapl_msrs      = intel_rapl_msrs,
711};
712
713static struct rapl_model model_snbep = {
714	.events		= BIT(PERF_RAPL_PP0) |
715			  BIT(PERF_RAPL_PKG) |
716			  BIT(PERF_RAPL_RAM),
717	.msr_power_unit = MSR_RAPL_POWER_UNIT,
718	.rapl_msrs      = intel_rapl_msrs,
719};
720
721static struct rapl_model model_hsw = {
722	.events		= BIT(PERF_RAPL_PP0) |
723			  BIT(PERF_RAPL_PKG) |
724			  BIT(PERF_RAPL_RAM) |
725			  BIT(PERF_RAPL_PP1),
726	.msr_power_unit = MSR_RAPL_POWER_UNIT,
727	.rapl_msrs      = intel_rapl_msrs,
728};
729
730static struct rapl_model model_hsx = {
731	.events		= BIT(PERF_RAPL_PP0) |
732			  BIT(PERF_RAPL_PKG) |
733			  BIT(PERF_RAPL_RAM),
734	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
735	.msr_power_unit = MSR_RAPL_POWER_UNIT,
736	.rapl_msrs      = intel_rapl_msrs,
737};
738
739static struct rapl_model model_knl = {
740	.events		= BIT(PERF_RAPL_PKG) |
741			  BIT(PERF_RAPL_RAM),
742	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
743	.msr_power_unit = MSR_RAPL_POWER_UNIT,
744	.rapl_msrs      = intel_rapl_msrs,
745};
746
747static struct rapl_model model_skl = {
748	.events		= BIT(PERF_RAPL_PP0) |
749			  BIT(PERF_RAPL_PKG) |
750			  BIT(PERF_RAPL_RAM) |
751			  BIT(PERF_RAPL_PP1) |
752			  BIT(PERF_RAPL_PSYS),
753	.msr_power_unit = MSR_RAPL_POWER_UNIT,
754	.rapl_msrs      = intel_rapl_msrs,
755};
756
757static struct rapl_model model_spr = {
758	.events		= BIT(PERF_RAPL_PP0) |
759			  BIT(PERF_RAPL_PKG) |
760			  BIT(PERF_RAPL_RAM) |
761			  BIT(PERF_RAPL_PSYS),
762	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
763	.msr_power_unit = MSR_RAPL_POWER_UNIT,
764	.rapl_msrs      = intel_rapl_msrs,
765};
766
767static struct rapl_model model_amd_fam17h = {
768	.events		= BIT(PERF_RAPL_PKG),
769	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
770	.rapl_msrs      = amd_rapl_msrs,
771};
772
773static const struct x86_cpu_id rapl_model_match[] __initconst = {
774	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,		&model_snb),
775	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,	&model_snbep),
776	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,		&model_snb),
777	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,		&model_snbep),
778	X86_MATCH_INTEL_FAM6_MODEL(HASWELL,		&model_hsw),
779	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,		&model_hsx),
780	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,		&model_hsw),
781	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,		&model_hsw),
782	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,		&model_hsw),
783	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,		&model_hsw),
784	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,		&model_hsx),
785	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,		&model_hsx),
786	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&model_knl),
787	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&model_knl),
788	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,		&model_skl),
789	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,		&model_skl),
790	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&model_hsx),
791	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,		&model_skl),
792	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&model_skl),
793	X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,	&model_skl),
794	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&model_hsw),
795	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&model_hsw),
796	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&model_hsw),
797	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,		&model_skl),
798	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,		&model_skl),
799	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&model_hsx),
800	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&model_hsx),
801	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,		&model_skl),
802	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,		&model_skl),
803	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&model_spr),
804	X86_MATCH_VENDOR_FAM(AMD,	0x17,		&model_amd_fam17h),
805	X86_MATCH_VENDOR_FAM(HYGON,	0x18,		&model_amd_fam17h),
806	X86_MATCH_VENDOR_FAM(AMD,	0x19,		&model_amd_fam17h),
807	{},
808};
809MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
810
811static int __init rapl_pmu_init(void)
812{
813	const struct x86_cpu_id *id;
814	struct rapl_model *rm;
815	int ret;
816
817	id = x86_match_cpu(rapl_model_match);
818	if (!id)
819		return -ENODEV;
820
821	rm = (struct rapl_model *) id->driver_data;
822
823	rapl_msrs = rm->rapl_msrs;
824
825	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
826					false, (void *) &rm->events);
827
828	ret = rapl_check_hw_unit(rm);
829	if (ret)
830		return ret;
831
832	ret = init_rapl_pmus();
833	if (ret)
834		return ret;
835
836	/*
837	 * Install callbacks. Core will call them for each online cpu.
838	 */
839	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
840				"perf/x86/rapl:online",
841				rapl_cpu_online, rapl_cpu_offline);
842	if (ret)
843		goto out;
844
845	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
846	if (ret)
847		goto out1;
848
849	rapl_advertise();
850	return 0;
851
852out1:
853	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
854out:
855	pr_warn("Initialization failed (%d), disabled\n", ret);
856	cleanup_rapl_pmus();
857	return ret;
858}
859module_init(rapl_pmu_init);
860
861static void __exit intel_rapl_exit(void)
862{
863	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
864	perf_pmu_unregister(&rapl_pmus->pmu);
865	cleanup_rapl_pmus();
866}
867module_exit(intel_rapl_exit);
868