xref: /kernel/linux/linux-6.6/arch/x86/events/rapl.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Support Intel/AMD RAPL energy consumption counters
4 * Copyright (C) 2013 Google, Inc., Stephane Eranian
5 *
6 * Intel RAPL interface is specified in the IA-32 Manual Vol3b
7 * section 14.7.1 (September 2013)
8 *
9 * AMD RAPL interface for Fam17h is described in the public PPR:
10 * https://bugzilla.kernel.org/show_bug.cgi?id=206537
11 *
12 * RAPL provides more controls than just reporting energy consumption
13 * however here we only expose the 3 energy consumption free running
14 * counters (pp0, pkg, dram).
15 *
16 * Each of those counters increments in a power unit defined by the
17 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
18 * but it can vary.
19 *
20 * Counter to rapl events mappings:
21 *
22 *  pp0 counter: consumption of all physical cores (power plane 0)
23 * 	  event: rapl_energy_cores
24 *    perf code: 0x1
25 *
26 *  pkg counter: consumption of the whole processor package
27 *	  event: rapl_energy_pkg
28 *    perf code: 0x2
29 *
30 * dram counter: consumption of the dram domain (servers only)
31 *	  event: rapl_energy_dram
32 *    perf code: 0x3
33 *
34 * gpu counter: consumption of the builtin-gpu domain (client only)
35 *	  event: rapl_energy_gpu
36 *    perf code: 0x4
37 *
38 *  psys counter: consumption of the builtin-psys domain (client only)
39 *	  event: rapl_energy_psys
40 *    perf code: 0x5
41 *
42 * We manage those counters as free running (read-only). They may be
43 * use simultaneously by other tools, such as turbostat.
44 *
45 * The events only support system-wide mode counting. There is no
46 * sampling support because it does not make sense and is not
47 * supported by the RAPL hardware.
48 *
49 * Because we want to avoid floating-point operations in the kernel,
50 * the events are all reported in fixed point arithmetic (32.32).
51 * Tools must adjust the counts to convert them to Watts using
52 * the duration of the measurement. Tools may use a function such as
53 * ldexp(raw_count, -32);
54 */
55
56#define pr_fmt(fmt) "RAPL PMU: " fmt
57
58#include <linux/module.h>
59#include <linux/slab.h>
60#include <linux/perf_event.h>
61#include <linux/nospec.h>
62#include <asm/cpu_device_id.h>
63#include <asm/intel-family.h>
64#include "perf_event.h"
65#include "probe.h"
66
67MODULE_LICENSE("GPL");
68
69/*
70 * RAPL energy status counters
71 */
72enum perf_rapl_events {
73	PERF_RAPL_PP0 = 0,		/* all cores */
74	PERF_RAPL_PKG,			/* entire package */
75	PERF_RAPL_RAM,			/* DRAM */
76	PERF_RAPL_PP1,			/* gpu */
77	PERF_RAPL_PSYS,			/* psys */
78
79	PERF_RAPL_MAX,
80	NR_RAPL_DOMAINS = PERF_RAPL_MAX,
81};
82
83static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
84	"pp0-core",
85	"package",
86	"dram",
87	"pp1-gpu",
88	"psys",
89};
90
91/*
92 * event code: LSB 8 bits, passed in attr->config
93 * any other bit is reserved
94 */
95#define RAPL_EVENT_MASK	0xFFULL
96#define RAPL_CNTR_WIDTH 32
97
98#define RAPL_EVENT_ATTR_STR(_name, v, str)					\
99static struct perf_pmu_events_attr event_attr_##v = {				\
100	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
101	.id		= 0,							\
102	.event_str	= str,							\
103};
104
105struct rapl_pmu {
106	raw_spinlock_t		lock;
107	int			n_active;
108	int			cpu;
109	struct list_head	active_list;
110	struct pmu		*pmu;
111	ktime_t			timer_interval;
112	struct hrtimer		hrtimer;
113};
114
115struct rapl_pmus {
116	struct pmu		pmu;
117	unsigned int		maxdie;
118	struct rapl_pmu		*pmus[];
119};
120
121enum rapl_unit_quirk {
122	RAPL_UNIT_QUIRK_NONE,
123	RAPL_UNIT_QUIRK_INTEL_HSW,
124	RAPL_UNIT_QUIRK_INTEL_SPR,
125};
126
127struct rapl_model {
128	struct perf_msr *rapl_msrs;
129	unsigned long	events;
130	unsigned int	msr_power_unit;
131	enum rapl_unit_quirk	unit_quirk;
132};
133
134 /* 1/2^hw_unit Joule */
135static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
136static struct rapl_pmus *rapl_pmus;
137static cpumask_t rapl_cpu_mask;
138static unsigned int rapl_cntr_mask;
139static u64 rapl_timer_ms;
140static struct perf_msr *rapl_msrs;
141
142static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
143{
144	unsigned int dieid = topology_logical_die_id(cpu);
145
146	/*
147	 * The unsigned check also catches the '-1' return value for non
148	 * existent mappings in the topology map.
149	 */
150	return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
151}
152
153static inline u64 rapl_read_counter(struct perf_event *event)
154{
155	u64 raw;
156	rdmsrl(event->hw.event_base, raw);
157	return raw;
158}
159
160static inline u64 rapl_scale(u64 v, int cfg)
161{
162	if (cfg > NR_RAPL_DOMAINS) {
163		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
164		return v;
165	}
166	/*
167	 * scale delta to smallest unit (1/2^32)
168	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
169	 * or use ldexp(count, -32).
170	 * Watts = Joules/Time delta
171	 */
172	return v << (32 - rapl_hw_unit[cfg - 1]);
173}
174
175static u64 rapl_event_update(struct perf_event *event)
176{
177	struct hw_perf_event *hwc = &event->hw;
178	u64 prev_raw_count, new_raw_count;
179	s64 delta, sdelta;
180	int shift = RAPL_CNTR_WIDTH;
181
182again:
183	prev_raw_count = local64_read(&hwc->prev_count);
184	rdmsrl(event->hw.event_base, new_raw_count);
185
186	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
187			    new_raw_count) != prev_raw_count) {
188		cpu_relax();
189		goto again;
190	}
191
192	/*
193	 * Now we have the new raw value and have updated the prev
194	 * timestamp already. We can now calculate the elapsed delta
195	 * (event-)time and add that to the generic event.
196	 *
197	 * Careful, not all hw sign-extends above the physical width
198	 * of the count.
199	 */
200	delta = (new_raw_count << shift) - (prev_raw_count << shift);
201	delta >>= shift;
202
203	sdelta = rapl_scale(delta, event->hw.config);
204
205	local64_add(sdelta, &event->count);
206
207	return new_raw_count;
208}
209
210static void rapl_start_hrtimer(struct rapl_pmu *pmu)
211{
212       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
213		     HRTIMER_MODE_REL_PINNED);
214}
215
216static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
217{
218	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
219	struct perf_event *event;
220	unsigned long flags;
221
222	if (!pmu->n_active)
223		return HRTIMER_NORESTART;
224
225	raw_spin_lock_irqsave(&pmu->lock, flags);
226
227	list_for_each_entry(event, &pmu->active_list, active_entry)
228		rapl_event_update(event);
229
230	raw_spin_unlock_irqrestore(&pmu->lock, flags);
231
232	hrtimer_forward_now(hrtimer, pmu->timer_interval);
233
234	return HRTIMER_RESTART;
235}
236
237static void rapl_hrtimer_init(struct rapl_pmu *pmu)
238{
239	struct hrtimer *hr = &pmu->hrtimer;
240
241	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
242	hr->function = rapl_hrtimer_handle;
243}
244
245static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
246				   struct perf_event *event)
247{
248	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
249		return;
250
251	event->hw.state = 0;
252
253	list_add_tail(&event->active_entry, &pmu->active_list);
254
255	local64_set(&event->hw.prev_count, rapl_read_counter(event));
256
257	pmu->n_active++;
258	if (pmu->n_active == 1)
259		rapl_start_hrtimer(pmu);
260}
261
262static void rapl_pmu_event_start(struct perf_event *event, int mode)
263{
264	struct rapl_pmu *pmu = event->pmu_private;
265	unsigned long flags;
266
267	raw_spin_lock_irqsave(&pmu->lock, flags);
268	__rapl_pmu_event_start(pmu, event);
269	raw_spin_unlock_irqrestore(&pmu->lock, flags);
270}
271
272static void rapl_pmu_event_stop(struct perf_event *event, int mode)
273{
274	struct rapl_pmu *pmu = event->pmu_private;
275	struct hw_perf_event *hwc = &event->hw;
276	unsigned long flags;
277
278	raw_spin_lock_irqsave(&pmu->lock, flags);
279
280	/* mark event as deactivated and stopped */
281	if (!(hwc->state & PERF_HES_STOPPED)) {
282		WARN_ON_ONCE(pmu->n_active <= 0);
283		pmu->n_active--;
284		if (pmu->n_active == 0)
285			hrtimer_cancel(&pmu->hrtimer);
286
287		list_del(&event->active_entry);
288
289		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
290		hwc->state |= PERF_HES_STOPPED;
291	}
292
293	/* check if update of sw counter is necessary */
294	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
295		/*
296		 * Drain the remaining delta count out of a event
297		 * that we are disabling:
298		 */
299		rapl_event_update(event);
300		hwc->state |= PERF_HES_UPTODATE;
301	}
302
303	raw_spin_unlock_irqrestore(&pmu->lock, flags);
304}
305
306static int rapl_pmu_event_add(struct perf_event *event, int mode)
307{
308	struct rapl_pmu *pmu = event->pmu_private;
309	struct hw_perf_event *hwc = &event->hw;
310	unsigned long flags;
311
312	raw_spin_lock_irqsave(&pmu->lock, flags);
313
314	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
315
316	if (mode & PERF_EF_START)
317		__rapl_pmu_event_start(pmu, event);
318
319	raw_spin_unlock_irqrestore(&pmu->lock, flags);
320
321	return 0;
322}
323
324static void rapl_pmu_event_del(struct perf_event *event, int flags)
325{
326	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
327}
328
329static int rapl_pmu_event_init(struct perf_event *event)
330{
331	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
332	int bit, ret = 0;
333	struct rapl_pmu *pmu;
334
335	/* only look at RAPL events */
336	if (event->attr.type != rapl_pmus->pmu.type)
337		return -ENOENT;
338
339	/* check only supported bits are set */
340	if (event->attr.config & ~RAPL_EVENT_MASK)
341		return -EINVAL;
342
343	if (event->cpu < 0)
344		return -EINVAL;
345
346	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
347
348	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
349		return -EINVAL;
350
351	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
352	bit = cfg - 1;
353
354	/* check event supported */
355	if (!(rapl_cntr_mask & (1 << bit)))
356		return -EINVAL;
357
358	/* unsupported modes and filters */
359	if (event->attr.sample_period) /* no sampling */
360		return -EINVAL;
361
362	/* must be done before validate_group */
363	pmu = cpu_to_rapl_pmu(event->cpu);
364	if (!pmu)
365		return -EINVAL;
366	event->cpu = pmu->cpu;
367	event->pmu_private = pmu;
368	event->hw.event_base = rapl_msrs[bit].msr;
369	event->hw.config = cfg;
370	event->hw.idx = bit;
371
372	return ret;
373}
374
375static void rapl_pmu_event_read(struct perf_event *event)
376{
377	rapl_event_update(event);
378}
379
380static ssize_t rapl_get_attr_cpumask(struct device *dev,
381				struct device_attribute *attr, char *buf)
382{
383	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
384}
385
386static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
387
388static struct attribute *rapl_pmu_attrs[] = {
389	&dev_attr_cpumask.attr,
390	NULL,
391};
392
393static struct attribute_group rapl_pmu_attr_group = {
394	.attrs = rapl_pmu_attrs,
395};
396
397RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
398RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
399RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
400RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
401RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
402
403RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
404RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
405RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
406RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
407RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
408
409/*
410 * we compute in 0.23 nJ increments regardless of MSR
411 */
412RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
413RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
414RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
415RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
416RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
417
418/*
419 * There are no default events, but we need to create
420 * "events" group (with empty attrs) before updating
421 * it with detected events.
422 */
423static struct attribute *attrs_empty[] = {
424	NULL,
425};
426
427static struct attribute_group rapl_pmu_events_group = {
428	.name = "events",
429	.attrs = attrs_empty,
430};
431
432PMU_FORMAT_ATTR(event, "config:0-7");
433static struct attribute *rapl_formats_attr[] = {
434	&format_attr_event.attr,
435	NULL,
436};
437
438static struct attribute_group rapl_pmu_format_group = {
439	.name = "format",
440	.attrs = rapl_formats_attr,
441};
442
443static const struct attribute_group *rapl_attr_groups[] = {
444	&rapl_pmu_attr_group,
445	&rapl_pmu_format_group,
446	&rapl_pmu_events_group,
447	NULL,
448};
449
450static struct attribute *rapl_events_cores[] = {
451	EVENT_PTR(rapl_cores),
452	EVENT_PTR(rapl_cores_unit),
453	EVENT_PTR(rapl_cores_scale),
454	NULL,
455};
456
457static struct attribute_group rapl_events_cores_group = {
458	.name  = "events",
459	.attrs = rapl_events_cores,
460};
461
462static struct attribute *rapl_events_pkg[] = {
463	EVENT_PTR(rapl_pkg),
464	EVENT_PTR(rapl_pkg_unit),
465	EVENT_PTR(rapl_pkg_scale),
466	NULL,
467};
468
469static struct attribute_group rapl_events_pkg_group = {
470	.name  = "events",
471	.attrs = rapl_events_pkg,
472};
473
474static struct attribute *rapl_events_ram[] = {
475	EVENT_PTR(rapl_ram),
476	EVENT_PTR(rapl_ram_unit),
477	EVENT_PTR(rapl_ram_scale),
478	NULL,
479};
480
481static struct attribute_group rapl_events_ram_group = {
482	.name  = "events",
483	.attrs = rapl_events_ram,
484};
485
486static struct attribute *rapl_events_gpu[] = {
487	EVENT_PTR(rapl_gpu),
488	EVENT_PTR(rapl_gpu_unit),
489	EVENT_PTR(rapl_gpu_scale),
490	NULL,
491};
492
493static struct attribute_group rapl_events_gpu_group = {
494	.name  = "events",
495	.attrs = rapl_events_gpu,
496};
497
498static struct attribute *rapl_events_psys[] = {
499	EVENT_PTR(rapl_psys),
500	EVENT_PTR(rapl_psys_unit),
501	EVENT_PTR(rapl_psys_scale),
502	NULL,
503};
504
505static struct attribute_group rapl_events_psys_group = {
506	.name  = "events",
507	.attrs = rapl_events_psys,
508};
509
510static bool test_msr(int idx, void *data)
511{
512	return test_bit(idx, (unsigned long *) data);
513}
514
515/* Only lower 32bits of the MSR represents the energy counter */
516#define RAPL_MSR_MASK 0xFFFFFFFF
517
518static struct perf_msr intel_rapl_msrs[] = {
519	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
520	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
521	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
522	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
523	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, false, RAPL_MSR_MASK },
524};
525
526static struct perf_msr intel_rapl_spr_msrs[] = {
527	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
528	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
529	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr, false, RAPL_MSR_MASK },
530	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr, false, RAPL_MSR_MASK },
531	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr, true, RAPL_MSR_MASK },
532};
533
534/*
535 * Force to PERF_RAPL_MAX size due to:
536 * - perf_msr_probe(PERF_RAPL_MAX)
537 * - want to use same event codes across both architectures
538 */
539static struct perf_msr amd_rapl_msrs[] = {
540	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, 0, false, 0 },
541	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
542	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   0, false, 0 },
543	[PERF_RAPL_PP1]  = { 0, &rapl_events_gpu_group,   0, false, 0 },
544	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  0, false, 0 },
545};
546
547static int rapl_cpu_offline(unsigned int cpu)
548{
549	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
550	int target;
551
552	/* Check if exiting cpu is used for collecting rapl events */
553	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
554		return 0;
555
556	pmu->cpu = -1;
557	/* Find a new cpu to collect rapl events */
558	target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
559
560	/* Migrate rapl events to the new target */
561	if (target < nr_cpu_ids) {
562		cpumask_set_cpu(target, &rapl_cpu_mask);
563		pmu->cpu = target;
564		perf_pmu_migrate_context(pmu->pmu, cpu, target);
565	}
566	return 0;
567}
568
569static int rapl_cpu_online(unsigned int cpu)
570{
571	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
572	int target;
573
574	if (!pmu) {
575		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
576		if (!pmu)
577			return -ENOMEM;
578
579		raw_spin_lock_init(&pmu->lock);
580		INIT_LIST_HEAD(&pmu->active_list);
581		pmu->pmu = &rapl_pmus->pmu;
582		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
583		rapl_hrtimer_init(pmu);
584
585		rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
586	}
587
588	/*
589	 * Check if there is an online cpu in the package which collects rapl
590	 * events already.
591	 */
592	target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
593	if (target < nr_cpu_ids)
594		return 0;
595
596	cpumask_set_cpu(cpu, &rapl_cpu_mask);
597	pmu->cpu = cpu;
598	return 0;
599}
600
601static int rapl_check_hw_unit(struct rapl_model *rm)
602{
603	u64 msr_rapl_power_unit_bits;
604	int i;
605
606	/* protect rdmsrl() to handle virtualization */
607	if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
608		return -1;
609	for (i = 0; i < NR_RAPL_DOMAINS; i++)
610		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
611
612	switch (rm->unit_quirk) {
613	/*
614	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
615	 * different than the unit from power unit MSR. See
616	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
617	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
618	 */
619	case RAPL_UNIT_QUIRK_INTEL_HSW:
620		rapl_hw_unit[PERF_RAPL_RAM] = 16;
621		break;
622	/* SPR uses a fixed energy unit for Psys domain. */
623	case RAPL_UNIT_QUIRK_INTEL_SPR:
624		rapl_hw_unit[PERF_RAPL_PSYS] = 0;
625		break;
626	default:
627		break;
628	}
629
630
631	/*
632	 * Calculate the timer rate:
633	 * Use reference of 200W for scaling the timeout to avoid counter
634	 * overflows. 200W = 200 Joules/sec
635	 * Divide interval by 2 to avoid lockstep (2 * 100)
636	 * if hw unit is 32, then we use 2 ms 1/200/2
637	 */
638	rapl_timer_ms = 2;
639	if (rapl_hw_unit[0] < 32) {
640		rapl_timer_ms = (1000 / (2 * 100));
641		rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
642	}
643	return 0;
644}
645
646static void __init rapl_advertise(void)
647{
648	int i;
649
650	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
651		hweight32(rapl_cntr_mask), rapl_timer_ms);
652
653	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
654		if (rapl_cntr_mask & (1 << i)) {
655			pr_info("hw unit of domain %s 2^-%d Joules\n",
656				rapl_domain_names[i], rapl_hw_unit[i]);
657		}
658	}
659}
660
661static void cleanup_rapl_pmus(void)
662{
663	int i;
664
665	for (i = 0; i < rapl_pmus->maxdie; i++)
666		kfree(rapl_pmus->pmus[i]);
667	kfree(rapl_pmus);
668}
669
670static const struct attribute_group *rapl_attr_update[] = {
671	&rapl_events_cores_group,
672	&rapl_events_pkg_group,
673	&rapl_events_ram_group,
674	&rapl_events_gpu_group,
675	&rapl_events_psys_group,
676	NULL,
677};
678
679static int __init init_rapl_pmus(void)
680{
681	int maxdie = topology_max_packages() * topology_max_die_per_package();
682	size_t size;
683
684	size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
685	rapl_pmus = kzalloc(size, GFP_KERNEL);
686	if (!rapl_pmus)
687		return -ENOMEM;
688
689	rapl_pmus->maxdie		= maxdie;
690	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
691	rapl_pmus->pmu.attr_update	= rapl_attr_update;
692	rapl_pmus->pmu.task_ctx_nr	= perf_invalid_context;
693	rapl_pmus->pmu.event_init	= rapl_pmu_event_init;
694	rapl_pmus->pmu.add		= rapl_pmu_event_add;
695	rapl_pmus->pmu.del		= rapl_pmu_event_del;
696	rapl_pmus->pmu.start		= rapl_pmu_event_start;
697	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
698	rapl_pmus->pmu.read		= rapl_pmu_event_read;
699	rapl_pmus->pmu.module		= THIS_MODULE;
700	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
701	return 0;
702}
703
704static struct rapl_model model_snb = {
705	.events		= BIT(PERF_RAPL_PP0) |
706			  BIT(PERF_RAPL_PKG) |
707			  BIT(PERF_RAPL_PP1),
708	.msr_power_unit = MSR_RAPL_POWER_UNIT,
709	.rapl_msrs      = intel_rapl_msrs,
710};
711
712static struct rapl_model model_snbep = {
713	.events		= BIT(PERF_RAPL_PP0) |
714			  BIT(PERF_RAPL_PKG) |
715			  BIT(PERF_RAPL_RAM),
716	.msr_power_unit = MSR_RAPL_POWER_UNIT,
717	.rapl_msrs      = intel_rapl_msrs,
718};
719
720static struct rapl_model model_hsw = {
721	.events		= BIT(PERF_RAPL_PP0) |
722			  BIT(PERF_RAPL_PKG) |
723			  BIT(PERF_RAPL_RAM) |
724			  BIT(PERF_RAPL_PP1),
725	.msr_power_unit = MSR_RAPL_POWER_UNIT,
726	.rapl_msrs      = intel_rapl_msrs,
727};
728
729static struct rapl_model model_hsx = {
730	.events		= BIT(PERF_RAPL_PP0) |
731			  BIT(PERF_RAPL_PKG) |
732			  BIT(PERF_RAPL_RAM),
733	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
734	.msr_power_unit = MSR_RAPL_POWER_UNIT,
735	.rapl_msrs      = intel_rapl_msrs,
736};
737
738static struct rapl_model model_knl = {
739	.events		= BIT(PERF_RAPL_PKG) |
740			  BIT(PERF_RAPL_RAM),
741	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
742	.msr_power_unit = MSR_RAPL_POWER_UNIT,
743	.rapl_msrs      = intel_rapl_msrs,
744};
745
746static struct rapl_model model_skl = {
747	.events		= BIT(PERF_RAPL_PP0) |
748			  BIT(PERF_RAPL_PKG) |
749			  BIT(PERF_RAPL_RAM) |
750			  BIT(PERF_RAPL_PP1) |
751			  BIT(PERF_RAPL_PSYS),
752	.msr_power_unit = MSR_RAPL_POWER_UNIT,
753	.rapl_msrs      = intel_rapl_msrs,
754};
755
756static struct rapl_model model_spr = {
757	.events		= BIT(PERF_RAPL_PP0) |
758			  BIT(PERF_RAPL_PKG) |
759			  BIT(PERF_RAPL_RAM) |
760			  BIT(PERF_RAPL_PSYS),
761	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
762	.msr_power_unit = MSR_RAPL_POWER_UNIT,
763	.rapl_msrs      = intel_rapl_spr_msrs,
764};
765
766static struct rapl_model model_amd_hygon = {
767	.events		= BIT(PERF_RAPL_PKG),
768	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
769	.rapl_msrs      = amd_rapl_msrs,
770};
771
772static const struct x86_cpu_id rapl_model_match[] __initconst = {
773	X86_MATCH_FEATURE(X86_FEATURE_RAPL,		&model_amd_hygon),
774	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,		&model_snb),
775	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,	&model_snbep),
776	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,		&model_snb),
777	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,		&model_snbep),
778	X86_MATCH_INTEL_FAM6_MODEL(HASWELL,		&model_hsw),
779	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,		&model_hsx),
780	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,		&model_hsw),
781	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,		&model_hsw),
782	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,		&model_hsw),
783	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,		&model_hsw),
784	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,		&model_hsx),
785	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,		&model_hsx),
786	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&model_knl),
787	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&model_knl),
788	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,		&model_skl),
789	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,		&model_skl),
790	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&model_hsx),
791	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,		&model_skl),
792	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&model_skl),
793	X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,	&model_skl),
794	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&model_hsw),
795	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&model_hsw),
796	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&model_hsw),
797	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,		&model_skl),
798	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,		&model_skl),
799	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&model_hsx),
800	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&model_hsx),
801	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,		&model_skl),
802	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,		&model_skl),
803	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L,		&model_skl),
804	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,		&model_skl),
805	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,		&model_skl),
806	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,		&model_skl),
807	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT,	&model_skl),
808	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&model_spr),
809	X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X,	&model_spr),
810	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE,		&model_skl),
811	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,	&model_skl),
812	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S,	&model_skl),
813	X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE,		&model_skl),
814	X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L,	&model_skl),
815	{},
816};
817MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
818
819static int __init rapl_pmu_init(void)
820{
821	const struct x86_cpu_id *id;
822	struct rapl_model *rm;
823	int ret;
824
825	id = x86_match_cpu(rapl_model_match);
826	if (!id)
827		return -ENODEV;
828
829	rm = (struct rapl_model *) id->driver_data;
830
831	rapl_msrs = rm->rapl_msrs;
832
833	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
834					false, (void *) &rm->events);
835
836	ret = rapl_check_hw_unit(rm);
837	if (ret)
838		return ret;
839
840	ret = init_rapl_pmus();
841	if (ret)
842		return ret;
843
844	/*
845	 * Install callbacks. Core will call them for each online cpu.
846	 */
847	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
848				"perf/x86/rapl:online",
849				rapl_cpu_online, rapl_cpu_offline);
850	if (ret)
851		goto out;
852
853	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
854	if (ret)
855		goto out1;
856
857	rapl_advertise();
858	return 0;
859
860out1:
861	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
862out:
863	pr_warn("Initialization failed (%d), disabled\n", ret);
864	cleanup_rapl_pmus();
865	return ret;
866}
867module_init(rapl_pmu_init);
868
869static void __exit intel_rapl_exit(void)
870{
871	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
872	perf_pmu_unregister(&rapl_pmus->pmu);
873	cleanup_rapl_pmus();
874}
875module_exit(intel_rapl_exit);
876