1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * intel_powerclamp.c - package c-state idle injection
4 *
5 * Copyright (c) 2012-2023, Intel Corporation.
6 *
7 * Authors:
8 *     Arjan van de Ven <arjan@linux.intel.com>
9 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10 *
11 *	TODO:
12 *           1. better handle wakeup from external interrupts, currently a fixed
13 *              compensation is added to clamping duration when excessive amount
14 *              of wakeups are observed during idle time. the reason is that in
15 *              case of external interrupts without need for ack, clamping down
16 *              cpu in non-irq context does not reduce irq. for majority of the
17 *              cases, clamping down cpu does help reduce irq as well, we should
18 *              be able to differentiate the two cases and give a quantitative
19 *              solution for the irqs that we can control. perhaps based on
20 *              get_cpu_iowait_time_us()
21 *
22 *	     2. synchronization with other hw blocks
23 */
24
25#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
26
27#include <linux/module.h>
28#include <linux/kernel.h>
29#include <linux/delay.h>
30#include <linux/cpu.h>
31#include <linux/thermal.h>
32#include <linux/debugfs.h>
33#include <linux/seq_file.h>
34#include <linux/idle_inject.h>
35
36#include <asm/msr.h>
37#include <asm/mwait.h>
38#include <asm/cpu_device_id.h>
39
40#define MAX_TARGET_RATIO (100U)
41/* For each undisturbed clamping period (no extra wake ups during idle time),
42 * we increment the confidence counter for the given target ratio.
43 * CONFIDENCE_OK defines the level where runtime calibration results are
44 * valid.
45 */
46#define CONFIDENCE_OK (3)
47/* Default idle injection duration, driver adjust sleep time to meet target
48 * idle ratio. Similar to frequency modulation.
49 */
50#define DEFAULT_DURATION_JIFFIES (6)
51
52static unsigned int target_mwait;
53static struct dentry *debug_dir;
54static bool poll_pkg_cstate_enable;
55
56/* Idle ratio observed using package C-state counters */
57static unsigned int current_ratio;
58
59/* Skip the idle injection till set to true */
60static bool should_skip;
61
62struct powerclamp_data {
63	unsigned int cpu;
64	unsigned int count;
65	unsigned int guard;
66	unsigned int window_size_now;
67	unsigned int target_ratio;
68	bool clamping;
69};
70
71static struct powerclamp_data powerclamp_data;
72
73static struct thermal_cooling_device *cooling_dev;
74
75static DEFINE_MUTEX(powerclamp_lock);
76
77/* This duration is in microseconds */
78static unsigned int duration;
79static unsigned int pkg_cstate_ratio_cur;
80static unsigned int window_size;
81
82static int duration_set(const char *arg, const struct kernel_param *kp)
83{
84	int ret = 0;
85	unsigned long new_duration;
86
87	ret = kstrtoul(arg, 10, &new_duration);
88	if (ret)
89		goto exit;
90	if (new_duration > 25 || new_duration < 6) {
91		pr_err("Out of recommended range %lu, between 6-25ms\n",
92			new_duration);
93		ret = -EINVAL;
94		goto exit;
95	}
96
97	mutex_lock(&powerclamp_lock);
98	duration = clamp(new_duration, 6ul, 25ul) * 1000;
99	mutex_unlock(&powerclamp_lock);
100exit:
101
102	return ret;
103}
104
105static int duration_get(char *buf, const struct kernel_param *kp)
106{
107	int ret;
108
109	mutex_lock(&powerclamp_lock);
110	ret = sysfs_emit(buf, "%d\n", duration / 1000);
111	mutex_unlock(&powerclamp_lock);
112
113	return ret;
114}
115
116static const struct kernel_param_ops duration_ops = {
117	.set = duration_set,
118	.get = duration_get,
119};
120
121module_param_cb(duration, &duration_ops, NULL, 0644);
122MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
123
124#define DEFAULT_MAX_IDLE	50
125#define MAX_ALL_CPU_IDLE	75
126
127static u8 max_idle = DEFAULT_MAX_IDLE;
128
129static cpumask_var_t idle_injection_cpu_mask;
130
131static int allocate_copy_idle_injection_mask(const struct cpumask *copy_mask)
132{
133	if (cpumask_available(idle_injection_cpu_mask))
134		goto copy_mask;
135
136	/* This mask is allocated only one time and freed during module exit */
137	if (!alloc_cpumask_var(&idle_injection_cpu_mask, GFP_KERNEL))
138		return -ENOMEM;
139
140copy_mask:
141	cpumask_copy(idle_injection_cpu_mask, copy_mask);
142
143	return 0;
144}
145
146/* Return true if the cpumask and idle percent combination is invalid */
147static bool check_invalid(cpumask_var_t mask, u8 idle)
148{
149	if (cpumask_equal(cpu_present_mask, mask) && idle > MAX_ALL_CPU_IDLE)
150		return true;
151
152	return false;
153}
154
155static int cpumask_set(const char *arg, const struct kernel_param *kp)
156{
157	cpumask_var_t new_mask;
158	int ret;
159
160	mutex_lock(&powerclamp_lock);
161
162	/* Can't set mask when cooling device is in use */
163	if (powerclamp_data.clamping) {
164		ret = -EAGAIN;
165		goto skip_cpumask_set;
166	}
167
168	ret = alloc_cpumask_var(&new_mask, GFP_KERNEL);
169	if (!ret)
170		goto skip_cpumask_set;
171
172	ret = bitmap_parse(arg, strlen(arg), cpumask_bits(new_mask),
173			   nr_cpumask_bits);
174	if (ret)
175		goto free_cpumask_set;
176
177	if (cpumask_empty(new_mask) || check_invalid(new_mask, max_idle)) {
178		ret = -EINVAL;
179		goto free_cpumask_set;
180	}
181
182	/*
183	 * When module parameters are passed from kernel command line
184	 * during insmod, the module parameter callback is called
185	 * before powerclamp_init(), so we can't assume that some
186	 * cpumask can be allocated and copied before here. Also
187	 * in this case this cpumask is used as the default mask.
188	 */
189	ret = allocate_copy_idle_injection_mask(new_mask);
190
191free_cpumask_set:
192	free_cpumask_var(new_mask);
193skip_cpumask_set:
194	mutex_unlock(&powerclamp_lock);
195
196	return ret;
197}
198
199static int cpumask_get(char *buf, const struct kernel_param *kp)
200{
201	if (!cpumask_available(idle_injection_cpu_mask))
202		return -ENODEV;
203
204	return bitmap_print_to_pagebuf(false, buf, cpumask_bits(idle_injection_cpu_mask),
205				       nr_cpumask_bits);
206}
207
208static const struct kernel_param_ops cpumask_ops = {
209	.set = cpumask_set,
210	.get = cpumask_get,
211};
212
213module_param_cb(cpumask, &cpumask_ops, NULL, 0644);
214MODULE_PARM_DESC(cpumask, "Mask of CPUs to use for idle injection.");
215
216static int max_idle_set(const char *arg, const struct kernel_param *kp)
217{
218	u8 new_max_idle;
219	int ret = 0;
220
221	mutex_lock(&powerclamp_lock);
222
223	/* Can't set mask when cooling device is in use */
224	if (powerclamp_data.clamping) {
225		ret = -EAGAIN;
226		goto skip_limit_set;
227	}
228
229	ret = kstrtou8(arg, 10, &new_max_idle);
230	if (ret)
231		goto skip_limit_set;
232
233	if (new_max_idle > MAX_TARGET_RATIO) {
234		ret = -EINVAL;
235		goto skip_limit_set;
236	}
237
238	if (!cpumask_available(idle_injection_cpu_mask)) {
239		ret = allocate_copy_idle_injection_mask(cpu_present_mask);
240		if (ret)
241			goto skip_limit_set;
242	}
243
244	if (check_invalid(idle_injection_cpu_mask, new_max_idle)) {
245		ret = -EINVAL;
246		goto skip_limit_set;
247	}
248
249	max_idle = new_max_idle;
250
251skip_limit_set:
252	mutex_unlock(&powerclamp_lock);
253
254	return ret;
255}
256
257static const struct kernel_param_ops max_idle_ops = {
258	.set = max_idle_set,
259	.get = param_get_byte,
260};
261
262module_param_cb(max_idle, &max_idle_ops, &max_idle, 0644);
263MODULE_PARM_DESC(max_idle, "maximum injected idle time to the total CPU time ratio in percent range:1-100");
264
265struct powerclamp_calibration_data {
266	unsigned long confidence;  /* used for calibration, basically a counter
267				    * gets incremented each time a clamping
268				    * period is completed without extra wakeups
269				    * once that counter is reached given level,
270				    * compensation is deemed usable.
271				    */
272	unsigned long steady_comp; /* steady state compensation used when
273				    * no extra wakeups occurred.
274				    */
275	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
276				     * mostly from external interrupts.
277				     */
278};
279
280static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
281
282static int window_size_set(const char *arg, const struct kernel_param *kp)
283{
284	int ret = 0;
285	unsigned long new_window_size;
286
287	ret = kstrtoul(arg, 10, &new_window_size);
288	if (ret)
289		goto exit_win;
290	if (new_window_size > 10 || new_window_size < 2) {
291		pr_err("Out of recommended window size %lu, between 2-10\n",
292			new_window_size);
293		ret = -EINVAL;
294	}
295
296	window_size = clamp(new_window_size, 2ul, 10ul);
297	smp_mb();
298
299exit_win:
300
301	return ret;
302}
303
304static const struct kernel_param_ops window_size_ops = {
305	.set = window_size_set,
306	.get = param_get_int,
307};
308
309module_param_cb(window_size, &window_size_ops, &window_size, 0644);
310MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
311	"\tpowerclamp controls idle ratio within this window. larger\n"
312	"\twindow size results in slower response time but more smooth\n"
313	"\tclamping results. default to 2.");
314
315static void find_target_mwait(void)
316{
317	unsigned int eax, ebx, ecx, edx;
318	unsigned int highest_cstate = 0;
319	unsigned int highest_subcstate = 0;
320	int i;
321
322	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
323		return;
324
325	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
326
327	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
328	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
329		return;
330
331	edx >>= MWAIT_SUBSTATE_SIZE;
332	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
333		if (edx & MWAIT_SUBSTATE_MASK) {
334			highest_cstate = i;
335			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
336		}
337	}
338	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
339		(highest_subcstate - 1);
340
341}
342
343struct pkg_cstate_info {
344	bool skip;
345	int msr_index;
346	int cstate_id;
347};
348
349#define PKG_CSTATE_INIT(id) {				\
350		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
351		.cstate_id = id				\
352			}
353
354static struct pkg_cstate_info pkg_cstates[] = {
355	PKG_CSTATE_INIT(2),
356	PKG_CSTATE_INIT(3),
357	PKG_CSTATE_INIT(6),
358	PKG_CSTATE_INIT(7),
359	PKG_CSTATE_INIT(8),
360	PKG_CSTATE_INIT(9),
361	PKG_CSTATE_INIT(10),
362	{NULL},
363};
364
365static bool has_pkg_state_counter(void)
366{
367	u64 val;
368	struct pkg_cstate_info *info = pkg_cstates;
369
370	/* check if any one of the counter msrs exists */
371	while (info->msr_index) {
372		if (!rdmsrl_safe(info->msr_index, &val))
373			return true;
374		info++;
375	}
376
377	return false;
378}
379
380static u64 pkg_state_counter(void)
381{
382	u64 val;
383	u64 count = 0;
384	struct pkg_cstate_info *info = pkg_cstates;
385
386	while (info->msr_index) {
387		if (!info->skip) {
388			if (!rdmsrl_safe(info->msr_index, &val))
389				count += val;
390			else
391				info->skip = true;
392		}
393		info++;
394	}
395
396	return count;
397}
398
399static unsigned int get_compensation(int ratio)
400{
401	unsigned int comp = 0;
402
403	if (!poll_pkg_cstate_enable)
404		return 0;
405
406	/* we only use compensation if all adjacent ones are good */
407	if (ratio == 1 &&
408		cal_data[ratio].confidence >= CONFIDENCE_OK &&
409		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
410		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
411		comp = (cal_data[ratio].steady_comp +
412			cal_data[ratio + 1].steady_comp +
413			cal_data[ratio + 2].steady_comp) / 3;
414	} else if (ratio == MAX_TARGET_RATIO - 1 &&
415		cal_data[ratio].confidence >= CONFIDENCE_OK &&
416		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
417		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
418		comp = (cal_data[ratio].steady_comp +
419			cal_data[ratio - 1].steady_comp +
420			cal_data[ratio - 2].steady_comp) / 3;
421	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
422		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
423		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
424		comp = (cal_data[ratio].steady_comp +
425			cal_data[ratio - 1].steady_comp +
426			cal_data[ratio + 1].steady_comp) / 3;
427	}
428
429	/* do not exceed limit */
430	if (comp + ratio >= MAX_TARGET_RATIO)
431		comp = MAX_TARGET_RATIO - ratio - 1;
432
433	return comp;
434}
435
436static void adjust_compensation(int target_ratio, unsigned int win)
437{
438	int delta;
439	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
440
441	/*
442	 * adjust compensations if confidence level has not been reached.
443	 */
444	if (d->confidence >= CONFIDENCE_OK)
445		return;
446
447	delta = powerclamp_data.target_ratio - current_ratio;
448	/* filter out bad data */
449	if (delta >= 0 && delta <= (1+target_ratio/10)) {
450		if (d->steady_comp)
451			d->steady_comp =
452				roundup(delta+d->steady_comp, 2)/2;
453		else
454			d->steady_comp = delta;
455		d->confidence++;
456	}
457}
458
459static bool powerclamp_adjust_controls(unsigned int target_ratio,
460				unsigned int guard, unsigned int win)
461{
462	static u64 msr_last, tsc_last;
463	u64 msr_now, tsc_now;
464	u64 val64;
465
466	/* check result for the last window */
467	msr_now = pkg_state_counter();
468	tsc_now = rdtsc();
469
470	/* calculate pkg cstate vs tsc ratio */
471	if (!msr_last || !tsc_last)
472		current_ratio = 1;
473	else if (tsc_now-tsc_last) {
474		val64 = 100*(msr_now-msr_last);
475		do_div(val64, (tsc_now-tsc_last));
476		current_ratio = val64;
477	}
478
479	/* update record */
480	msr_last = msr_now;
481	tsc_last = tsc_now;
482
483	adjust_compensation(target_ratio, win);
484
485	/* if we are above target+guard, skip */
486	return powerclamp_data.target_ratio + guard <= current_ratio;
487}
488
489/*
490 * This function calculates runtime from the current target ratio.
491 * This function gets called under powerclamp_lock.
492 */
493static unsigned int get_run_time(void)
494{
495	unsigned int compensated_ratio;
496	unsigned int runtime;
497
498	/*
499	 * make sure user selected ratio does not take effect until
500	 * the next round. adjust target_ratio if user has changed
501	 * target such that we can converge quickly.
502	 */
503	powerclamp_data.guard = 1 + powerclamp_data.target_ratio / 20;
504	powerclamp_data.window_size_now = window_size;
505
506	/*
507	 * systems may have different ability to enter package level
508	 * c-states, thus we need to compensate the injected idle ratio
509	 * to achieve the actual target reported by the HW.
510	 */
511	compensated_ratio = powerclamp_data.target_ratio +
512		get_compensation(powerclamp_data.target_ratio);
513	if (compensated_ratio <= 0)
514		compensated_ratio = 1;
515
516	runtime = duration * 100 / compensated_ratio - duration;
517
518	return runtime;
519}
520
521/*
522 * 1 HZ polling while clamping is active, useful for userspace
523 * to monitor actual idle ratio.
524 */
525static void poll_pkg_cstate(struct work_struct *dummy);
526static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
527static void poll_pkg_cstate(struct work_struct *dummy)
528{
529	static u64 msr_last;
530	static u64 tsc_last;
531
532	u64 msr_now;
533	u64 tsc_now;
534	u64 val64;
535
536	msr_now = pkg_state_counter();
537	tsc_now = rdtsc();
538
539	/* calculate pkg cstate vs tsc ratio */
540	if (!msr_last || !tsc_last)
541		pkg_cstate_ratio_cur = 1;
542	else {
543		if (tsc_now - tsc_last) {
544			val64 = 100 * (msr_now - msr_last);
545			do_div(val64, (tsc_now - tsc_last));
546			pkg_cstate_ratio_cur = val64;
547		}
548	}
549
550	/* update record */
551	msr_last = msr_now;
552	tsc_last = tsc_now;
553
554	mutex_lock(&powerclamp_lock);
555	if (powerclamp_data.clamping)
556		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
557	mutex_unlock(&powerclamp_lock);
558}
559
560static struct idle_inject_device *ii_dev;
561
562/*
563 * This function is called from idle injection core on timer expiry
564 * for the run duration. This allows powerclamp to readjust or skip
565 * injecting idle for this cycle.
566 */
567static bool idle_inject_update(void)
568{
569	bool update = false;
570
571	/* We can't sleep in this callback */
572	if (!mutex_trylock(&powerclamp_lock))
573		return true;
574
575	if (!(powerclamp_data.count % powerclamp_data.window_size_now)) {
576
577		should_skip = powerclamp_adjust_controls(powerclamp_data.target_ratio,
578							 powerclamp_data.guard,
579							 powerclamp_data.window_size_now);
580		update = true;
581	}
582
583	if (update) {
584		unsigned int runtime = get_run_time();
585
586		idle_inject_set_duration(ii_dev, runtime, duration);
587	}
588
589	powerclamp_data.count++;
590
591	mutex_unlock(&powerclamp_lock);
592
593	if (should_skip)
594		return false;
595
596	return true;
597}
598
599/* This function starts idle injection by calling idle_inject_start() */
600static void trigger_idle_injection(void)
601{
602	unsigned int runtime = get_run_time();
603
604	idle_inject_set_duration(ii_dev, runtime, duration);
605	idle_inject_start(ii_dev);
606	powerclamp_data.clamping = true;
607}
608
609/*
610 * This function is called from start_power_clamp() to register
611 * CPUS with powercap idle injection register and set default
612 * idle duration and latency.
613 */
614static int powerclamp_idle_injection_register(void)
615{
616	poll_pkg_cstate_enable = false;
617	if (cpumask_equal(cpu_present_mask, idle_injection_cpu_mask)) {
618		ii_dev = idle_inject_register_full(idle_injection_cpu_mask, idle_inject_update);
619		if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
620			poll_pkg_cstate_enable = true;
621	} else {
622		ii_dev = idle_inject_register(idle_injection_cpu_mask);
623	}
624
625	if (!ii_dev) {
626		pr_err("powerclamp: idle_inject_register failed\n");
627		return -EAGAIN;
628	}
629
630	idle_inject_set_duration(ii_dev, TICK_USEC, duration);
631	idle_inject_set_latency(ii_dev, UINT_MAX);
632
633	return 0;
634}
635
636/*
637 * This function is called from end_power_clamp() to stop idle injection
638 * and unregister CPUS from powercap idle injection core.
639 */
640static void remove_idle_injection(void)
641{
642	if (!powerclamp_data.clamping)
643		return;
644
645	powerclamp_data.clamping = false;
646	idle_inject_stop(ii_dev);
647}
648
649/*
650 * This function is called when user change the cooling device
651 * state from zero to some other value.
652 */
653static int start_power_clamp(void)
654{
655	int ret;
656
657	ret = powerclamp_idle_injection_register();
658	if (!ret) {
659		trigger_idle_injection();
660		if (poll_pkg_cstate_enable)
661			schedule_delayed_work(&poll_pkg_cstate_work, 0);
662	}
663
664	return ret;
665}
666
667/*
668 * This function is called when user change the cooling device
669 * state from non zero value zero.
670 */
671static void end_power_clamp(void)
672{
673	if (powerclamp_data.clamping) {
674		remove_idle_injection();
675		idle_inject_unregister(ii_dev);
676	}
677}
678
679static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
680				 unsigned long *state)
681{
682	*state = MAX_TARGET_RATIO;
683
684	return 0;
685}
686
687static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
688				 unsigned long *state)
689{
690	mutex_lock(&powerclamp_lock);
691	*state = powerclamp_data.target_ratio;
692	mutex_unlock(&powerclamp_lock);
693
694	return 0;
695}
696
697static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
698				 unsigned long new_target_ratio)
699{
700	int ret = 0;
701
702	mutex_lock(&powerclamp_lock);
703
704	new_target_ratio = clamp(new_target_ratio, 0UL,
705				(unsigned long) (max_idle - 1));
706
707	if (powerclamp_data.target_ratio == new_target_ratio)
708		goto exit_set;
709
710	if (!powerclamp_data.target_ratio && new_target_ratio > 0) {
711		pr_info("Start idle injection to reduce power\n");
712		powerclamp_data.target_ratio = new_target_ratio;
713		ret = start_power_clamp();
714		if (ret)
715			powerclamp_data.target_ratio = 0;
716		goto exit_set;
717	} else	if (powerclamp_data.target_ratio > 0 && new_target_ratio == 0) {
718		pr_info("Stop forced idle injection\n");
719		end_power_clamp();
720		powerclamp_data.target_ratio = 0;
721	} else	/* adjust currently running */ {
722		unsigned int runtime;
723
724		powerclamp_data.target_ratio = new_target_ratio;
725		runtime = get_run_time();
726		idle_inject_set_duration(ii_dev, runtime, duration);
727	}
728
729exit_set:
730	mutex_unlock(&powerclamp_lock);
731
732	return ret;
733}
734
735/* bind to generic thermal layer as cooling device*/
736static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
737	.get_max_state = powerclamp_get_max_state,
738	.get_cur_state = powerclamp_get_cur_state,
739	.set_cur_state = powerclamp_set_cur_state,
740};
741
742static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
743	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
744	{}
745};
746MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
747
748static int __init powerclamp_probe(void)
749{
750
751	if (!x86_match_cpu(intel_powerclamp_ids)) {
752		pr_err("CPU does not support MWAIT\n");
753		return -ENODEV;
754	}
755
756	/* The goal for idle time alignment is to achieve package cstate. */
757	if (!has_pkg_state_counter()) {
758		pr_info("No package C-state available\n");
759		return -ENODEV;
760	}
761
762	/* find the deepest mwait value */
763	find_target_mwait();
764
765	return 0;
766}
767
768static int powerclamp_debug_show(struct seq_file *m, void *unused)
769{
770	int i = 0;
771
772	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
773	for (i = 0; i < MAX_TARGET_RATIO; i++) {
774		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
775			i,
776			cal_data[i].confidence,
777			cal_data[i].steady_comp,
778			cal_data[i].dynamic_comp);
779	}
780
781	return 0;
782}
783
784DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
785
786static inline void powerclamp_create_debug_files(void)
787{
788	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
789
790	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
791			    &powerclamp_debug_fops);
792}
793
794static int __init powerclamp_init(void)
795{
796	int retval;
797
798	/* probe cpu features and ids here */
799	retval = powerclamp_probe();
800	if (retval)
801		return retval;
802
803	mutex_lock(&powerclamp_lock);
804	if (!cpumask_available(idle_injection_cpu_mask))
805		retval = allocate_copy_idle_injection_mask(cpu_present_mask);
806	mutex_unlock(&powerclamp_lock);
807
808	if (retval)
809		return retval;
810
811	/* set default limit, maybe adjusted during runtime based on feedback */
812	window_size = 2;
813
814	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
815						      &powerclamp_cooling_ops);
816	if (IS_ERR(cooling_dev))
817		return -ENODEV;
818
819	if (!duration)
820		duration = jiffies_to_usecs(DEFAULT_DURATION_JIFFIES);
821
822	powerclamp_create_debug_files();
823
824	return 0;
825}
826module_init(powerclamp_init);
827
828static void __exit powerclamp_exit(void)
829{
830	mutex_lock(&powerclamp_lock);
831	end_power_clamp();
832	mutex_unlock(&powerclamp_lock);
833
834	thermal_cooling_device_unregister(cooling_dev);
835
836	cancel_delayed_work_sync(&poll_pkg_cstate_work);
837	debugfs_remove_recursive(debug_dir);
838
839	if (cpumask_available(idle_injection_cpu_mask))
840		free_cpumask_var(idle_injection_cpu_mask);
841}
842module_exit(powerclamp_exit);
843
844MODULE_IMPORT_NS(IDLE_INJECT);
845
846MODULE_LICENSE("GPL");
847MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
848MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
849MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
850