1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *	TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *	     2. synchronization with other hw blocks
23  */
24 
25 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/kthread.h>
31 #include <linux/cpu.h>
32 #include <linux/thermal.h>
33 #include <linux/slab.h>
34 #include <linux/tick.h>
35 #include <linux/debugfs.h>
36 #include <linux/seq_file.h>
37 #include <linux/sched/rt.h>
38 #include <uapi/linux/sched/types.h>
39 
40 #include <asm/nmi.h>
41 #include <asm/msr.h>
42 #include <asm/mwait.h>
43 #include <asm/cpu_device_id.h>
44 #include <asm/hardirq.h>
45 
46 #define MAX_TARGET_RATIO (50U)
47 /* For each undisturbed clamping period (no extra wake ups during idle time),
48  * we increment the confidence counter for the given target ratio.
49  * CONFIDENCE_OK defines the level where runtime calibration results are
50  * valid.
51  */
52 #define CONFIDENCE_OK (3)
53 /* Default idle injection duration, driver adjust sleep time to meet target
54  * idle ratio. Similar to frequency modulation.
55  */
56 #define DEFAULT_DURATION_JIFFIES (6)
57 
58 static unsigned int target_mwait;
59 static struct dentry *debug_dir;
60 static bool poll_pkg_cstate_enable;
61 
62 /* user selected target */
63 static unsigned int set_target_ratio;
64 static unsigned int current_ratio;
65 static bool should_skip;
66 static bool reduce_irq;
67 static atomic_t idle_wakeup_counter;
68 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
69 				  * control parameters. default to BSP but BSP
70 				  * can be offlined.
71 				  */
72 static bool clamping;
73 
74 struct powerclamp_worker_data {
75 	struct kthread_worker *worker;
76 	struct kthread_work balancing_work;
77 	struct kthread_delayed_work idle_injection_work;
78 	unsigned int cpu;
79 	unsigned int count;
80 	unsigned int guard;
81 	unsigned int window_size_now;
82 	unsigned int target_ratio;
83 	unsigned int duration_jiffies;
84 	bool clamping;
85 };
86 
87 static struct powerclamp_worker_data __percpu *worker_data;
88 static struct thermal_cooling_device *cooling_dev;
89 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
90 					   * clamping kthread worker
91 					   */
92 
93 static unsigned int duration;
94 static unsigned int pkg_cstate_ratio_cur;
95 static unsigned int window_size;
96 
duration_set(const char *arg, const struct kernel_param *kp)97 static int duration_set(const char *arg, const struct kernel_param *kp)
98 {
99 	int ret = 0;
100 	unsigned long new_duration;
101 
102 	ret = kstrtoul(arg, 10, &new_duration);
103 	if (ret)
104 		goto exit;
105 	if (new_duration > 25 || new_duration < 6) {
106 		pr_err("Out of recommended range %lu, between 6-25ms\n",
107 			new_duration);
108 		ret = -EINVAL;
109 	}
110 
111 	duration = clamp(new_duration, 6ul, 25ul);
112 	smp_mb();
113 
114 exit:
115 
116 	return ret;
117 }
118 
119 static const struct kernel_param_ops duration_ops = {
120 	.set = duration_set,
121 	.get = param_get_int,
122 };
123 
124 
125 module_param_cb(duration, &duration_ops, &duration, 0644);
126 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
127 
128 struct powerclamp_calibration_data {
129 	unsigned long confidence;  /* used for calibration, basically a counter
130 				    * gets incremented each time a clamping
131 				    * period is completed without extra wakeups
132 				    * once that counter is reached given level,
133 				    * compensation is deemed usable.
134 				    */
135 	unsigned long steady_comp; /* steady state compensation used when
136 				    * no extra wakeups occurred.
137 				    */
138 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
139 				     * mostly from external interrupts.
140 				     */
141 };
142 
143 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
144 
window_size_set(const char *arg, const struct kernel_param *kp)145 static int window_size_set(const char *arg, const struct kernel_param *kp)
146 {
147 	int ret = 0;
148 	unsigned long new_window_size;
149 
150 	ret = kstrtoul(arg, 10, &new_window_size);
151 	if (ret)
152 		goto exit_win;
153 	if (new_window_size > 10 || new_window_size < 2) {
154 		pr_err("Out of recommended window size %lu, between 2-10\n",
155 			new_window_size);
156 		ret = -EINVAL;
157 	}
158 
159 	window_size = clamp(new_window_size, 2ul, 10ul);
160 	smp_mb();
161 
162 exit_win:
163 
164 	return ret;
165 }
166 
167 static const struct kernel_param_ops window_size_ops = {
168 	.set = window_size_set,
169 	.get = param_get_int,
170 };
171 
172 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
173 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
174 	"\tpowerclamp controls idle ratio within this window. larger\n"
175 	"\twindow size results in slower response time but more smooth\n"
176 	"\tclamping results. default to 2.");
177 
find_target_mwait(void)178 static void find_target_mwait(void)
179 {
180 	unsigned int eax, ebx, ecx, edx;
181 	unsigned int highest_cstate = 0;
182 	unsigned int highest_subcstate = 0;
183 	int i;
184 
185 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
186 		return;
187 
188 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
189 
190 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
191 	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
192 		return;
193 
194 	edx >>= MWAIT_SUBSTATE_SIZE;
195 	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
196 		if (edx & MWAIT_SUBSTATE_MASK) {
197 			highest_cstate = i;
198 			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
199 		}
200 	}
201 	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
202 		(highest_subcstate - 1);
203 
204 }
205 
206 struct pkg_cstate_info {
207 	bool skip;
208 	int msr_index;
209 	int cstate_id;
210 };
211 
212 #define PKG_CSTATE_INIT(id) {				\
213 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
214 		.cstate_id = id				\
215 			}
216 
217 static struct pkg_cstate_info pkg_cstates[] = {
218 	PKG_CSTATE_INIT(2),
219 	PKG_CSTATE_INIT(3),
220 	PKG_CSTATE_INIT(6),
221 	PKG_CSTATE_INIT(7),
222 	PKG_CSTATE_INIT(8),
223 	PKG_CSTATE_INIT(9),
224 	PKG_CSTATE_INIT(10),
225 	{NULL},
226 };
227 
has_pkg_state_counter(void)228 static bool has_pkg_state_counter(void)
229 {
230 	u64 val;
231 	struct pkg_cstate_info *info = pkg_cstates;
232 
233 	/* check if any one of the counter msrs exists */
234 	while (info->msr_index) {
235 		if (!rdmsrl_safe(info->msr_index, &val))
236 			return true;
237 		info++;
238 	}
239 
240 	return false;
241 }
242 
pkg_state_counter(void)243 static u64 pkg_state_counter(void)
244 {
245 	u64 val;
246 	u64 count = 0;
247 	struct pkg_cstate_info *info = pkg_cstates;
248 
249 	while (info->msr_index) {
250 		if (!info->skip) {
251 			if (!rdmsrl_safe(info->msr_index, &val))
252 				count += val;
253 			else
254 				info->skip = true;
255 		}
256 		info++;
257 	}
258 
259 	return count;
260 }
261 
get_compensation(int ratio)262 static unsigned int get_compensation(int ratio)
263 {
264 	unsigned int comp = 0;
265 
266 	if (!poll_pkg_cstate_enable)
267 		return 0;
268 
269 	/* we only use compensation if all adjacent ones are good */
270 	if (ratio == 1 &&
271 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
272 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
273 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
274 		comp = (cal_data[ratio].steady_comp +
275 			cal_data[ratio + 1].steady_comp +
276 			cal_data[ratio + 2].steady_comp) / 3;
277 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
278 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
279 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
280 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
281 		comp = (cal_data[ratio].steady_comp +
282 			cal_data[ratio - 1].steady_comp +
283 			cal_data[ratio - 2].steady_comp) / 3;
284 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
285 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
286 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
287 		comp = (cal_data[ratio].steady_comp +
288 			cal_data[ratio - 1].steady_comp +
289 			cal_data[ratio + 1].steady_comp) / 3;
290 	}
291 
292 	/* REVISIT: simple penalty of double idle injection */
293 	if (reduce_irq)
294 		comp = ratio;
295 	/* do not exceed limit */
296 	if (comp + ratio >= MAX_TARGET_RATIO)
297 		comp = MAX_TARGET_RATIO - ratio - 1;
298 
299 	return comp;
300 }
301 
adjust_compensation(int target_ratio, unsigned int win)302 static void adjust_compensation(int target_ratio, unsigned int win)
303 {
304 	int delta;
305 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
306 
307 	/*
308 	 * adjust compensations if confidence level has not been reached or
309 	 * there are too many wakeups during the last idle injection period, we
310 	 * cannot trust the data for compensation.
311 	 */
312 	if (d->confidence >= CONFIDENCE_OK ||
313 		atomic_read(&idle_wakeup_counter) >
314 		win * num_online_cpus())
315 		return;
316 
317 	delta = set_target_ratio - current_ratio;
318 	/* filter out bad data */
319 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
320 		if (d->steady_comp)
321 			d->steady_comp =
322 				roundup(delta+d->steady_comp, 2)/2;
323 		else
324 			d->steady_comp = delta;
325 		d->confidence++;
326 	}
327 }
328 
powerclamp_adjust_controls(unsigned int target_ratio, unsigned int guard, unsigned int win)329 static bool powerclamp_adjust_controls(unsigned int target_ratio,
330 				unsigned int guard, unsigned int win)
331 {
332 	static u64 msr_last, tsc_last;
333 	u64 msr_now, tsc_now;
334 	u64 val64;
335 
336 	/* check result for the last window */
337 	msr_now = pkg_state_counter();
338 	tsc_now = rdtsc();
339 
340 	/* calculate pkg cstate vs tsc ratio */
341 	if (!msr_last || !tsc_last)
342 		current_ratio = 1;
343 	else if (tsc_now-tsc_last) {
344 		val64 = 100*(msr_now-msr_last);
345 		do_div(val64, (tsc_now-tsc_last));
346 		current_ratio = val64;
347 	}
348 
349 	/* update record */
350 	msr_last = msr_now;
351 	tsc_last = tsc_now;
352 
353 	adjust_compensation(target_ratio, win);
354 	/*
355 	 * too many external interrupts, set flag such
356 	 * that we can take measure later.
357 	 */
358 	reduce_irq = atomic_read(&idle_wakeup_counter) >=
359 		2 * win * num_online_cpus();
360 
361 	atomic_set(&idle_wakeup_counter, 0);
362 	/* if we are above target+guard, skip */
363 	return set_target_ratio + guard <= current_ratio;
364 }
365 
clamp_balancing_func(struct kthread_work *work)366 static void clamp_balancing_func(struct kthread_work *work)
367 {
368 	struct powerclamp_worker_data *w_data;
369 	int sleeptime;
370 	unsigned long target_jiffies;
371 	unsigned int compensated_ratio;
372 	int interval; /* jiffies to sleep for each attempt */
373 
374 	w_data = container_of(work, struct powerclamp_worker_data,
375 			      balancing_work);
376 
377 	/*
378 	 * make sure user selected ratio does not take effect until
379 	 * the next round. adjust target_ratio if user has changed
380 	 * target such that we can converge quickly.
381 	 */
382 	w_data->target_ratio = READ_ONCE(set_target_ratio);
383 	w_data->guard = 1 + w_data->target_ratio / 20;
384 	w_data->window_size_now = window_size;
385 	w_data->duration_jiffies = msecs_to_jiffies(duration);
386 	w_data->count++;
387 
388 	/*
389 	 * systems may have different ability to enter package level
390 	 * c-states, thus we need to compensate the injected idle ratio
391 	 * to achieve the actual target reported by the HW.
392 	 */
393 	compensated_ratio = w_data->target_ratio +
394 		get_compensation(w_data->target_ratio);
395 	if (compensated_ratio <= 0)
396 		compensated_ratio = 1;
397 	interval = w_data->duration_jiffies * 100 / compensated_ratio;
398 
399 	/* align idle time */
400 	target_jiffies = roundup(jiffies, interval);
401 	sleeptime = target_jiffies - jiffies;
402 	if (sleeptime <= 0)
403 		sleeptime = 1;
404 
405 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
406 		kthread_queue_delayed_work(w_data->worker,
407 					   &w_data->idle_injection_work,
408 					   sleeptime);
409 }
410 
clamp_idle_injection_func(struct kthread_work *work)411 static void clamp_idle_injection_func(struct kthread_work *work)
412 {
413 	struct powerclamp_worker_data *w_data;
414 
415 	w_data = container_of(work, struct powerclamp_worker_data,
416 			      idle_injection_work.work);
417 
418 	/*
419 	 * only elected controlling cpu can collect stats and update
420 	 * control parameters.
421 	 */
422 	if (w_data->cpu == control_cpu &&
423 	    !(w_data->count % w_data->window_size_now)) {
424 		should_skip =
425 			powerclamp_adjust_controls(w_data->target_ratio,
426 						   w_data->guard,
427 						   w_data->window_size_now);
428 		smp_mb();
429 	}
430 
431 	if (should_skip)
432 		goto balance;
433 
434 	play_idle(jiffies_to_usecs(w_data->duration_jiffies));
435 
436 balance:
437 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
438 		kthread_queue_work(w_data->worker, &w_data->balancing_work);
439 }
440 
441 /*
442  * 1 HZ polling while clamping is active, useful for userspace
443  * to monitor actual idle ratio.
444  */
445 static void poll_pkg_cstate(struct work_struct *dummy);
446 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
poll_pkg_cstate(struct work_struct *dummy)447 static void poll_pkg_cstate(struct work_struct *dummy)
448 {
449 	static u64 msr_last;
450 	static u64 tsc_last;
451 
452 	u64 msr_now;
453 	u64 tsc_now;
454 	u64 val64;
455 
456 	msr_now = pkg_state_counter();
457 	tsc_now = rdtsc();
458 
459 	/* calculate pkg cstate vs tsc ratio */
460 	if (!msr_last || !tsc_last)
461 		pkg_cstate_ratio_cur = 1;
462 	else {
463 		if (tsc_now - tsc_last) {
464 			val64 = 100 * (msr_now - msr_last);
465 			do_div(val64, (tsc_now - tsc_last));
466 			pkg_cstate_ratio_cur = val64;
467 		}
468 	}
469 
470 	/* update record */
471 	msr_last = msr_now;
472 	tsc_last = tsc_now;
473 
474 	if (true == clamping)
475 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
476 }
477 
start_power_clamp_worker(unsigned long cpu)478 static void start_power_clamp_worker(unsigned long cpu)
479 {
480 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
481 	struct kthread_worker *worker;
482 
483 	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
484 	if (IS_ERR(worker))
485 		return;
486 
487 	w_data->worker = worker;
488 	w_data->count = 0;
489 	w_data->cpu = cpu;
490 	w_data->clamping = true;
491 	set_bit(cpu, cpu_clamping_mask);
492 	sched_set_fifo(worker->task);
493 	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
494 	kthread_init_delayed_work(&w_data->idle_injection_work,
495 				  clamp_idle_injection_func);
496 	kthread_queue_work(w_data->worker, &w_data->balancing_work);
497 }
498 
stop_power_clamp_worker(unsigned long cpu)499 static void stop_power_clamp_worker(unsigned long cpu)
500 {
501 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
502 
503 	if (!w_data->worker)
504 		return;
505 
506 	w_data->clamping = false;
507 	/*
508 	 * Make sure that all works that get queued after this point see
509 	 * the clamping disabled. The counter part is not needed because
510 	 * there is an implicit memory barrier when the queued work
511 	 * is proceed.
512 	 */
513 	smp_wmb();
514 	kthread_cancel_work_sync(&w_data->balancing_work);
515 	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
516 	/*
517 	 * The balancing work still might be queued here because
518 	 * the handling of the "clapming" variable, cancel, and queue
519 	 * operations are not synchronized via a lock. But it is not
520 	 * a big deal. The balancing work is fast and destroy kthread
521 	 * will wait for it.
522 	 */
523 	clear_bit(w_data->cpu, cpu_clamping_mask);
524 	kthread_destroy_worker(w_data->worker);
525 
526 	w_data->worker = NULL;
527 }
528 
start_power_clamp(void)529 static int start_power_clamp(void)
530 {
531 	unsigned long cpu;
532 
533 	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
534 	/* prevent cpu hotplug */
535 	get_online_cpus();
536 
537 	/* prefer BSP */
538 	control_cpu = cpumask_first(cpu_online_mask);
539 
540 	clamping = true;
541 	if (poll_pkg_cstate_enable)
542 		schedule_delayed_work(&poll_pkg_cstate_work, 0);
543 
544 	/* start one kthread worker per online cpu */
545 	for_each_online_cpu(cpu) {
546 		start_power_clamp_worker(cpu);
547 	}
548 	put_online_cpus();
549 
550 	return 0;
551 }
552 
end_power_clamp(void)553 static void end_power_clamp(void)
554 {
555 	int i;
556 
557 	/*
558 	 * Block requeuing in all the kthread workers. They will flush and
559 	 * stop faster.
560 	 */
561 	clamping = false;
562 	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
563 		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
564 			pr_debug("clamping worker for cpu %d alive, destroy\n",
565 				 i);
566 			stop_power_clamp_worker(i);
567 		}
568 	}
569 }
570 
powerclamp_cpu_online(unsigned int cpu)571 static int powerclamp_cpu_online(unsigned int cpu)
572 {
573 	if (clamping == false)
574 		return 0;
575 	start_power_clamp_worker(cpu);
576 	/* prefer BSP as controlling CPU */
577 	if (cpu == 0) {
578 		control_cpu = 0;
579 		smp_mb();
580 	}
581 	return 0;
582 }
583 
powerclamp_cpu_predown(unsigned int cpu)584 static int powerclamp_cpu_predown(unsigned int cpu)
585 {
586 	if (clamping == false)
587 		return 0;
588 
589 	stop_power_clamp_worker(cpu);
590 	if (cpu != control_cpu)
591 		return 0;
592 
593 	control_cpu = cpumask_first(cpu_online_mask);
594 	if (control_cpu == cpu)
595 		control_cpu = cpumask_next(cpu, cpu_online_mask);
596 	smp_mb();
597 	return 0;
598 }
599 
powerclamp_get_max_state(struct thermal_cooling_device *cdev, unsigned long *state)600 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
601 				 unsigned long *state)
602 {
603 	*state = MAX_TARGET_RATIO;
604 
605 	return 0;
606 }
607 
powerclamp_get_cur_state(struct thermal_cooling_device *cdev, unsigned long *state)608 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
609 				 unsigned long *state)
610 {
611 	if (clamping) {
612 		if (poll_pkg_cstate_enable)
613 			*state = pkg_cstate_ratio_cur;
614 		else
615 			*state = set_target_ratio;
616 	} else {
617 		/* to save power, do not poll idle ratio while not clamping */
618 		*state = -1; /* indicates invalid state */
619 	}
620 
621 	return 0;
622 }
623 
powerclamp_set_cur_state(struct thermal_cooling_device *cdev, unsigned long new_target_ratio)624 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
625 				 unsigned long new_target_ratio)
626 {
627 	int ret = 0;
628 
629 	new_target_ratio = clamp(new_target_ratio, 0UL,
630 				(unsigned long) (MAX_TARGET_RATIO-1));
631 	if (set_target_ratio == 0 && new_target_ratio > 0) {
632 		pr_info("Start idle injection to reduce power\n");
633 		set_target_ratio = new_target_ratio;
634 		ret = start_power_clamp();
635 		goto exit_set;
636 	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
637 		pr_info("Stop forced idle injection\n");
638 		end_power_clamp();
639 		set_target_ratio = 0;
640 	} else	/* adjust currently running */ {
641 		set_target_ratio = new_target_ratio;
642 		/* make new set_target_ratio visible to other cpus */
643 		smp_mb();
644 	}
645 
646 exit_set:
647 	return ret;
648 }
649 
650 /* bind to generic thermal layer as cooling device*/
651 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
652 	.get_max_state = powerclamp_get_max_state,
653 	.get_cur_state = powerclamp_get_cur_state,
654 	.set_cur_state = powerclamp_set_cur_state,
655 };
656 
657 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
658 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
659 	{}
660 };
661 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
662 
powerclamp_probe(void)663 static int __init powerclamp_probe(void)
664 {
665 
666 	if (!x86_match_cpu(intel_powerclamp_ids)) {
667 		pr_err("CPU does not support MWAIT\n");
668 		return -ENODEV;
669 	}
670 
671 	/* The goal for idle time alignment is to achieve package cstate. */
672 	if (!has_pkg_state_counter()) {
673 		pr_info("No package C-state available\n");
674 		return -ENODEV;
675 	}
676 
677 	/* find the deepest mwait value */
678 	find_target_mwait();
679 
680 	return 0;
681 }
682 
powerclamp_debug_show(struct seq_file *m, void *unused)683 static int powerclamp_debug_show(struct seq_file *m, void *unused)
684 {
685 	int i = 0;
686 
687 	seq_printf(m, "controlling cpu: %d\n", control_cpu);
688 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
689 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
690 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
691 			i,
692 			cal_data[i].confidence,
693 			cal_data[i].steady_comp,
694 			cal_data[i].dynamic_comp);
695 	}
696 
697 	return 0;
698 }
699 
700 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
701 
powerclamp_create_debug_files(void)702 static inline void powerclamp_create_debug_files(void)
703 {
704 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
705 
706 	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
707 			    &powerclamp_debug_fops);
708 }
709 
710 static enum cpuhp_state hp_state;
711 
powerclamp_init(void)712 static int __init powerclamp_init(void)
713 {
714 	int retval;
715 	int bitmap_size;
716 
717 	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
718 	cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
719 	if (!cpu_clamping_mask)
720 		return -ENOMEM;
721 
722 	/* probe cpu features and ids here */
723 	retval = powerclamp_probe();
724 	if (retval)
725 		goto exit_free;
726 
727 	/* set default limit, maybe adjusted during runtime based on feedback */
728 	window_size = 2;
729 	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
730 					   "thermal/intel_powerclamp:online",
731 					   powerclamp_cpu_online,
732 					   powerclamp_cpu_predown);
733 	if (retval < 0)
734 		goto exit_free;
735 
736 	hp_state = retval;
737 
738 	worker_data = alloc_percpu(struct powerclamp_worker_data);
739 	if (!worker_data) {
740 		retval = -ENOMEM;
741 		goto exit_unregister;
742 	}
743 
744 	if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
745 		poll_pkg_cstate_enable = true;
746 
747 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
748 						&powerclamp_cooling_ops);
749 	if (IS_ERR(cooling_dev)) {
750 		retval = -ENODEV;
751 		goto exit_free_thread;
752 	}
753 
754 	if (!duration)
755 		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
756 
757 	powerclamp_create_debug_files();
758 
759 	return 0;
760 
761 exit_free_thread:
762 	free_percpu(worker_data);
763 exit_unregister:
764 	cpuhp_remove_state_nocalls(hp_state);
765 exit_free:
766 	kfree(cpu_clamping_mask);
767 	return retval;
768 }
769 module_init(powerclamp_init);
770 
powerclamp_exit(void)771 static void __exit powerclamp_exit(void)
772 {
773 	end_power_clamp();
774 	cpuhp_remove_state_nocalls(hp_state);
775 	free_percpu(worker_data);
776 	thermal_cooling_device_unregister(cooling_dev);
777 	kfree(cpu_clamping_mask);
778 
779 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
780 	debugfs_remove_recursive(debug_dir);
781 }
782 module_exit(powerclamp_exit);
783 
784 MODULE_LICENSE("GPL");
785 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
786 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
787 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
788