1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * intel_idle.c - native hardware idle loop for modern Intel processors
4 *
5 * Copyright (c) 2013 - 2020, Intel Corporation.
6 * Len Brown <len.brown@intel.com>
7 * Rafael J. Wysocki <rafael.j.wysocki@intel.com>
8 */
9
10/*
11 * intel_idle is a cpuidle driver that loads on all Intel CPUs with MWAIT
12 * in lieu of the legacy ACPI processor_idle driver.  The intent is to
13 * make Linux more efficient on these processors, as intel_idle knows
14 * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs.
15 */
16
17/*
18 * Design Assumptions
19 *
20 * All CPUs have same idle states as boot CPU
21 *
22 * Chipset BM_STS (bus master status) bit is a NOP
23 *	for preventing entry into deep C-states
24 *
25 * CPU will flush caches as needed when entering a C-state via MWAIT
26 *	(in contrast to entering ACPI C3, in which case the WBINVD
27 *	instruction needs to be executed to flush the caches)
28 */
29
30/*
31 * Known limitations
32 *
33 * ACPI has a .suspend hack to turn off deep c-statees during suspend
34 * to avoid complications with the lapic timer workaround.
35 * Have not seen issues with suspend, but may need same workaround here.
36 *
37 */
38
39/* un-comment DEBUG to enable pr_debug() statements */
40#define DEBUG
41
42#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
43
44#include <linux/acpi.h>
45#include <linux/kernel.h>
46#include <linux/cpuidle.h>
47#include <linux/tick.h>
48#include <trace/events/power.h>
49#include <linux/sched.h>
50#include <linux/sched/smt.h>
51#include <linux/notifier.h>
52#include <linux/cpu.h>
53#include <linux/moduleparam.h>
54#include <asm/cpu_device_id.h>
55#include <asm/intel-family.h>
56#include <asm/nospec-branch.h>
57#include <asm/mwait.h>
58#include <asm/msr.h>
59
60#define INTEL_IDLE_VERSION "0.5.1"
61
62static struct cpuidle_driver intel_idle_driver = {
63	.name = "intel_idle",
64	.owner = THIS_MODULE,
65};
66/* intel_idle.max_cstate=0 disables driver */
67static int max_cstate = CPUIDLE_STATE_MAX - 1;
68static unsigned int disabled_states_mask;
69
70static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
71
72static unsigned long auto_demotion_disable_flags;
73static bool disable_promotion_to_c1e;
74
75struct idle_cpu {
76	struct cpuidle_state *state_table;
77
78	/*
79	 * Hardware C-state auto-demotion may not always be optimal.
80	 * Indicate which enable bits to clear here.
81	 */
82	unsigned long auto_demotion_disable_flags;
83	bool byt_auto_demotion_disable_flag;
84	bool disable_promotion_to_c1e;
85	bool use_acpi;
86};
87
88static const struct idle_cpu *icpu __initdata;
89static struct cpuidle_state *cpuidle_state_table __initdata;
90
91static unsigned int mwait_substates __initdata;
92
93/*
94 * Enable this state by default even if the ACPI _CST does not list it.
95 */
96#define CPUIDLE_FLAG_ALWAYS_ENABLE	BIT(15)
97
98/*
99 * Disable IBRS across idle (when KERNEL_IBRS), is exclusive vs IRQ_ENABLE
100 * above.
101 */
102#define CPUIDLE_FLAG_IBRS		BIT(16)
103
104/*
105 * MWAIT takes an 8-bit "hint" in EAX "suggesting"
106 * the C-state (top nibble) and sub-state (bottom nibble)
107 * 0x00 means "MWAIT(C1)", 0x10 means "MWAIT(C2)" etc.
108 *
109 * We store the hint at the top of our "flags" for each state.
110 */
111#define flg2MWAIT(flags) (((flags) >> 24) & 0xFF)
112#define MWAIT2flg(eax) ((eax & 0xFF) << 24)
113
114/**
115 * intel_idle - Ask the processor to enter the given idle state.
116 * @dev: cpuidle device of the target CPU.
117 * @drv: cpuidle driver (assumed to point to intel_idle_driver).
118 * @index: Target idle state index.
119 *
120 * Use the MWAIT instruction to notify the processor that the CPU represented by
121 * @dev is idle and it can try to enter the idle state corresponding to @index.
122 *
123 * If the local APIC timer is not known to be reliable in the target idle state,
124 * enable one-shot tick broadcasting for the target CPU before executing MWAIT.
125 *
126 * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to
127 * flushing user TLBs.
128 *
129 * Must be called under local_irq_disable().
130 */
131static __cpuidle int intel_idle(struct cpuidle_device *dev,
132				struct cpuidle_driver *drv, int index)
133{
134	struct cpuidle_state *state = &drv->states[index];
135	unsigned long eax = flg2MWAIT(state->flags);
136	unsigned long ecx = 1; /* break on interrupt flag */
137
138	mwait_idle_with_hints(eax, ecx);
139
140	return index;
141}
142
143static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev,
144				     struct cpuidle_driver *drv, int index)
145{
146	bool smt_active = sched_smt_active();
147	u64 spec_ctrl = spec_ctrl_current();
148	int ret;
149
150	if (smt_active)
151		wrmsrl(MSR_IA32_SPEC_CTRL, 0);
152
153	ret = intel_idle(dev, drv, index);
154
155	if (smt_active)
156		wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl);
157
158	return ret;
159}
160
161/**
162 * intel_idle_s2idle - Ask the processor to enter the given idle state.
163 * @dev: cpuidle device of the target CPU.
164 * @drv: cpuidle driver (assumed to point to intel_idle_driver).
165 * @index: Target idle state index.
166 *
167 * Use the MWAIT instruction to notify the processor that the CPU represented by
168 * @dev is idle and it can try to enter the idle state corresponding to @index.
169 *
170 * Invoked as a suspend-to-idle callback routine with frozen user space, frozen
171 * scheduler tick and suspended scheduler clock on the target CPU.
172 */
173static __cpuidle int intel_idle_s2idle(struct cpuidle_device *dev,
174				       struct cpuidle_driver *drv, int index)
175{
176	unsigned long eax = flg2MWAIT(drv->states[index].flags);
177	unsigned long ecx = 1; /* break on interrupt flag */
178
179	mwait_idle_with_hints(eax, ecx);
180
181	return 0;
182}
183
184/*
185 * States are indexed by the cstate number,
186 * which is also the index into the MWAIT hint array.
187 * Thus C0 is a dummy.
188 */
189static struct cpuidle_state nehalem_cstates[] __initdata = {
190	{
191		.name = "C1",
192		.desc = "MWAIT 0x00",
193		.flags = MWAIT2flg(0x00),
194		.exit_latency = 3,
195		.target_residency = 6,
196		.enter = &intel_idle,
197		.enter_s2idle = intel_idle_s2idle, },
198	{
199		.name = "C1E",
200		.desc = "MWAIT 0x01",
201		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
202		.exit_latency = 10,
203		.target_residency = 20,
204		.enter = &intel_idle,
205		.enter_s2idle = intel_idle_s2idle, },
206	{
207		.name = "C3",
208		.desc = "MWAIT 0x10",
209		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
210		.exit_latency = 20,
211		.target_residency = 80,
212		.enter = &intel_idle,
213		.enter_s2idle = intel_idle_s2idle, },
214	{
215		.name = "C6",
216		.desc = "MWAIT 0x20",
217		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
218		.exit_latency = 200,
219		.target_residency = 800,
220		.enter = &intel_idle,
221		.enter_s2idle = intel_idle_s2idle, },
222	{
223		.enter = NULL }
224};
225
226static struct cpuidle_state snb_cstates[] __initdata = {
227	{
228		.name = "C1",
229		.desc = "MWAIT 0x00",
230		.flags = MWAIT2flg(0x00),
231		.exit_latency = 2,
232		.target_residency = 2,
233		.enter = &intel_idle,
234		.enter_s2idle = intel_idle_s2idle, },
235	{
236		.name = "C1E",
237		.desc = "MWAIT 0x01",
238		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
239		.exit_latency = 10,
240		.target_residency = 20,
241		.enter = &intel_idle,
242		.enter_s2idle = intel_idle_s2idle, },
243	{
244		.name = "C3",
245		.desc = "MWAIT 0x10",
246		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
247		.exit_latency = 80,
248		.target_residency = 211,
249		.enter = &intel_idle,
250		.enter_s2idle = intel_idle_s2idle, },
251	{
252		.name = "C6",
253		.desc = "MWAIT 0x20",
254		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
255		.exit_latency = 104,
256		.target_residency = 345,
257		.enter = &intel_idle,
258		.enter_s2idle = intel_idle_s2idle, },
259	{
260		.name = "C7",
261		.desc = "MWAIT 0x30",
262		.flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
263		.exit_latency = 109,
264		.target_residency = 345,
265		.enter = &intel_idle,
266		.enter_s2idle = intel_idle_s2idle, },
267	{
268		.enter = NULL }
269};
270
271static struct cpuidle_state byt_cstates[] __initdata = {
272	{
273		.name = "C1",
274		.desc = "MWAIT 0x00",
275		.flags = MWAIT2flg(0x00),
276		.exit_latency = 1,
277		.target_residency = 1,
278		.enter = &intel_idle,
279		.enter_s2idle = intel_idle_s2idle, },
280	{
281		.name = "C6N",
282		.desc = "MWAIT 0x58",
283		.flags = MWAIT2flg(0x58) | CPUIDLE_FLAG_TLB_FLUSHED,
284		.exit_latency = 300,
285		.target_residency = 275,
286		.enter = &intel_idle,
287		.enter_s2idle = intel_idle_s2idle, },
288	{
289		.name = "C6S",
290		.desc = "MWAIT 0x52",
291		.flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
292		.exit_latency = 500,
293		.target_residency = 560,
294		.enter = &intel_idle,
295		.enter_s2idle = intel_idle_s2idle, },
296	{
297		.name = "C7",
298		.desc = "MWAIT 0x60",
299		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
300		.exit_latency = 1200,
301		.target_residency = 4000,
302		.enter = &intel_idle,
303		.enter_s2idle = intel_idle_s2idle, },
304	{
305		.name = "C7S",
306		.desc = "MWAIT 0x64",
307		.flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED,
308		.exit_latency = 10000,
309		.target_residency = 20000,
310		.enter = &intel_idle,
311		.enter_s2idle = intel_idle_s2idle, },
312	{
313		.enter = NULL }
314};
315
316static struct cpuidle_state cht_cstates[] __initdata = {
317	{
318		.name = "C1",
319		.desc = "MWAIT 0x00",
320		.flags = MWAIT2flg(0x00),
321		.exit_latency = 1,
322		.target_residency = 1,
323		.enter = &intel_idle,
324		.enter_s2idle = intel_idle_s2idle, },
325	{
326		.name = "C6N",
327		.desc = "MWAIT 0x58",
328		.flags = MWAIT2flg(0x58) | CPUIDLE_FLAG_TLB_FLUSHED,
329		.exit_latency = 80,
330		.target_residency = 275,
331		.enter = &intel_idle,
332		.enter_s2idle = intel_idle_s2idle, },
333	{
334		.name = "C6S",
335		.desc = "MWAIT 0x52",
336		.flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
337		.exit_latency = 200,
338		.target_residency = 560,
339		.enter = &intel_idle,
340		.enter_s2idle = intel_idle_s2idle, },
341	{
342		.name = "C7",
343		.desc = "MWAIT 0x60",
344		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
345		.exit_latency = 1200,
346		.target_residency = 4000,
347		.enter = &intel_idle,
348		.enter_s2idle = intel_idle_s2idle, },
349	{
350		.name = "C7S",
351		.desc = "MWAIT 0x64",
352		.flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED,
353		.exit_latency = 10000,
354		.target_residency = 20000,
355		.enter = &intel_idle,
356		.enter_s2idle = intel_idle_s2idle, },
357	{
358		.enter = NULL }
359};
360
361static struct cpuidle_state ivb_cstates[] __initdata = {
362	{
363		.name = "C1",
364		.desc = "MWAIT 0x00",
365		.flags = MWAIT2flg(0x00),
366		.exit_latency = 1,
367		.target_residency = 1,
368		.enter = &intel_idle,
369		.enter_s2idle = intel_idle_s2idle, },
370	{
371		.name = "C1E",
372		.desc = "MWAIT 0x01",
373		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
374		.exit_latency = 10,
375		.target_residency = 20,
376		.enter = &intel_idle,
377		.enter_s2idle = intel_idle_s2idle, },
378	{
379		.name = "C3",
380		.desc = "MWAIT 0x10",
381		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
382		.exit_latency = 59,
383		.target_residency = 156,
384		.enter = &intel_idle,
385		.enter_s2idle = intel_idle_s2idle, },
386	{
387		.name = "C6",
388		.desc = "MWAIT 0x20",
389		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
390		.exit_latency = 80,
391		.target_residency = 300,
392		.enter = &intel_idle,
393		.enter_s2idle = intel_idle_s2idle, },
394	{
395		.name = "C7",
396		.desc = "MWAIT 0x30",
397		.flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
398		.exit_latency = 87,
399		.target_residency = 300,
400		.enter = &intel_idle,
401		.enter_s2idle = intel_idle_s2idle, },
402	{
403		.enter = NULL }
404};
405
406static struct cpuidle_state ivt_cstates[] __initdata = {
407	{
408		.name = "C1",
409		.desc = "MWAIT 0x00",
410		.flags = MWAIT2flg(0x00),
411		.exit_latency = 1,
412		.target_residency = 1,
413		.enter = &intel_idle,
414		.enter_s2idle = intel_idle_s2idle, },
415	{
416		.name = "C1E",
417		.desc = "MWAIT 0x01",
418		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
419		.exit_latency = 10,
420		.target_residency = 80,
421		.enter = &intel_idle,
422		.enter_s2idle = intel_idle_s2idle, },
423	{
424		.name = "C3",
425		.desc = "MWAIT 0x10",
426		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
427		.exit_latency = 59,
428		.target_residency = 156,
429		.enter = &intel_idle,
430		.enter_s2idle = intel_idle_s2idle, },
431	{
432		.name = "C6",
433		.desc = "MWAIT 0x20",
434		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
435		.exit_latency = 82,
436		.target_residency = 300,
437		.enter = &intel_idle,
438		.enter_s2idle = intel_idle_s2idle, },
439	{
440		.enter = NULL }
441};
442
443static struct cpuidle_state ivt_cstates_4s[] __initdata = {
444	{
445		.name = "C1",
446		.desc = "MWAIT 0x00",
447		.flags = MWAIT2flg(0x00),
448		.exit_latency = 1,
449		.target_residency = 1,
450		.enter = &intel_idle,
451		.enter_s2idle = intel_idle_s2idle, },
452	{
453		.name = "C1E",
454		.desc = "MWAIT 0x01",
455		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
456		.exit_latency = 10,
457		.target_residency = 250,
458		.enter = &intel_idle,
459		.enter_s2idle = intel_idle_s2idle, },
460	{
461		.name = "C3",
462		.desc = "MWAIT 0x10",
463		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
464		.exit_latency = 59,
465		.target_residency = 300,
466		.enter = &intel_idle,
467		.enter_s2idle = intel_idle_s2idle, },
468	{
469		.name = "C6",
470		.desc = "MWAIT 0x20",
471		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
472		.exit_latency = 84,
473		.target_residency = 400,
474		.enter = &intel_idle,
475		.enter_s2idle = intel_idle_s2idle, },
476	{
477		.enter = NULL }
478};
479
480static struct cpuidle_state ivt_cstates_8s[] __initdata = {
481	{
482		.name = "C1",
483		.desc = "MWAIT 0x00",
484		.flags = MWAIT2flg(0x00),
485		.exit_latency = 1,
486		.target_residency = 1,
487		.enter = &intel_idle,
488		.enter_s2idle = intel_idle_s2idle, },
489	{
490		.name = "C1E",
491		.desc = "MWAIT 0x01",
492		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
493		.exit_latency = 10,
494		.target_residency = 500,
495		.enter = &intel_idle,
496		.enter_s2idle = intel_idle_s2idle, },
497	{
498		.name = "C3",
499		.desc = "MWAIT 0x10",
500		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
501		.exit_latency = 59,
502		.target_residency = 600,
503		.enter = &intel_idle,
504		.enter_s2idle = intel_idle_s2idle, },
505	{
506		.name = "C6",
507		.desc = "MWAIT 0x20",
508		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
509		.exit_latency = 88,
510		.target_residency = 700,
511		.enter = &intel_idle,
512		.enter_s2idle = intel_idle_s2idle, },
513	{
514		.enter = NULL }
515};
516
517static struct cpuidle_state hsw_cstates[] __initdata = {
518	{
519		.name = "C1",
520		.desc = "MWAIT 0x00",
521		.flags = MWAIT2flg(0x00),
522		.exit_latency = 2,
523		.target_residency = 2,
524		.enter = &intel_idle,
525		.enter_s2idle = intel_idle_s2idle, },
526	{
527		.name = "C1E",
528		.desc = "MWAIT 0x01",
529		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
530		.exit_latency = 10,
531		.target_residency = 20,
532		.enter = &intel_idle,
533		.enter_s2idle = intel_idle_s2idle, },
534	{
535		.name = "C3",
536		.desc = "MWAIT 0x10",
537		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
538		.exit_latency = 33,
539		.target_residency = 100,
540		.enter = &intel_idle,
541		.enter_s2idle = intel_idle_s2idle, },
542	{
543		.name = "C6",
544		.desc = "MWAIT 0x20",
545		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
546		.exit_latency = 133,
547		.target_residency = 400,
548		.enter = &intel_idle,
549		.enter_s2idle = intel_idle_s2idle, },
550	{
551		.name = "C7s",
552		.desc = "MWAIT 0x32",
553		.flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
554		.exit_latency = 166,
555		.target_residency = 500,
556		.enter = &intel_idle,
557		.enter_s2idle = intel_idle_s2idle, },
558	{
559		.name = "C8",
560		.desc = "MWAIT 0x40",
561		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
562		.exit_latency = 300,
563		.target_residency = 900,
564		.enter = &intel_idle,
565		.enter_s2idle = intel_idle_s2idle, },
566	{
567		.name = "C9",
568		.desc = "MWAIT 0x50",
569		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
570		.exit_latency = 600,
571		.target_residency = 1800,
572		.enter = &intel_idle,
573		.enter_s2idle = intel_idle_s2idle, },
574	{
575		.name = "C10",
576		.desc = "MWAIT 0x60",
577		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
578		.exit_latency = 2600,
579		.target_residency = 7700,
580		.enter = &intel_idle,
581		.enter_s2idle = intel_idle_s2idle, },
582	{
583		.enter = NULL }
584};
585static struct cpuidle_state bdw_cstates[] __initdata = {
586	{
587		.name = "C1",
588		.desc = "MWAIT 0x00",
589		.flags = MWAIT2flg(0x00),
590		.exit_latency = 2,
591		.target_residency = 2,
592		.enter = &intel_idle,
593		.enter_s2idle = intel_idle_s2idle, },
594	{
595		.name = "C1E",
596		.desc = "MWAIT 0x01",
597		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
598		.exit_latency = 10,
599		.target_residency = 20,
600		.enter = &intel_idle,
601		.enter_s2idle = intel_idle_s2idle, },
602	{
603		.name = "C3",
604		.desc = "MWAIT 0x10",
605		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
606		.exit_latency = 40,
607		.target_residency = 100,
608		.enter = &intel_idle,
609		.enter_s2idle = intel_idle_s2idle, },
610	{
611		.name = "C6",
612		.desc = "MWAIT 0x20",
613		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
614		.exit_latency = 133,
615		.target_residency = 400,
616		.enter = &intel_idle,
617		.enter_s2idle = intel_idle_s2idle, },
618	{
619		.name = "C7s",
620		.desc = "MWAIT 0x32",
621		.flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
622		.exit_latency = 166,
623		.target_residency = 500,
624		.enter = &intel_idle,
625		.enter_s2idle = intel_idle_s2idle, },
626	{
627		.name = "C8",
628		.desc = "MWAIT 0x40",
629		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
630		.exit_latency = 300,
631		.target_residency = 900,
632		.enter = &intel_idle,
633		.enter_s2idle = intel_idle_s2idle, },
634	{
635		.name = "C9",
636		.desc = "MWAIT 0x50",
637		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
638		.exit_latency = 600,
639		.target_residency = 1800,
640		.enter = &intel_idle,
641		.enter_s2idle = intel_idle_s2idle, },
642	{
643		.name = "C10",
644		.desc = "MWAIT 0x60",
645		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
646		.exit_latency = 2600,
647		.target_residency = 7700,
648		.enter = &intel_idle,
649		.enter_s2idle = intel_idle_s2idle, },
650	{
651		.enter = NULL }
652};
653
654static struct cpuidle_state skl_cstates[] __initdata = {
655	{
656		.name = "C1",
657		.desc = "MWAIT 0x00",
658		.flags = MWAIT2flg(0x00),
659		.exit_latency = 2,
660		.target_residency = 2,
661		.enter = &intel_idle,
662		.enter_s2idle = intel_idle_s2idle, },
663	{
664		.name = "C1E",
665		.desc = "MWAIT 0x01",
666		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
667		.exit_latency = 10,
668		.target_residency = 20,
669		.enter = &intel_idle,
670		.enter_s2idle = intel_idle_s2idle, },
671	{
672		.name = "C3",
673		.desc = "MWAIT 0x10",
674		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
675		.exit_latency = 70,
676		.target_residency = 100,
677		.enter = &intel_idle,
678		.enter_s2idle = intel_idle_s2idle, },
679	{
680		.name = "C6",
681		.desc = "MWAIT 0x20",
682		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
683		.exit_latency = 85,
684		.target_residency = 200,
685		.enter = &intel_idle,
686		.enter_s2idle = intel_idle_s2idle, },
687	{
688		.name = "C7s",
689		.desc = "MWAIT 0x33",
690		.flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
691		.exit_latency = 124,
692		.target_residency = 800,
693		.enter = &intel_idle,
694		.enter_s2idle = intel_idle_s2idle, },
695	{
696		.name = "C8",
697		.desc = "MWAIT 0x40",
698		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
699		.exit_latency = 200,
700		.target_residency = 800,
701		.enter = &intel_idle,
702		.enter_s2idle = intel_idle_s2idle, },
703	{
704		.name = "C9",
705		.desc = "MWAIT 0x50",
706		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
707		.exit_latency = 480,
708		.target_residency = 5000,
709		.enter = &intel_idle,
710		.enter_s2idle = intel_idle_s2idle, },
711	{
712		.name = "C10",
713		.desc = "MWAIT 0x60",
714		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
715		.exit_latency = 890,
716		.target_residency = 5000,
717		.enter = &intel_idle,
718		.enter_s2idle = intel_idle_s2idle, },
719	{
720		.enter = NULL }
721};
722
723static struct cpuidle_state skx_cstates[] __initdata = {
724	{
725		.name = "C1",
726		.desc = "MWAIT 0x00",
727		.flags = MWAIT2flg(0x00),
728		.exit_latency = 2,
729		.target_residency = 2,
730		.enter = &intel_idle,
731		.enter_s2idle = intel_idle_s2idle, },
732	{
733		.name = "C1E",
734		.desc = "MWAIT 0x01",
735		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
736		.exit_latency = 10,
737		.target_residency = 20,
738		.enter = &intel_idle,
739		.enter_s2idle = intel_idle_s2idle, },
740	{
741		.name = "C6",
742		.desc = "MWAIT 0x20",
743		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
744		.exit_latency = 133,
745		.target_residency = 600,
746		.enter = &intel_idle,
747		.enter_s2idle = intel_idle_s2idle, },
748	{
749		.enter = NULL }
750};
751
752static struct cpuidle_state icx_cstates[] __initdata = {
753	{
754		.name = "C1",
755		.desc = "MWAIT 0x00",
756		.flags = MWAIT2flg(0x00),
757		.exit_latency = 1,
758		.target_residency = 1,
759		.enter = &intel_idle,
760		.enter_s2idle = intel_idle_s2idle, },
761	{
762		.name = "C1E",
763		.desc = "MWAIT 0x01",
764		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
765		.exit_latency = 4,
766		.target_residency = 4,
767		.enter = &intel_idle,
768		.enter_s2idle = intel_idle_s2idle, },
769	{
770		.name = "C6",
771		.desc = "MWAIT 0x20",
772		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
773		.exit_latency = 128,
774		.target_residency = 384,
775		.enter = &intel_idle,
776		.enter_s2idle = intel_idle_s2idle, },
777	{
778		.enter = NULL }
779};
780
781static struct cpuidle_state atom_cstates[] __initdata = {
782	{
783		.name = "C1E",
784		.desc = "MWAIT 0x00",
785		.flags = MWAIT2flg(0x00),
786		.exit_latency = 10,
787		.target_residency = 20,
788		.enter = &intel_idle,
789		.enter_s2idle = intel_idle_s2idle, },
790	{
791		.name = "C2",
792		.desc = "MWAIT 0x10",
793		.flags = MWAIT2flg(0x10),
794		.exit_latency = 20,
795		.target_residency = 80,
796		.enter = &intel_idle,
797		.enter_s2idle = intel_idle_s2idle, },
798	{
799		.name = "C4",
800		.desc = "MWAIT 0x30",
801		.flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
802		.exit_latency = 100,
803		.target_residency = 400,
804		.enter = &intel_idle,
805		.enter_s2idle = intel_idle_s2idle, },
806	{
807		.name = "C6",
808		.desc = "MWAIT 0x52",
809		.flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
810		.exit_latency = 140,
811		.target_residency = 560,
812		.enter = &intel_idle,
813		.enter_s2idle = intel_idle_s2idle, },
814	{
815		.enter = NULL }
816};
817static struct cpuidle_state tangier_cstates[] __initdata = {
818	{
819		.name = "C1",
820		.desc = "MWAIT 0x00",
821		.flags = MWAIT2flg(0x00),
822		.exit_latency = 1,
823		.target_residency = 4,
824		.enter = &intel_idle,
825		.enter_s2idle = intel_idle_s2idle, },
826	{
827		.name = "C4",
828		.desc = "MWAIT 0x30",
829		.flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
830		.exit_latency = 100,
831		.target_residency = 400,
832		.enter = &intel_idle,
833		.enter_s2idle = intel_idle_s2idle, },
834	{
835		.name = "C6",
836		.desc = "MWAIT 0x52",
837		.flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
838		.exit_latency = 140,
839		.target_residency = 560,
840		.enter = &intel_idle,
841		.enter_s2idle = intel_idle_s2idle, },
842	{
843		.name = "C7",
844		.desc = "MWAIT 0x60",
845		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
846		.exit_latency = 1200,
847		.target_residency = 4000,
848		.enter = &intel_idle,
849		.enter_s2idle = intel_idle_s2idle, },
850	{
851		.name = "C9",
852		.desc = "MWAIT 0x64",
853		.flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED,
854		.exit_latency = 10000,
855		.target_residency = 20000,
856		.enter = &intel_idle,
857		.enter_s2idle = intel_idle_s2idle, },
858	{
859		.enter = NULL }
860};
861static struct cpuidle_state avn_cstates[] __initdata = {
862	{
863		.name = "C1",
864		.desc = "MWAIT 0x00",
865		.flags = MWAIT2flg(0x00),
866		.exit_latency = 2,
867		.target_residency = 2,
868		.enter = &intel_idle,
869		.enter_s2idle = intel_idle_s2idle, },
870	{
871		.name = "C6",
872		.desc = "MWAIT 0x51",
873		.flags = MWAIT2flg(0x51) | CPUIDLE_FLAG_TLB_FLUSHED,
874		.exit_latency = 15,
875		.target_residency = 45,
876		.enter = &intel_idle,
877		.enter_s2idle = intel_idle_s2idle, },
878	{
879		.enter = NULL }
880};
881static struct cpuidle_state knl_cstates[] __initdata = {
882	{
883		.name = "C1",
884		.desc = "MWAIT 0x00",
885		.flags = MWAIT2flg(0x00),
886		.exit_latency = 1,
887		.target_residency = 2,
888		.enter = &intel_idle,
889		.enter_s2idle = intel_idle_s2idle },
890	{
891		.name = "C6",
892		.desc = "MWAIT 0x10",
893		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
894		.exit_latency = 120,
895		.target_residency = 500,
896		.enter = &intel_idle,
897		.enter_s2idle = intel_idle_s2idle },
898	{
899		.enter = NULL }
900};
901
902static struct cpuidle_state bxt_cstates[] __initdata = {
903	{
904		.name = "C1",
905		.desc = "MWAIT 0x00",
906		.flags = MWAIT2flg(0x00),
907		.exit_latency = 2,
908		.target_residency = 2,
909		.enter = &intel_idle,
910		.enter_s2idle = intel_idle_s2idle, },
911	{
912		.name = "C1E",
913		.desc = "MWAIT 0x01",
914		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
915		.exit_latency = 10,
916		.target_residency = 20,
917		.enter = &intel_idle,
918		.enter_s2idle = intel_idle_s2idle, },
919	{
920		.name = "C6",
921		.desc = "MWAIT 0x20",
922		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
923		.exit_latency = 133,
924		.target_residency = 133,
925		.enter = &intel_idle,
926		.enter_s2idle = intel_idle_s2idle, },
927	{
928		.name = "C7s",
929		.desc = "MWAIT 0x31",
930		.flags = MWAIT2flg(0x31) | CPUIDLE_FLAG_TLB_FLUSHED,
931		.exit_latency = 155,
932		.target_residency = 155,
933		.enter = &intel_idle,
934		.enter_s2idle = intel_idle_s2idle, },
935	{
936		.name = "C8",
937		.desc = "MWAIT 0x40",
938		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
939		.exit_latency = 1000,
940		.target_residency = 1000,
941		.enter = &intel_idle,
942		.enter_s2idle = intel_idle_s2idle, },
943	{
944		.name = "C9",
945		.desc = "MWAIT 0x50",
946		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
947		.exit_latency = 2000,
948		.target_residency = 2000,
949		.enter = &intel_idle,
950		.enter_s2idle = intel_idle_s2idle, },
951	{
952		.name = "C10",
953		.desc = "MWAIT 0x60",
954		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
955		.exit_latency = 10000,
956		.target_residency = 10000,
957		.enter = &intel_idle,
958		.enter_s2idle = intel_idle_s2idle, },
959	{
960		.enter = NULL }
961};
962
963static struct cpuidle_state dnv_cstates[] __initdata = {
964	{
965		.name = "C1",
966		.desc = "MWAIT 0x00",
967		.flags = MWAIT2flg(0x00),
968		.exit_latency = 2,
969		.target_residency = 2,
970		.enter = &intel_idle,
971		.enter_s2idle = intel_idle_s2idle, },
972	{
973		.name = "C1E",
974		.desc = "MWAIT 0x01",
975		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
976		.exit_latency = 10,
977		.target_residency = 20,
978		.enter = &intel_idle,
979		.enter_s2idle = intel_idle_s2idle, },
980	{
981		.name = "C6",
982		.desc = "MWAIT 0x20",
983		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
984		.exit_latency = 50,
985		.target_residency = 500,
986		.enter = &intel_idle,
987		.enter_s2idle = intel_idle_s2idle, },
988	{
989		.enter = NULL }
990};
991
992static const struct idle_cpu idle_cpu_nehalem __initconst = {
993	.state_table = nehalem_cstates,
994	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
995	.disable_promotion_to_c1e = true,
996};
997
998static const struct idle_cpu idle_cpu_nhx __initconst = {
999	.state_table = nehalem_cstates,
1000	.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
1001	.disable_promotion_to_c1e = true,
1002	.use_acpi = true,
1003};
1004
1005static const struct idle_cpu idle_cpu_atom __initconst = {
1006	.state_table = atom_cstates,
1007};
1008
1009static const struct idle_cpu idle_cpu_tangier __initconst = {
1010	.state_table = tangier_cstates,
1011};
1012
1013static const struct idle_cpu idle_cpu_lincroft __initconst = {
1014	.state_table = atom_cstates,
1015	.auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE,
1016};
1017
1018static const struct idle_cpu idle_cpu_snb __initconst = {
1019	.state_table = snb_cstates,
1020	.disable_promotion_to_c1e = true,
1021};
1022
1023static const struct idle_cpu idle_cpu_snx __initconst = {
1024	.state_table = snb_cstates,
1025	.disable_promotion_to_c1e = true,
1026	.use_acpi = true,
1027};
1028
1029static const struct idle_cpu idle_cpu_byt __initconst = {
1030	.state_table = byt_cstates,
1031	.disable_promotion_to_c1e = true,
1032	.byt_auto_demotion_disable_flag = true,
1033};
1034
1035static const struct idle_cpu idle_cpu_cht __initconst = {
1036	.state_table = cht_cstates,
1037	.disable_promotion_to_c1e = true,
1038	.byt_auto_demotion_disable_flag = true,
1039};
1040
1041static const struct idle_cpu idle_cpu_ivb __initconst = {
1042	.state_table = ivb_cstates,
1043	.disable_promotion_to_c1e = true,
1044};
1045
1046static const struct idle_cpu idle_cpu_ivt __initconst = {
1047	.state_table = ivt_cstates,
1048	.disable_promotion_to_c1e = true,
1049	.use_acpi = true,
1050};
1051
1052static const struct idle_cpu idle_cpu_hsw __initconst = {
1053	.state_table = hsw_cstates,
1054	.disable_promotion_to_c1e = true,
1055};
1056
1057static const struct idle_cpu idle_cpu_hsx __initconst = {
1058	.state_table = hsw_cstates,
1059	.disable_promotion_to_c1e = true,
1060	.use_acpi = true,
1061};
1062
1063static const struct idle_cpu idle_cpu_bdw __initconst = {
1064	.state_table = bdw_cstates,
1065	.disable_promotion_to_c1e = true,
1066};
1067
1068static const struct idle_cpu idle_cpu_bdx __initconst = {
1069	.state_table = bdw_cstates,
1070	.disable_promotion_to_c1e = true,
1071	.use_acpi = true,
1072};
1073
1074static const struct idle_cpu idle_cpu_skl __initconst = {
1075	.state_table = skl_cstates,
1076	.disable_promotion_to_c1e = true,
1077};
1078
1079static const struct idle_cpu idle_cpu_skx __initconst = {
1080	.state_table = skx_cstates,
1081	.disable_promotion_to_c1e = true,
1082	.use_acpi = true,
1083};
1084
1085static const struct idle_cpu idle_cpu_icx __initconst = {
1086	.state_table = icx_cstates,
1087	.disable_promotion_to_c1e = true,
1088	.use_acpi = true,
1089};
1090
1091static const struct idle_cpu idle_cpu_avn __initconst = {
1092	.state_table = avn_cstates,
1093	.disable_promotion_to_c1e = true,
1094	.use_acpi = true,
1095};
1096
1097static const struct idle_cpu idle_cpu_knl __initconst = {
1098	.state_table = knl_cstates,
1099	.use_acpi = true,
1100};
1101
1102static const struct idle_cpu idle_cpu_bxt __initconst = {
1103	.state_table = bxt_cstates,
1104	.disable_promotion_to_c1e = true,
1105};
1106
1107static const struct idle_cpu idle_cpu_dnv __initconst = {
1108	.state_table = dnv_cstates,
1109	.disable_promotion_to_c1e = true,
1110	.use_acpi = true,
1111};
1112
1113static const struct x86_cpu_id intel_idle_ids[] __initconst = {
1114	X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP,		&idle_cpu_nhx),
1115	X86_MATCH_INTEL_FAM6_MODEL(NEHALEM,		&idle_cpu_nehalem),
1116	X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_G,		&idle_cpu_nehalem),
1117	X86_MATCH_INTEL_FAM6_MODEL(WESTMERE,		&idle_cpu_nehalem),
1118	X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP,		&idle_cpu_nhx),
1119	X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX,		&idle_cpu_nhx),
1120	X86_MATCH_INTEL_FAM6_MODEL(ATOM_BONNELL,	&idle_cpu_atom),
1121	X86_MATCH_INTEL_FAM6_MODEL(ATOM_BONNELL_MID,	&idle_cpu_lincroft),
1122	X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX,		&idle_cpu_nhx),
1123	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,		&idle_cpu_snb),
1124	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,	&idle_cpu_snx),
1125	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL,	&idle_cpu_atom),
1126	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT,	&idle_cpu_byt),
1127	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID,	&idle_cpu_tangier),
1128	X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT,	&idle_cpu_cht),
1129	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,		&idle_cpu_ivb),
1130	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,		&idle_cpu_ivt),
1131	X86_MATCH_INTEL_FAM6_MODEL(HASWELL,		&idle_cpu_hsw),
1132	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,		&idle_cpu_hsx),
1133	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,		&idle_cpu_hsw),
1134	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,		&idle_cpu_hsw),
1135	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_D,	&idle_cpu_avn),
1136	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,		&idle_cpu_bdw),
1137	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,		&idle_cpu_bdw),
1138	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,		&idle_cpu_bdx),
1139	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,		&idle_cpu_bdx),
1140	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,		&idle_cpu_skl),
1141	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,		&idle_cpu_skl),
1142	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,		&idle_cpu_skl),
1143	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&idle_cpu_skl),
1144	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&idle_cpu_skx),
1145	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&idle_cpu_icx),
1146	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&idle_cpu_knl),
1147	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&idle_cpu_knl),
1148	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&idle_cpu_bxt),
1149	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&idle_cpu_bxt),
1150	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&idle_cpu_dnv),
1151	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,	&idle_cpu_dnv),
1152	{}
1153};
1154
1155static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
1156	X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
1157	{}
1158};
1159
1160static bool __init intel_idle_max_cstate_reached(int cstate)
1161{
1162	if (cstate + 1 > max_cstate) {
1163		pr_info("max_cstate %d reached\n", max_cstate);
1164		return true;
1165	}
1166	return false;
1167}
1168
1169static bool __init intel_idle_state_needs_timer_stop(struct cpuidle_state *state)
1170{
1171	unsigned long eax = flg2MWAIT(state->flags);
1172
1173	if (boot_cpu_has(X86_FEATURE_ARAT))
1174		return false;
1175
1176	/*
1177	 * Switch over to one-shot tick broadcast if the target C-state
1178	 * is deeper than C1.
1179	 */
1180	return !!((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK);
1181}
1182
1183#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
1184#include <acpi/processor.h>
1185
1186static bool no_acpi __read_mostly;
1187module_param(no_acpi, bool, 0444);
1188MODULE_PARM_DESC(no_acpi, "Do not use ACPI _CST for building the idle states list");
1189
1190static bool force_use_acpi __read_mostly; /* No effect if no_acpi is set. */
1191module_param_named(use_acpi, force_use_acpi, bool, 0444);
1192MODULE_PARM_DESC(use_acpi, "Use ACPI _CST for building the idle states list");
1193
1194static struct acpi_processor_power acpi_state_table __initdata;
1195
1196/**
1197 * intel_idle_cst_usable - Check if the _CST information can be used.
1198 *
1199 * Check if all of the C-states listed by _CST in the max_cstate range are
1200 * ACPI_CSTATE_FFH, which means that they should be entered via MWAIT.
1201 */
1202static bool __init intel_idle_cst_usable(void)
1203{
1204	int cstate, limit;
1205
1206	limit = min_t(int, min_t(int, CPUIDLE_STATE_MAX, max_cstate + 1),
1207		      acpi_state_table.count);
1208
1209	for (cstate = 1; cstate < limit; cstate++) {
1210		struct acpi_processor_cx *cx = &acpi_state_table.states[cstate];
1211
1212		if (cx->entry_method != ACPI_CSTATE_FFH)
1213			return false;
1214	}
1215
1216	return true;
1217}
1218
1219static bool __init intel_idle_acpi_cst_extract(void)
1220{
1221	unsigned int cpu;
1222
1223	if (no_acpi) {
1224		pr_debug("Not allowed to use ACPI _CST\n");
1225		return false;
1226	}
1227
1228	for_each_possible_cpu(cpu) {
1229		struct acpi_processor *pr = per_cpu(processors, cpu);
1230
1231		if (!pr)
1232			continue;
1233
1234		if (acpi_processor_evaluate_cst(pr->handle, cpu, &acpi_state_table))
1235			continue;
1236
1237		acpi_state_table.count++;
1238
1239		if (!intel_idle_cst_usable())
1240			continue;
1241
1242		if (!acpi_processor_claim_cst_control())
1243			break;
1244
1245		return true;
1246	}
1247
1248	acpi_state_table.count = 0;
1249	pr_debug("ACPI _CST not found or not usable\n");
1250	return false;
1251}
1252
1253static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
1254{
1255	int cstate, limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count);
1256
1257	/*
1258	 * If limit > 0, intel_idle_cst_usable() has returned 'true', so all of
1259	 * the interesting states are ACPI_CSTATE_FFH.
1260	 */
1261	for (cstate = 1; cstate < limit; cstate++) {
1262		struct acpi_processor_cx *cx;
1263		struct cpuidle_state *state;
1264
1265		if (intel_idle_max_cstate_reached(cstate - 1))
1266			break;
1267
1268		cx = &acpi_state_table.states[cstate];
1269
1270		state = &drv->states[drv->state_count++];
1271
1272		snprintf(state->name, CPUIDLE_NAME_LEN, "C%d_ACPI", cstate);
1273		strlcpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
1274		state->exit_latency = cx->latency;
1275		/*
1276		 * For C1-type C-states use the same number for both the exit
1277		 * latency and target residency, because that is the case for
1278		 * C1 in the majority of the static C-states tables above.
1279		 * For the other types of C-states, however, set the target
1280		 * residency to 3 times the exit latency which should lead to
1281		 * a reasonable balance between energy-efficiency and
1282		 * performance in the majority of interesting cases.
1283		 */
1284		state->target_residency = cx->latency;
1285		if (cx->type > ACPI_STATE_C1)
1286			state->target_residency *= 3;
1287
1288		state->flags = MWAIT2flg(cx->address);
1289		if (cx->type > ACPI_STATE_C2)
1290			state->flags |= CPUIDLE_FLAG_TLB_FLUSHED;
1291
1292		if (disabled_states_mask & BIT(cstate))
1293			state->flags |= CPUIDLE_FLAG_OFF;
1294
1295		if (intel_idle_state_needs_timer_stop(state))
1296			state->flags |= CPUIDLE_FLAG_TIMER_STOP;
1297
1298		state->enter = intel_idle;
1299		state->enter_s2idle = intel_idle_s2idle;
1300	}
1301}
1302
1303static bool __init intel_idle_off_by_default(u32 mwait_hint)
1304{
1305	int cstate, limit;
1306
1307	/*
1308	 * If there are no _CST C-states, do not disable any C-states by
1309	 * default.
1310	 */
1311	if (!acpi_state_table.count)
1312		return false;
1313
1314	limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count);
1315	/*
1316	 * If limit > 0, intel_idle_cst_usable() has returned 'true', so all of
1317	 * the interesting states are ACPI_CSTATE_FFH.
1318	 */
1319	for (cstate = 1; cstate < limit; cstate++) {
1320		if (acpi_state_table.states[cstate].address == mwait_hint)
1321			return false;
1322	}
1323	return true;
1324}
1325#else /* !CONFIG_ACPI_PROCESSOR_CSTATE */
1326#define force_use_acpi	(false)
1327
1328static inline bool intel_idle_acpi_cst_extract(void) { return false; }
1329static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
1330static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
1331#endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
1332
1333/**
1334 * ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
1335 *
1336 * Tune IVT multi-socket targets.
1337 * Assumption: num_sockets == (max_package_num + 1).
1338 */
1339static void __init ivt_idle_state_table_update(void)
1340{
1341	/* IVT uses a different table for 1-2, 3-4, and > 4 sockets */
1342	int cpu, package_num, num_sockets = 1;
1343
1344	for_each_online_cpu(cpu) {
1345		package_num = topology_physical_package_id(cpu);
1346		if (package_num + 1 > num_sockets) {
1347			num_sockets = package_num + 1;
1348
1349			if (num_sockets > 4) {
1350				cpuidle_state_table = ivt_cstates_8s;
1351				return;
1352			}
1353		}
1354	}
1355
1356	if (num_sockets > 2)
1357		cpuidle_state_table = ivt_cstates_4s;
1358
1359	/* else, 1 and 2 socket systems use default ivt_cstates */
1360}
1361
1362/**
1363 * irtl_2_usec - IRTL to microseconds conversion.
1364 * @irtl: IRTL MSR value.
1365 *
1366 * Translate the IRTL (Interrupt Response Time Limit) MSR value to microseconds.
1367 */
1368static unsigned long long __init irtl_2_usec(unsigned long long irtl)
1369{
1370	static const unsigned int irtl_ns_units[] __initconst = {
1371		1, 32, 1024, 32768, 1048576, 33554432, 0, 0
1372	};
1373	unsigned long long ns;
1374
1375	if (!irtl)
1376		return 0;
1377
1378	ns = irtl_ns_units[(irtl >> 10) & 0x7];
1379
1380	return div_u64((irtl & 0x3FF) * ns, NSEC_PER_USEC);
1381}
1382
1383/**
1384 * bxt_idle_state_table_update - Fix up the Broxton idle states table.
1385 *
1386 * On BXT, trust the IRTL (Interrupt Response Time Limit) MSR to show the
1387 * definitive maximum latency and use the same value for target_residency.
1388 */
1389static void __init bxt_idle_state_table_update(void)
1390{
1391	unsigned long long msr;
1392	unsigned int usec;
1393
1394	rdmsrl(MSR_PKGC6_IRTL, msr);
1395	usec = irtl_2_usec(msr);
1396	if (usec) {
1397		bxt_cstates[2].exit_latency = usec;
1398		bxt_cstates[2].target_residency = usec;
1399	}
1400
1401	rdmsrl(MSR_PKGC7_IRTL, msr);
1402	usec = irtl_2_usec(msr);
1403	if (usec) {
1404		bxt_cstates[3].exit_latency = usec;
1405		bxt_cstates[3].target_residency = usec;
1406	}
1407
1408	rdmsrl(MSR_PKGC8_IRTL, msr);
1409	usec = irtl_2_usec(msr);
1410	if (usec) {
1411		bxt_cstates[4].exit_latency = usec;
1412		bxt_cstates[4].target_residency = usec;
1413	}
1414
1415	rdmsrl(MSR_PKGC9_IRTL, msr);
1416	usec = irtl_2_usec(msr);
1417	if (usec) {
1418		bxt_cstates[5].exit_latency = usec;
1419		bxt_cstates[5].target_residency = usec;
1420	}
1421
1422	rdmsrl(MSR_PKGC10_IRTL, msr);
1423	usec = irtl_2_usec(msr);
1424	if (usec) {
1425		bxt_cstates[6].exit_latency = usec;
1426		bxt_cstates[6].target_residency = usec;
1427	}
1428
1429}
1430
1431/**
1432 * sklh_idle_state_table_update - Fix up the Sky Lake idle states table.
1433 *
1434 * On SKL-H (model 0x5e) skip C8 and C9 if C10 is enabled and SGX disabled.
1435 */
1436static void __init sklh_idle_state_table_update(void)
1437{
1438	unsigned long long msr;
1439	unsigned int eax, ebx, ecx, edx;
1440
1441
1442	/* if PC10 disabled via cmdline intel_idle.max_cstate=7 or shallower */
1443	if (max_cstate <= 7)
1444		return;
1445
1446	/* if PC10 not present in CPUID.MWAIT.EDX */
1447	if ((mwait_substates & (0xF << 28)) == 0)
1448		return;
1449
1450	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
1451
1452	/* PC10 is not enabled in PKG C-state limit */
1453	if ((msr & 0xF) != 8)
1454		return;
1455
1456	ecx = 0;
1457	cpuid(7, &eax, &ebx, &ecx, &edx);
1458
1459	/* if SGX is present */
1460	if (ebx & (1 << 2)) {
1461
1462		rdmsrl(MSR_IA32_FEAT_CTL, msr);
1463
1464		/* if SGX is enabled */
1465		if (msr & (1 << 18))
1466			return;
1467	}
1468
1469	skl_cstates[5].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C8-SKL */
1470	skl_cstates[6].flags |= CPUIDLE_FLAG_UNUSABLE;	/* C9-SKL */
1471}
1472
1473static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
1474{
1475	unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
1476	unsigned int num_substates = (mwait_substates >> mwait_cstate * 4) &
1477					MWAIT_SUBSTATE_MASK;
1478
1479	/* Ignore the C-state if there are NO sub-states in CPUID for it. */
1480	if (num_substates == 0)
1481		return false;
1482
1483	if (mwait_cstate > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
1484		mark_tsc_unstable("TSC halts in idle states deeper than C2");
1485
1486	return true;
1487}
1488
1489static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
1490{
1491	int cstate;
1492
1493	switch (boot_cpu_data.x86_model) {
1494	case INTEL_FAM6_IVYBRIDGE_X:
1495		ivt_idle_state_table_update();
1496		break;
1497	case INTEL_FAM6_ATOM_GOLDMONT:
1498	case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
1499		bxt_idle_state_table_update();
1500		break;
1501	case INTEL_FAM6_SKYLAKE:
1502		sklh_idle_state_table_update();
1503		break;
1504	}
1505
1506	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
1507		unsigned int mwait_hint;
1508
1509		if (intel_idle_max_cstate_reached(cstate))
1510			break;
1511
1512		if (!cpuidle_state_table[cstate].enter &&
1513		    !cpuidle_state_table[cstate].enter_s2idle)
1514			break;
1515
1516		/* If marked as unusable, skip this state. */
1517		if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_UNUSABLE) {
1518			pr_debug("state %s is disabled\n",
1519				 cpuidle_state_table[cstate].name);
1520			continue;
1521		}
1522
1523		mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
1524		if (!intel_idle_verify_cstate(mwait_hint))
1525			continue;
1526
1527		/* Structure copy. */
1528		drv->states[drv->state_count] = cpuidle_state_table[cstate];
1529
1530		if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
1531		    cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IBRS) {
1532			drv->states[drv->state_count].enter = intel_idle_ibrs;
1533		}
1534
1535		if ((disabled_states_mask & BIT(drv->state_count)) ||
1536		    ((icpu->use_acpi || force_use_acpi) &&
1537		     intel_idle_off_by_default(mwait_hint) &&
1538		     !(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_ALWAYS_ENABLE)))
1539			drv->states[drv->state_count].flags |= CPUIDLE_FLAG_OFF;
1540
1541		if (intel_idle_state_needs_timer_stop(&drv->states[drv->state_count]))
1542			drv->states[drv->state_count].flags |= CPUIDLE_FLAG_TIMER_STOP;
1543
1544		drv->state_count++;
1545	}
1546
1547	if (icpu->byt_auto_demotion_disable_flag) {
1548		wrmsrl(MSR_CC6_DEMOTION_POLICY_CONFIG, 0);
1549		wrmsrl(MSR_MC6_DEMOTION_POLICY_CONFIG, 0);
1550	}
1551}
1552
1553/**
1554 * intel_idle_cpuidle_driver_init - Create the list of available idle states.
1555 * @drv: cpuidle driver structure to initialize.
1556 */
1557static void __init intel_idle_cpuidle_driver_init(struct cpuidle_driver *drv)
1558{
1559	cpuidle_poll_state_init(drv);
1560
1561	if (disabled_states_mask & BIT(0))
1562		drv->states[0].flags |= CPUIDLE_FLAG_OFF;
1563
1564	drv->state_count = 1;
1565
1566	if (icpu)
1567		intel_idle_init_cstates_icpu(drv);
1568	else
1569		intel_idle_init_cstates_acpi(drv);
1570}
1571
1572static void auto_demotion_disable(void)
1573{
1574	unsigned long long msr_bits;
1575
1576	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
1577	msr_bits &= ~auto_demotion_disable_flags;
1578	wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
1579}
1580
1581static void c1e_promotion_disable(void)
1582{
1583	unsigned long long msr_bits;
1584
1585	rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
1586	msr_bits &= ~0x2;
1587	wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
1588}
1589
1590/**
1591 * intel_idle_cpu_init - Register the target CPU with the cpuidle core.
1592 * @cpu: CPU to initialize.
1593 *
1594 * Register a cpuidle device object for @cpu and update its MSRs in accordance
1595 * with the processor model flags.
1596 */
1597static int intel_idle_cpu_init(unsigned int cpu)
1598{
1599	struct cpuidle_device *dev;
1600
1601	dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
1602	dev->cpu = cpu;
1603
1604	if (cpuidle_register_device(dev)) {
1605		pr_debug("cpuidle_register_device %d failed!\n", cpu);
1606		return -EIO;
1607	}
1608
1609	if (auto_demotion_disable_flags)
1610		auto_demotion_disable();
1611
1612	if (disable_promotion_to_c1e)
1613		c1e_promotion_disable();
1614
1615	return 0;
1616}
1617
1618static int intel_idle_cpu_online(unsigned int cpu)
1619{
1620	struct cpuidle_device *dev;
1621
1622	if (!boot_cpu_has(X86_FEATURE_ARAT))
1623		tick_broadcast_enable();
1624
1625	/*
1626	 * Some systems can hotplug a cpu at runtime after
1627	 * the kernel has booted, we have to initialize the
1628	 * driver in this case
1629	 */
1630	dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
1631	if (!dev->registered)
1632		return intel_idle_cpu_init(cpu);
1633
1634	return 0;
1635}
1636
1637/**
1638 * intel_idle_cpuidle_devices_uninit - Unregister all cpuidle devices.
1639 */
1640static void __init intel_idle_cpuidle_devices_uninit(void)
1641{
1642	int i;
1643
1644	for_each_online_cpu(i)
1645		cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
1646}
1647
1648static int __init intel_idle_init(void)
1649{
1650	const struct x86_cpu_id *id;
1651	unsigned int eax, ebx, ecx;
1652	int retval;
1653
1654	/* Do not load intel_idle at all for now if idle= is passed */
1655	if (boot_option_idle_override != IDLE_NO_OVERRIDE)
1656		return -ENODEV;
1657
1658	if (max_cstate == 0) {
1659		pr_debug("disabled\n");
1660		return -EPERM;
1661	}
1662
1663	id = x86_match_cpu(intel_idle_ids);
1664	if (id) {
1665		if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
1666			pr_debug("Please enable MWAIT in BIOS SETUP\n");
1667			return -ENODEV;
1668		}
1669	} else {
1670		id = x86_match_cpu(intel_mwait_ids);
1671		if (!id)
1672			return -ENODEV;
1673	}
1674
1675	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
1676		return -ENODEV;
1677
1678	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
1679
1680	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
1681	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
1682	    !mwait_substates)
1683			return -ENODEV;
1684
1685	pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
1686
1687	icpu = (const struct idle_cpu *)id->driver_data;
1688	if (icpu) {
1689		cpuidle_state_table = icpu->state_table;
1690		auto_demotion_disable_flags = icpu->auto_demotion_disable_flags;
1691		disable_promotion_to_c1e = icpu->disable_promotion_to_c1e;
1692		if (icpu->use_acpi || force_use_acpi)
1693			intel_idle_acpi_cst_extract();
1694	} else if (!intel_idle_acpi_cst_extract()) {
1695		return -ENODEV;
1696	}
1697
1698	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
1699		 boot_cpu_data.x86_model);
1700
1701	intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
1702	if (!intel_idle_cpuidle_devices)
1703		return -ENOMEM;
1704
1705	intel_idle_cpuidle_driver_init(&intel_idle_driver);
1706
1707	retval = cpuidle_register_driver(&intel_idle_driver);
1708	if (retval) {
1709		struct cpuidle_driver *drv = cpuidle_get_driver();
1710		printk(KERN_DEBUG pr_fmt("intel_idle yielding to %s\n"),
1711		       drv ? drv->name : "none");
1712		goto init_driver_fail;
1713	}
1714
1715	retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
1716				   intel_idle_cpu_online, NULL);
1717	if (retval < 0)
1718		goto hp_setup_fail;
1719
1720	pr_debug("Local APIC timer is reliable in %s\n",
1721		 boot_cpu_has(X86_FEATURE_ARAT) ? "all C-states" : "C1");
1722
1723	return 0;
1724
1725hp_setup_fail:
1726	intel_idle_cpuidle_devices_uninit();
1727	cpuidle_unregister_driver(&intel_idle_driver);
1728init_driver_fail:
1729	free_percpu(intel_idle_cpuidle_devices);
1730	return retval;
1731
1732}
1733device_initcall(intel_idle_init);
1734
1735/*
1736 * We are not really modular, but we used to support that.  Meaning we also
1737 * support "intel_idle.max_cstate=..." at boot and also a read-only export of
1738 * it at /sys/module/intel_idle/parameters/max_cstate -- so using module_param
1739 * is the easiest way (currently) to continue doing that.
1740 */
1741module_param(max_cstate, int, 0444);
1742/*
1743 * The positions of the bits that are set in this number are the indices of the
1744 * idle states to be disabled by default (as reflected by the names of the
1745 * corresponding idle state directories in sysfs, "state0", "state1" ...
1746 * "state<i>" ..., where <i> is the index of the given state).
1747 */
1748module_param_named(states_off, disabled_states_mask, uint, 0444);
1749MODULE_PARM_DESC(states_off, "Mask of disabled idle states");
1750