xref: /kernel/linux/linux-5.10/arch/x86/kvm/vmx/vmx.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 *
11 * Authors:
12 *   Avi Kivity   <avi@qumranet.com>
13 *   Yaniv Kamay  <yaniv@qumranet.com>
14 */
15
16#include <linux/highmem.h>
17#include <linux/hrtimer.h>
18#include <linux/kernel.h>
19#include <linux/kvm_host.h>
20#include <linux/module.h>
21#include <linux/moduleparam.h>
22#include <linux/mod_devicetable.h>
23#include <linux/mm.h>
24#include <linux/objtool.h>
25#include <linux/sched.h>
26#include <linux/sched/smt.h>
27#include <linux/slab.h>
28#include <linux/tboot.h>
29#include <linux/trace_events.h>
30#include <linux/entry-kvm.h>
31
32#include <asm/apic.h>
33#include <asm/asm.h>
34#include <asm/cpu.h>
35#include <asm/cpu_device_id.h>
36#include <asm/debugreg.h>
37#include <asm/desc.h>
38#include <asm/fpu/internal.h>
39#include <asm/idtentry.h>
40#include <asm/io.h>
41#include <asm/irq_remapping.h>
42#include <asm/kexec.h>
43#include <asm/perf_event.h>
44#include <asm/mce.h>
45#include <asm/mmu_context.h>
46#include <asm/mshyperv.h>
47#include <asm/mwait.h>
48#include <asm/spec-ctrl.h>
49#include <asm/virtext.h>
50#include <asm/vmx.h>
51
52#include "capabilities.h"
53#include "cpuid.h"
54#include "evmcs.h"
55#include "irq.h"
56#include "kvm_cache_regs.h"
57#include "lapic.h"
58#include "mmu.h"
59#include "nested.h"
60#include "pmu.h"
61#include "trace.h"
62#include "vmcs.h"
63#include "vmcs12.h"
64#include "vmx.h"
65#include "x86.h"
66
67MODULE_AUTHOR("Qumranet");
68MODULE_LICENSE("GPL");
69
70#ifdef MODULE
71static const struct x86_cpu_id vmx_cpu_id[] = {
72	X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
73	{}
74};
75MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
76#endif
77
78bool __read_mostly enable_vpid = 1;
79module_param_named(vpid, enable_vpid, bool, 0444);
80
81static bool __read_mostly enable_vnmi = 1;
82module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
83
84bool __read_mostly flexpriority_enabled = 1;
85module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
86
87bool __read_mostly enable_ept = 1;
88module_param_named(ept, enable_ept, bool, S_IRUGO);
89
90bool __read_mostly enable_unrestricted_guest = 1;
91module_param_named(unrestricted_guest,
92			enable_unrestricted_guest, bool, S_IRUGO);
93
94bool __read_mostly enable_ept_ad_bits = 1;
95module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
96
97static bool __read_mostly emulate_invalid_guest_state = true;
98module_param(emulate_invalid_guest_state, bool, S_IRUGO);
99
100static bool __read_mostly fasteoi = 1;
101module_param(fasteoi, bool, S_IRUGO);
102
103bool __read_mostly enable_apicv = 1;
104module_param(enable_apicv, bool, S_IRUGO);
105
106/*
107 * If nested=1, nested virtualization is supported, i.e., guests may use
108 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
109 * use VMX instructions.
110 */
111static bool __read_mostly nested = 1;
112module_param(nested, bool, S_IRUGO);
113
114bool __read_mostly enable_pml = 1;
115module_param_named(pml, enable_pml, bool, S_IRUGO);
116
117static bool __read_mostly dump_invalid_vmcs = 0;
118module_param(dump_invalid_vmcs, bool, 0644);
119
120#define MSR_BITMAP_MODE_X2APIC		1
121#define MSR_BITMAP_MODE_X2APIC_APICV	2
122
123#define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
124
125/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
126static int __read_mostly cpu_preemption_timer_multi;
127static bool __read_mostly enable_preemption_timer = 1;
128#ifdef CONFIG_X86_64
129module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
130#endif
131
132extern bool __read_mostly allow_smaller_maxphyaddr;
133module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
134
135#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
136#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
137#define KVM_VM_CR0_ALWAYS_ON				\
138	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
139
140#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
141#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
142#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
143
144#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
145
146#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
147	RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
148	RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
149	RTIT_STATUS_BYTECNT))
150
151/*
152 * List of MSRs that can be directly passed to the guest.
153 * In addition to these x2apic and PT MSRs are handled specially.
154 */
155static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
156	MSR_IA32_SPEC_CTRL,
157	MSR_IA32_PRED_CMD,
158	MSR_IA32_TSC,
159#ifdef CONFIG_X86_64
160	MSR_FS_BASE,
161	MSR_GS_BASE,
162	MSR_KERNEL_GS_BASE,
163#endif
164	MSR_IA32_SYSENTER_CS,
165	MSR_IA32_SYSENTER_ESP,
166	MSR_IA32_SYSENTER_EIP,
167	MSR_CORE_C1_RES,
168	MSR_CORE_C3_RESIDENCY,
169	MSR_CORE_C6_RESIDENCY,
170	MSR_CORE_C7_RESIDENCY,
171};
172
173/*
174 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
175 * ple_gap:    upper bound on the amount of time between two successive
176 *             executions of PAUSE in a loop. Also indicate if ple enabled.
177 *             According to test, this time is usually smaller than 128 cycles.
178 * ple_window: upper bound on the amount of time a guest is allowed to execute
179 *             in a PAUSE loop. Tests indicate that most spinlocks are held for
180 *             less than 2^12 cycles
181 * Time is measured based on a counter that runs at the same rate as the TSC,
182 * refer SDM volume 3b section 21.6.13 & 22.1.3.
183 */
184static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
185module_param(ple_gap, uint, 0444);
186
187static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
188module_param(ple_window, uint, 0444);
189
190/* Default doubles per-vcpu window every exit. */
191static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
192module_param(ple_window_grow, uint, 0444);
193
194/* Default resets per-vcpu window every exit to ple_window. */
195static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
196module_param(ple_window_shrink, uint, 0444);
197
198/* Default is to compute the maximum so we can never overflow. */
199static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
200module_param(ple_window_max, uint, 0444);
201
202/* Default is SYSTEM mode, 1 for host-guest mode */
203int __read_mostly pt_mode = PT_MODE_SYSTEM;
204module_param(pt_mode, int, S_IRUGO);
205
206static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
207static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
208static DEFINE_MUTEX(vmx_l1d_flush_mutex);
209
210/* Storage for pre module init parameter parsing */
211static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
212
213static const struct {
214	const char *option;
215	bool for_parse;
216} vmentry_l1d_param[] = {
217	[VMENTER_L1D_FLUSH_AUTO]	 = {"auto", true},
218	[VMENTER_L1D_FLUSH_NEVER]	 = {"never", true},
219	[VMENTER_L1D_FLUSH_COND]	 = {"cond", true},
220	[VMENTER_L1D_FLUSH_ALWAYS]	 = {"always", true},
221	[VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
222	[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
223};
224
225#define L1D_CACHE_ORDER 4
226static void *vmx_l1d_flush_pages;
227
228/* Control for disabling CPU Fill buffer clear */
229static bool __read_mostly vmx_fb_clear_ctrl_available;
230
231static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
232{
233	struct page *page;
234	unsigned int i;
235
236	if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
237		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
238		return 0;
239	}
240
241	if (!enable_ept) {
242		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
243		return 0;
244	}
245
246	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
247		u64 msr;
248
249		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
250		if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
251			l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
252			return 0;
253		}
254	}
255
256	/* If set to auto use the default l1tf mitigation method */
257	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
258		switch (l1tf_mitigation) {
259		case L1TF_MITIGATION_OFF:
260			l1tf = VMENTER_L1D_FLUSH_NEVER;
261			break;
262		case L1TF_MITIGATION_FLUSH_NOWARN:
263		case L1TF_MITIGATION_FLUSH:
264		case L1TF_MITIGATION_FLUSH_NOSMT:
265			l1tf = VMENTER_L1D_FLUSH_COND;
266			break;
267		case L1TF_MITIGATION_FULL:
268		case L1TF_MITIGATION_FULL_FORCE:
269			l1tf = VMENTER_L1D_FLUSH_ALWAYS;
270			break;
271		}
272	} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
273		l1tf = VMENTER_L1D_FLUSH_ALWAYS;
274	}
275
276	if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
277	    !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
278		/*
279		 * This allocation for vmx_l1d_flush_pages is not tied to a VM
280		 * lifetime and so should not be charged to a memcg.
281		 */
282		page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
283		if (!page)
284			return -ENOMEM;
285		vmx_l1d_flush_pages = page_address(page);
286
287		/*
288		 * Initialize each page with a different pattern in
289		 * order to protect against KSM in the nested
290		 * virtualization case.
291		 */
292		for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
293			memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
294			       PAGE_SIZE);
295		}
296	}
297
298	l1tf_vmx_mitigation = l1tf;
299
300	if (l1tf != VMENTER_L1D_FLUSH_NEVER)
301		static_branch_enable(&vmx_l1d_should_flush);
302	else
303		static_branch_disable(&vmx_l1d_should_flush);
304
305	if (l1tf == VMENTER_L1D_FLUSH_COND)
306		static_branch_enable(&vmx_l1d_flush_cond);
307	else
308		static_branch_disable(&vmx_l1d_flush_cond);
309	return 0;
310}
311
312static int vmentry_l1d_flush_parse(const char *s)
313{
314	unsigned int i;
315
316	if (s) {
317		for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
318			if (vmentry_l1d_param[i].for_parse &&
319			    sysfs_streq(s, vmentry_l1d_param[i].option))
320				return i;
321		}
322	}
323	return -EINVAL;
324}
325
326static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
327{
328	int l1tf, ret;
329
330	l1tf = vmentry_l1d_flush_parse(s);
331	if (l1tf < 0)
332		return l1tf;
333
334	if (!boot_cpu_has(X86_BUG_L1TF))
335		return 0;
336
337	/*
338	 * Has vmx_init() run already? If not then this is the pre init
339	 * parameter parsing. In that case just store the value and let
340	 * vmx_init() do the proper setup after enable_ept has been
341	 * established.
342	 */
343	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
344		vmentry_l1d_flush_param = l1tf;
345		return 0;
346	}
347
348	mutex_lock(&vmx_l1d_flush_mutex);
349	ret = vmx_setup_l1d_flush(l1tf);
350	mutex_unlock(&vmx_l1d_flush_mutex);
351	return ret;
352}
353
354static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
355{
356	if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
357		return sprintf(s, "???\n");
358
359	return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
360}
361
362static void vmx_setup_fb_clear_ctrl(void)
363{
364	u64 msr;
365
366	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
367	    !boot_cpu_has_bug(X86_BUG_MDS) &&
368	    !boot_cpu_has_bug(X86_BUG_TAA)) {
369		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
370		if (msr & ARCH_CAP_FB_CLEAR_CTRL)
371			vmx_fb_clear_ctrl_available = true;
372	}
373}
374
375static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
376{
377	u64 msr;
378
379	if (!vmx->disable_fb_clear)
380		return;
381
382	msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
383	msr |= FB_CLEAR_DIS;
384	native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
385	/* Cache the MSR value to avoid reading it later */
386	vmx->msr_ia32_mcu_opt_ctrl = msr;
387}
388
389static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
390{
391	if (!vmx->disable_fb_clear)
392		return;
393
394	vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
395	native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
396}
397
398static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
399{
400	vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
401
402	/*
403	 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
404	 * at VMEntry. Skip the MSR read/write when a guest has no use case to
405	 * execute VERW.
406	 */
407	if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
408	   ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
409	    (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
410	    (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
411	    (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
412	    (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
413		vmx->disable_fb_clear = false;
414}
415
416static const struct kernel_param_ops vmentry_l1d_flush_ops = {
417	.set = vmentry_l1d_flush_set,
418	.get = vmentry_l1d_flush_get,
419};
420module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
421
422static u32 vmx_segment_access_rights(struct kvm_segment *var);
423static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
424							  u32 msr, int type);
425
426void vmx_vmexit(void);
427
428#define vmx_insn_failed(fmt...)		\
429do {					\
430	WARN_ONCE(1, fmt);		\
431	pr_warn_ratelimited(fmt);	\
432} while (0)
433
434asmlinkage void vmread_error(unsigned long field, bool fault)
435{
436	if (fault)
437		kvm_spurious_fault();
438	else
439		vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
440}
441
442noinline void vmwrite_error(unsigned long field, unsigned long value)
443{
444	vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
445			field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
446}
447
448noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
449{
450	vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
451}
452
453noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
454{
455	vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
456}
457
458noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
459{
460	vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
461			ext, vpid, gva);
462}
463
464noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
465{
466	vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
467			ext, eptp, gpa);
468}
469
470static DEFINE_PER_CPU(struct vmcs *, vmxarea);
471DEFINE_PER_CPU(struct vmcs *, current_vmcs);
472/*
473 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
474 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
475 */
476static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
477
478static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
479static DEFINE_SPINLOCK(vmx_vpid_lock);
480
481struct vmcs_config vmcs_config;
482struct vmx_capability vmx_capability;
483
484#define VMX_SEGMENT_FIELD(seg)					\
485	[VCPU_SREG_##seg] = {                                   \
486		.selector = GUEST_##seg##_SELECTOR,		\
487		.base = GUEST_##seg##_BASE,		   	\
488		.limit = GUEST_##seg##_LIMIT,		   	\
489		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
490	}
491
492static const struct kvm_vmx_segment_field {
493	unsigned selector;
494	unsigned base;
495	unsigned limit;
496	unsigned ar_bytes;
497} kvm_vmx_segment_fields[] = {
498	VMX_SEGMENT_FIELD(CS),
499	VMX_SEGMENT_FIELD(DS),
500	VMX_SEGMENT_FIELD(ES),
501	VMX_SEGMENT_FIELD(FS),
502	VMX_SEGMENT_FIELD(GS),
503	VMX_SEGMENT_FIELD(SS),
504	VMX_SEGMENT_FIELD(TR),
505	VMX_SEGMENT_FIELD(LDTR),
506};
507
508static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
509{
510	vmx->segment_cache.bitmask = 0;
511}
512
513static unsigned long host_idt_base;
514
515/*
516 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
517 * will emulate SYSCALL in legacy mode if the vendor string in guest
518 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
519 * support this emulation, IA32_STAR must always be included in
520 * vmx_uret_msrs_list[], even in i386 builds.
521 */
522static const u32 vmx_uret_msrs_list[] = {
523#ifdef CONFIG_X86_64
524	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
525#endif
526	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
527	MSR_IA32_TSX_CTRL,
528};
529
530#if IS_ENABLED(CONFIG_HYPERV)
531static bool __read_mostly enlightened_vmcs = true;
532module_param(enlightened_vmcs, bool, 0444);
533
534/* check_ept_pointer() should be under protection of ept_pointer_lock. */
535static void check_ept_pointer_match(struct kvm *kvm)
536{
537	struct kvm_vcpu *vcpu;
538	u64 tmp_eptp = INVALID_PAGE;
539	int i;
540
541	kvm_for_each_vcpu(i, vcpu, kvm) {
542		if (!VALID_PAGE(tmp_eptp)) {
543			tmp_eptp = to_vmx(vcpu)->ept_pointer;
544		} else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
545			to_kvm_vmx(kvm)->ept_pointers_match
546				= EPT_POINTERS_MISMATCH;
547			return;
548		}
549	}
550
551	to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
552}
553
554static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
555		void *data)
556{
557	struct kvm_tlb_range *range = data;
558
559	return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
560			range->pages);
561}
562
563static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
564		struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
565{
566	u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
567
568	/*
569	 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
570	 * of the base of EPT PML4 table, strip off EPT configuration
571	 * information.
572	 */
573	if (range)
574		return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
575				kvm_fill_hv_flush_list_func, (void *)range);
576	else
577		return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
578}
579
580static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
581		struct kvm_tlb_range *range)
582{
583	struct kvm_vcpu *vcpu;
584	int ret = 0, i;
585
586	spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
587
588	if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
589		check_ept_pointer_match(kvm);
590
591	if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
592		kvm_for_each_vcpu(i, vcpu, kvm) {
593			/* If ept_pointer is invalid pointer, bypass flush request. */
594			if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
595				ret |= __hv_remote_flush_tlb_with_range(
596					kvm, vcpu, range);
597		}
598	} else {
599		ret = __hv_remote_flush_tlb_with_range(kvm,
600				kvm_get_vcpu(kvm, 0), range);
601	}
602
603	spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
604	return ret;
605}
606static int hv_remote_flush_tlb(struct kvm *kvm)
607{
608	return hv_remote_flush_tlb_with_range(kvm, NULL);
609}
610
611static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
612{
613	struct hv_enlightened_vmcs *evmcs;
614	struct hv_partition_assist_pg **p_hv_pa_pg =
615			&vcpu->kvm->arch.hyperv.hv_pa_pg;
616	/*
617	 * Synthetic VM-Exit is not enabled in current code and so All
618	 * evmcs in singe VM shares same assist page.
619	 */
620	if (!*p_hv_pa_pg)
621		*p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
622
623	if (!*p_hv_pa_pg)
624		return -ENOMEM;
625
626	evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
627
628	evmcs->partition_assist_page =
629		__pa(*p_hv_pa_pg);
630	evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
631	evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
632
633	return 0;
634}
635
636#endif /* IS_ENABLED(CONFIG_HYPERV) */
637
638/*
639 * Comment's format: document - errata name - stepping - processor name.
640 * Refer from
641 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
642 */
643static u32 vmx_preemption_cpu_tfms[] = {
644/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
6450x000206E6,
646/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
647/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
648/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
6490x00020652,
650/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
6510x00020655,
652/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
653/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
654/*
655 * 320767.pdf - AAP86  - B1 -
656 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
657 */
6580x000106E5,
659/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
6600x000106A0,
661/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
6620x000106A1,
663/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
6640x000106A4,
665 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
666 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
667 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
6680x000106A5,
669 /* Xeon E3-1220 V2 */
6700x000306A8,
671};
672
673static inline bool cpu_has_broken_vmx_preemption_timer(void)
674{
675	u32 eax = cpuid_eax(0x00000001), i;
676
677	/* Clear the reserved bits */
678	eax &= ~(0x3U << 14 | 0xfU << 28);
679	for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
680		if (eax == vmx_preemption_cpu_tfms[i])
681			return true;
682
683	return false;
684}
685
686static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
687{
688	return flexpriority_enabled && lapic_in_kernel(vcpu);
689}
690
691static inline bool report_flexpriority(void)
692{
693	return flexpriority_enabled;
694}
695
696static int possible_passthrough_msr_slot(u32 msr)
697{
698	u32 i;
699
700	for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
701		if (vmx_possible_passthrough_msrs[i] == msr)
702			return i;
703
704	return -ENOENT;
705}
706
707static bool is_valid_passthrough_msr(u32 msr)
708{
709	bool r;
710
711	switch (msr) {
712	case 0x800 ... 0x8ff:
713		/* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
714		return true;
715	case MSR_IA32_RTIT_STATUS:
716	case MSR_IA32_RTIT_OUTPUT_BASE:
717	case MSR_IA32_RTIT_OUTPUT_MASK:
718	case MSR_IA32_RTIT_CR3_MATCH:
719	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
720		/* PT MSRs. These are handled in pt_update_intercept_for_msr() */
721		return true;
722	}
723
724	r = possible_passthrough_msr_slot(msr) != -ENOENT;
725
726	WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
727
728	return r;
729}
730
731static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
732{
733	int i;
734
735	for (i = 0; i < vmx->nr_uret_msrs; ++i)
736		if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
737			return i;
738	return -1;
739}
740
741struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
742{
743	int i;
744
745	i = __vmx_find_uret_msr(vmx, msr);
746	if (i >= 0)
747		return &vmx->guest_uret_msrs[i];
748	return NULL;
749}
750
751static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
752				  struct vmx_uret_msr *msr, u64 data)
753{
754	int ret = 0;
755
756	u64 old_msr_data = msr->data;
757	msr->data = data;
758	if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
759		preempt_disable();
760		ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
761		preempt_enable();
762		if (ret)
763			msr->data = old_msr_data;
764	}
765	return ret;
766}
767
768#ifdef CONFIG_KEXEC_CORE
769static void crash_vmclear_local_loaded_vmcss(void)
770{
771	int cpu = raw_smp_processor_id();
772	struct loaded_vmcs *v;
773
774	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
775			    loaded_vmcss_on_cpu_link)
776		vmcs_clear(v->vmcs);
777}
778#endif /* CONFIG_KEXEC_CORE */
779
780static void __loaded_vmcs_clear(void *arg)
781{
782	struct loaded_vmcs *loaded_vmcs = arg;
783	int cpu = raw_smp_processor_id();
784
785	if (loaded_vmcs->cpu != cpu)
786		return; /* vcpu migration can race with cpu offline */
787	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
788		per_cpu(current_vmcs, cpu) = NULL;
789
790	vmcs_clear(loaded_vmcs->vmcs);
791	if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
792		vmcs_clear(loaded_vmcs->shadow_vmcs);
793
794	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
795
796	/*
797	 * Ensure all writes to loaded_vmcs, including deleting it from its
798	 * current percpu list, complete before setting loaded_vmcs->vcpu to
799	 * -1, otherwise a different cpu can see vcpu == -1 first and add
800	 * loaded_vmcs to its percpu list before it's deleted from this cpu's
801	 * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
802	 */
803	smp_wmb();
804
805	loaded_vmcs->cpu = -1;
806	loaded_vmcs->launched = 0;
807}
808
809void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
810{
811	int cpu = loaded_vmcs->cpu;
812
813	if (cpu != -1)
814		smp_call_function_single(cpu,
815			 __loaded_vmcs_clear, loaded_vmcs, 1);
816}
817
818static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
819				       unsigned field)
820{
821	bool ret;
822	u32 mask = 1 << (seg * SEG_FIELD_NR + field);
823
824	if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
825		kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
826		vmx->segment_cache.bitmask = 0;
827	}
828	ret = vmx->segment_cache.bitmask & mask;
829	vmx->segment_cache.bitmask |= mask;
830	return ret;
831}
832
833static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
834{
835	u16 *p = &vmx->segment_cache.seg[seg].selector;
836
837	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
838		*p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
839	return *p;
840}
841
842static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
843{
844	ulong *p = &vmx->segment_cache.seg[seg].base;
845
846	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
847		*p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
848	return *p;
849}
850
851static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
852{
853	u32 *p = &vmx->segment_cache.seg[seg].limit;
854
855	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
856		*p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
857	return *p;
858}
859
860static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
861{
862	u32 *p = &vmx->segment_cache.seg[seg].ar;
863
864	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
865		*p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
866	return *p;
867}
868
869void update_exception_bitmap(struct kvm_vcpu *vcpu)
870{
871	u32 eb;
872
873	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
874	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
875	/*
876	 * Guest access to VMware backdoor ports could legitimately
877	 * trigger #GP because of TSS I/O permission bitmap.
878	 * We intercept those #GP and allow access to them anyway
879	 * as VMware does.
880	 */
881	if (enable_vmware_backdoor)
882		eb |= (1u << GP_VECTOR);
883	if ((vcpu->guest_debug &
884	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
885	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
886		eb |= 1u << BP_VECTOR;
887	if (to_vmx(vcpu)->rmode.vm86_active)
888		eb = ~0;
889	if (!vmx_need_pf_intercept(vcpu))
890		eb &= ~(1u << PF_VECTOR);
891
892	/* When we are running a nested L2 guest and L1 specified for it a
893	 * certain exception bitmap, we must trap the same exceptions and pass
894	 * them to L1. When running L2, we will only handle the exceptions
895	 * specified above if L1 did not want them.
896	 */
897	if (is_guest_mode(vcpu))
898		eb |= get_vmcs12(vcpu)->exception_bitmap;
899        else {
900		/*
901		 * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
902		 * between guest and host.  In that case we only care about present
903		 * faults.  For vmcs02, however, PFEC_MASK and PFEC_MATCH are set in
904		 * prepare_vmcs02_rare.
905		 */
906		bool selective_pf_trap = enable_ept && (eb & (1u << PF_VECTOR));
907		int mask = selective_pf_trap ? PFERR_PRESENT_MASK : 0;
908		vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
909		vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, mask);
910	}
911
912	vmcs_write32(EXCEPTION_BITMAP, eb);
913}
914
915/*
916 * Check if MSR is intercepted for currently loaded MSR bitmap.
917 */
918static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
919{
920	unsigned long *msr_bitmap;
921	int f = sizeof(unsigned long);
922
923	if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
924		return true;
925
926	msr_bitmap = vmx->loaded_vmcs->msr_bitmap;
927
928	if (msr <= 0x1fff) {
929		return !!test_bit(msr, msr_bitmap + 0x800 / f);
930	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
931		msr &= 0x1fff;
932		return !!test_bit(msr, msr_bitmap + 0xc00 / f);
933	}
934
935	return true;
936}
937
938unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
939{
940	unsigned int flags = 0;
941
942	if (vmx->loaded_vmcs->launched)
943		flags |= VMX_RUN_VMRESUME;
944
945	/*
946	 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
947	 * to change it directly without causing a vmexit.  In that case read
948	 * it after vmexit and store it in vmx->spec_ctrl.
949	 */
950	if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
951		flags |= VMX_RUN_SAVE_SPEC_CTRL;
952
953	return flags;
954}
955
956static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
957		unsigned long entry, unsigned long exit)
958{
959	vm_entry_controls_clearbit(vmx, entry);
960	vm_exit_controls_clearbit(vmx, exit);
961}
962
963int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
964{
965	unsigned int i;
966
967	for (i = 0; i < m->nr; ++i) {
968		if (m->val[i].index == msr)
969			return i;
970	}
971	return -ENOENT;
972}
973
974static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
975{
976	int i;
977	struct msr_autoload *m = &vmx->msr_autoload;
978
979	switch (msr) {
980	case MSR_EFER:
981		if (cpu_has_load_ia32_efer()) {
982			clear_atomic_switch_msr_special(vmx,
983					VM_ENTRY_LOAD_IA32_EFER,
984					VM_EXIT_LOAD_IA32_EFER);
985			return;
986		}
987		break;
988	case MSR_CORE_PERF_GLOBAL_CTRL:
989		if (cpu_has_load_perf_global_ctrl()) {
990			clear_atomic_switch_msr_special(vmx,
991					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
992					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
993			return;
994		}
995		break;
996	}
997	i = vmx_find_loadstore_msr_slot(&m->guest, msr);
998	if (i < 0)
999		goto skip_guest;
1000	--m->guest.nr;
1001	m->guest.val[i] = m->guest.val[m->guest.nr];
1002	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1003
1004skip_guest:
1005	i = vmx_find_loadstore_msr_slot(&m->host, msr);
1006	if (i < 0)
1007		return;
1008
1009	--m->host.nr;
1010	m->host.val[i] = m->host.val[m->host.nr];
1011	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1012}
1013
1014static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1015		unsigned long entry, unsigned long exit,
1016		unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1017		u64 guest_val, u64 host_val)
1018{
1019	vmcs_write64(guest_val_vmcs, guest_val);
1020	if (host_val_vmcs != HOST_IA32_EFER)
1021		vmcs_write64(host_val_vmcs, host_val);
1022	vm_entry_controls_setbit(vmx, entry);
1023	vm_exit_controls_setbit(vmx, exit);
1024}
1025
1026static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1027				  u64 guest_val, u64 host_val, bool entry_only)
1028{
1029	int i, j = 0;
1030	struct msr_autoload *m = &vmx->msr_autoload;
1031
1032	switch (msr) {
1033	case MSR_EFER:
1034		if (cpu_has_load_ia32_efer()) {
1035			add_atomic_switch_msr_special(vmx,
1036					VM_ENTRY_LOAD_IA32_EFER,
1037					VM_EXIT_LOAD_IA32_EFER,
1038					GUEST_IA32_EFER,
1039					HOST_IA32_EFER,
1040					guest_val, host_val);
1041			return;
1042		}
1043		break;
1044	case MSR_CORE_PERF_GLOBAL_CTRL:
1045		if (cpu_has_load_perf_global_ctrl()) {
1046			add_atomic_switch_msr_special(vmx,
1047					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1048					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1049					GUEST_IA32_PERF_GLOBAL_CTRL,
1050					HOST_IA32_PERF_GLOBAL_CTRL,
1051					guest_val, host_val);
1052			return;
1053		}
1054		break;
1055	case MSR_IA32_PEBS_ENABLE:
1056		/* PEBS needs a quiescent period after being disabled (to write
1057		 * a record).  Disabling PEBS through VMX MSR swapping doesn't
1058		 * provide that period, so a CPU could write host's record into
1059		 * guest's memory.
1060		 */
1061		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
1062	}
1063
1064	i = vmx_find_loadstore_msr_slot(&m->guest, msr);
1065	if (!entry_only)
1066		j = vmx_find_loadstore_msr_slot(&m->host, msr);
1067
1068	if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
1069	    (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
1070		printk_once(KERN_WARNING "Not enough msr switch entries. "
1071				"Can't add msr %x\n", msr);
1072		return;
1073	}
1074	if (i < 0) {
1075		i = m->guest.nr++;
1076		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1077	}
1078	m->guest.val[i].index = msr;
1079	m->guest.val[i].value = guest_val;
1080
1081	if (entry_only)
1082		return;
1083
1084	if (j < 0) {
1085		j = m->host.nr++;
1086		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1087	}
1088	m->host.val[j].index = msr;
1089	m->host.val[j].value = host_val;
1090}
1091
1092static bool update_transition_efer(struct vcpu_vmx *vmx)
1093{
1094	u64 guest_efer = vmx->vcpu.arch.efer;
1095	u64 ignore_bits = 0;
1096	int i;
1097
1098	/* Shadow paging assumes NX to be available.  */
1099	if (!enable_ept)
1100		guest_efer |= EFER_NX;
1101
1102	/*
1103	 * LMA and LME handled by hardware; SCE meaningless outside long mode.
1104	 */
1105	ignore_bits |= EFER_SCE;
1106#ifdef CONFIG_X86_64
1107	ignore_bits |= EFER_LMA | EFER_LME;
1108	/* SCE is meaningful only in long mode on Intel */
1109	if (guest_efer & EFER_LMA)
1110		ignore_bits &= ~(u64)EFER_SCE;
1111#endif
1112
1113	/*
1114	 * On EPT, we can't emulate NX, so we must switch EFER atomically.
1115	 * On CPUs that support "load IA32_EFER", always switch EFER
1116	 * atomically, since it's faster than switching it manually.
1117	 */
1118	if (cpu_has_load_ia32_efer() ||
1119	    (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
1120		if (!(guest_efer & EFER_LMA))
1121			guest_efer &= ~EFER_LME;
1122		if (guest_efer != host_efer)
1123			add_atomic_switch_msr(vmx, MSR_EFER,
1124					      guest_efer, host_efer, false);
1125		else
1126			clear_atomic_switch_msr(vmx, MSR_EFER);
1127		return false;
1128	}
1129
1130	i = __vmx_find_uret_msr(vmx, MSR_EFER);
1131	if (i < 0)
1132		return false;
1133
1134	clear_atomic_switch_msr(vmx, MSR_EFER);
1135
1136	guest_efer &= ~ignore_bits;
1137	guest_efer |= host_efer & ignore_bits;
1138
1139	vmx->guest_uret_msrs[i].data = guest_efer;
1140	vmx->guest_uret_msrs[i].mask = ~ignore_bits;
1141
1142	return true;
1143}
1144
1145#ifdef CONFIG_X86_32
1146/*
1147 * On 32-bit kernels, VM exits still load the FS and GS bases from the
1148 * VMCS rather than the segment table.  KVM uses this helper to figure
1149 * out the current bases to poke them into the VMCS before entry.
1150 */
1151static unsigned long segment_base(u16 selector)
1152{
1153	struct desc_struct *table;
1154	unsigned long v;
1155
1156	if (!(selector & ~SEGMENT_RPL_MASK))
1157		return 0;
1158
1159	table = get_current_gdt_ro();
1160
1161	if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1162		u16 ldt_selector = kvm_read_ldt();
1163
1164		if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1165			return 0;
1166
1167		table = (struct desc_struct *)segment_base(ldt_selector);
1168	}
1169	v = get_desc_base(&table[selector >> 3]);
1170	return v;
1171}
1172#endif
1173
1174static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1175{
1176	return vmx_pt_mode_is_host_guest() &&
1177	       !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1178}
1179
1180static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1181{
1182	/* The base must be 128-byte aligned and a legal physical address. */
1183	return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f);
1184}
1185
1186static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1187{
1188	u32 i;
1189
1190	wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1191	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1192	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1193	wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1194	for (i = 0; i < addr_range; i++) {
1195		wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1196		wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1197	}
1198}
1199
1200static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1201{
1202	u32 i;
1203
1204	rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1205	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1206	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1207	rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1208	for (i = 0; i < addr_range; i++) {
1209		rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1210		rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1211	}
1212}
1213
1214static void pt_guest_enter(struct vcpu_vmx *vmx)
1215{
1216	if (vmx_pt_mode_is_system())
1217		return;
1218
1219	/*
1220	 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1221	 * Save host state before VM entry.
1222	 */
1223	rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1224	if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1225		wrmsrl(MSR_IA32_RTIT_CTL, 0);
1226		pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1227		pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1228	}
1229}
1230
1231static void pt_guest_exit(struct vcpu_vmx *vmx)
1232{
1233	if (vmx_pt_mode_is_system())
1234		return;
1235
1236	if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1237		pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1238		pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1239	}
1240
1241	/* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
1242	wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1243}
1244
1245void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1246			unsigned long fs_base, unsigned long gs_base)
1247{
1248	if (unlikely(fs_sel != host->fs_sel)) {
1249		if (!(fs_sel & 7))
1250			vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1251		else
1252			vmcs_write16(HOST_FS_SELECTOR, 0);
1253		host->fs_sel = fs_sel;
1254	}
1255	if (unlikely(gs_sel != host->gs_sel)) {
1256		if (!(gs_sel & 7))
1257			vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1258		else
1259			vmcs_write16(HOST_GS_SELECTOR, 0);
1260		host->gs_sel = gs_sel;
1261	}
1262	if (unlikely(fs_base != host->fs_base)) {
1263		vmcs_writel(HOST_FS_BASE, fs_base);
1264		host->fs_base = fs_base;
1265	}
1266	if (unlikely(gs_base != host->gs_base)) {
1267		vmcs_writel(HOST_GS_BASE, gs_base);
1268		host->gs_base = gs_base;
1269	}
1270}
1271
1272void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1273{
1274	struct vcpu_vmx *vmx = to_vmx(vcpu);
1275	struct vmcs_host_state *host_state;
1276#ifdef CONFIG_X86_64
1277	int cpu = raw_smp_processor_id();
1278#endif
1279	unsigned long fs_base, gs_base;
1280	u16 fs_sel, gs_sel;
1281	int i;
1282
1283	vmx->req_immediate_exit = false;
1284
1285	/*
1286	 * Note that guest MSRs to be saved/restored can also be changed
1287	 * when guest state is loaded. This happens when guest transitions
1288	 * to/from long-mode by setting MSR_EFER.LMA.
1289	 */
1290	if (!vmx->guest_uret_msrs_loaded) {
1291		vmx->guest_uret_msrs_loaded = true;
1292		for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
1293			kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
1294						vmx->guest_uret_msrs[i].data,
1295						vmx->guest_uret_msrs[i].mask);
1296
1297	}
1298
1299    	if (vmx->nested.need_vmcs12_to_shadow_sync)
1300		nested_sync_vmcs12_to_shadow(vcpu);
1301
1302	if (vmx->guest_state_loaded)
1303		return;
1304
1305	host_state = &vmx->loaded_vmcs->host_state;
1306
1307	/*
1308	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1309	 * allow segment selectors with cpl > 0 or ti == 1.
1310	 */
1311	host_state->ldt_sel = kvm_read_ldt();
1312
1313#ifdef CONFIG_X86_64
1314	savesegment(ds, host_state->ds_sel);
1315	savesegment(es, host_state->es_sel);
1316
1317	gs_base = cpu_kernelmode_gs_base(cpu);
1318	if (likely(is_64bit_mm(current->mm))) {
1319		current_save_fsgs();
1320		fs_sel = current->thread.fsindex;
1321		gs_sel = current->thread.gsindex;
1322		fs_base = current->thread.fsbase;
1323		vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1324	} else {
1325		savesegment(fs, fs_sel);
1326		savesegment(gs, gs_sel);
1327		fs_base = read_msr(MSR_FS_BASE);
1328		vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1329	}
1330
1331	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1332#else
1333	savesegment(fs, fs_sel);
1334	savesegment(gs, gs_sel);
1335	fs_base = segment_base(fs_sel);
1336	gs_base = segment_base(gs_sel);
1337#endif
1338
1339	vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1340	vmx->guest_state_loaded = true;
1341}
1342
1343static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1344{
1345	struct vmcs_host_state *host_state;
1346
1347	if (!vmx->guest_state_loaded)
1348		return;
1349
1350	host_state = &vmx->loaded_vmcs->host_state;
1351
1352	++vmx->vcpu.stat.host_state_reload;
1353
1354#ifdef CONFIG_X86_64
1355	rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1356#endif
1357	if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1358		kvm_load_ldt(host_state->ldt_sel);
1359#ifdef CONFIG_X86_64
1360		load_gs_index(host_state->gs_sel);
1361#else
1362		loadsegment(gs, host_state->gs_sel);
1363#endif
1364	}
1365	if (host_state->fs_sel & 7)
1366		loadsegment(fs, host_state->fs_sel);
1367#ifdef CONFIG_X86_64
1368	if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1369		loadsegment(ds, host_state->ds_sel);
1370		loadsegment(es, host_state->es_sel);
1371	}
1372#endif
1373	invalidate_tss_limit();
1374#ifdef CONFIG_X86_64
1375	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1376#endif
1377	load_fixmap_gdt(raw_smp_processor_id());
1378	vmx->guest_state_loaded = false;
1379	vmx->guest_uret_msrs_loaded = false;
1380}
1381
1382#ifdef CONFIG_X86_64
1383static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1384{
1385	preempt_disable();
1386	if (vmx->guest_state_loaded)
1387		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1388	preempt_enable();
1389	return vmx->msr_guest_kernel_gs_base;
1390}
1391
1392static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1393{
1394	preempt_disable();
1395	if (vmx->guest_state_loaded)
1396		wrmsrl(MSR_KERNEL_GS_BASE, data);
1397	preempt_enable();
1398	vmx->msr_guest_kernel_gs_base = data;
1399}
1400#endif
1401
1402void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
1403			struct loaded_vmcs *buddy)
1404{
1405	struct vcpu_vmx *vmx = to_vmx(vcpu);
1406	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1407	struct vmcs *prev;
1408
1409	if (!already_loaded) {
1410		loaded_vmcs_clear(vmx->loaded_vmcs);
1411		local_irq_disable();
1412
1413		/*
1414		 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1415		 * this cpu's percpu list, otherwise it may not yet be deleted
1416		 * from its previous cpu's percpu list.  Pairs with the
1417		 * smb_wmb() in __loaded_vmcs_clear().
1418		 */
1419		smp_rmb();
1420
1421		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1422			 &per_cpu(loaded_vmcss_on_cpu, cpu));
1423		local_irq_enable();
1424	}
1425
1426	prev = per_cpu(current_vmcs, cpu);
1427	if (prev != vmx->loaded_vmcs->vmcs) {
1428		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1429		vmcs_load(vmx->loaded_vmcs->vmcs);
1430
1431		/*
1432		 * No indirect branch prediction barrier needed when switching
1433		 * the active VMCS within a vCPU, unless IBRS is advertised to
1434		 * the vCPU.  To minimize the number of IBPBs executed, KVM
1435		 * performs IBPB on nested VM-Exit (a single nested transition
1436		 * may switch the active VMCS multiple times).
1437		 */
1438		if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
1439			indirect_branch_prediction_barrier();
1440	}
1441
1442	if (!already_loaded) {
1443		void *gdt = get_current_gdt_ro();
1444		unsigned long sysenter_esp;
1445
1446		/*
1447		 * Flush all EPTP/VPID contexts, the new pCPU may have stale
1448		 * TLB entries from its previous association with the vCPU.
1449		 */
1450		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1451
1452		/*
1453		 * Linux uses per-cpu TSS and GDT, so set these when switching
1454		 * processors.  See 22.2.4.
1455		 */
1456		vmcs_writel(HOST_TR_BASE,
1457			    (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1458		vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
1459
1460		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1461		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1462
1463		vmx->loaded_vmcs->cpu = cpu;
1464	}
1465
1466	/* Setup TSC multiplier */
1467	if (kvm_has_tsc_control &&
1468	    vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
1469		decache_tsc_multiplier(vmx);
1470}
1471
1472/*
1473 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1474 * vcpu mutex is already taken.
1475 */
1476static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1477{
1478	struct vcpu_vmx *vmx = to_vmx(vcpu);
1479
1480	vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
1481
1482	vmx_vcpu_pi_load(vcpu, cpu);
1483
1484	vmx->host_debugctlmsr = get_debugctlmsr();
1485}
1486
1487static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1488{
1489	vmx_vcpu_pi_put(vcpu);
1490
1491	vmx_prepare_switch_to_host(to_vmx(vcpu));
1492}
1493
1494static bool emulation_required(struct kvm_vcpu *vcpu)
1495{
1496	return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1497}
1498
1499unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1500{
1501	struct vcpu_vmx *vmx = to_vmx(vcpu);
1502	unsigned long rflags, save_rflags;
1503
1504	if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1505		kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1506		rflags = vmcs_readl(GUEST_RFLAGS);
1507		if (vmx->rmode.vm86_active) {
1508			rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1509			save_rflags = vmx->rmode.save_rflags;
1510			rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1511		}
1512		vmx->rflags = rflags;
1513	}
1514	return vmx->rflags;
1515}
1516
1517void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1518{
1519	struct vcpu_vmx *vmx = to_vmx(vcpu);
1520	unsigned long old_rflags;
1521
1522	/*
1523	 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
1524	 * is an unrestricted guest in order to mark L2 as needing emulation
1525	 * if L1 runs L2 as a restricted guest.
1526	 */
1527	if (is_unrestricted_guest(vcpu)) {
1528		kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1529		vmx->rflags = rflags;
1530		vmcs_writel(GUEST_RFLAGS, rflags);
1531		return;
1532	}
1533
1534	old_rflags = vmx_get_rflags(vcpu);
1535	vmx->rflags = rflags;
1536	if (vmx->rmode.vm86_active) {
1537		vmx->rmode.save_rflags = rflags;
1538		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1539	}
1540	vmcs_writel(GUEST_RFLAGS, rflags);
1541
1542	if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1543		vmx->emulation_required = emulation_required(vcpu);
1544}
1545
1546u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1547{
1548	u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1549	int ret = 0;
1550
1551	if (interruptibility & GUEST_INTR_STATE_STI)
1552		ret |= KVM_X86_SHADOW_INT_STI;
1553	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1554		ret |= KVM_X86_SHADOW_INT_MOV_SS;
1555
1556	return ret;
1557}
1558
1559void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1560{
1561	u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1562	u32 interruptibility = interruptibility_old;
1563
1564	interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1565
1566	if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1567		interruptibility |= GUEST_INTR_STATE_MOV_SS;
1568	else if (mask & KVM_X86_SHADOW_INT_STI)
1569		interruptibility |= GUEST_INTR_STATE_STI;
1570
1571	if ((interruptibility != interruptibility_old))
1572		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1573}
1574
1575static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1576{
1577	struct vcpu_vmx *vmx = to_vmx(vcpu);
1578	unsigned long value;
1579
1580	/*
1581	 * Any MSR write that attempts to change bits marked reserved will
1582	 * case a #GP fault.
1583	 */
1584	if (data & vmx->pt_desc.ctl_bitmask)
1585		return 1;
1586
1587	/*
1588	 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1589	 * result in a #GP unless the same write also clears TraceEn.
1590	 */
1591	if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1592		((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1593		return 1;
1594
1595	/*
1596	 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1597	 * and FabricEn would cause #GP, if
1598	 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1599	 */
1600	if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1601		!(data & RTIT_CTL_FABRIC_EN) &&
1602		!intel_pt_validate_cap(vmx->pt_desc.caps,
1603					PT_CAP_single_range_output))
1604		return 1;
1605
1606	/*
1607	 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1608	 * utilize encodings marked reserved will casue a #GP fault.
1609	 */
1610	value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1611	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1612			!test_bit((data & RTIT_CTL_MTC_RANGE) >>
1613			RTIT_CTL_MTC_RANGE_OFFSET, &value))
1614		return 1;
1615	value = intel_pt_validate_cap(vmx->pt_desc.caps,
1616						PT_CAP_cycle_thresholds);
1617	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1618			!test_bit((data & RTIT_CTL_CYC_THRESH) >>
1619			RTIT_CTL_CYC_THRESH_OFFSET, &value))
1620		return 1;
1621	value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1622	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1623			!test_bit((data & RTIT_CTL_PSB_FREQ) >>
1624			RTIT_CTL_PSB_FREQ_OFFSET, &value))
1625		return 1;
1626
1627	/*
1628	 * If ADDRx_CFG is reserved or the encodings is >2 will
1629	 * cause a #GP fault.
1630	 */
1631	value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1632	if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
1633		return 1;
1634	value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1635	if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
1636		return 1;
1637	value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1638	if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
1639		return 1;
1640	value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1641	if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
1642		return 1;
1643
1644	return 0;
1645}
1646
1647static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
1648{
1649	return true;
1650}
1651
1652static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1653{
1654	unsigned long rip, orig_rip;
1655
1656	/*
1657	 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1658	 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1659	 * set when EPT misconfig occurs.  In practice, real hardware updates
1660	 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1661	 * (namely Hyper-V) don't set it due to it being undefined behavior,
1662	 * i.e. we end up advancing IP with some random value.
1663	 */
1664	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1665	    to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1666		orig_rip = kvm_rip_read(vcpu);
1667		rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1668#ifdef CONFIG_X86_64
1669		/*
1670		 * We need to mask out the high 32 bits of RIP if not in 64-bit
1671		 * mode, but just finding out that we are in 64-bit mode is
1672		 * quite expensive.  Only do it if there was a carry.
1673		 */
1674		if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1675			rip = (u32)rip;
1676#endif
1677		kvm_rip_write(vcpu, rip);
1678	} else {
1679		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1680			return 0;
1681	}
1682
1683	/* skipping an emulated instruction also counts */
1684	vmx_set_interrupt_shadow(vcpu, 0);
1685
1686	return 1;
1687}
1688
1689/*
1690 * Recognizes a pending MTF VM-exit and records the nested state for later
1691 * delivery.
1692 */
1693static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1694{
1695	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1696	struct vcpu_vmx *vmx = to_vmx(vcpu);
1697
1698	if (!is_guest_mode(vcpu))
1699		return;
1700
1701	/*
1702	 * Per the SDM, MTF takes priority over debug-trap exceptions besides
1703	 * T-bit traps. As instruction emulation is completed (i.e. at the
1704	 * instruction boundary), any #DB exception pending delivery must be a
1705	 * debug-trap. Record the pending MTF state to be delivered in
1706	 * vmx_check_nested_events().
1707	 */
1708	if (nested_cpu_has_mtf(vmcs12) &&
1709	    (!vcpu->arch.exception.pending ||
1710	     vcpu->arch.exception.nr == DB_VECTOR))
1711		vmx->nested.mtf_pending = true;
1712	else
1713		vmx->nested.mtf_pending = false;
1714}
1715
1716static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1717{
1718	vmx_update_emulated_instruction(vcpu);
1719	return skip_emulated_instruction(vcpu);
1720}
1721
1722static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1723{
1724	/*
1725	 * Ensure that we clear the HLT state in the VMCS.  We don't need to
1726	 * explicitly skip the instruction because if the HLT state is set,
1727	 * then the instruction is already executing and RIP has already been
1728	 * advanced.
1729	 */
1730	if (kvm_hlt_in_guest(vcpu->kvm) &&
1731			vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1732		vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1733}
1734
1735static void vmx_queue_exception(struct kvm_vcpu *vcpu)
1736{
1737	struct vcpu_vmx *vmx = to_vmx(vcpu);
1738	unsigned nr = vcpu->arch.exception.nr;
1739	bool has_error_code = vcpu->arch.exception.has_error_code;
1740	u32 error_code = vcpu->arch.exception.error_code;
1741	u32 intr_info = nr | INTR_INFO_VALID_MASK;
1742
1743	kvm_deliver_exception_payload(vcpu);
1744
1745	if (has_error_code) {
1746		/*
1747		 * Despite the error code being architecturally defined as 32
1748		 * bits, and the VMCS field being 32 bits, Intel CPUs and thus
1749		 * VMX don't actually supporting setting bits 31:16.  Hardware
1750		 * will (should) never provide a bogus error code, but AMD CPUs
1751		 * do generate error codes with bits 31:16 set, and so KVM's
1752		 * ABI lets userspace shove in arbitrary 32-bit values.  Drop
1753		 * the upper bits to avoid VM-Fail, losing information that
1754		 * does't really exist is preferable to killing the VM.
1755		 */
1756		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)error_code);
1757		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1758	}
1759
1760	if (vmx->rmode.vm86_active) {
1761		int inc_eip = 0;
1762		if (kvm_exception_is_soft(nr))
1763			inc_eip = vcpu->arch.event_exit_inst_len;
1764		kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
1765		return;
1766	}
1767
1768	WARN_ON_ONCE(vmx->emulation_required);
1769
1770	if (kvm_exception_is_soft(nr)) {
1771		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1772			     vmx->vcpu.arch.event_exit_inst_len);
1773		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1774	} else
1775		intr_info |= INTR_TYPE_HARD_EXCEPTION;
1776
1777	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1778
1779	vmx_clear_hlt(vcpu);
1780}
1781
1782static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
1783{
1784	struct vmx_uret_msr tmp;
1785	int from, to;
1786
1787	from = __vmx_find_uret_msr(vmx, msr);
1788	if (from < 0)
1789		return;
1790	to = vmx->nr_active_uret_msrs++;
1791
1792	tmp = vmx->guest_uret_msrs[to];
1793	vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
1794	vmx->guest_uret_msrs[from] = tmp;
1795}
1796
1797/*
1798 * Set up the vmcs to automatically save and restore system
1799 * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
1800 * mode, as fiddling with msrs is very expensive.
1801 */
1802static void setup_msrs(struct vcpu_vmx *vmx)
1803{
1804	vmx->guest_uret_msrs_loaded = false;
1805	vmx->nr_active_uret_msrs = 0;
1806#ifdef CONFIG_X86_64
1807	/*
1808	 * The SYSCALL MSRs are only needed on long mode guests, and only
1809	 * when EFER.SCE is set.
1810	 */
1811	if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
1812		vmx_setup_uret_msr(vmx, MSR_STAR);
1813		vmx_setup_uret_msr(vmx, MSR_LSTAR);
1814		vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
1815	}
1816#endif
1817	if (update_transition_efer(vmx))
1818		vmx_setup_uret_msr(vmx, MSR_EFER);
1819
1820	if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
1821		vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
1822
1823	vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
1824
1825	if (cpu_has_vmx_msr_bitmap())
1826		vmx_update_msr_bitmap(&vmx->vcpu);
1827}
1828
1829static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1830{
1831	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1832	u64 g_tsc_offset = 0;
1833
1834	/*
1835	 * We're here if L1 chose not to trap WRMSR to TSC. According
1836	 * to the spec, this should set L1's TSC; The offset that L1
1837	 * set for L2 remains unchanged, and still needs to be added
1838	 * to the newly set TSC to get L2's TSC.
1839	 */
1840	if (is_guest_mode(vcpu) &&
1841	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
1842		g_tsc_offset = vmcs12->tsc_offset;
1843
1844	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1845				   vcpu->arch.tsc_offset - g_tsc_offset,
1846				   offset);
1847	vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
1848	return offset + g_tsc_offset;
1849}
1850
1851/*
1852 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1853 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1854 * all guests if the "nested" module option is off, and can also be disabled
1855 * for a single guest by disabling its VMX cpuid bit.
1856 */
1857bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1858{
1859	return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
1860}
1861
1862static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
1863						 uint64_t val)
1864{
1865	uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
1866
1867	return !(val & ~valid_bits);
1868}
1869
1870static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
1871{
1872	switch (msr->index) {
1873	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1874		if (!nested)
1875			return 1;
1876		return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1877	case MSR_IA32_PERF_CAPABILITIES:
1878		msr->data = vmx_get_perf_capabilities();
1879		return 0;
1880	default:
1881		return KVM_MSR_RET_INVALID;
1882	}
1883}
1884
1885/*
1886 * Reads an msr value (of 'msr_index') into 'pdata'.
1887 * Returns 0 on success, non-0 otherwise.
1888 * Assumes vcpu_load() was already called.
1889 */
1890static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1891{
1892	struct vcpu_vmx *vmx = to_vmx(vcpu);
1893	struct vmx_uret_msr *msr;
1894	u32 index;
1895
1896	switch (msr_info->index) {
1897#ifdef CONFIG_X86_64
1898	case MSR_FS_BASE:
1899		msr_info->data = vmcs_readl(GUEST_FS_BASE);
1900		break;
1901	case MSR_GS_BASE:
1902		msr_info->data = vmcs_readl(GUEST_GS_BASE);
1903		break;
1904	case MSR_KERNEL_GS_BASE:
1905		msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
1906		break;
1907#endif
1908	case MSR_EFER:
1909		return kvm_get_msr_common(vcpu, msr_info);
1910	case MSR_IA32_TSX_CTRL:
1911		if (!msr_info->host_initiated &&
1912		    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
1913			return 1;
1914		goto find_uret_msr;
1915	case MSR_IA32_UMWAIT_CONTROL:
1916		if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
1917			return 1;
1918
1919		msr_info->data = vmx->msr_ia32_umwait_control;
1920		break;
1921	case MSR_IA32_SPEC_CTRL:
1922		if (!msr_info->host_initiated &&
1923		    !guest_has_spec_ctrl_msr(vcpu))
1924			return 1;
1925
1926		msr_info->data = to_vmx(vcpu)->spec_ctrl;
1927		break;
1928	case MSR_IA32_SYSENTER_CS:
1929		msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
1930		break;
1931	case MSR_IA32_SYSENTER_EIP:
1932		msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
1933		break;
1934	case MSR_IA32_SYSENTER_ESP:
1935		msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
1936		break;
1937	case MSR_IA32_BNDCFGS:
1938		if (!kvm_mpx_supported() ||
1939		    (!msr_info->host_initiated &&
1940		     !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
1941			return 1;
1942		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
1943		break;
1944	case MSR_IA32_MCG_EXT_CTL:
1945		if (!msr_info->host_initiated &&
1946		    !(vmx->msr_ia32_feature_control &
1947		      FEAT_CTL_LMCE_ENABLED))
1948			return 1;
1949		msr_info->data = vcpu->arch.mcg_ext_ctl;
1950		break;
1951	case MSR_IA32_FEAT_CTL:
1952		msr_info->data = vmx->msr_ia32_feature_control;
1953		break;
1954	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1955		if (!nested_vmx_allowed(vcpu))
1956			return 1;
1957		if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
1958				    &msr_info->data))
1959			return 1;
1960		/*
1961		 * Enlightened VMCS v1 doesn't have certain VMCS fields but
1962		 * instead of just ignoring the features, different Hyper-V
1963		 * versions are either trying to use them and fail or do some
1964		 * sanity checking and refuse to boot. Filter all unsupported
1965		 * features out.
1966		 */
1967		if (!msr_info->host_initiated &&
1968		    vmx->nested.enlightened_vmcs_enabled)
1969			nested_evmcs_filter_control_msr(msr_info->index,
1970							&msr_info->data);
1971		break;
1972	case MSR_IA32_RTIT_CTL:
1973		if (!vmx_pt_mode_is_host_guest())
1974			return 1;
1975		msr_info->data = vmx->pt_desc.guest.ctl;
1976		break;
1977	case MSR_IA32_RTIT_STATUS:
1978		if (!vmx_pt_mode_is_host_guest())
1979			return 1;
1980		msr_info->data = vmx->pt_desc.guest.status;
1981		break;
1982	case MSR_IA32_RTIT_CR3_MATCH:
1983		if (!vmx_pt_mode_is_host_guest() ||
1984			!intel_pt_validate_cap(vmx->pt_desc.caps,
1985						PT_CAP_cr3_filtering))
1986			return 1;
1987		msr_info->data = vmx->pt_desc.guest.cr3_match;
1988		break;
1989	case MSR_IA32_RTIT_OUTPUT_BASE:
1990		if (!vmx_pt_mode_is_host_guest() ||
1991			(!intel_pt_validate_cap(vmx->pt_desc.caps,
1992					PT_CAP_topa_output) &&
1993			 !intel_pt_validate_cap(vmx->pt_desc.caps,
1994					PT_CAP_single_range_output)))
1995			return 1;
1996		msr_info->data = vmx->pt_desc.guest.output_base;
1997		break;
1998	case MSR_IA32_RTIT_OUTPUT_MASK:
1999		if (!vmx_pt_mode_is_host_guest() ||
2000			(!intel_pt_validate_cap(vmx->pt_desc.caps,
2001					PT_CAP_topa_output) &&
2002			 !intel_pt_validate_cap(vmx->pt_desc.caps,
2003					PT_CAP_single_range_output)))
2004			return 1;
2005		msr_info->data = vmx->pt_desc.guest.output_mask;
2006		break;
2007	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2008		index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2009		if (!vmx_pt_mode_is_host_guest() ||
2010			(index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
2011					PT_CAP_num_address_ranges)))
2012			return 1;
2013		if (index % 2)
2014			msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2015		else
2016			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2017		break;
2018	case MSR_TSC_AUX:
2019		if (!msr_info->host_initiated &&
2020		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2021			return 1;
2022		goto find_uret_msr;
2023	default:
2024	find_uret_msr:
2025		msr = vmx_find_uret_msr(vmx, msr_info->index);
2026		if (msr) {
2027			msr_info->data = msr->data;
2028			break;
2029		}
2030		return kvm_get_msr_common(vcpu, msr_info);
2031	}
2032
2033	return 0;
2034}
2035
2036static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
2037						    u64 data)
2038{
2039#ifdef CONFIG_X86_64
2040	if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
2041		return (u32)data;
2042#endif
2043	return (unsigned long)data;
2044}
2045
2046/*
2047 * Writes msr value into the appropriate "register".
2048 * Returns 0 on success, non-0 otherwise.
2049 * Assumes vcpu_load() was already called.
2050 */
2051static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2052{
2053	struct vcpu_vmx *vmx = to_vmx(vcpu);
2054	struct vmx_uret_msr *msr;
2055	int ret = 0;
2056	u32 msr_index = msr_info->index;
2057	u64 data = msr_info->data;
2058	u32 index;
2059
2060	switch (msr_index) {
2061	case MSR_EFER:
2062		ret = kvm_set_msr_common(vcpu, msr_info);
2063		break;
2064#ifdef CONFIG_X86_64
2065	case MSR_FS_BASE:
2066		vmx_segment_cache_clear(vmx);
2067		vmcs_writel(GUEST_FS_BASE, data);
2068		break;
2069	case MSR_GS_BASE:
2070		vmx_segment_cache_clear(vmx);
2071		vmcs_writel(GUEST_GS_BASE, data);
2072		break;
2073	case MSR_KERNEL_GS_BASE:
2074		vmx_write_guest_kernel_gs_base(vmx, data);
2075		break;
2076#endif
2077	case MSR_IA32_SYSENTER_CS:
2078		if (is_guest_mode(vcpu))
2079			get_vmcs12(vcpu)->guest_sysenter_cs = data;
2080		vmcs_write32(GUEST_SYSENTER_CS, data);
2081		break;
2082	case MSR_IA32_SYSENTER_EIP:
2083		if (is_guest_mode(vcpu)) {
2084			data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2085			get_vmcs12(vcpu)->guest_sysenter_eip = data;
2086		}
2087		vmcs_writel(GUEST_SYSENTER_EIP, data);
2088		break;
2089	case MSR_IA32_SYSENTER_ESP:
2090		if (is_guest_mode(vcpu)) {
2091			data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2092			get_vmcs12(vcpu)->guest_sysenter_esp = data;
2093		}
2094		vmcs_writel(GUEST_SYSENTER_ESP, data);
2095		break;
2096	case MSR_IA32_DEBUGCTLMSR:
2097		if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2098						VM_EXIT_SAVE_DEBUG_CONTROLS)
2099			get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2100
2101		ret = kvm_set_msr_common(vcpu, msr_info);
2102		break;
2103
2104	case MSR_IA32_BNDCFGS:
2105		if (!kvm_mpx_supported() ||
2106		    (!msr_info->host_initiated &&
2107		     !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2108			return 1;
2109		if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
2110		    (data & MSR_IA32_BNDCFGS_RSVD))
2111			return 1;
2112		vmcs_write64(GUEST_BNDCFGS, data);
2113		break;
2114	case MSR_IA32_UMWAIT_CONTROL:
2115		if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2116			return 1;
2117
2118		/* The reserved bit 1 and non-32 bit [63:32] should be zero */
2119		if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2120			return 1;
2121
2122		vmx->msr_ia32_umwait_control = data;
2123		break;
2124	case MSR_IA32_SPEC_CTRL:
2125		if (!msr_info->host_initiated &&
2126		    !guest_has_spec_ctrl_msr(vcpu))
2127			return 1;
2128
2129		if (kvm_spec_ctrl_test_value(data))
2130			return 1;
2131
2132		vmx->spec_ctrl = data;
2133		if (!data)
2134			break;
2135
2136		/*
2137		 * For non-nested:
2138		 * When it's written (to non-zero) for the first time, pass
2139		 * it through.
2140		 *
2141		 * For nested:
2142		 * The handling of the MSR bitmap for L2 guests is done in
2143		 * nested_vmx_prepare_msr_bitmap. We should not touch the
2144		 * vmcs02.msr_bitmap here since it gets completely overwritten
2145		 * in the merging. We update the vmcs01 here for L1 as well
2146		 * since it will end up touching the MSR anyway now.
2147		 */
2148		vmx_disable_intercept_for_msr(vcpu,
2149					      MSR_IA32_SPEC_CTRL,
2150					      MSR_TYPE_RW);
2151		break;
2152	case MSR_IA32_TSX_CTRL:
2153		if (!msr_info->host_initiated &&
2154		    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2155			return 1;
2156		if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2157			return 1;
2158		goto find_uret_msr;
2159	case MSR_IA32_PRED_CMD:
2160		if (!msr_info->host_initiated &&
2161		    !guest_has_pred_cmd_msr(vcpu))
2162			return 1;
2163
2164		if (data & ~PRED_CMD_IBPB)
2165			return 1;
2166		if (!boot_cpu_has(X86_FEATURE_IBPB))
2167			return 1;
2168		if (!data)
2169			break;
2170
2171		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2172
2173		/*
2174		 * For non-nested:
2175		 * When it's written (to non-zero) for the first time, pass
2176		 * it through.
2177		 *
2178		 * For nested:
2179		 * The handling of the MSR bitmap for L2 guests is done in
2180		 * nested_vmx_prepare_msr_bitmap. We should not touch the
2181		 * vmcs02.msr_bitmap here since it gets completely overwritten
2182		 * in the merging.
2183		 */
2184		vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
2185		break;
2186	case MSR_IA32_CR_PAT:
2187		if (!kvm_pat_valid(data))
2188			return 1;
2189
2190		if (is_guest_mode(vcpu) &&
2191		    get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2192			get_vmcs12(vcpu)->guest_ia32_pat = data;
2193
2194		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2195			vmcs_write64(GUEST_IA32_PAT, data);
2196			vcpu->arch.pat = data;
2197			break;
2198		}
2199		ret = kvm_set_msr_common(vcpu, msr_info);
2200		break;
2201	case MSR_IA32_TSC_ADJUST:
2202		ret = kvm_set_msr_common(vcpu, msr_info);
2203		break;
2204	case MSR_IA32_MCG_EXT_CTL:
2205		if ((!msr_info->host_initiated &&
2206		     !(to_vmx(vcpu)->msr_ia32_feature_control &
2207		       FEAT_CTL_LMCE_ENABLED)) ||
2208		    (data & ~MCG_EXT_CTL_LMCE_EN))
2209			return 1;
2210		vcpu->arch.mcg_ext_ctl = data;
2211		break;
2212	case MSR_IA32_FEAT_CTL:
2213		if (!vmx_feature_control_msr_valid(vcpu, data) ||
2214		    (to_vmx(vcpu)->msr_ia32_feature_control &
2215		     FEAT_CTL_LOCKED && !msr_info->host_initiated))
2216			return 1;
2217		vmx->msr_ia32_feature_control = data;
2218		if (msr_info->host_initiated && data == 0)
2219			vmx_leave_nested(vcpu);
2220		break;
2221	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2222		if (!msr_info->host_initiated)
2223			return 1; /* they are read-only */
2224		if (!nested_vmx_allowed(vcpu))
2225			return 1;
2226		return vmx_set_vmx_msr(vcpu, msr_index, data);
2227	case MSR_IA32_RTIT_CTL:
2228		if (!vmx_pt_mode_is_host_guest() ||
2229			vmx_rtit_ctl_check(vcpu, data) ||
2230			vmx->nested.vmxon)
2231			return 1;
2232		vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2233		vmx->pt_desc.guest.ctl = data;
2234		pt_update_intercept_for_msr(vcpu);
2235		break;
2236	case MSR_IA32_RTIT_STATUS:
2237		if (!pt_can_write_msr(vmx))
2238			return 1;
2239		if (data & MSR_IA32_RTIT_STATUS_MASK)
2240			return 1;
2241		vmx->pt_desc.guest.status = data;
2242		break;
2243	case MSR_IA32_RTIT_CR3_MATCH:
2244		if (!pt_can_write_msr(vmx))
2245			return 1;
2246		if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2247					   PT_CAP_cr3_filtering))
2248			return 1;
2249		vmx->pt_desc.guest.cr3_match = data;
2250		break;
2251	case MSR_IA32_RTIT_OUTPUT_BASE:
2252		if (!pt_can_write_msr(vmx))
2253			return 1;
2254		if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2255					   PT_CAP_topa_output) &&
2256		    !intel_pt_validate_cap(vmx->pt_desc.caps,
2257					   PT_CAP_single_range_output))
2258			return 1;
2259		if (!pt_output_base_valid(vcpu, data))
2260			return 1;
2261		vmx->pt_desc.guest.output_base = data;
2262		break;
2263	case MSR_IA32_RTIT_OUTPUT_MASK:
2264		if (!pt_can_write_msr(vmx))
2265			return 1;
2266		if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2267					   PT_CAP_topa_output) &&
2268		    !intel_pt_validate_cap(vmx->pt_desc.caps,
2269					   PT_CAP_single_range_output))
2270			return 1;
2271		vmx->pt_desc.guest.output_mask = data;
2272		break;
2273	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2274		if (!pt_can_write_msr(vmx))
2275			return 1;
2276		index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2277		if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
2278						       PT_CAP_num_address_ranges))
2279			return 1;
2280		if (is_noncanonical_address(data, vcpu))
2281			return 1;
2282		if (index % 2)
2283			vmx->pt_desc.guest.addr_b[index / 2] = data;
2284		else
2285			vmx->pt_desc.guest.addr_a[index / 2] = data;
2286		break;
2287	case MSR_TSC_AUX:
2288		if (!msr_info->host_initiated &&
2289		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2290			return 1;
2291		/* Check reserved bit, higher 32 bits should be zero */
2292		if ((data >> 32) != 0)
2293			return 1;
2294		goto find_uret_msr;
2295
2296	default:
2297	find_uret_msr:
2298		msr = vmx_find_uret_msr(vmx, msr_index);
2299		if (msr)
2300			ret = vmx_set_guest_uret_msr(vmx, msr, data);
2301		else
2302			ret = kvm_set_msr_common(vcpu, msr_info);
2303	}
2304
2305	/* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
2306	if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
2307		vmx_update_fb_clear_dis(vcpu, vmx);
2308
2309	return ret;
2310}
2311
2312static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2313{
2314	unsigned long guest_owned_bits;
2315
2316	kvm_register_mark_available(vcpu, reg);
2317
2318	switch (reg) {
2319	case VCPU_REGS_RSP:
2320		vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2321		break;
2322	case VCPU_REGS_RIP:
2323		vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2324		break;
2325	case VCPU_EXREG_PDPTR:
2326		if (enable_ept)
2327			ept_save_pdptrs(vcpu);
2328		break;
2329	case VCPU_EXREG_CR0:
2330		guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2331
2332		vcpu->arch.cr0 &= ~guest_owned_bits;
2333		vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2334		break;
2335	case VCPU_EXREG_CR3:
2336		if (is_unrestricted_guest(vcpu) ||
2337		    (enable_ept && is_paging(vcpu)))
2338			vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2339		break;
2340	case VCPU_EXREG_CR4:
2341		guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2342
2343		vcpu->arch.cr4 &= ~guest_owned_bits;
2344		vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2345		break;
2346	default:
2347		WARN_ON_ONCE(1);
2348		break;
2349	}
2350}
2351
2352static __init int cpu_has_kvm_support(void)
2353{
2354	return cpu_has_vmx();
2355}
2356
2357static __init int vmx_disabled_by_bios(void)
2358{
2359	return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2360	       !boot_cpu_has(X86_FEATURE_VMX);
2361}
2362
2363static int kvm_cpu_vmxon(u64 vmxon_pointer)
2364{
2365	u64 msr;
2366
2367	cr4_set_bits(X86_CR4_VMXE);
2368	intel_pt_handle_vmx(1);
2369
2370	asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
2371			  _ASM_EXTABLE(1b, %l[fault])
2372			  : : [vmxon_pointer] "m"(vmxon_pointer)
2373			  : : fault);
2374	return 0;
2375
2376fault:
2377	WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2378		  rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
2379	intel_pt_handle_vmx(0);
2380	cr4_clear_bits(X86_CR4_VMXE);
2381
2382	return -EFAULT;
2383}
2384
2385static int hardware_enable(void)
2386{
2387	int cpu = raw_smp_processor_id();
2388	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2389	int r;
2390
2391	if (cr4_read_shadow() & X86_CR4_VMXE)
2392		return -EBUSY;
2393
2394	/*
2395	 * This can happen if we hot-added a CPU but failed to allocate
2396	 * VP assist page for it.
2397	 */
2398	if (static_branch_unlikely(&enable_evmcs) &&
2399	    !hv_get_vp_assist_page(cpu))
2400		return -EFAULT;
2401
2402	r = kvm_cpu_vmxon(phys_addr);
2403	if (r)
2404		return r;
2405
2406	if (enable_ept)
2407		ept_sync_global();
2408
2409	return 0;
2410}
2411
2412static void vmclear_local_loaded_vmcss(void)
2413{
2414	int cpu = raw_smp_processor_id();
2415	struct loaded_vmcs *v, *n;
2416
2417	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2418				 loaded_vmcss_on_cpu_link)
2419		__loaded_vmcs_clear(v);
2420}
2421
2422
2423/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
2424 * tricks.
2425 */
2426static void kvm_cpu_vmxoff(void)
2427{
2428	asm volatile (__ex("vmxoff"));
2429
2430	intel_pt_handle_vmx(0);
2431	cr4_clear_bits(X86_CR4_VMXE);
2432}
2433
2434static void hardware_disable(void)
2435{
2436	vmclear_local_loaded_vmcss();
2437	kvm_cpu_vmxoff();
2438}
2439
2440/*
2441 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2442 * directly instead of going through cpu_has(), to ensure KVM is trapping
2443 * ENCLS whenever it's supported in hardware.  It does not matter whether
2444 * the host OS supports or has enabled SGX.
2445 */
2446static bool cpu_has_sgx(void)
2447{
2448	return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2449}
2450
2451static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
2452				      u32 msr, u32 *result)
2453{
2454	u32 vmx_msr_low, vmx_msr_high;
2455	u32 ctl = ctl_min | ctl_opt;
2456
2457	rdmsr(msr, vmx_msr_low, vmx_msr_high);
2458
2459	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2460	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2461
2462	/* Ensure minimum (required) set of control bits are supported. */
2463	if (ctl_min & ~ctl)
2464		return -EIO;
2465
2466	*result = ctl;
2467	return 0;
2468}
2469
2470static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2471				    struct vmx_capability *vmx_cap)
2472{
2473	u32 vmx_msr_low, vmx_msr_high;
2474	u32 min, opt, min2, opt2;
2475	u32 _pin_based_exec_control = 0;
2476	u32 _cpu_based_exec_control = 0;
2477	u32 _cpu_based_2nd_exec_control = 0;
2478	u32 _vmexit_control = 0;
2479	u32 _vmentry_control = 0;
2480
2481	memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2482	min = CPU_BASED_HLT_EXITING |
2483#ifdef CONFIG_X86_64
2484	      CPU_BASED_CR8_LOAD_EXITING |
2485	      CPU_BASED_CR8_STORE_EXITING |
2486#endif
2487	      CPU_BASED_CR3_LOAD_EXITING |
2488	      CPU_BASED_CR3_STORE_EXITING |
2489	      CPU_BASED_UNCOND_IO_EXITING |
2490	      CPU_BASED_MOV_DR_EXITING |
2491	      CPU_BASED_USE_TSC_OFFSETTING |
2492	      CPU_BASED_MWAIT_EXITING |
2493	      CPU_BASED_MONITOR_EXITING |
2494	      CPU_BASED_INVLPG_EXITING |
2495	      CPU_BASED_RDPMC_EXITING;
2496
2497	opt = CPU_BASED_TPR_SHADOW |
2498	      CPU_BASED_USE_MSR_BITMAPS |
2499	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2500	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
2501				&_cpu_based_exec_control) < 0)
2502		return -EIO;
2503#ifdef CONFIG_X86_64
2504	if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2505		_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
2506					   ~CPU_BASED_CR8_STORE_EXITING;
2507#endif
2508	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2509		min2 = 0;
2510		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2511			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2512			SECONDARY_EXEC_WBINVD_EXITING |
2513			SECONDARY_EXEC_ENABLE_VPID |
2514			SECONDARY_EXEC_ENABLE_EPT |
2515			SECONDARY_EXEC_UNRESTRICTED_GUEST |
2516			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2517			SECONDARY_EXEC_DESC |
2518			SECONDARY_EXEC_ENABLE_RDTSCP |
2519			SECONDARY_EXEC_ENABLE_INVPCID |
2520			SECONDARY_EXEC_APIC_REGISTER_VIRT |
2521			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2522			SECONDARY_EXEC_SHADOW_VMCS |
2523			SECONDARY_EXEC_XSAVES |
2524			SECONDARY_EXEC_RDSEED_EXITING |
2525			SECONDARY_EXEC_RDRAND_EXITING |
2526			SECONDARY_EXEC_ENABLE_PML |
2527			SECONDARY_EXEC_TSC_SCALING |
2528			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2529			SECONDARY_EXEC_PT_USE_GPA |
2530			SECONDARY_EXEC_PT_CONCEAL_VMX |
2531			SECONDARY_EXEC_ENABLE_VMFUNC;
2532		if (cpu_has_sgx())
2533			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
2534		if (adjust_vmx_controls(min2, opt2,
2535					MSR_IA32_VMX_PROCBASED_CTLS2,
2536					&_cpu_based_2nd_exec_control) < 0)
2537			return -EIO;
2538	}
2539#ifndef CONFIG_X86_64
2540	if (!(_cpu_based_2nd_exec_control &
2541				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2542		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2543#endif
2544
2545	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2546		_cpu_based_2nd_exec_control &= ~(
2547				SECONDARY_EXEC_APIC_REGISTER_VIRT |
2548				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2549				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2550
2551	rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2552		&vmx_cap->ept, &vmx_cap->vpid);
2553
2554	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2555		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2556		   enabled */
2557		_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
2558					     CPU_BASED_CR3_STORE_EXITING |
2559					     CPU_BASED_INVLPG_EXITING);
2560	} else if (vmx_cap->ept) {
2561		vmx_cap->ept = 0;
2562		pr_warn_once("EPT CAP should not exist if not support "
2563				"1-setting enable EPT VM-execution control\n");
2564	}
2565	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2566		vmx_cap->vpid) {
2567		vmx_cap->vpid = 0;
2568		pr_warn_once("VPID CAP should not exist if not support "
2569				"1-setting enable VPID VM-execution control\n");
2570	}
2571
2572	min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
2573#ifdef CONFIG_X86_64
2574	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2575#endif
2576	opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
2577	      VM_EXIT_LOAD_IA32_PAT |
2578	      VM_EXIT_LOAD_IA32_EFER |
2579	      VM_EXIT_CLEAR_BNDCFGS |
2580	      VM_EXIT_PT_CONCEAL_PIP |
2581	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
2582	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2583				&_vmexit_control) < 0)
2584		return -EIO;
2585
2586	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2587	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
2588		 PIN_BASED_VMX_PREEMPTION_TIMER;
2589	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2590				&_pin_based_exec_control) < 0)
2591		return -EIO;
2592
2593	if (cpu_has_broken_vmx_preemption_timer())
2594		_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2595	if (!(_cpu_based_2nd_exec_control &
2596		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2597		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2598
2599	min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
2600	opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
2601	      VM_ENTRY_LOAD_IA32_PAT |
2602	      VM_ENTRY_LOAD_IA32_EFER |
2603	      VM_ENTRY_LOAD_BNDCFGS |
2604	      VM_ENTRY_PT_CONCEAL_PIP |
2605	      VM_ENTRY_LOAD_IA32_RTIT_CTL;
2606	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
2607				&_vmentry_control) < 0)
2608		return -EIO;
2609
2610	/*
2611	 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2612	 * can't be used due to an errata where VM Exit may incorrectly clear
2613	 * IA32_PERF_GLOBAL_CTRL[34:32].  Workaround the errata by using the
2614	 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2615	 */
2616	if (boot_cpu_data.x86 == 0x6) {
2617		switch (boot_cpu_data.x86_model) {
2618		case 26: /* AAK155 */
2619		case 30: /* AAP115 */
2620		case 37: /* AAT100 */
2621		case 44: /* BC86,AAY89,BD102 */
2622		case 46: /* BA97 */
2623			_vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
2624			_vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
2625			pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2626					"does not work properly. Using workaround\n");
2627			break;
2628		default:
2629			break;
2630		}
2631	}
2632
2633
2634	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2635
2636	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2637	if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
2638		return -EIO;
2639
2640#ifdef CONFIG_X86_64
2641	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2642	if (vmx_msr_high & (1u<<16))
2643		return -EIO;
2644#endif
2645
2646	/* Require Write-Back (WB) memory type for VMCS accesses. */
2647	if (((vmx_msr_high >> 18) & 15) != 6)
2648		return -EIO;
2649
2650	vmcs_conf->size = vmx_msr_high & 0x1fff;
2651	vmcs_conf->order = get_order(vmcs_conf->size);
2652	vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
2653
2654	vmcs_conf->revision_id = vmx_msr_low;
2655
2656	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2657	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2658	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2659	vmcs_conf->vmexit_ctrl         = _vmexit_control;
2660	vmcs_conf->vmentry_ctrl        = _vmentry_control;
2661
2662#if IS_ENABLED(CONFIG_HYPERV)
2663	if (enlightened_vmcs)
2664		evmcs_sanitize_exec_ctrls(vmcs_conf);
2665#endif
2666
2667	return 0;
2668}
2669
2670struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
2671{
2672	int node = cpu_to_node(cpu);
2673	struct page *pages;
2674	struct vmcs *vmcs;
2675
2676	pages = __alloc_pages_node(node, flags, vmcs_config.order);
2677	if (!pages)
2678		return NULL;
2679	vmcs = page_address(pages);
2680	memset(vmcs, 0, vmcs_config.size);
2681
2682	/* KVM supports Enlightened VMCS v1 only */
2683	if (static_branch_unlikely(&enable_evmcs))
2684		vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2685	else
2686		vmcs->hdr.revision_id = vmcs_config.revision_id;
2687
2688	if (shadow)
2689		vmcs->hdr.shadow_vmcs = 1;
2690	return vmcs;
2691}
2692
2693void free_vmcs(struct vmcs *vmcs)
2694{
2695	free_pages((unsigned long)vmcs, vmcs_config.order);
2696}
2697
2698/*
2699 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2700 */
2701void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2702{
2703	if (!loaded_vmcs->vmcs)
2704		return;
2705	loaded_vmcs_clear(loaded_vmcs);
2706	free_vmcs(loaded_vmcs->vmcs);
2707	loaded_vmcs->vmcs = NULL;
2708	if (loaded_vmcs->msr_bitmap)
2709		free_page((unsigned long)loaded_vmcs->msr_bitmap);
2710	WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2711}
2712
2713int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2714{
2715	loaded_vmcs->vmcs = alloc_vmcs(false);
2716	if (!loaded_vmcs->vmcs)
2717		return -ENOMEM;
2718
2719	vmcs_clear(loaded_vmcs->vmcs);
2720
2721	loaded_vmcs->shadow_vmcs = NULL;
2722	loaded_vmcs->hv_timer_soft_disabled = false;
2723	loaded_vmcs->cpu = -1;
2724	loaded_vmcs->launched = 0;
2725
2726	if (cpu_has_vmx_msr_bitmap()) {
2727		loaded_vmcs->msr_bitmap = (unsigned long *)
2728				__get_free_page(GFP_KERNEL_ACCOUNT);
2729		if (!loaded_vmcs->msr_bitmap)
2730			goto out_vmcs;
2731		memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2732	}
2733
2734	memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2735	memset(&loaded_vmcs->controls_shadow, 0,
2736		sizeof(struct vmcs_controls_shadow));
2737
2738	return 0;
2739
2740out_vmcs:
2741	free_loaded_vmcs(loaded_vmcs);
2742	return -ENOMEM;
2743}
2744
2745static void free_kvm_area(void)
2746{
2747	int cpu;
2748
2749	for_each_possible_cpu(cpu) {
2750		free_vmcs(per_cpu(vmxarea, cpu));
2751		per_cpu(vmxarea, cpu) = NULL;
2752	}
2753}
2754
2755static __init int alloc_kvm_area(void)
2756{
2757	int cpu;
2758
2759	for_each_possible_cpu(cpu) {
2760		struct vmcs *vmcs;
2761
2762		vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
2763		if (!vmcs) {
2764			free_kvm_area();
2765			return -ENOMEM;
2766		}
2767
2768		/*
2769		 * When eVMCS is enabled, alloc_vmcs_cpu() sets
2770		 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2771		 * revision_id reported by MSR_IA32_VMX_BASIC.
2772		 *
2773		 * However, even though not explicitly documented by
2774		 * TLFS, VMXArea passed as VMXON argument should
2775		 * still be marked with revision_id reported by
2776		 * physical CPU.
2777		 */
2778		if (static_branch_unlikely(&enable_evmcs))
2779			vmcs->hdr.revision_id = vmcs_config.revision_id;
2780
2781		per_cpu(vmxarea, cpu) = vmcs;
2782	}
2783	return 0;
2784}
2785
2786static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2787		struct kvm_segment *save)
2788{
2789	if (!emulate_invalid_guest_state) {
2790		/*
2791		 * CS and SS RPL should be equal during guest entry according
2792		 * to VMX spec, but in reality it is not always so. Since vcpu
2793		 * is in the middle of the transition from real mode to
2794		 * protected mode it is safe to assume that RPL 0 is a good
2795		 * default value.
2796		 */
2797		if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2798			save->selector &= ~SEGMENT_RPL_MASK;
2799		save->dpl = save->selector & SEGMENT_RPL_MASK;
2800		save->s = 1;
2801	}
2802	vmx_set_segment(vcpu, save, seg);
2803}
2804
2805static void enter_pmode(struct kvm_vcpu *vcpu)
2806{
2807	unsigned long flags;
2808	struct vcpu_vmx *vmx = to_vmx(vcpu);
2809
2810	/*
2811	 * Update real mode segment cache. It may be not up-to-date if sement
2812	 * register was written while vcpu was in a guest mode.
2813	 */
2814	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2815	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2816	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2817	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2818	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2819	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2820
2821	vmx->rmode.vm86_active = 0;
2822
2823	vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2824
2825	flags = vmcs_readl(GUEST_RFLAGS);
2826	flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2827	flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2828	vmcs_writel(GUEST_RFLAGS, flags);
2829
2830	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
2831			(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
2832
2833	update_exception_bitmap(vcpu);
2834
2835	fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2836	fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2837	fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2838	fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2839	fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2840	fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2841}
2842
2843static void fix_rmode_seg(int seg, struct kvm_segment *save)
2844{
2845	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2846	struct kvm_segment var = *save;
2847
2848	var.dpl = 0x3;
2849	if (seg == VCPU_SREG_CS)
2850		var.type = 0x3;
2851
2852	if (!emulate_invalid_guest_state) {
2853		var.selector = var.base >> 4;
2854		var.base = var.base & 0xffff0;
2855		var.limit = 0xffff;
2856		var.g = 0;
2857		var.db = 0;
2858		var.present = 1;
2859		var.s = 1;
2860		var.l = 0;
2861		var.unusable = 0;
2862		var.type = 0x3;
2863		var.avl = 0;
2864		if (save->base & 0xf)
2865			printk_once(KERN_WARNING "kvm: segment base is not "
2866					"paragraph aligned when entering "
2867					"protected mode (seg=%d)", seg);
2868	}
2869
2870	vmcs_write16(sf->selector, var.selector);
2871	vmcs_writel(sf->base, var.base);
2872	vmcs_write32(sf->limit, var.limit);
2873	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
2874}
2875
2876static void enter_rmode(struct kvm_vcpu *vcpu)
2877{
2878	unsigned long flags;
2879	struct vcpu_vmx *vmx = to_vmx(vcpu);
2880	struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
2881
2882	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2883	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2884	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2885	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2886	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2887	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2888	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2889
2890	vmx->rmode.vm86_active = 1;
2891
2892	/*
2893	 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2894	 * vcpu. Warn the user that an update is overdue.
2895	 */
2896	if (!kvm_vmx->tss_addr)
2897		printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
2898			     "called before entering vcpu\n");
2899
2900	vmx_segment_cache_clear(vmx);
2901
2902	vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
2903	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2904	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2905
2906	flags = vmcs_readl(GUEST_RFLAGS);
2907	vmx->rmode.save_rflags = flags;
2908
2909	flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2910
2911	vmcs_writel(GUEST_RFLAGS, flags);
2912	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
2913	update_exception_bitmap(vcpu);
2914
2915	fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2916	fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2917	fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2918	fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2919	fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2920	fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2921
2922	kvm_mmu_reset_context(vcpu);
2923}
2924
2925int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
2926{
2927	struct vcpu_vmx *vmx = to_vmx(vcpu);
2928	struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER);
2929
2930	/* Nothing to do if hardware doesn't support EFER. */
2931	if (!msr)
2932		return 0;
2933
2934	vcpu->arch.efer = efer;
2935	if (efer & EFER_LMA) {
2936		vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2937		msr->data = efer;
2938	} else {
2939		vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2940
2941		msr->data = efer & ~EFER_LME;
2942	}
2943	setup_msrs(vmx);
2944	return 0;
2945}
2946
2947#ifdef CONFIG_X86_64
2948
2949static void enter_lmode(struct kvm_vcpu *vcpu)
2950{
2951	u32 guest_tr_ar;
2952
2953	vmx_segment_cache_clear(to_vmx(vcpu));
2954
2955	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
2956	if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
2957		pr_debug_ratelimited("%s: tss fixup for long mode. \n",
2958				     __func__);
2959		vmcs_write32(GUEST_TR_AR_BYTES,
2960			     (guest_tr_ar & ~VMX_AR_TYPE_MASK)
2961			     | VMX_AR_TYPE_BUSY_64_TSS);
2962	}
2963	vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
2964}
2965
2966static void exit_lmode(struct kvm_vcpu *vcpu)
2967{
2968	vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
2969	vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
2970}
2971
2972#endif
2973
2974static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
2975{
2976	struct vcpu_vmx *vmx = to_vmx(vcpu);
2977
2978	/*
2979	 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
2980	 * the CPU is not required to invalidate guest-physical mappings on
2981	 * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
2982	 * associated with the root EPT structure and not any particular VPID
2983	 * (INVVPID also isn't required to invalidate guest-physical mappings).
2984	 */
2985	if (enable_ept) {
2986		ept_sync_global();
2987	} else if (enable_vpid) {
2988		if (cpu_has_vmx_invvpid_global()) {
2989			vpid_sync_vcpu_global();
2990		} else {
2991			vpid_sync_vcpu_single(vmx->vpid);
2992			vpid_sync_vcpu_single(vmx->nested.vpid02);
2993		}
2994	}
2995}
2996
2997static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
2998{
2999	if (is_guest_mode(vcpu))
3000		return nested_get_vpid02(vcpu);
3001	return to_vmx(vcpu)->vpid;
3002}
3003
3004static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
3005{
3006	struct kvm_mmu *mmu = vcpu->arch.mmu;
3007	u64 root_hpa = mmu->root_hpa;
3008
3009	/* No flush required if the current context is invalid. */
3010	if (!VALID_PAGE(root_hpa))
3011		return;
3012
3013	if (enable_ept)
3014		ept_sync_context(construct_eptp(vcpu, root_hpa,
3015						mmu->shadow_root_level));
3016	else
3017		vpid_sync_context(vmx_get_current_vpid(vcpu));
3018}
3019
3020static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3021{
3022	/*
3023	 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
3024	 * vmx_flush_tlb_guest() for an explanation of why this is ok.
3025	 */
3026	vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
3027}
3028
3029static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
3030{
3031	/*
3032	 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
3033	 * vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit are
3034	 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
3035	 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
3036	 * i.e. no explicit INVVPID is necessary.
3037	 */
3038	vpid_sync_context(vmx_get_current_vpid(vcpu));
3039}
3040
3041void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
3042{
3043	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3044
3045	if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
3046		return;
3047
3048	if (is_pae_paging(vcpu)) {
3049		vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3050		vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3051		vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3052		vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3053	}
3054}
3055
3056void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3057{
3058	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3059
3060	if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3061		return;
3062
3063	mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3064	mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3065	mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3066	mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3067
3068	kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
3069}
3070
3071#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3072			  CPU_BASED_CR3_STORE_EXITING)
3073
3074void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3075{
3076	struct vcpu_vmx *vmx = to_vmx(vcpu);
3077	unsigned long hw_cr0;
3078	u32 tmp;
3079
3080	hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3081	if (enable_unrestricted_guest)
3082		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3083	else {
3084		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3085		if (!enable_ept)
3086			hw_cr0 |= X86_CR0_WP;
3087
3088		if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3089			enter_pmode(vcpu);
3090
3091		if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3092			enter_rmode(vcpu);
3093	}
3094
3095#ifdef CONFIG_X86_64
3096	if (vcpu->arch.efer & EFER_LME) {
3097		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
3098			enter_lmode(vcpu);
3099		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
3100			exit_lmode(vcpu);
3101	}
3102#endif
3103
3104	if (enable_ept && !enable_unrestricted_guest) {
3105		/*
3106		 * Ensure KVM has an up-to-date snapshot of the guest's CR3.  If
3107		 * the below code _enables_ CR3 exiting, vmx_cache_reg() will
3108		 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3109		 * KVM's CR3 is installed.
3110		 */
3111		if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3112			vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3113
3114		/*
3115		 * When running with EPT but not unrestricted guest, KVM must
3116		 * intercept CR3 accesses when paging is _disabled_.  This is
3117		 * necessary because restricted guests can't actually run with
3118		 * paging disabled, and so KVM stuffs its own CR3 in order to
3119		 * run the guest when identity mapped page tables.
3120		 *
3121		 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3122		 * update, it may be stale with respect to CR3 interception,
3123		 * e.g. after nested VM-Enter.
3124		 *
3125		 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3126		 * stores to forward them to L1, even if KVM does not need to
3127		 * intercept them to preserve its identity mapped page tables.
3128		 */
3129		if (!(cr0 & X86_CR0_PG)) {
3130			exec_controls_setbit(vmx, CR3_EXITING_BITS);
3131		} else if (!is_guest_mode(vcpu)) {
3132			exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3133		} else {
3134			tmp = exec_controls_get(vmx);
3135			tmp &= ~CR3_EXITING_BITS;
3136			tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3137			exec_controls_set(vmx, tmp);
3138		}
3139
3140		if (!is_paging(vcpu) != !(cr0 & X86_CR0_PG)) {
3141			vcpu->arch.cr0 = cr0;
3142			vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3143		}
3144	}
3145
3146	vmcs_writel(CR0_READ_SHADOW, cr0);
3147	vmcs_writel(GUEST_CR0, hw_cr0);
3148	vcpu->arch.cr0 = cr0;
3149	kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3150
3151	/* depends on vcpu->arch.cr0 to be set to a new value */
3152	vmx->emulation_required = emulation_required(vcpu);
3153}
3154
3155static int vmx_get_max_tdp_level(void)
3156{
3157	if (cpu_has_vmx_ept_5levels())
3158		return 5;
3159	return 4;
3160}
3161
3162u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
3163		   int root_level)
3164{
3165	u64 eptp = VMX_EPTP_MT_WB;
3166
3167	eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3168
3169	if (enable_ept_ad_bits &&
3170	    (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
3171		eptp |= VMX_EPTP_AD_ENABLE_BIT;
3172	eptp |= (root_hpa & PAGE_MASK);
3173
3174	return eptp;
3175}
3176
3177static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
3178			     int pgd_level)
3179{
3180	struct kvm *kvm = vcpu->kvm;
3181	bool update_guest_cr3 = true;
3182	unsigned long guest_cr3;
3183	u64 eptp;
3184
3185	if (enable_ept) {
3186		eptp = construct_eptp(vcpu, pgd, pgd_level);
3187		vmcs_write64(EPT_POINTER, eptp);
3188
3189		if (kvm_x86_ops.tlb_remote_flush) {
3190			spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
3191			to_vmx(vcpu)->ept_pointer = eptp;
3192			to_kvm_vmx(kvm)->ept_pointers_match
3193				= EPT_POINTERS_CHECK;
3194			spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
3195		}
3196
3197		if (!enable_unrestricted_guest && !is_paging(vcpu))
3198			guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3199		else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3200			guest_cr3 = vcpu->arch.cr3;
3201		else /* vmcs01.GUEST_CR3 is already up-to-date. */
3202			update_guest_cr3 = false;
3203		vmx_ept_load_pdptrs(vcpu);
3204	} else {
3205		guest_cr3 = pgd;
3206	}
3207
3208	if (update_guest_cr3)
3209		vmcs_writel(GUEST_CR3, guest_cr3);
3210}
3211
3212static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3213{
3214	/*
3215	 * We operate under the default treatment of SMM, so VMX cannot be
3216	 * enabled under SMM.  Note, whether or not VMXE is allowed at all is
3217	 * handled by kvm_valid_cr4().
3218	 */
3219	if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3220		return false;
3221
3222	if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3223		return false;
3224
3225	return true;
3226}
3227
3228void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3229{
3230	struct vcpu_vmx *vmx = to_vmx(vcpu);
3231	/*
3232	 * Pass through host's Machine Check Enable value to hw_cr4, which
3233	 * is in force while we are in guest mode.  Do not let guests control
3234	 * this bit, even if host CR4.MCE == 0.
3235	 */
3236	unsigned long hw_cr4;
3237
3238	hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3239	if (enable_unrestricted_guest)
3240		hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3241	else if (vmx->rmode.vm86_active)
3242		hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3243	else
3244		hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3245
3246	if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
3247		if (cr4 & X86_CR4_UMIP) {
3248			secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3249			hw_cr4 &= ~X86_CR4_UMIP;
3250		} else if (!is_guest_mode(vcpu) ||
3251			!nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3252			secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3253		}
3254	}
3255
3256	vcpu->arch.cr4 = cr4;
3257	kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3258
3259	if (!enable_unrestricted_guest) {
3260		if (enable_ept) {
3261			if (!is_paging(vcpu)) {
3262				hw_cr4 &= ~X86_CR4_PAE;
3263				hw_cr4 |= X86_CR4_PSE;
3264			} else if (!(cr4 & X86_CR4_PAE)) {
3265				hw_cr4 &= ~X86_CR4_PAE;
3266			}
3267		}
3268
3269		/*
3270		 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3271		 * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
3272		 * to be manually disabled when guest switches to non-paging
3273		 * mode.
3274		 *
3275		 * If !enable_unrestricted_guest, the CPU is always running
3276		 * with CR0.PG=1 and CR4 needs to be modified.
3277		 * If enable_unrestricted_guest, the CPU automatically
3278		 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3279		 */
3280		if (!is_paging(vcpu))
3281			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3282	}
3283
3284	vmcs_writel(CR4_READ_SHADOW, cr4);
3285	vmcs_writel(GUEST_CR4, hw_cr4);
3286}
3287
3288void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3289{
3290	struct vcpu_vmx *vmx = to_vmx(vcpu);
3291	u32 ar;
3292
3293	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3294		*var = vmx->rmode.segs[seg];
3295		if (seg == VCPU_SREG_TR
3296		    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3297			return;
3298		var->base = vmx_read_guest_seg_base(vmx, seg);
3299		var->selector = vmx_read_guest_seg_selector(vmx, seg);
3300		return;
3301	}
3302	var->base = vmx_read_guest_seg_base(vmx, seg);
3303	var->limit = vmx_read_guest_seg_limit(vmx, seg);
3304	var->selector = vmx_read_guest_seg_selector(vmx, seg);
3305	ar = vmx_read_guest_seg_ar(vmx, seg);
3306	var->unusable = (ar >> 16) & 1;
3307	var->type = ar & 15;
3308	var->s = (ar >> 4) & 1;
3309	var->dpl = (ar >> 5) & 3;
3310	/*
3311	 * Some userspaces do not preserve unusable property. Since usable
3312	 * segment has to be present according to VMX spec we can use present
3313	 * property to amend userspace bug by making unusable segment always
3314	 * nonpresent. vmx_segment_access_rights() already marks nonpresent
3315	 * segment as unusable.
3316	 */
3317	var->present = !var->unusable;
3318	var->avl = (ar >> 12) & 1;
3319	var->l = (ar >> 13) & 1;
3320	var->db = (ar >> 14) & 1;
3321	var->g = (ar >> 15) & 1;
3322}
3323
3324static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3325{
3326	struct kvm_segment s;
3327
3328	if (to_vmx(vcpu)->rmode.vm86_active) {
3329		vmx_get_segment(vcpu, &s, seg);
3330		return s.base;
3331	}
3332	return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3333}
3334
3335int vmx_get_cpl(struct kvm_vcpu *vcpu)
3336{
3337	struct vcpu_vmx *vmx = to_vmx(vcpu);
3338
3339	if (unlikely(vmx->rmode.vm86_active))
3340		return 0;
3341	else {
3342		int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3343		return VMX_AR_DPL(ar);
3344	}
3345}
3346
3347static u32 vmx_segment_access_rights(struct kvm_segment *var)
3348{
3349	u32 ar;
3350
3351	ar = var->type & 15;
3352	ar |= (var->s & 1) << 4;
3353	ar |= (var->dpl & 3) << 5;
3354	ar |= (var->present & 1) << 7;
3355	ar |= (var->avl & 1) << 12;
3356	ar |= (var->l & 1) << 13;
3357	ar |= (var->db & 1) << 14;
3358	ar |= (var->g & 1) << 15;
3359	ar |= (var->unusable || !var->present) << 16;
3360
3361	return ar;
3362}
3363
3364void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3365{
3366	struct vcpu_vmx *vmx = to_vmx(vcpu);
3367	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3368
3369	vmx_segment_cache_clear(vmx);
3370
3371	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3372		vmx->rmode.segs[seg] = *var;
3373		if (seg == VCPU_SREG_TR)
3374			vmcs_write16(sf->selector, var->selector);
3375		else if (var->s)
3376			fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3377		goto out;
3378	}
3379
3380	vmcs_writel(sf->base, var->base);
3381	vmcs_write32(sf->limit, var->limit);
3382	vmcs_write16(sf->selector, var->selector);
3383
3384	/*
3385	 *   Fix the "Accessed" bit in AR field of segment registers for older
3386	 * qemu binaries.
3387	 *   IA32 arch specifies that at the time of processor reset the
3388	 * "Accessed" bit in the AR field of segment registers is 1. And qemu
3389	 * is setting it to 0 in the userland code. This causes invalid guest
3390	 * state vmexit when "unrestricted guest" mode is turned on.
3391	 *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3392	 * tree. Newer qemu binaries with that qemu fix would not need this
3393	 * kvm hack.
3394	 */
3395	if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3396		var->type |= 0x1; /* Accessed */
3397
3398	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3399
3400out:
3401	vmx->emulation_required = emulation_required(vcpu);
3402}
3403
3404static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3405{
3406	u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3407
3408	*db = (ar >> 14) & 1;
3409	*l = (ar >> 13) & 1;
3410}
3411
3412static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3413{
3414	dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3415	dt->address = vmcs_readl(GUEST_IDTR_BASE);
3416}
3417
3418static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3419{
3420	vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3421	vmcs_writel(GUEST_IDTR_BASE, dt->address);
3422}
3423
3424static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3425{
3426	dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3427	dt->address = vmcs_readl(GUEST_GDTR_BASE);
3428}
3429
3430static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3431{
3432	vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3433	vmcs_writel(GUEST_GDTR_BASE, dt->address);
3434}
3435
3436static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3437{
3438	struct kvm_segment var;
3439	u32 ar;
3440
3441	vmx_get_segment(vcpu, &var, seg);
3442	var.dpl = 0x3;
3443	if (seg == VCPU_SREG_CS)
3444		var.type = 0x3;
3445	ar = vmx_segment_access_rights(&var);
3446
3447	if (var.base != (var.selector << 4))
3448		return false;
3449	if (var.limit != 0xffff)
3450		return false;
3451	if (ar != 0xf3)
3452		return false;
3453
3454	return true;
3455}
3456
3457static bool code_segment_valid(struct kvm_vcpu *vcpu)
3458{
3459	struct kvm_segment cs;
3460	unsigned int cs_rpl;
3461
3462	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3463	cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3464
3465	if (cs.unusable)
3466		return false;
3467	if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3468		return false;
3469	if (!cs.s)
3470		return false;
3471	if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3472		if (cs.dpl > cs_rpl)
3473			return false;
3474	} else {
3475		if (cs.dpl != cs_rpl)
3476			return false;
3477	}
3478	if (!cs.present)
3479		return false;
3480
3481	/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3482	return true;
3483}
3484
3485static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3486{
3487	struct kvm_segment ss;
3488	unsigned int ss_rpl;
3489
3490	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3491	ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3492
3493	if (ss.unusable)
3494		return true;
3495	if (ss.type != 3 && ss.type != 7)
3496		return false;
3497	if (!ss.s)
3498		return false;
3499	if (ss.dpl != ss_rpl) /* DPL != RPL */
3500		return false;
3501	if (!ss.present)
3502		return false;
3503
3504	return true;
3505}
3506
3507static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3508{
3509	struct kvm_segment var;
3510	unsigned int rpl;
3511
3512	vmx_get_segment(vcpu, &var, seg);
3513	rpl = var.selector & SEGMENT_RPL_MASK;
3514
3515	if (var.unusable)
3516		return true;
3517	if (!var.s)
3518		return false;
3519	if (!var.present)
3520		return false;
3521	if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3522		if (var.dpl < rpl) /* DPL < RPL */
3523			return false;
3524	}
3525
3526	/* TODO: Add other members to kvm_segment_field to allow checking for other access
3527	 * rights flags
3528	 */
3529	return true;
3530}
3531
3532static bool tr_valid(struct kvm_vcpu *vcpu)
3533{
3534	struct kvm_segment tr;
3535
3536	vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3537
3538	if (tr.unusable)
3539		return false;
3540	if (tr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
3541		return false;
3542	if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3543		return false;
3544	if (!tr.present)
3545		return false;
3546
3547	return true;
3548}
3549
3550static bool ldtr_valid(struct kvm_vcpu *vcpu)
3551{
3552	struct kvm_segment ldtr;
3553
3554	vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3555
3556	if (ldtr.unusable)
3557		return true;
3558	if (ldtr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
3559		return false;
3560	if (ldtr.type != 2)
3561		return false;
3562	if (!ldtr.present)
3563		return false;
3564
3565	return true;
3566}
3567
3568static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3569{
3570	struct kvm_segment cs, ss;
3571
3572	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3573	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3574
3575	return ((cs.selector & SEGMENT_RPL_MASK) ==
3576		 (ss.selector & SEGMENT_RPL_MASK));
3577}
3578
3579/*
3580 * Check if guest state is valid. Returns true if valid, false if
3581 * not.
3582 * We assume that registers are always usable
3583 */
3584bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
3585{
3586	/* real mode guest state checks */
3587	if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3588		if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3589			return false;
3590		if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3591			return false;
3592		if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3593			return false;
3594		if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3595			return false;
3596		if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3597			return false;
3598		if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3599			return false;
3600	} else {
3601	/* protected mode guest state checks */
3602		if (!cs_ss_rpl_check(vcpu))
3603			return false;
3604		if (!code_segment_valid(vcpu))
3605			return false;
3606		if (!stack_segment_valid(vcpu))
3607			return false;
3608		if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3609			return false;
3610		if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3611			return false;
3612		if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3613			return false;
3614		if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3615			return false;
3616		if (!tr_valid(vcpu))
3617			return false;
3618		if (!ldtr_valid(vcpu))
3619			return false;
3620	}
3621	/* TODO:
3622	 * - Add checks on RIP
3623	 * - Add checks on RFLAGS
3624	 */
3625
3626	return true;
3627}
3628
3629static int init_rmode_tss(struct kvm *kvm)
3630{
3631	gfn_t fn;
3632	u16 data = 0;
3633	int idx, r;
3634
3635	idx = srcu_read_lock(&kvm->srcu);
3636	fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
3637	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3638	if (r < 0)
3639		goto out;
3640	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3641	r = kvm_write_guest_page(kvm, fn++, &data,
3642			TSS_IOPB_BASE_OFFSET, sizeof(u16));
3643	if (r < 0)
3644		goto out;
3645	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
3646	if (r < 0)
3647		goto out;
3648	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3649	if (r < 0)
3650		goto out;
3651	data = ~0;
3652	r = kvm_write_guest_page(kvm, fn, &data,
3653				 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
3654				 sizeof(u8));
3655out:
3656	srcu_read_unlock(&kvm->srcu, idx);
3657	return r;
3658}
3659
3660static int init_rmode_identity_map(struct kvm *kvm)
3661{
3662	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
3663	int i, r = 0;
3664	kvm_pfn_t identity_map_pfn;
3665	u32 tmp;
3666
3667	/* Protect kvm_vmx->ept_identity_pagetable_done. */
3668	mutex_lock(&kvm->slots_lock);
3669
3670	if (likely(kvm_vmx->ept_identity_pagetable_done))
3671		goto out;
3672
3673	if (!kvm_vmx->ept_identity_map_addr)
3674		kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3675	identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
3676
3677	r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3678				    kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
3679	if (r < 0)
3680		goto out;
3681
3682	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
3683	if (r < 0)
3684		goto out;
3685	/* Set up identity-mapping pagetable for EPT in real mode */
3686	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
3687		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3688			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3689		r = kvm_write_guest_page(kvm, identity_map_pfn,
3690				&tmp, i * sizeof(tmp), sizeof(tmp));
3691		if (r < 0)
3692			goto out;
3693	}
3694	kvm_vmx->ept_identity_pagetable_done = true;
3695
3696out:
3697	mutex_unlock(&kvm->slots_lock);
3698	return r;
3699}
3700
3701static void seg_setup(int seg)
3702{
3703	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3704	unsigned int ar;
3705
3706	vmcs_write16(sf->selector, 0);
3707	vmcs_writel(sf->base, 0);
3708	vmcs_write32(sf->limit, 0xffff);
3709	ar = 0x93;
3710	if (seg == VCPU_SREG_CS)
3711		ar |= 0x08; /* code segment */
3712
3713	vmcs_write32(sf->ar_bytes, ar);
3714}
3715
3716static int alloc_apic_access_page(struct kvm *kvm)
3717{
3718	struct page *page;
3719	int r = 0;
3720
3721	mutex_lock(&kvm->slots_lock);
3722	if (kvm->arch.apic_access_page_done)
3723		goto out;
3724	r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
3725				    APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
3726	if (r)
3727		goto out;
3728
3729	page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
3730	if (is_error_page(page)) {
3731		r = -EFAULT;
3732		goto out;
3733	}
3734
3735	/*
3736	 * Do not pin the page in memory, so that memory hot-unplug
3737	 * is able to migrate it.
3738	 */
3739	put_page(page);
3740	kvm->arch.apic_access_page_done = true;
3741out:
3742	mutex_unlock(&kvm->slots_lock);
3743	return r;
3744}
3745
3746int allocate_vpid(void)
3747{
3748	int vpid;
3749
3750	if (!enable_vpid)
3751		return 0;
3752	spin_lock(&vmx_vpid_lock);
3753	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3754	if (vpid < VMX_NR_VPIDS)
3755		__set_bit(vpid, vmx_vpid_bitmap);
3756	else
3757		vpid = 0;
3758	spin_unlock(&vmx_vpid_lock);
3759	return vpid;
3760}
3761
3762void free_vpid(int vpid)
3763{
3764	if (!enable_vpid || vpid == 0)
3765		return;
3766	spin_lock(&vmx_vpid_lock);
3767	__clear_bit(vpid, vmx_vpid_bitmap);
3768	spin_unlock(&vmx_vpid_lock);
3769}
3770
3771static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
3772{
3773	int f = sizeof(unsigned long);
3774
3775	if (msr <= 0x1fff)
3776		__clear_bit(msr, msr_bitmap + 0x000 / f);
3777	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
3778		__clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
3779}
3780
3781static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
3782{
3783	int f = sizeof(unsigned long);
3784
3785	if (msr <= 0x1fff)
3786		__clear_bit(msr, msr_bitmap + 0x800 / f);
3787	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
3788		__clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
3789}
3790
3791static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
3792{
3793	int f = sizeof(unsigned long);
3794
3795	if (msr <= 0x1fff)
3796		__set_bit(msr, msr_bitmap + 0x000 / f);
3797	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
3798		__set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
3799}
3800
3801static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
3802{
3803	int f = sizeof(unsigned long);
3804
3805	if (msr <= 0x1fff)
3806		__set_bit(msr, msr_bitmap + 0x800 / f);
3807	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
3808		__set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
3809}
3810
3811static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
3812{
3813	/*
3814	 * When KVM is a nested hypervisor on top of Hyper-V and uses
3815	 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
3816	 * bitmap has changed.
3817	 */
3818	if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) {
3819		struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
3820
3821		if (evmcs->hv_enlightenments_control.msr_bitmap)
3822			evmcs->hv_clean_fields &=
3823				~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
3824	}
3825}
3826
3827static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
3828							  u32 msr, int type)
3829{
3830	struct vcpu_vmx *vmx = to_vmx(vcpu);
3831	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3832
3833	if (!cpu_has_vmx_msr_bitmap())
3834		return;
3835
3836	vmx_msr_bitmap_l01_changed(vmx);
3837
3838	/*
3839	 * Mark the desired intercept state in shadow bitmap, this is needed
3840	 * for resync when the MSR filters change.
3841	*/
3842	if (is_valid_passthrough_msr(msr)) {
3843		int idx = possible_passthrough_msr_slot(msr);
3844
3845		if (idx != -ENOENT) {
3846			if (type & MSR_TYPE_R)
3847				clear_bit(idx, vmx->shadow_msr_intercept.read);
3848			if (type & MSR_TYPE_W)
3849				clear_bit(idx, vmx->shadow_msr_intercept.write);
3850		}
3851	}
3852
3853	if ((type & MSR_TYPE_R) &&
3854	    !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
3855		vmx_set_msr_bitmap_read(msr_bitmap, msr);
3856		type &= ~MSR_TYPE_R;
3857	}
3858
3859	if ((type & MSR_TYPE_W) &&
3860	    !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
3861		vmx_set_msr_bitmap_write(msr_bitmap, msr);
3862		type &= ~MSR_TYPE_W;
3863	}
3864
3865	if (type & MSR_TYPE_R)
3866		vmx_clear_msr_bitmap_read(msr_bitmap, msr);
3867
3868	if (type & MSR_TYPE_W)
3869		vmx_clear_msr_bitmap_write(msr_bitmap, msr);
3870}
3871
3872static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
3873							 u32 msr, int type)
3874{
3875	struct vcpu_vmx *vmx = to_vmx(vcpu);
3876	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3877
3878	if (!cpu_has_vmx_msr_bitmap())
3879		return;
3880
3881	vmx_msr_bitmap_l01_changed(vmx);
3882
3883	/*
3884	 * Mark the desired intercept state in shadow bitmap, this is needed
3885	 * for resync when the MSR filter changes.
3886	*/
3887	if (is_valid_passthrough_msr(msr)) {
3888		int idx = possible_passthrough_msr_slot(msr);
3889
3890		if (idx != -ENOENT) {
3891			if (type & MSR_TYPE_R)
3892				set_bit(idx, vmx->shadow_msr_intercept.read);
3893			if (type & MSR_TYPE_W)
3894				set_bit(idx, vmx->shadow_msr_intercept.write);
3895		}
3896	}
3897
3898	if (type & MSR_TYPE_R)
3899		vmx_set_msr_bitmap_read(msr_bitmap, msr);
3900
3901	if (type & MSR_TYPE_W)
3902		vmx_set_msr_bitmap_write(msr_bitmap, msr);
3903}
3904
3905static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
3906						      u32 msr, int type, bool value)
3907{
3908	if (value)
3909		vmx_enable_intercept_for_msr(vcpu, msr, type);
3910	else
3911		vmx_disable_intercept_for_msr(vcpu, msr, type);
3912}
3913
3914static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
3915{
3916	u8 mode = 0;
3917
3918	if (cpu_has_secondary_exec_ctrls() &&
3919	    (secondary_exec_controls_get(to_vmx(vcpu)) &
3920	     SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
3921		mode |= MSR_BITMAP_MODE_X2APIC;
3922		if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
3923			mode |= MSR_BITMAP_MODE_X2APIC_APICV;
3924	}
3925
3926	return mode;
3927}
3928
3929static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
3930{
3931	unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
3932	unsigned long read_intercept;
3933	int msr;
3934
3935	read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
3936
3937	for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
3938		unsigned int read_idx = msr / BITS_PER_LONG;
3939		unsigned int write_idx = read_idx + (0x800 / sizeof(long));
3940
3941		msr_bitmap[read_idx] = read_intercept;
3942		msr_bitmap[write_idx] = ~0ul;
3943	}
3944}
3945
3946static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
3947{
3948	if (!cpu_has_vmx_msr_bitmap())
3949		return;
3950
3951	vmx_reset_x2apic_msrs(vcpu, mode);
3952
3953	/*
3954	 * TPR reads and writes can be virtualized even if virtual interrupt
3955	 * delivery is not in use.
3956	 */
3957	vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
3958				  !(mode & MSR_BITMAP_MODE_X2APIC));
3959
3960	if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
3961		vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
3962		vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
3963		vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
3964	}
3965}
3966
3967void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
3968{
3969	struct vcpu_vmx *vmx = to_vmx(vcpu);
3970	u8 mode = vmx_msr_bitmap_mode(vcpu);
3971	u8 changed = mode ^ vmx->msr_bitmap_mode;
3972
3973	if (!changed)
3974		return;
3975
3976	if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
3977		vmx_update_msr_bitmap_x2apic(vcpu, mode);
3978
3979	vmx->msr_bitmap_mode = mode;
3980}
3981
3982void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
3983{
3984	struct vcpu_vmx *vmx = to_vmx(vcpu);
3985	bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
3986	u32 i;
3987
3988	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
3989	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
3990	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
3991	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
3992	for (i = 0; i < vmx->pt_desc.addr_range; i++) {
3993		vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
3994		vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
3995	}
3996}
3997
3998static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
3999{
4000	struct vcpu_vmx *vmx = to_vmx(vcpu);
4001	void *vapic_page;
4002	u32 vppr;
4003	int rvi;
4004
4005	if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
4006		!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
4007		WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
4008		return false;
4009
4010	rvi = vmx_get_rvi();
4011
4012	vapic_page = vmx->nested.virtual_apic_map.hva;
4013	vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
4014
4015	return ((rvi & 0xf0) > (vppr & 0xf0));
4016}
4017
4018static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
4019{
4020	struct vcpu_vmx *vmx = to_vmx(vcpu);
4021	u32 i;
4022
4023	/*
4024	 * Set intercept permissions for all potentially passed through MSRs
4025	 * again. They will automatically get filtered through the MSR filter,
4026	 * so we are back in sync after this.
4027	 */
4028	for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
4029		u32 msr = vmx_possible_passthrough_msrs[i];
4030		bool read = test_bit(i, vmx->shadow_msr_intercept.read);
4031		bool write = test_bit(i, vmx->shadow_msr_intercept.write);
4032
4033		vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read);
4034		vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write);
4035	}
4036
4037	pt_update_intercept_for_msr(vcpu);
4038	vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu));
4039}
4040
4041static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
4042						     bool nested)
4043{
4044#ifdef CONFIG_SMP
4045	int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
4046
4047	if (vcpu->mode == IN_GUEST_MODE) {
4048		/*
4049		 * The vector of interrupt to be delivered to vcpu had
4050		 * been set in PIR before this function.
4051		 *
4052		 * Following cases will be reached in this block, and
4053		 * we always send a notification event in all cases as
4054		 * explained below.
4055		 *
4056		 * Case 1: vcpu keeps in non-root mode. Sending a
4057		 * notification event posts the interrupt to vcpu.
4058		 *
4059		 * Case 2: vcpu exits to root mode and is still
4060		 * runnable. PIR will be synced to vIRR before the
4061		 * next vcpu entry. Sending a notification event in
4062		 * this case has no effect, as vcpu is not in root
4063		 * mode.
4064		 *
4065		 * Case 3: vcpu exits to root mode and is blocked.
4066		 * vcpu_block() has already synced PIR to vIRR and
4067		 * never blocks vcpu if vIRR is not cleared. Therefore,
4068		 * a blocked vcpu here does not wait for any requested
4069		 * interrupts in PIR, and sending a notification event
4070		 * which has no effect is safe here.
4071		 */
4072
4073		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
4074		return true;
4075	}
4076#endif
4077	return false;
4078}
4079
4080static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4081						int vector)
4082{
4083	struct vcpu_vmx *vmx = to_vmx(vcpu);
4084
4085	if (is_guest_mode(vcpu) &&
4086	    vector == vmx->nested.posted_intr_nv) {
4087		/*
4088		 * If a posted intr is not recognized by hardware,
4089		 * we will accomplish it in the next vmentry.
4090		 */
4091		vmx->nested.pi_pending = true;
4092		kvm_make_request(KVM_REQ_EVENT, vcpu);
4093		/* the PIR and ON have been set by L1. */
4094		if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
4095			kvm_vcpu_kick(vcpu);
4096		return 0;
4097	}
4098	return -1;
4099}
4100/*
4101 * Send interrupt to vcpu via posted interrupt way.
4102 * 1. If target vcpu is running(non-root mode), send posted interrupt
4103 * notification to vcpu and hardware will sync PIR to vIRR atomically.
4104 * 2. If target vcpu isn't running(root mode), kick it to pick up the
4105 * interrupt from PIR in next vmentry.
4106 */
4107static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4108{
4109	struct vcpu_vmx *vmx = to_vmx(vcpu);
4110	int r;
4111
4112	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4113	if (!r)
4114		return 0;
4115
4116	if (!vcpu->arch.apicv_active)
4117		return -1;
4118
4119	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4120		return 0;
4121
4122	/* If a previous notification has sent the IPI, nothing to do.  */
4123	if (pi_test_and_set_on(&vmx->pi_desc))
4124		return 0;
4125
4126	if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
4127		kvm_vcpu_kick(vcpu);
4128
4129	return 0;
4130}
4131
4132/*
4133 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4134 * will not change in the lifetime of the guest.
4135 * Note that host-state that does change is set elsewhere. E.g., host-state
4136 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4137 */
4138void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4139{
4140	u32 low32, high32;
4141	unsigned long tmpl;
4142	unsigned long cr0, cr3, cr4;
4143
4144	cr0 = read_cr0();
4145	WARN_ON(cr0 & X86_CR0_TS);
4146	vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
4147
4148	/*
4149	 * Save the most likely value for this task's CR3 in the VMCS.
4150	 * We can't use __get_current_cr3_fast() because we're not atomic.
4151	 */
4152	cr3 = __read_cr3();
4153	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
4154	vmx->loaded_vmcs->host_state.cr3 = cr3;
4155
4156	/* Save the most likely value for this task's CR4 in the VMCS. */
4157	cr4 = cr4_read_shadow();
4158	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
4159	vmx->loaded_vmcs->host_state.cr4 = cr4;
4160
4161	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
4162#ifdef CONFIG_X86_64
4163	/*
4164	 * Load null selectors, so we can avoid reloading them in
4165	 * vmx_prepare_switch_to_host(), in case userspace uses
4166	 * the null selectors too (the expected case).
4167	 */
4168	vmcs_write16(HOST_DS_SELECTOR, 0);
4169	vmcs_write16(HOST_ES_SELECTOR, 0);
4170#else
4171	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4172	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4173#endif
4174	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4175	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
4176
4177	vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */
4178
4179	vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
4180
4181	rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4182	vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4183	rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4184	vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
4185
4186	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4187		rdmsr(MSR_IA32_CR_PAT, low32, high32);
4188		vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4189	}
4190
4191	if (cpu_has_load_ia32_efer())
4192		vmcs_write64(HOST_IA32_EFER, host_efer);
4193}
4194
4195void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4196{
4197	struct kvm_vcpu *vcpu = &vmx->vcpu;
4198
4199	vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4200					  ~vcpu->arch.cr4_guest_rsvd_bits;
4201	if (!enable_ept)
4202		vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;
4203	if (is_guest_mode(&vmx->vcpu))
4204		vcpu->arch.cr4_guest_owned_bits &=
4205			~get_vmcs12(vcpu)->cr4_guest_host_mask;
4206	vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
4207}
4208
4209u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4210{
4211	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4212
4213	if (!kvm_vcpu_apicv_active(&vmx->vcpu))
4214		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4215
4216	if (!enable_vnmi)
4217		pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4218
4219	if (!enable_preemption_timer)
4220		pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4221
4222	return pin_based_exec_ctrl;
4223}
4224
4225static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4226{
4227	struct vcpu_vmx *vmx = to_vmx(vcpu);
4228
4229	pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4230	if (cpu_has_secondary_exec_ctrls()) {
4231		if (kvm_vcpu_apicv_active(vcpu))
4232			secondary_exec_controls_setbit(vmx,
4233				      SECONDARY_EXEC_APIC_REGISTER_VIRT |
4234				      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4235		else
4236			secondary_exec_controls_clearbit(vmx,
4237					SECONDARY_EXEC_APIC_REGISTER_VIRT |
4238					SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4239	}
4240
4241	if (cpu_has_vmx_msr_bitmap())
4242		vmx_update_msr_bitmap(vcpu);
4243}
4244
4245u32 vmx_exec_control(struct vcpu_vmx *vmx)
4246{
4247	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4248
4249	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4250		exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4251
4252	if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
4253		exec_control &= ~CPU_BASED_TPR_SHADOW;
4254#ifdef CONFIG_X86_64
4255		exec_control |= CPU_BASED_CR8_STORE_EXITING |
4256				CPU_BASED_CR8_LOAD_EXITING;
4257#endif
4258	}
4259	if (!enable_ept)
4260		exec_control |= CPU_BASED_CR3_STORE_EXITING |
4261				CPU_BASED_CR3_LOAD_EXITING  |
4262				CPU_BASED_INVLPG_EXITING;
4263	if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4264		exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4265				CPU_BASED_MONITOR_EXITING);
4266	if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4267		exec_control &= ~CPU_BASED_HLT_EXITING;
4268	return exec_control;
4269}
4270
4271/*
4272 * Adjust a single secondary execution control bit to intercept/allow an
4273 * instruction in the guest.  This is usually done based on whether or not a
4274 * feature has been exposed to the guest in order to correctly emulate faults.
4275 */
4276static inline void
4277vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4278				  u32 control, bool enabled, bool exiting)
4279{
4280	/*
4281	 * If the control is for an opt-in feature, clear the control if the
4282	 * feature is not exposed to the guest, i.e. not enabled.  If the
4283	 * control is opt-out, i.e. an exiting control, clear the control if
4284	 * the feature _is_ exposed to the guest, i.e. exiting/interception is
4285	 * disabled for the associated instruction.  Note, the caller is
4286	 * responsible presetting exec_control to set all supported bits.
4287	 */
4288	if (enabled == exiting)
4289		*exec_control &= ~control;
4290
4291	/*
4292	 * Update the nested MSR settings so that a nested VMM can/can't set
4293	 * controls for features that are/aren't exposed to the guest.
4294	 */
4295	if (nested) {
4296		if (enabled)
4297			vmx->nested.msrs.secondary_ctls_high |= control;
4298		else
4299			vmx->nested.msrs.secondary_ctls_high &= ~control;
4300	}
4301}
4302
4303/*
4304 * Wrapper macro for the common case of adjusting a secondary execution control
4305 * based on a single guest CPUID bit, with a dedicated feature bit.  This also
4306 * verifies that the control is actually supported by KVM and hardware.
4307 */
4308#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4309({									 \
4310	bool __enabled;							 \
4311									 \
4312	if (cpu_has_vmx_##name()) {					 \
4313		__enabled = guest_cpuid_has(&(vmx)->vcpu,		 \
4314					    X86_FEATURE_##feat_name);	 \
4315		vmx_adjust_secondary_exec_control(vmx, exec_control,	 \
4316			SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
4317	}								 \
4318})
4319
4320/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4321#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4322	vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4323
4324#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4325	vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4326
4327static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
4328{
4329	struct kvm_vcpu *vcpu = &vmx->vcpu;
4330
4331	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4332
4333	if (vmx_pt_mode_is_system())
4334		exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4335	if (!cpu_need_virtualize_apic_accesses(vcpu))
4336		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4337	if (vmx->vpid == 0)
4338		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4339	if (!enable_ept) {
4340		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4341		enable_unrestricted_guest = 0;
4342	}
4343	if (!enable_unrestricted_guest)
4344		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4345	if (kvm_pause_in_guest(vmx->vcpu.kvm))
4346		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4347	if (!kvm_vcpu_apicv_active(vcpu))
4348		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4349				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4350	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4351
4352	/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4353	 * in vmx_set_cr4.  */
4354	exec_control &= ~SECONDARY_EXEC_DESC;
4355
4356	/* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4357	   (handle_vmptrld).
4358	   We can NOT enable shadow_vmcs here because we don't have yet
4359	   a current VMCS12
4360	*/
4361	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4362
4363	if (!enable_pml)
4364		exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4365
4366	if (cpu_has_vmx_xsaves()) {
4367		/* Exposing XSAVES only when XSAVE is exposed */
4368		bool xsaves_enabled =
4369			boot_cpu_has(X86_FEATURE_XSAVE) &&
4370			guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4371			guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
4372
4373		vcpu->arch.xsaves_enabled = xsaves_enabled;
4374
4375		vmx_adjust_secondary_exec_control(vmx, &exec_control,
4376						  SECONDARY_EXEC_XSAVES,
4377						  xsaves_enabled, false);
4378	}
4379
4380	vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
4381
4382	/*
4383	 * Expose INVPCID if and only if PCID is also exposed to the guest.
4384	 * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
4385	 * if CR4.PCIDE=0.  Enumerating CPUID.INVPCID=1 would lead to incorrect
4386	 * behavior from the guest perspective (it would expect #GP or #PF).
4387	 */
4388	if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
4389		guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
4390	vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4391
4392
4393	vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4394	vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4395
4396	vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4397				    ENABLE_USR_WAIT_PAUSE, false);
4398
4399	vmx->secondary_exec_control = exec_control;
4400}
4401
4402static void ept_set_mmio_spte_mask(void)
4403{
4404	/*
4405	 * EPT Misconfigurations can be generated if the value of bits 2:0
4406	 * of an EPT paging-structure entry is 110b (write/execute).
4407	 */
4408	kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
4409}
4410
4411#define VMX_XSS_EXIT_BITMAP 0
4412
4413/*
4414 * Noting that the initialization of Guest-state Area of VMCS is in
4415 * vmx_vcpu_reset().
4416 */
4417static void init_vmcs(struct vcpu_vmx *vmx)
4418{
4419	if (nested)
4420		nested_vmx_set_vmcs_shadowing_bitmap();
4421
4422	if (cpu_has_vmx_msr_bitmap())
4423		vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4424
4425	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
4426
4427	/* Control */
4428	pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4429
4430	exec_controls_set(vmx, vmx_exec_control(vmx));
4431
4432	if (cpu_has_secondary_exec_ctrls()) {
4433		vmx_compute_secondary_exec_control(vmx);
4434		secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
4435	}
4436
4437	if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
4438		vmcs_write64(EOI_EXIT_BITMAP0, 0);
4439		vmcs_write64(EOI_EXIT_BITMAP1, 0);
4440		vmcs_write64(EOI_EXIT_BITMAP2, 0);
4441		vmcs_write64(EOI_EXIT_BITMAP3, 0);
4442
4443		vmcs_write16(GUEST_INTR_STATUS, 0);
4444
4445		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4446		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4447	}
4448
4449	if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
4450		vmcs_write32(PLE_GAP, ple_gap);
4451		vmx->ple_window = ple_window;
4452		vmx->ple_window_dirty = true;
4453	}
4454
4455	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4456	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4457	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
4458
4459	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
4460	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
4461	vmx_set_constant_host_state(vmx);
4462	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4463	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4464
4465	if (cpu_has_vmx_vmfunc())
4466		vmcs_write64(VM_FUNCTION_CONTROL, 0);
4467
4468	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4469	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4470	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4471	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4472	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
4473
4474	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4475		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
4476
4477	vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
4478
4479	/* 22.2.1, 20.8.1 */
4480	vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
4481
4482	vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4483	vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
4484
4485	set_cr4_guest_host_mask(vmx);
4486
4487	if (vmx->vpid != 0)
4488		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4489
4490	if (cpu_has_vmx_xsaves())
4491		vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4492
4493	if (enable_pml) {
4494		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4495		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
4496	}
4497
4498	if (cpu_has_vmx_encls_vmexit())
4499		vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
4500
4501	if (vmx_pt_mode_is_host_guest()) {
4502		memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4503		/* Bit[6~0] are forced to 1, writes are ignored. */
4504		vmx->pt_desc.guest.output_mask = 0x7F;
4505		vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4506	}
4507}
4508
4509static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4510{
4511	struct vcpu_vmx *vmx = to_vmx(vcpu);
4512	struct msr_data apic_base_msr;
4513	u64 cr0;
4514
4515	vmx->rmode.vm86_active = 0;
4516	vmx->spec_ctrl = 0;
4517
4518	vmx->msr_ia32_umwait_control = 0;
4519
4520	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4521	vmx->hv_deadline_tsc = -1;
4522	kvm_set_cr8(vcpu, 0);
4523
4524	if (!init_event) {
4525		apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
4526				     MSR_IA32_APICBASE_ENABLE;
4527		if (kvm_vcpu_is_reset_bsp(vcpu))
4528			apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
4529		apic_base_msr.host_initiated = true;
4530		kvm_set_apic_base(vcpu, &apic_base_msr);
4531	}
4532
4533	vmx_segment_cache_clear(vmx);
4534
4535	seg_setup(VCPU_SREG_CS);
4536	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4537	vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
4538
4539	seg_setup(VCPU_SREG_DS);
4540	seg_setup(VCPU_SREG_ES);
4541	seg_setup(VCPU_SREG_FS);
4542	seg_setup(VCPU_SREG_GS);
4543	seg_setup(VCPU_SREG_SS);
4544
4545	vmcs_write16(GUEST_TR_SELECTOR, 0);
4546	vmcs_writel(GUEST_TR_BASE, 0);
4547	vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4548	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4549
4550	vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4551	vmcs_writel(GUEST_LDTR_BASE, 0);
4552	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4553	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4554
4555	if (!init_event) {
4556		vmcs_write32(GUEST_SYSENTER_CS, 0);
4557		vmcs_writel(GUEST_SYSENTER_ESP, 0);
4558		vmcs_writel(GUEST_SYSENTER_EIP, 0);
4559		vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4560	}
4561
4562	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
4563	kvm_rip_write(vcpu, 0xfff0);
4564
4565	vmcs_writel(GUEST_GDTR_BASE, 0);
4566	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4567
4568	vmcs_writel(GUEST_IDTR_BASE, 0);
4569	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4570
4571	vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4572	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4573	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4574	if (kvm_mpx_supported())
4575		vmcs_write64(GUEST_BNDCFGS, 0);
4576
4577	setup_msrs(vmx);
4578
4579	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
4580
4581	if (cpu_has_vmx_tpr_shadow() && !init_event) {
4582		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4583		if (cpu_need_tpr_shadow(vcpu))
4584			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4585				     __pa(vcpu->arch.apic->regs));
4586		vmcs_write32(TPR_THRESHOLD, 0);
4587	}
4588
4589	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4590
4591	cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
4592	vmx->vcpu.arch.cr0 = cr0;
4593	vmx_set_cr0(vcpu, cr0); /* enter rmode */
4594	vmx_set_cr4(vcpu, 0);
4595	vmx_set_efer(vcpu, 0);
4596
4597	update_exception_bitmap(vcpu);
4598
4599	vpid_sync_context(vmx->vpid);
4600	if (init_event)
4601		vmx_clear_hlt(vcpu);
4602
4603	vmx_update_fb_clear_dis(vcpu, vmx);
4604}
4605
4606static void enable_irq_window(struct kvm_vcpu *vcpu)
4607{
4608	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
4609}
4610
4611static void enable_nmi_window(struct kvm_vcpu *vcpu)
4612{
4613	if (!enable_vnmi ||
4614	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4615		enable_irq_window(vcpu);
4616		return;
4617	}
4618
4619	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
4620}
4621
4622static void vmx_inject_irq(struct kvm_vcpu *vcpu)
4623{
4624	struct vcpu_vmx *vmx = to_vmx(vcpu);
4625	uint32_t intr;
4626	int irq = vcpu->arch.interrupt.nr;
4627
4628	trace_kvm_inj_virq(irq);
4629
4630	++vcpu->stat.irq_injections;
4631	if (vmx->rmode.vm86_active) {
4632		int inc_eip = 0;
4633		if (vcpu->arch.interrupt.soft)
4634			inc_eip = vcpu->arch.event_exit_inst_len;
4635		kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
4636		return;
4637	}
4638	intr = irq | INTR_INFO_VALID_MASK;
4639	if (vcpu->arch.interrupt.soft) {
4640		intr |= INTR_TYPE_SOFT_INTR;
4641		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4642			     vmx->vcpu.arch.event_exit_inst_len);
4643	} else
4644		intr |= INTR_TYPE_EXT_INTR;
4645	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4646
4647	vmx_clear_hlt(vcpu);
4648}
4649
4650static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4651{
4652	struct vcpu_vmx *vmx = to_vmx(vcpu);
4653
4654	if (!enable_vnmi) {
4655		/*
4656		 * Tracking the NMI-blocked state in software is built upon
4657		 * finding the next open IRQ window. This, in turn, depends on
4658		 * well-behaving guests: They have to keep IRQs disabled at
4659		 * least as long as the NMI handler runs. Otherwise we may
4660		 * cause NMI nesting, maybe breaking the guest. But as this is
4661		 * highly unlikely, we can live with the residual risk.
4662		 */
4663		vmx->loaded_vmcs->soft_vnmi_blocked = 1;
4664		vmx->loaded_vmcs->vnmi_blocked_time = 0;
4665	}
4666
4667	++vcpu->stat.nmi_injections;
4668	vmx->loaded_vmcs->nmi_known_unmasked = false;
4669
4670	if (vmx->rmode.vm86_active) {
4671		kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
4672		return;
4673	}
4674
4675	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4676			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4677
4678	vmx_clear_hlt(vcpu);
4679}
4680
4681bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4682{
4683	struct vcpu_vmx *vmx = to_vmx(vcpu);
4684	bool masked;
4685
4686	if (!enable_vnmi)
4687		return vmx->loaded_vmcs->soft_vnmi_blocked;
4688	if (vmx->loaded_vmcs->nmi_known_unmasked)
4689		return false;
4690	masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4691	vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4692	return masked;
4693}
4694
4695void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4696{
4697	struct vcpu_vmx *vmx = to_vmx(vcpu);
4698
4699	if (!enable_vnmi) {
4700		if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
4701			vmx->loaded_vmcs->soft_vnmi_blocked = masked;
4702			vmx->loaded_vmcs->vnmi_blocked_time = 0;
4703		}
4704	} else {
4705		vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4706		if (masked)
4707			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
4708				      GUEST_INTR_STATE_NMI);
4709		else
4710			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
4711					GUEST_INTR_STATE_NMI);
4712	}
4713}
4714
4715bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
4716{
4717	if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
4718		return false;
4719
4720	if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
4721		return true;
4722
4723	return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4724		(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
4725		 GUEST_INTR_STATE_NMI));
4726}
4727
4728static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4729{
4730	if (to_vmx(vcpu)->nested.nested_run_pending)
4731		return -EBUSY;
4732
4733	/* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
4734	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
4735		return -EBUSY;
4736
4737	return !vmx_nmi_blocked(vcpu);
4738}
4739
4740bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
4741{
4742	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4743		return false;
4744
4745	return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
4746	       (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4747		(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
4748}
4749
4750static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4751{
4752	if (to_vmx(vcpu)->nested.nested_run_pending)
4753		return -EBUSY;
4754
4755       /*
4756        * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
4757        * e.g. if the IRQ arrived asynchronously after checking nested events.
4758        */
4759	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4760		return -EBUSY;
4761
4762	return !vmx_interrupt_blocked(vcpu);
4763}
4764
4765static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4766{
4767	int ret;
4768
4769	if (enable_unrestricted_guest)
4770		return 0;
4771
4772	mutex_lock(&kvm->slots_lock);
4773	ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
4774				      PAGE_SIZE * 3);
4775	mutex_unlock(&kvm->slots_lock);
4776
4777	if (ret)
4778		return ret;
4779	to_kvm_vmx(kvm)->tss_addr = addr;
4780	return init_rmode_tss(kvm);
4781}
4782
4783static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
4784{
4785	to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
4786	return 0;
4787}
4788
4789static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
4790{
4791	switch (vec) {
4792	case BP_VECTOR:
4793		/*
4794		 * Update instruction length as we may reinject the exception
4795		 * from user space while in guest debugging mode.
4796		 */
4797		to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
4798			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4799		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
4800			return false;
4801		fallthrough;
4802	case DB_VECTOR:
4803		return !(vcpu->guest_debug &
4804			(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
4805	case DE_VECTOR:
4806	case OF_VECTOR:
4807	case BR_VECTOR:
4808	case UD_VECTOR:
4809	case DF_VECTOR:
4810	case SS_VECTOR:
4811	case GP_VECTOR:
4812	case MF_VECTOR:
4813		return true;
4814	}
4815	return false;
4816}
4817
4818static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4819				  int vec, u32 err_code)
4820{
4821	/*
4822	 * Instruction with address size override prefix opcode 0x67
4823	 * Cause the #SS fault with 0 error code in VM86 mode.
4824	 */
4825	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
4826		if (kvm_emulate_instruction(vcpu, 0)) {
4827			if (vcpu->arch.halt_request) {
4828				vcpu->arch.halt_request = 0;
4829				return kvm_vcpu_halt(vcpu);
4830			}
4831			return 1;
4832		}
4833		return 0;
4834	}
4835
4836	/*
4837	 * Forward all other exceptions that are valid in real mode.
4838	 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4839	 *        the required debugging infrastructure rework.
4840	 */
4841	kvm_queue_exception(vcpu, vec);
4842	return 1;
4843}
4844
4845/*
4846 * Trigger machine check on the host. We assume all the MSRs are already set up
4847 * by the CPU and that we still run on the same CPU as the MCE occurred on.
4848 * We pass a fake environment to the machine check handler because we want
4849 * the guest to be always treated like user space, no matter what context
4850 * it used internally.
4851 */
4852static void kvm_machine_check(void)
4853{
4854#if defined(CONFIG_X86_MCE)
4855	struct pt_regs regs = {
4856		.cs = 3, /* Fake ring 3 no matter what the guest ran on */
4857		.flags = X86_EFLAGS_IF,
4858	};
4859
4860	do_machine_check(&regs);
4861#endif
4862}
4863
4864static int handle_machine_check(struct kvm_vcpu *vcpu)
4865{
4866	/* handled by vmx_vcpu_run() */
4867	return 1;
4868}
4869
4870/*
4871 * If the host has split lock detection disabled, then #AC is
4872 * unconditionally injected into the guest, which is the pre split lock
4873 * detection behaviour.
4874 *
4875 * If the host has split lock detection enabled then #AC is
4876 * only injected into the guest when:
4877 *  - Guest CPL == 3 (user mode)
4878 *  - Guest has #AC detection enabled in CR0
4879 *  - Guest EFLAGS has AC bit set
4880 */
4881bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
4882{
4883	if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
4884		return true;
4885
4886	return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
4887	       (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
4888}
4889
4890static int handle_exception_nmi(struct kvm_vcpu *vcpu)
4891{
4892	struct vcpu_vmx *vmx = to_vmx(vcpu);
4893	struct kvm_run *kvm_run = vcpu->run;
4894	u32 intr_info, ex_no, error_code;
4895	unsigned long cr2, rip, dr6;
4896	u32 vect_info;
4897
4898	vect_info = vmx->idt_vectoring_info;
4899	intr_info = vmx_get_intr_info(vcpu);
4900
4901	if (is_machine_check(intr_info) || is_nmi(intr_info))
4902		return 1; /* handled by handle_exception_nmi_irqoff() */
4903
4904	if (is_invalid_opcode(intr_info))
4905		return handle_ud(vcpu);
4906
4907	error_code = 0;
4908	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
4909		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
4910
4911	if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
4912		WARN_ON_ONCE(!enable_vmware_backdoor);
4913
4914		/*
4915		 * VMware backdoor emulation on #GP interception only handles
4916		 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
4917		 * error code on #GP.
4918		 */
4919		if (error_code) {
4920			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
4921			return 1;
4922		}
4923		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
4924	}
4925
4926	/*
4927	 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
4928	 * MMIO, it is better to report an internal error.
4929	 * See the comments in vmx_handle_exit.
4930	 */
4931	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4932	    !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
4933		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4934		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4935		vcpu->run->internal.ndata = 4;
4936		vcpu->run->internal.data[0] = vect_info;
4937		vcpu->run->internal.data[1] = intr_info;
4938		vcpu->run->internal.data[2] = error_code;
4939		vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
4940		return 0;
4941	}
4942
4943	if (is_page_fault(intr_info)) {
4944		cr2 = vmx_get_exit_qual(vcpu);
4945		if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
4946			/*
4947			 * EPT will cause page fault only if we need to
4948			 * detect illegal GPAs.
4949			 */
4950			WARN_ON_ONCE(!allow_smaller_maxphyaddr);
4951			kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
4952			return 1;
4953		} else
4954			return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
4955	}
4956
4957	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
4958
4959	if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
4960		return handle_rmode_exception(vcpu, ex_no, error_code);
4961
4962	switch (ex_no) {
4963	case DB_VECTOR:
4964		dr6 = vmx_get_exit_qual(vcpu);
4965		if (!(vcpu->guest_debug &
4966		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
4967			/*
4968			 * If the #DB was due to ICEBP, a.k.a. INT1, skip the
4969			 * instruction.  ICEBP generates a trap-like #DB, but
4970			 * despite its interception control being tied to #DB,
4971			 * is an instruction intercept, i.e. the VM-Exit occurs
4972			 * on the ICEBP itself.  Note, skipping ICEBP also
4973			 * clears STI and MOVSS blocking.
4974			 *
4975			 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
4976			 * if single-step is enabled in RFLAGS and STI or MOVSS
4977			 * blocking is active, as the CPU doesn't set the bit
4978			 * on VM-Exit due to #DB interception.  VM-Entry has a
4979			 * consistency check that a single-step #DB is pending
4980			 * in this scenario as the previous instruction cannot
4981			 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
4982			 * don't modify RFLAGS), therefore the one instruction
4983			 * delay when activating single-step breakpoints must
4984			 * have already expired.  Note, the CPU sets/clears BS
4985			 * as appropriate for all other VM-Exits types.
4986			 */
4987			if (is_icebp(intr_info))
4988				WARN_ON(!skip_emulated_instruction(vcpu));
4989			else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
4990				 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4991				  (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
4992				vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
4993					    vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
4994
4995			kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
4996			return 1;
4997		}
4998		kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
4999		kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5000		fallthrough;
5001	case BP_VECTOR:
5002		/*
5003		 * Update instruction length as we may reinject #BP from
5004		 * user space while in guest debugging mode. Reading it for
5005		 * #DB as well causes no harm, it is not used in that case.
5006		 */
5007		vmx->vcpu.arch.event_exit_inst_len =
5008			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5009		kvm_run->exit_reason = KVM_EXIT_DEBUG;
5010		rip = kvm_rip_read(vcpu);
5011		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
5012		kvm_run->debug.arch.exception = ex_no;
5013		break;
5014	case AC_VECTOR:
5015		if (vmx_guest_inject_ac(vcpu)) {
5016			kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5017			return 1;
5018		}
5019
5020		/*
5021		 * Handle split lock. Depending on detection mode this will
5022		 * either warn and disable split lock detection for this
5023		 * task or force SIGBUS on it.
5024		 */
5025		if (handle_guest_split_lock(kvm_rip_read(vcpu)))
5026			return 1;
5027		fallthrough;
5028	default:
5029		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5030		kvm_run->ex.exception = ex_no;
5031		kvm_run->ex.error_code = error_code;
5032		break;
5033	}
5034	return 0;
5035}
5036
5037static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
5038{
5039	++vcpu->stat.irq_exits;
5040	return 1;
5041}
5042
5043static int handle_triple_fault(struct kvm_vcpu *vcpu)
5044{
5045	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5046	vcpu->mmio_needed = 0;
5047	return 0;
5048}
5049
5050static int handle_io(struct kvm_vcpu *vcpu)
5051{
5052	unsigned long exit_qualification;
5053	int size, in, string;
5054	unsigned port;
5055
5056	exit_qualification = vmx_get_exit_qual(vcpu);
5057	string = (exit_qualification & 16) != 0;
5058
5059	++vcpu->stat.io_exits;
5060
5061	if (string)
5062		return kvm_emulate_instruction(vcpu, 0);
5063
5064	port = exit_qualification >> 16;
5065	size = (exit_qualification & 7) + 1;
5066	in = (exit_qualification & 8) != 0;
5067
5068	return kvm_fast_pio(vcpu, size, port, in);
5069}
5070
5071static void
5072vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5073{
5074	/*
5075	 * Patch in the VMCALL instruction:
5076	 */
5077	hypercall[0] = 0x0f;
5078	hypercall[1] = 0x01;
5079	hypercall[2] = 0xc1;
5080}
5081
5082/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
5083static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5084{
5085	if (is_guest_mode(vcpu)) {
5086		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5087		unsigned long orig_val = val;
5088
5089		/*
5090		 * We get here when L2 changed cr0 in a way that did not change
5091		 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5092		 * but did change L0 shadowed bits. So we first calculate the
5093		 * effective cr0 value that L1 would like to write into the
5094		 * hardware. It consists of the L2-owned bits from the new
5095		 * value combined with the L1-owned bits from L1's guest_cr0.
5096		 */
5097		val = (val & ~vmcs12->cr0_guest_host_mask) |
5098			(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5099
5100		if (!nested_guest_cr0_valid(vcpu, val))
5101			return 1;
5102
5103		if (kvm_set_cr0(vcpu, val))
5104			return 1;
5105		vmcs_writel(CR0_READ_SHADOW, orig_val);
5106		return 0;
5107	} else {
5108		if (to_vmx(vcpu)->nested.vmxon &&
5109		    !nested_host_cr0_valid(vcpu, val))
5110			return 1;
5111
5112		return kvm_set_cr0(vcpu, val);
5113	}
5114}
5115
5116static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5117{
5118	if (is_guest_mode(vcpu)) {
5119		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5120		unsigned long orig_val = val;
5121
5122		/* analogously to handle_set_cr0 */
5123		val = (val & ~vmcs12->cr4_guest_host_mask) |
5124			(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5125		if (kvm_set_cr4(vcpu, val))
5126			return 1;
5127		vmcs_writel(CR4_READ_SHADOW, orig_val);
5128		return 0;
5129	} else
5130		return kvm_set_cr4(vcpu, val);
5131}
5132
5133static int handle_desc(struct kvm_vcpu *vcpu)
5134{
5135	WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
5136	return kvm_emulate_instruction(vcpu, 0);
5137}
5138
5139static int handle_cr(struct kvm_vcpu *vcpu)
5140{
5141	unsigned long exit_qualification, val;
5142	int cr;
5143	int reg;
5144	int err;
5145	int ret;
5146
5147	exit_qualification = vmx_get_exit_qual(vcpu);
5148	cr = exit_qualification & 15;
5149	reg = (exit_qualification >> 8) & 15;
5150	switch ((exit_qualification >> 4) & 3) {
5151	case 0: /* mov to cr */
5152		val = kvm_register_readl(vcpu, reg);
5153		trace_kvm_cr_write(cr, val);
5154		switch (cr) {
5155		case 0:
5156			err = handle_set_cr0(vcpu, val);
5157			return kvm_complete_insn_gp(vcpu, err);
5158		case 3:
5159			WARN_ON_ONCE(enable_unrestricted_guest);
5160			err = kvm_set_cr3(vcpu, val);
5161			return kvm_complete_insn_gp(vcpu, err);
5162		case 4:
5163			err = handle_set_cr4(vcpu, val);
5164			return kvm_complete_insn_gp(vcpu, err);
5165		case 8: {
5166				u8 cr8_prev = kvm_get_cr8(vcpu);
5167				u8 cr8 = (u8)val;
5168				err = kvm_set_cr8(vcpu, cr8);
5169				ret = kvm_complete_insn_gp(vcpu, err);
5170				if (lapic_in_kernel(vcpu))
5171					return ret;
5172				if (cr8_prev <= cr8)
5173					return ret;
5174				/*
5175				 * TODO: we might be squashing a
5176				 * KVM_GUESTDBG_SINGLESTEP-triggered
5177				 * KVM_EXIT_DEBUG here.
5178				 */
5179				vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5180				return 0;
5181			}
5182		}
5183		break;
5184	case 2: /* clts */
5185		WARN_ONCE(1, "Guest should always own CR0.TS");
5186		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
5187		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
5188		return kvm_skip_emulated_instruction(vcpu);
5189	case 1: /*mov from cr*/
5190		switch (cr) {
5191		case 3:
5192			WARN_ON_ONCE(enable_unrestricted_guest);
5193			val = kvm_read_cr3(vcpu);
5194			kvm_register_write(vcpu, reg, val);
5195			trace_kvm_cr_read(cr, val);
5196			return kvm_skip_emulated_instruction(vcpu);
5197		case 8:
5198			val = kvm_get_cr8(vcpu);
5199			kvm_register_write(vcpu, reg, val);
5200			trace_kvm_cr_read(cr, val);
5201			return kvm_skip_emulated_instruction(vcpu);
5202		}
5203		break;
5204	case 3: /* lmsw */
5205		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5206		trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
5207		kvm_lmsw(vcpu, val);
5208
5209		return kvm_skip_emulated_instruction(vcpu);
5210	default:
5211		break;
5212	}
5213	vcpu->run->exit_reason = 0;
5214	vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5215	       (int)(exit_qualification >> 4) & 3, cr);
5216	return 0;
5217}
5218
5219static int handle_dr(struct kvm_vcpu *vcpu)
5220{
5221	unsigned long exit_qualification;
5222	int dr, dr7, reg;
5223
5224	exit_qualification = vmx_get_exit_qual(vcpu);
5225	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5226
5227	/* First, if DR does not exist, trigger UD */
5228	if (!kvm_require_dr(vcpu, dr))
5229		return 1;
5230
5231	/* Do not handle if the CPL > 0, will trigger GP on re-entry */
5232	if (!kvm_require_cpl(vcpu, 0))
5233		return 1;
5234	dr7 = vmcs_readl(GUEST_DR7);
5235	if (dr7 & DR7_GD) {
5236		/*
5237		 * As the vm-exit takes precedence over the debug trap, we
5238		 * need to emulate the latter, either for the host or the
5239		 * guest debugging itself.
5240		 */
5241		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5242			vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1;
5243			vcpu->run->debug.arch.dr7 = dr7;
5244			vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5245			vcpu->run->debug.arch.exception = DB_VECTOR;
5246			vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5247			return 0;
5248		} else {
5249			kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
5250			return 1;
5251		}
5252	}
5253
5254	if (vcpu->guest_debug == 0) {
5255		exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5256
5257		/*
5258		 * No more DR vmexits; force a reload of the debug registers
5259		 * and reenter on this instruction.  The next vmexit will
5260		 * retrieve the full state of the debug registers.
5261		 */
5262		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5263		return 1;
5264	}
5265
5266	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5267	if (exit_qualification & TYPE_MOV_FROM_DR) {
5268		unsigned long val;
5269
5270		if (kvm_get_dr(vcpu, dr, &val))
5271			return 1;
5272		kvm_register_write(vcpu, reg, val);
5273	} else
5274		if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
5275			return 1;
5276
5277	return kvm_skip_emulated_instruction(vcpu);
5278}
5279
5280static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5281{
5282	get_debugreg(vcpu->arch.db[0], 0);
5283	get_debugreg(vcpu->arch.db[1], 1);
5284	get_debugreg(vcpu->arch.db[2], 2);
5285	get_debugreg(vcpu->arch.db[3], 3);
5286	get_debugreg(vcpu->arch.dr6, 6);
5287	vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5288
5289	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5290	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5291}
5292
5293static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5294{
5295	vmcs_writel(GUEST_DR7, val);
5296}
5297
5298static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5299{
5300	kvm_apic_update_ppr(vcpu);
5301	return 1;
5302}
5303
5304static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5305{
5306	exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5307
5308	kvm_make_request(KVM_REQ_EVENT, vcpu);
5309
5310	++vcpu->stat.irq_window_exits;
5311	return 1;
5312}
5313
5314static int handle_vmcall(struct kvm_vcpu *vcpu)
5315{
5316	return kvm_emulate_hypercall(vcpu);
5317}
5318
5319static int handle_invd(struct kvm_vcpu *vcpu)
5320{
5321	/* Treat an INVD instruction as a NOP and just skip it. */
5322	return kvm_skip_emulated_instruction(vcpu);
5323}
5324
5325static int handle_invlpg(struct kvm_vcpu *vcpu)
5326{
5327	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5328
5329	kvm_mmu_invlpg(vcpu, exit_qualification);
5330	return kvm_skip_emulated_instruction(vcpu);
5331}
5332
5333static int handle_rdpmc(struct kvm_vcpu *vcpu)
5334{
5335	int err;
5336
5337	err = kvm_rdpmc(vcpu);
5338	return kvm_complete_insn_gp(vcpu, err);
5339}
5340
5341static int handle_wbinvd(struct kvm_vcpu *vcpu)
5342{
5343	return kvm_emulate_wbinvd(vcpu);
5344}
5345
5346static int handle_xsetbv(struct kvm_vcpu *vcpu)
5347{
5348	u64 new_bv = kvm_read_edx_eax(vcpu);
5349	u32 index = kvm_rcx_read(vcpu);
5350
5351	if (kvm_set_xcr(vcpu, index, new_bv) == 0)
5352		return kvm_skip_emulated_instruction(vcpu);
5353	return 1;
5354}
5355
5356static int handle_apic_access(struct kvm_vcpu *vcpu)
5357{
5358	if (likely(fasteoi)) {
5359		unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5360		int access_type, offset;
5361
5362		access_type = exit_qualification & APIC_ACCESS_TYPE;
5363		offset = exit_qualification & APIC_ACCESS_OFFSET;
5364		/*
5365		 * Sane guest uses MOV to write EOI, with written value
5366		 * not cared. So make a short-circuit here by avoiding
5367		 * heavy instruction emulation.
5368		 */
5369		if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5370		    (offset == APIC_EOI)) {
5371			kvm_lapic_set_eoi(vcpu);
5372			return kvm_skip_emulated_instruction(vcpu);
5373		}
5374	}
5375	return kvm_emulate_instruction(vcpu, 0);
5376}
5377
5378static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5379{
5380	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5381	int vector = exit_qualification & 0xff;
5382
5383	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5384	kvm_apic_set_eoi_accelerated(vcpu, vector);
5385	return 1;
5386}
5387
5388static int handle_apic_write(struct kvm_vcpu *vcpu)
5389{
5390	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5391	u32 offset = exit_qualification & 0xfff;
5392
5393	/* APIC-write VM exit is trap-like and thus no need to adjust IP */
5394	kvm_apic_write_nodecode(vcpu, offset);
5395	return 1;
5396}
5397
5398static int handle_task_switch(struct kvm_vcpu *vcpu)
5399{
5400	struct vcpu_vmx *vmx = to_vmx(vcpu);
5401	unsigned long exit_qualification;
5402	bool has_error_code = false;
5403	u32 error_code = 0;
5404	u16 tss_selector;
5405	int reason, type, idt_v, idt_index;
5406
5407	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5408	idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5409	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5410
5411	exit_qualification = vmx_get_exit_qual(vcpu);
5412
5413	reason = (u32)exit_qualification >> 30;
5414	if (reason == TASK_SWITCH_GATE && idt_v) {
5415		switch (type) {
5416		case INTR_TYPE_NMI_INTR:
5417			vcpu->arch.nmi_injected = false;
5418			vmx_set_nmi_mask(vcpu, true);
5419			break;
5420		case INTR_TYPE_EXT_INTR:
5421		case INTR_TYPE_SOFT_INTR:
5422			kvm_clear_interrupt_queue(vcpu);
5423			break;
5424		case INTR_TYPE_HARD_EXCEPTION:
5425			if (vmx->idt_vectoring_info &
5426			    VECTORING_INFO_DELIVER_CODE_MASK) {
5427				has_error_code = true;
5428				error_code =
5429					vmcs_read32(IDT_VECTORING_ERROR_CODE);
5430			}
5431			fallthrough;
5432		case INTR_TYPE_SOFT_EXCEPTION:
5433			kvm_clear_exception_queue(vcpu);
5434			break;
5435		default:
5436			break;
5437		}
5438	}
5439	tss_selector = exit_qualification;
5440
5441	if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5442		       type != INTR_TYPE_EXT_INTR &&
5443		       type != INTR_TYPE_NMI_INTR))
5444		WARN_ON(!skip_emulated_instruction(vcpu));
5445
5446	/*
5447	 * TODO: What about debug traps on tss switch?
5448	 *       Are we supposed to inject them and update dr6?
5449	 */
5450	return kvm_task_switch(vcpu, tss_selector,
5451			       type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5452			       reason, has_error_code, error_code);
5453}
5454
5455static int handle_ept_violation(struct kvm_vcpu *vcpu)
5456{
5457	unsigned long exit_qualification;
5458	gpa_t gpa;
5459	u64 error_code;
5460
5461	exit_qualification = vmx_get_exit_qual(vcpu);
5462
5463	/*
5464	 * EPT violation happened while executing iret from NMI,
5465	 * "blocked by NMI" bit has to be set before next VM entry.
5466	 * There are errata that may cause this bit to not be set:
5467	 * AAK134, BY25.
5468	 */
5469	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5470			enable_vnmi &&
5471			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
5472		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5473
5474	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5475	trace_kvm_page_fault(gpa, exit_qualification);
5476
5477	/* Is it a read fault? */
5478	error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5479		     ? PFERR_USER_MASK : 0;
5480	/* Is it a write fault? */
5481	error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5482		      ? PFERR_WRITE_MASK : 0;
5483	/* Is it a fetch fault? */
5484	error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5485		      ? PFERR_FETCH_MASK : 0;
5486	/* ept page table entry is present? */
5487	error_code |= (exit_qualification &
5488		       (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
5489			EPT_VIOLATION_EXECUTABLE))
5490		      ? PFERR_PRESENT_MASK : 0;
5491
5492	error_code |= (exit_qualification & 0x100) != 0 ?
5493	       PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
5494
5495	vcpu->arch.exit_qualification = exit_qualification;
5496
5497	/*
5498	 * Check that the GPA doesn't exceed physical memory limits, as that is
5499	 * a guest page fault.  We have to emulate the instruction here, because
5500	 * if the illegal address is that of a paging structure, then
5501	 * EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
5502	 * would also use advanced VM-exit information for EPT violations to
5503	 * reconstruct the page fault error code.
5504	 */
5505	if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
5506		return kvm_emulate_instruction(vcpu, 0);
5507
5508	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5509}
5510
5511static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5512{
5513	gpa_t gpa;
5514
5515	/*
5516	 * A nested guest cannot optimize MMIO vmexits, because we have an
5517	 * nGPA here instead of the required GPA.
5518	 */
5519	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5520	if (!is_guest_mode(vcpu) &&
5521	    !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5522		trace_kvm_fast_mmio(gpa);
5523		return kvm_skip_emulated_instruction(vcpu);
5524	}
5525
5526	return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5527}
5528
5529static int handle_nmi_window(struct kvm_vcpu *vcpu)
5530{
5531	WARN_ON_ONCE(!enable_vnmi);
5532	exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5533	++vcpu->stat.nmi_window_exits;
5534	kvm_make_request(KVM_REQ_EVENT, vcpu);
5535
5536	return 1;
5537}
5538
5539static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5540{
5541	struct vcpu_vmx *vmx = to_vmx(vcpu);
5542	bool intr_window_requested;
5543	unsigned count = 130;
5544
5545	intr_window_requested = exec_controls_get(vmx) &
5546				CPU_BASED_INTR_WINDOW_EXITING;
5547
5548	while (vmx->emulation_required && count-- != 0) {
5549		if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
5550			return handle_interrupt_window(&vmx->vcpu);
5551
5552		if (kvm_test_request(KVM_REQ_EVENT, vcpu))
5553			return 1;
5554
5555		if (!kvm_emulate_instruction(vcpu, 0))
5556			return 0;
5557
5558		if (vmx->emulation_required && !vmx->rmode.vm86_active &&
5559		    vcpu->arch.exception.pending) {
5560			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5561			vcpu->run->internal.suberror =
5562						KVM_INTERNAL_ERROR_EMULATION;
5563			vcpu->run->internal.ndata = 0;
5564			return 0;
5565		}
5566
5567		if (vcpu->arch.halt_request) {
5568			vcpu->arch.halt_request = 0;
5569			return kvm_vcpu_halt(vcpu);
5570		}
5571
5572		/*
5573		 * Note, return 1 and not 0, vcpu_run() will invoke
5574		 * xfer_to_guest_mode() which will create a proper return
5575		 * code.
5576		 */
5577		if (__xfer_to_guest_mode_work_pending())
5578			return 1;
5579	}
5580
5581	return 1;
5582}
5583
5584static void grow_ple_window(struct kvm_vcpu *vcpu)
5585{
5586	struct vcpu_vmx *vmx = to_vmx(vcpu);
5587	unsigned int old = vmx->ple_window;
5588
5589	vmx->ple_window = __grow_ple_window(old, ple_window,
5590					    ple_window_grow,
5591					    ple_window_max);
5592
5593	if (vmx->ple_window != old) {
5594		vmx->ple_window_dirty = true;
5595		trace_kvm_ple_window_update(vcpu->vcpu_id,
5596					    vmx->ple_window, old);
5597	}
5598}
5599
5600static void shrink_ple_window(struct kvm_vcpu *vcpu)
5601{
5602	struct vcpu_vmx *vmx = to_vmx(vcpu);
5603	unsigned int old = vmx->ple_window;
5604
5605	vmx->ple_window = __shrink_ple_window(old, ple_window,
5606					      ple_window_shrink,
5607					      ple_window);
5608
5609	if (vmx->ple_window != old) {
5610		vmx->ple_window_dirty = true;
5611		trace_kvm_ple_window_update(vcpu->vcpu_id,
5612					    vmx->ple_window, old);
5613	}
5614}
5615
5616static void vmx_enable_tdp(void)
5617{
5618	kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
5619		enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
5620		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
5621		0ull, VMX_EPT_EXECUTABLE_MASK,
5622		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
5623		VMX_EPT_RWX_MASK, 0ull);
5624
5625	ept_set_mmio_spte_mask();
5626}
5627
5628/*
5629 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5630 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5631 */
5632static int handle_pause(struct kvm_vcpu *vcpu)
5633{
5634	if (!kvm_pause_in_guest(vcpu->kvm))
5635		grow_ple_window(vcpu);
5636
5637	/*
5638	 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5639	 * VM-execution control is ignored if CPL > 0. OTOH, KVM
5640	 * never set PAUSE_EXITING and just set PLE if supported,
5641	 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5642	 */
5643	kvm_vcpu_on_spin(vcpu, true);
5644	return kvm_skip_emulated_instruction(vcpu);
5645}
5646
5647static int handle_nop(struct kvm_vcpu *vcpu)
5648{
5649	return kvm_skip_emulated_instruction(vcpu);
5650}
5651
5652static int handle_mwait(struct kvm_vcpu *vcpu)
5653{
5654	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
5655	return handle_nop(vcpu);
5656}
5657
5658static int handle_invalid_op(struct kvm_vcpu *vcpu)
5659{
5660	kvm_queue_exception(vcpu, UD_VECTOR);
5661	return 1;
5662}
5663
5664static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5665{
5666	return 1;
5667}
5668
5669static int handle_monitor(struct kvm_vcpu *vcpu)
5670{
5671	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
5672	return handle_nop(vcpu);
5673}
5674
5675static int handle_invpcid(struct kvm_vcpu *vcpu)
5676{
5677	u32 vmx_instruction_info;
5678	unsigned long type;
5679	gva_t gva;
5680	struct {
5681		u64 pcid;
5682		u64 gla;
5683	} operand;
5684
5685	if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
5686		kvm_queue_exception(vcpu, UD_VECTOR);
5687		return 1;
5688	}
5689
5690	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5691	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5692
5693	if (type > 3) {
5694		kvm_inject_gp(vcpu, 0);
5695		return 1;
5696	}
5697
5698	/* According to the Intel instruction reference, the memory operand
5699	 * is read even if it isn't needed (e.g., for type==all)
5700	 */
5701	if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5702				vmx_instruction_info, false,
5703				sizeof(operand), &gva))
5704		return 1;
5705
5706	return kvm_handle_invpcid(vcpu, type, gva);
5707}
5708
5709static int handle_pml_full(struct kvm_vcpu *vcpu)
5710{
5711	unsigned long exit_qualification;
5712
5713	trace_kvm_pml_full(vcpu->vcpu_id);
5714
5715	exit_qualification = vmx_get_exit_qual(vcpu);
5716
5717	/*
5718	 * PML buffer FULL happened while executing iret from NMI,
5719	 * "blocked by NMI" bit has to be set before next VM entry.
5720	 */
5721	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5722			enable_vnmi &&
5723			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
5724		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5725				GUEST_INTR_STATE_NMI);
5726
5727	/*
5728	 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
5729	 * here.., and there's no userspace involvement needed for PML.
5730	 */
5731	return 1;
5732}
5733
5734static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
5735{
5736	struct vcpu_vmx *vmx = to_vmx(vcpu);
5737
5738	if (!vmx->req_immediate_exit &&
5739	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
5740		kvm_lapic_expired_hv_timer(vcpu);
5741		return EXIT_FASTPATH_REENTER_GUEST;
5742	}
5743
5744	return EXIT_FASTPATH_NONE;
5745}
5746
5747static int handle_preemption_timer(struct kvm_vcpu *vcpu)
5748{
5749	handle_fastpath_preemption_timer(vcpu);
5750	return 1;
5751}
5752
5753/*
5754 * When nested=0, all VMX instruction VM Exits filter here.  The handlers
5755 * are overwritten by nested_vmx_setup() when nested=1.
5756 */
5757static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
5758{
5759	kvm_queue_exception(vcpu, UD_VECTOR);
5760	return 1;
5761}
5762
5763static int handle_encls(struct kvm_vcpu *vcpu)
5764{
5765	/*
5766	 * SGX virtualization is not yet supported.  There is no software
5767	 * enable bit for SGX, so we have to trap ENCLS and inject a #UD
5768	 * to prevent the guest from executing ENCLS.
5769	 */
5770	kvm_queue_exception(vcpu, UD_VECTOR);
5771	return 1;
5772}
5773
5774/*
5775 * The exit handlers return 1 if the exit was handled fully and guest execution
5776 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
5777 * to be done to userspace and return 0.
5778 */
5779static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5780	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
5781	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
5782	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
5783	[EXIT_REASON_NMI_WINDOW]	      = handle_nmi_window,
5784	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
5785	[EXIT_REASON_CR_ACCESS]               = handle_cr,
5786	[EXIT_REASON_DR_ACCESS]               = handle_dr,
5787	[EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
5788	[EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
5789	[EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
5790	[EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
5791	[EXIT_REASON_HLT]                     = kvm_emulate_halt,
5792	[EXIT_REASON_INVD]		      = handle_invd,
5793	[EXIT_REASON_INVLPG]		      = handle_invlpg,
5794	[EXIT_REASON_RDPMC]                   = handle_rdpmc,
5795	[EXIT_REASON_VMCALL]                  = handle_vmcall,
5796	[EXIT_REASON_VMCLEAR]		      = handle_vmx_instruction,
5797	[EXIT_REASON_VMLAUNCH]		      = handle_vmx_instruction,
5798	[EXIT_REASON_VMPTRLD]		      = handle_vmx_instruction,
5799	[EXIT_REASON_VMPTRST]		      = handle_vmx_instruction,
5800	[EXIT_REASON_VMREAD]		      = handle_vmx_instruction,
5801	[EXIT_REASON_VMRESUME]		      = handle_vmx_instruction,
5802	[EXIT_REASON_VMWRITE]		      = handle_vmx_instruction,
5803	[EXIT_REASON_VMOFF]		      = handle_vmx_instruction,
5804	[EXIT_REASON_VMON]		      = handle_vmx_instruction,
5805	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
5806	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
5807	[EXIT_REASON_APIC_WRITE]              = handle_apic_write,
5808	[EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
5809	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
5810	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
5811	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
5812	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
5813	[EXIT_REASON_GDTR_IDTR]		      = handle_desc,
5814	[EXIT_REASON_LDTR_TR]		      = handle_desc,
5815	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
5816	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
5817	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
5818	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_mwait,
5819	[EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
5820	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
5821	[EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
5822	[EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
5823	[EXIT_REASON_RDRAND]                  = handle_invalid_op,
5824	[EXIT_REASON_RDSEED]                  = handle_invalid_op,
5825	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
5826	[EXIT_REASON_INVPCID]                 = handle_invpcid,
5827	[EXIT_REASON_VMFUNC]		      = handle_vmx_instruction,
5828	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
5829	[EXIT_REASON_ENCLS]		      = handle_encls,
5830};
5831
5832static const int kvm_vmx_max_exit_handlers =
5833	ARRAY_SIZE(kvm_vmx_exit_handlers);
5834
5835static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
5836			      u32 *intr_info, u32 *error_code)
5837{
5838	struct vcpu_vmx *vmx = to_vmx(vcpu);
5839
5840	*info1 = vmx_get_exit_qual(vcpu);
5841	if (!(vmx->exit_reason.failed_vmentry)) {
5842		*info2 = vmx->idt_vectoring_info;
5843		*intr_info = vmx_get_intr_info(vcpu);
5844		if (is_exception_with_error_code(*intr_info))
5845			*error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5846		else
5847			*error_code = 0;
5848	} else {
5849		*info2 = 0;
5850		*intr_info = 0;
5851		*error_code = 0;
5852	}
5853}
5854
5855static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
5856{
5857	if (vmx->pml_pg) {
5858		__free_page(vmx->pml_pg);
5859		vmx->pml_pg = NULL;
5860	}
5861}
5862
5863static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
5864{
5865	struct vcpu_vmx *vmx = to_vmx(vcpu);
5866	u64 *pml_buf;
5867	u16 pml_idx;
5868
5869	pml_idx = vmcs_read16(GUEST_PML_INDEX);
5870
5871	/* Do nothing if PML buffer is empty */
5872	if (pml_idx == (PML_ENTITY_NUM - 1))
5873		return;
5874
5875	/* PML index always points to next available PML buffer entity */
5876	if (pml_idx >= PML_ENTITY_NUM)
5877		pml_idx = 0;
5878	else
5879		pml_idx++;
5880
5881	pml_buf = page_address(vmx->pml_pg);
5882	for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
5883		u64 gpa;
5884
5885		gpa = pml_buf[pml_idx];
5886		WARN_ON(gpa & (PAGE_SIZE - 1));
5887		kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
5888	}
5889
5890	/* reset PML index */
5891	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
5892}
5893
5894/*
5895 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
5896 * Called before reporting dirty_bitmap to userspace.
5897 */
5898static void kvm_flush_pml_buffers(struct kvm *kvm)
5899{
5900	int i;
5901	struct kvm_vcpu *vcpu;
5902	/*
5903	 * We only need to kick vcpu out of guest mode here, as PML buffer
5904	 * is flushed at beginning of all VMEXITs, and it's obvious that only
5905	 * vcpus running in guest are possible to have unflushed GPAs in PML
5906	 * buffer.
5907	 */
5908	kvm_for_each_vcpu(i, vcpu, kvm)
5909		kvm_vcpu_kick(vcpu);
5910}
5911
5912static void vmx_dump_sel(char *name, uint32_t sel)
5913{
5914	pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
5915	       name, vmcs_read16(sel),
5916	       vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
5917	       vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
5918	       vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
5919}
5920
5921static void vmx_dump_dtsel(char *name, uint32_t limit)
5922{
5923	pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
5924	       name, vmcs_read32(limit),
5925	       vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
5926}
5927
5928void dump_vmcs(void)
5929{
5930	u32 vmentry_ctl, vmexit_ctl;
5931	u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
5932	unsigned long cr4;
5933
5934	if (!dump_invalid_vmcs) {
5935		pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
5936		return;
5937	}
5938
5939	vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
5940	vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
5941	cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5942	pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
5943	cr4 = vmcs_readl(GUEST_CR4);
5944	secondary_exec_control = 0;
5945	if (cpu_has_secondary_exec_ctrls())
5946		secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
5947
5948	pr_err("*** Guest State ***\n");
5949	pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
5950	       vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
5951	       vmcs_readl(CR0_GUEST_HOST_MASK));
5952	pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
5953	       cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
5954	pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
5955	if (cpu_has_vmx_ept()) {
5956		pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
5957		       vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
5958		pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
5959		       vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
5960	}
5961	pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
5962	       vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
5963	pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
5964	       vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
5965	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
5966	       vmcs_readl(GUEST_SYSENTER_ESP),
5967	       vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
5968	vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
5969	vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
5970	vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
5971	vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
5972	vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
5973	vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
5974	vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
5975	vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
5976	vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
5977	vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
5978	if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
5979	    (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
5980		pr_err("EFER =     0x%016llx  PAT = 0x%016llx\n",
5981		       vmcs_read64(GUEST_IA32_EFER),
5982		       vmcs_read64(GUEST_IA32_PAT));
5983	pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
5984	       vmcs_read64(GUEST_IA32_DEBUGCTL),
5985	       vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
5986	if (cpu_has_load_perf_global_ctrl() &&
5987	    vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
5988		pr_err("PerfGlobCtl = 0x%016llx\n",
5989		       vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
5990	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
5991		pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
5992	pr_err("Interruptibility = %08x  ActivityState = %08x\n",
5993	       vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
5994	       vmcs_read32(GUEST_ACTIVITY_STATE));
5995	if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
5996		pr_err("InterruptStatus = %04x\n",
5997		       vmcs_read16(GUEST_INTR_STATUS));
5998
5999	pr_err("*** Host State ***\n");
6000	pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
6001	       vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6002	pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6003	       vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6004	       vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6005	       vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6006	       vmcs_read16(HOST_TR_SELECTOR));
6007	pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6008	       vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6009	       vmcs_readl(HOST_TR_BASE));
6010	pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6011	       vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6012	pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6013	       vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6014	       vmcs_readl(HOST_CR4));
6015	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6016	       vmcs_readl(HOST_IA32_SYSENTER_ESP),
6017	       vmcs_read32(HOST_IA32_SYSENTER_CS),
6018	       vmcs_readl(HOST_IA32_SYSENTER_EIP));
6019	if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
6020		pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
6021		       vmcs_read64(HOST_IA32_EFER),
6022		       vmcs_read64(HOST_IA32_PAT));
6023	if (cpu_has_load_perf_global_ctrl() &&
6024	    vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6025		pr_err("PerfGlobCtl = 0x%016llx\n",
6026		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
6027
6028	pr_err("*** Control State ***\n");
6029	pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
6030	       pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
6031	pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
6032	pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6033	       vmcs_read32(EXCEPTION_BITMAP),
6034	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6035	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6036	pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6037	       vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6038	       vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6039	       vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6040	pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6041	       vmcs_read32(VM_EXIT_INTR_INFO),
6042	       vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6043	       vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6044	pr_err("        reason=%08x qualification=%016lx\n",
6045	       vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6046	pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6047	       vmcs_read32(IDT_VECTORING_INFO_FIELD),
6048	       vmcs_read32(IDT_VECTORING_ERROR_CODE));
6049	pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6050	if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6051		pr_err("TSC Multiplier = 0x%016llx\n",
6052		       vmcs_read64(TSC_MULTIPLIER));
6053	if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6054		if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6055			u16 status = vmcs_read16(GUEST_INTR_STATUS);
6056			pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6057		}
6058		pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
6059		if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6060			pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
6061		pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
6062	}
6063	if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6064		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6065	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6066		pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
6067	if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6068		pr_err("PLE Gap=%08x Window=%08x\n",
6069		       vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6070	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6071		pr_err("Virtual processor ID = 0x%04x\n",
6072		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
6073}
6074
6075/*
6076 * The guest has exited.  See if we can fix it or if we need userspace
6077 * assistance.
6078 */
6079static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6080{
6081	struct vcpu_vmx *vmx = to_vmx(vcpu);
6082	union vmx_exit_reason exit_reason = vmx->exit_reason;
6083	u32 vectoring_info = vmx->idt_vectoring_info;
6084	u16 exit_handler_index;
6085
6086	/*
6087	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
6088	 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6089	 * querying dirty_bitmap, we only need to kick all vcpus out of guest
6090	 * mode as if vcpus is in root mode, the PML buffer must has been
6091	 * flushed already.
6092	 */
6093	if (enable_pml)
6094		vmx_flush_pml_buffer(vcpu);
6095
6096	/*
6097	 * We should never reach this point with a pending nested VM-Enter, and
6098	 * more specifically emulation of L2 due to invalid guest state (see
6099	 * below) should never happen as that means we incorrectly allowed a
6100	 * nested VM-Enter with an invalid vmcs12.
6101	 */
6102	WARN_ON_ONCE(vmx->nested.nested_run_pending);
6103
6104	/* If guest state is invalid, start emulating */
6105	if (vmx->emulation_required)
6106		return handle_invalid_guest_state(vcpu);
6107
6108	if (is_guest_mode(vcpu)) {
6109		/*
6110		 * The host physical addresses of some pages of guest memory
6111		 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6112		 * Page). The CPU may write to these pages via their host
6113		 * physical address while L2 is running, bypassing any
6114		 * address-translation-based dirty tracking (e.g. EPT write
6115		 * protection).
6116		 *
6117		 * Mark them dirty on every exit from L2 to prevent them from
6118		 * getting out of sync with dirty tracking.
6119		 */
6120		nested_mark_vmcs12_pages_dirty(vcpu);
6121
6122		if (nested_vmx_reflect_vmexit(vcpu))
6123			return 1;
6124	}
6125
6126	if (exit_reason.failed_vmentry) {
6127		dump_vmcs();
6128		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6129		vcpu->run->fail_entry.hardware_entry_failure_reason
6130			= exit_reason.full;
6131		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6132		return 0;
6133	}
6134
6135	if (unlikely(vmx->fail)) {
6136		dump_vmcs();
6137		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6138		vcpu->run->fail_entry.hardware_entry_failure_reason
6139			= vmcs_read32(VM_INSTRUCTION_ERROR);
6140		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6141		return 0;
6142	}
6143
6144	/*
6145	 * Note:
6146	 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6147	 * delivery event since it indicates guest is accessing MMIO.
6148	 * The vm-exit can be triggered again after return to guest that
6149	 * will cause infinite loop.
6150	 */
6151	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6152	    (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6153	     exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6154	     exit_reason.basic != EXIT_REASON_PML_FULL &&
6155	     exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
6156	     exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
6157		int ndata = 3;
6158
6159		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6160		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
6161		vcpu->run->internal.data[0] = vectoring_info;
6162		vcpu->run->internal.data[1] = exit_reason.full;
6163		vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
6164		if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
6165			vcpu->run->internal.data[ndata++] =
6166				vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6167		}
6168		vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
6169		vcpu->run->internal.ndata = ndata;
6170		return 0;
6171	}
6172
6173	if (unlikely(!enable_vnmi &&
6174		     vmx->loaded_vmcs->soft_vnmi_blocked)) {
6175		if (!vmx_interrupt_blocked(vcpu)) {
6176			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6177		} else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6178			   vcpu->arch.nmi_pending) {
6179			/*
6180			 * This CPU don't support us in finding the end of an
6181			 * NMI-blocked window if the guest runs with IRQs
6182			 * disabled. So we pull the trigger after 1 s of
6183			 * futile waiting, but inform the user about this.
6184			 */
6185			printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6186			       "state on VCPU %d after 1 s timeout\n",
6187			       __func__, vcpu->vcpu_id);
6188			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6189		}
6190	}
6191
6192	if (exit_fastpath != EXIT_FASTPATH_NONE)
6193		return 1;
6194
6195	if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
6196		goto unexpected_vmexit;
6197#ifdef CONFIG_RETPOLINE
6198	if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6199		return kvm_emulate_wrmsr(vcpu);
6200	else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
6201		return handle_preemption_timer(vcpu);
6202	else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
6203		return handle_interrupt_window(vcpu);
6204	else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6205		return handle_external_interrupt(vcpu);
6206	else if (exit_reason.basic == EXIT_REASON_HLT)
6207		return kvm_emulate_halt(vcpu);
6208	else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
6209		return handle_ept_misconfig(vcpu);
6210#endif
6211
6212	exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6213						kvm_vmx_max_exit_handlers);
6214	if (!kvm_vmx_exit_handlers[exit_handler_index])
6215		goto unexpected_vmexit;
6216
6217	return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
6218
6219unexpected_vmexit:
6220	vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
6221		    exit_reason.full);
6222	dump_vmcs();
6223	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6224	vcpu->run->internal.suberror =
6225			KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
6226	vcpu->run->internal.ndata = 2;
6227	vcpu->run->internal.data[0] = exit_reason.full;
6228	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
6229	return 0;
6230}
6231
6232/*
6233 * Software based L1D cache flush which is used when microcode providing
6234 * the cache control MSR is not loaded.
6235 *
6236 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6237 * flush it is required to read in 64 KiB because the replacement algorithm
6238 * is not exactly LRU. This could be sized at runtime via topology
6239 * information but as all relevant affected CPUs have 32KiB L1D cache size
6240 * there is no point in doing so.
6241 */
6242static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
6243{
6244	int size = PAGE_SIZE << L1D_CACHE_ORDER;
6245
6246	/*
6247	 * This code is only executed when the the flush mode is 'cond' or
6248	 * 'always'
6249	 */
6250	if (static_branch_likely(&vmx_l1d_flush_cond)) {
6251		bool flush_l1d;
6252
6253		/*
6254		 * Clear the per-vcpu flush bit, it gets set again
6255		 * either from vcpu_run() or from one of the unsafe
6256		 * VMEXIT handlers.
6257		 */
6258		flush_l1d = vcpu->arch.l1tf_flush_l1d;
6259		vcpu->arch.l1tf_flush_l1d = false;
6260
6261		/*
6262		 * Clear the per-cpu flush bit, it gets set again from
6263		 * the interrupt handlers.
6264		 */
6265		flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
6266		kvm_clear_cpu_l1tf_flush_l1d();
6267
6268		if (!flush_l1d)
6269			return;
6270	}
6271
6272	vcpu->stat.l1d_flush++;
6273
6274	if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
6275		native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
6276		return;
6277	}
6278
6279	asm volatile(
6280		/* First ensure the pages are in the TLB */
6281		"xorl	%%eax, %%eax\n"
6282		".Lpopulate_tlb:\n\t"
6283		"movzbl	(%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6284		"addl	$4096, %%eax\n\t"
6285		"cmpl	%%eax, %[size]\n\t"
6286		"jne	.Lpopulate_tlb\n\t"
6287		"xorl	%%eax, %%eax\n\t"
6288		"cpuid\n\t"
6289		/* Now fill the cache */
6290		"xorl	%%eax, %%eax\n"
6291		".Lfill_cache:\n"
6292		"movzbl	(%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6293		"addl	$64, %%eax\n\t"
6294		"cmpl	%%eax, %[size]\n\t"
6295		"jne	.Lfill_cache\n\t"
6296		"lfence\n"
6297		:: [flush_pages] "r" (vmx_l1d_flush_pages),
6298		    [size] "r" (size)
6299		: "eax", "ebx", "ecx", "edx");
6300}
6301
6302static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6303{
6304	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6305	int tpr_threshold;
6306
6307	if (is_guest_mode(vcpu) &&
6308		nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6309		return;
6310
6311	tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6312	if (is_guest_mode(vcpu))
6313		to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6314	else
6315		vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6316}
6317
6318void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6319{
6320	struct vcpu_vmx *vmx = to_vmx(vcpu);
6321	u32 sec_exec_control;
6322
6323	if (!lapic_in_kernel(vcpu))
6324		return;
6325
6326	if (!flexpriority_enabled &&
6327	    !cpu_has_vmx_virtualize_x2apic_mode())
6328		return;
6329
6330	/* Postpone execution until vmcs01 is the current VMCS. */
6331	if (is_guest_mode(vcpu)) {
6332		vmx->nested.change_vmcs01_virtual_apic_mode = true;
6333		return;
6334	}
6335
6336	sec_exec_control = secondary_exec_controls_get(vmx);
6337	sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6338			      SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6339
6340	switch (kvm_get_apic_mode(vcpu)) {
6341	case LAPIC_MODE_INVALID:
6342		WARN_ONCE(true, "Invalid local APIC state");
6343	case LAPIC_MODE_DISABLED:
6344		break;
6345	case LAPIC_MODE_XAPIC:
6346		if (flexpriority_enabled) {
6347			sec_exec_control |=
6348				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6349			kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6350
6351			/*
6352			 * Flush the TLB, reloading the APIC access page will
6353			 * only do so if its physical address has changed, but
6354			 * the guest may have inserted a non-APIC mapping into
6355			 * the TLB while the APIC access page was disabled.
6356			 */
6357			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
6358		}
6359		break;
6360	case LAPIC_MODE_X2APIC:
6361		if (cpu_has_vmx_virtualize_x2apic_mode())
6362			sec_exec_control |=
6363				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6364		break;
6365	}
6366	secondary_exec_controls_set(vmx, sec_exec_control);
6367
6368	vmx_update_msr_bitmap(vcpu);
6369}
6370
6371static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
6372{
6373	struct page *page;
6374
6375	/* Defer reload until vmcs01 is the current VMCS. */
6376	if (is_guest_mode(vcpu)) {
6377		to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
6378		return;
6379	}
6380
6381	if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
6382	    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6383		return;
6384
6385	page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
6386	if (is_error_page(page))
6387		return;
6388
6389	vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
6390	vmx_flush_tlb_current(vcpu);
6391
6392	/*
6393	 * Do not pin apic access page in memory, the MMU notifier
6394	 * will call us again if it is migrated or swapped out.
6395	 */
6396	put_page(page);
6397}
6398
6399static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
6400{
6401	u16 status;
6402	u8 old;
6403
6404	if (max_isr == -1)
6405		max_isr = 0;
6406
6407	status = vmcs_read16(GUEST_INTR_STATUS);
6408	old = status >> 8;
6409	if (max_isr != old) {
6410		status &= 0xff;
6411		status |= max_isr << 8;
6412		vmcs_write16(GUEST_INTR_STATUS, status);
6413	}
6414}
6415
6416static void vmx_set_rvi(int vector)
6417{
6418	u16 status;
6419	u8 old;
6420
6421	if (vector == -1)
6422		vector = 0;
6423
6424	status = vmcs_read16(GUEST_INTR_STATUS);
6425	old = (u8)status & 0xff;
6426	if ((u8)vector != old) {
6427		status &= ~0xff;
6428		status |= (u8)vector;
6429		vmcs_write16(GUEST_INTR_STATUS, status);
6430	}
6431}
6432
6433static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6434{
6435	/*
6436	 * When running L2, updating RVI is only relevant when
6437	 * vmcs12 virtual-interrupt-delivery enabled.
6438	 * However, it can be enabled only when L1 also
6439	 * intercepts external-interrupts and in that case
6440	 * we should not update vmcs02 RVI but instead intercept
6441	 * interrupt. Therefore, do nothing when running L2.
6442	 */
6443	if (!is_guest_mode(vcpu))
6444		vmx_set_rvi(max_irr);
6445}
6446
6447static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6448{
6449	struct vcpu_vmx *vmx = to_vmx(vcpu);
6450	int max_irr;
6451	bool max_irr_updated;
6452
6453	WARN_ON(!vcpu->arch.apicv_active);
6454	if (pi_test_on(&vmx->pi_desc)) {
6455		pi_clear_on(&vmx->pi_desc);
6456		/*
6457		 * IOMMU can write to PID.ON, so the barrier matters even on UP.
6458		 * But on x86 this is just a compiler barrier anyway.
6459		 */
6460		smp_mb__after_atomic();
6461		max_irr_updated =
6462			kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
6463
6464		/*
6465		 * If we are running L2 and L1 has a new pending interrupt
6466		 * which can be injected, this may cause a vmexit or it may
6467		 * be injected into L2.  Either way, this interrupt will be
6468		 * processed via KVM_REQ_EVENT, not RVI, because we do not use
6469		 * virtual interrupt delivery to inject L1 interrupts into L2.
6470		 */
6471		if (is_guest_mode(vcpu) && max_irr_updated)
6472			kvm_make_request(KVM_REQ_EVENT, vcpu);
6473	} else {
6474		max_irr = kvm_lapic_find_highest_irr(vcpu);
6475	}
6476	vmx_hwapic_irr_update(vcpu, max_irr);
6477	return max_irr;
6478}
6479
6480static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6481{
6482	if (!kvm_vcpu_apicv_active(vcpu))
6483		return;
6484
6485	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6486	vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6487	vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6488	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6489}
6490
6491static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
6492{
6493	struct vcpu_vmx *vmx = to_vmx(vcpu);
6494
6495	pi_clear_on(&vmx->pi_desc);
6496	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6497}
6498
6499void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
6500
6501static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
6502					unsigned long entry)
6503{
6504	kvm_before_interrupt(vcpu);
6505	vmx_do_interrupt_nmi_irqoff(entry);
6506	kvm_after_interrupt(vcpu);
6507}
6508
6509static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
6510{
6511	const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
6512	u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
6513
6514	/* if exit due to PF check for async PF */
6515	if (is_page_fault(intr_info))
6516		vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
6517	/* Handle machine checks before interrupts are enabled */
6518	else if (is_machine_check(intr_info))
6519		kvm_machine_check();
6520	/* We need to handle NMIs before interrupts are enabled */
6521	else if (is_nmi(intr_info))
6522		handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
6523}
6524
6525static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
6526{
6527	u32 intr_info = vmx_get_intr_info(vcpu);
6528	unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
6529	gate_desc *desc = (gate_desc *)host_idt_base + vector;
6530
6531	if (WARN_ONCE(!is_external_intr(intr_info),
6532	    "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
6533		return;
6534
6535	handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
6536	vcpu->arch.at_instruction_boundary = true;
6537}
6538
6539static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
6540{
6541	struct vcpu_vmx *vmx = to_vmx(vcpu);
6542
6543	if (vmx->emulation_required)
6544		return;
6545
6546	if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6547		handle_external_interrupt_irqoff(vcpu);
6548	else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
6549		handle_exception_nmi_irqoff(vmx);
6550}
6551
6552static bool vmx_has_emulated_msr(u32 index)
6553{
6554	switch (index) {
6555	case MSR_IA32_SMBASE:
6556		/*
6557		 * We cannot do SMM unless we can run the guest in big
6558		 * real mode.
6559		 */
6560		return enable_unrestricted_guest || emulate_invalid_guest_state;
6561	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
6562		return nested;
6563	case MSR_AMD64_VIRT_SPEC_CTRL:
6564		/* This is AMD only.  */
6565		return false;
6566	default:
6567		return true;
6568	}
6569}
6570
6571static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6572{
6573	u32 exit_intr_info;
6574	bool unblock_nmi;
6575	u8 vector;
6576	bool idtv_info_valid;
6577
6578	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6579
6580	if (enable_vnmi) {
6581		if (vmx->loaded_vmcs->nmi_known_unmasked)
6582			return;
6583
6584		exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
6585		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
6586		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
6587		/*
6588		 * SDM 3: 27.7.1.2 (September 2008)
6589		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
6590		 * a guest IRET fault.
6591		 * SDM 3: 23.2.2 (September 2008)
6592		 * Bit 12 is undefined in any of the following cases:
6593		 *  If the VM exit sets the valid bit in the IDT-vectoring
6594		 *   information field.
6595		 *  If the VM exit is due to a double fault.
6596		 */
6597		if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
6598		    vector != DF_VECTOR && !idtv_info_valid)
6599			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6600				      GUEST_INTR_STATE_NMI);
6601		else
6602			vmx->loaded_vmcs->nmi_known_unmasked =
6603				!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
6604				  & GUEST_INTR_STATE_NMI);
6605	} else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
6606		vmx->loaded_vmcs->vnmi_blocked_time +=
6607			ktime_to_ns(ktime_sub(ktime_get(),
6608					      vmx->loaded_vmcs->entry_time));
6609}
6610
6611static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
6612				      u32 idt_vectoring_info,
6613				      int instr_len_field,
6614				      int error_code_field)
6615{
6616	u8 vector;
6617	int type;
6618	bool idtv_info_valid;
6619
6620	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6621
6622	vcpu->arch.nmi_injected = false;
6623	kvm_clear_exception_queue(vcpu);
6624	kvm_clear_interrupt_queue(vcpu);
6625
6626	if (!idtv_info_valid)
6627		return;
6628
6629	kvm_make_request(KVM_REQ_EVENT, vcpu);
6630
6631	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
6632	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
6633
6634	switch (type) {
6635	case INTR_TYPE_NMI_INTR:
6636		vcpu->arch.nmi_injected = true;
6637		/*
6638		 * SDM 3: 27.7.1.2 (September 2008)
6639		 * Clear bit "block by NMI" before VM entry if a NMI
6640		 * delivery faulted.
6641		 */
6642		vmx_set_nmi_mask(vcpu, false);
6643		break;
6644	case INTR_TYPE_SOFT_EXCEPTION:
6645		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6646		fallthrough;
6647	case INTR_TYPE_HARD_EXCEPTION:
6648		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
6649			u32 err = vmcs_read32(error_code_field);
6650			kvm_requeue_exception_e(vcpu, vector, err);
6651		} else
6652			kvm_requeue_exception(vcpu, vector);
6653		break;
6654	case INTR_TYPE_SOFT_INTR:
6655		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6656		fallthrough;
6657	case INTR_TYPE_EXT_INTR:
6658		kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
6659		break;
6660	default:
6661		break;
6662	}
6663}
6664
6665static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
6666{
6667	__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
6668				  VM_EXIT_INSTRUCTION_LEN,
6669				  IDT_VECTORING_ERROR_CODE);
6670}
6671
6672static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
6673{
6674	__vmx_complete_interrupts(vcpu,
6675				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6676				  VM_ENTRY_INSTRUCTION_LEN,
6677				  VM_ENTRY_EXCEPTION_ERROR_CODE);
6678
6679	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
6680}
6681
6682static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
6683{
6684	int i, nr_msrs;
6685	struct perf_guest_switch_msr *msrs;
6686
6687	msrs = perf_guest_get_msrs(&nr_msrs);
6688
6689	if (!msrs)
6690		return;
6691
6692	for (i = 0; i < nr_msrs; i++)
6693		if (msrs[i].host == msrs[i].guest)
6694			clear_atomic_switch_msr(vmx, msrs[i].msr);
6695		else
6696			add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
6697					msrs[i].host, false);
6698}
6699
6700static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
6701{
6702	struct vcpu_vmx *vmx = to_vmx(vcpu);
6703	u64 tscl;
6704	u32 delta_tsc;
6705
6706	if (vmx->req_immediate_exit) {
6707		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
6708		vmx->loaded_vmcs->hv_timer_soft_disabled = false;
6709	} else if (vmx->hv_deadline_tsc != -1) {
6710		tscl = rdtsc();
6711		if (vmx->hv_deadline_tsc > tscl)
6712			/* set_hv_timer ensures the delta fits in 32-bits */
6713			delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
6714				cpu_preemption_timer_multi);
6715		else
6716			delta_tsc = 0;
6717
6718		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
6719		vmx->loaded_vmcs->hv_timer_soft_disabled = false;
6720	} else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
6721		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
6722		vmx->loaded_vmcs->hv_timer_soft_disabled = true;
6723	}
6724}
6725
6726void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
6727{
6728	if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
6729		vmx->loaded_vmcs->host_state.rsp = host_rsp;
6730		vmcs_writel(HOST_RSP, host_rsp);
6731	}
6732}
6733
6734void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
6735					unsigned int flags)
6736{
6737	u64 hostval = this_cpu_read(x86_spec_ctrl_current);
6738
6739	if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
6740		return;
6741
6742	if (flags & VMX_RUN_SAVE_SPEC_CTRL)
6743		vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
6744
6745	/*
6746	 * If the guest/host SPEC_CTRL values differ, restore the host value.
6747	 *
6748	 * For legacy IBRS, the IBRS bit always needs to be written after
6749	 * transitioning from a less privileged predictor mode, regardless of
6750	 * whether the guest/host values differ.
6751	 */
6752	if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
6753	    vmx->spec_ctrl != hostval)
6754		native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
6755
6756	barrier_nospec();
6757}
6758
6759static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
6760{
6761	switch (to_vmx(vcpu)->exit_reason.basic) {
6762	case EXIT_REASON_MSR_WRITE:
6763		return handle_fastpath_set_msr_irqoff(vcpu);
6764	case EXIT_REASON_PREEMPTION_TIMER:
6765		return handle_fastpath_preemption_timer(vcpu);
6766	default:
6767		return EXIT_FASTPATH_NONE;
6768	}
6769}
6770
6771static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
6772					struct vcpu_vmx *vmx,
6773					unsigned long flags)
6774{
6775	/*
6776	 * VMENTER enables interrupts (host state), but the kernel state is
6777	 * interrupts disabled when this is invoked. Also tell RCU about
6778	 * it. This is the same logic as for exit_to_user_mode().
6779	 *
6780	 * This ensures that e.g. latency analysis on the host observes
6781	 * guest mode as interrupt enabled.
6782	 *
6783	 * guest_enter_irqoff() informs context tracking about the
6784	 * transition to guest mode and if enabled adjusts RCU state
6785	 * accordingly.
6786	 */
6787	instrumentation_begin();
6788	trace_hardirqs_on_prepare();
6789	lockdep_hardirqs_on_prepare(CALLER_ADDR0);
6790	instrumentation_end();
6791
6792	guest_enter_irqoff();
6793	lockdep_hardirqs_on(CALLER_ADDR0);
6794
6795	/* L1D Flush includes CPU buffer clear to mitigate MDS */
6796	if (static_branch_unlikely(&vmx_l1d_should_flush))
6797		vmx_l1d_flush(vcpu);
6798	else if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF))
6799		mds_clear_cpu_buffers();
6800	else if (static_branch_unlikely(&mmio_stale_data_clear) &&
6801		 kvm_arch_has_assigned_device(vcpu->kvm))
6802		mds_clear_cpu_buffers();
6803
6804	vmx_disable_fb_clear(vmx);
6805
6806	if (vcpu->arch.cr2 != native_read_cr2())
6807		native_write_cr2(vcpu->arch.cr2);
6808
6809	vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
6810				   flags);
6811
6812	vcpu->arch.cr2 = native_read_cr2();
6813
6814	vmx_enable_fb_clear(vmx);
6815
6816	/*
6817	 * VMEXIT disables interrupts (host state), but tracing and lockdep
6818	 * have them in state 'on' as recorded before entering guest mode.
6819	 * Same as enter_from_user_mode().
6820	 *
6821	 * context_tracking_guest_exit() restores host context and reinstates
6822	 * RCU if enabled and required.
6823	 *
6824	 * This needs to be done before the below as native_read_msr()
6825	 * contains a tracepoint and x86_spec_ctrl_restore_host() calls
6826	 * into world and some more.
6827	 */
6828	lockdep_hardirqs_off(CALLER_ADDR0);
6829	context_tracking_guest_exit();
6830
6831	instrumentation_begin();
6832	trace_hardirqs_off_finish();
6833	instrumentation_end();
6834}
6835
6836static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
6837{
6838	fastpath_t exit_fastpath;
6839	struct vcpu_vmx *vmx = to_vmx(vcpu);
6840	unsigned long cr3, cr4;
6841
6842reenter_guest:
6843	/* Record the guest's net vcpu time for enforced NMI injections. */
6844	if (unlikely(!enable_vnmi &&
6845		     vmx->loaded_vmcs->soft_vnmi_blocked))
6846		vmx->loaded_vmcs->entry_time = ktime_get();
6847
6848	/* Don't enter VMX if guest state is invalid, let the exit handler
6849	   start emulation until we arrive back to a valid state */
6850	if (vmx->emulation_required)
6851		return EXIT_FASTPATH_NONE;
6852
6853	if (vmx->ple_window_dirty) {
6854		vmx->ple_window_dirty = false;
6855		vmcs_write32(PLE_WINDOW, vmx->ple_window);
6856	}
6857
6858	/*
6859	 * We did this in prepare_switch_to_guest, because it needs to
6860	 * be within srcu_read_lock.
6861	 */
6862	WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
6863
6864	if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
6865		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
6866	if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
6867		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
6868
6869	cr3 = __get_current_cr3_fast();
6870	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
6871		vmcs_writel(HOST_CR3, cr3);
6872		vmx->loaded_vmcs->host_state.cr3 = cr3;
6873	}
6874
6875	cr4 = cr4_read_shadow();
6876	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
6877		vmcs_writel(HOST_CR4, cr4);
6878		vmx->loaded_vmcs->host_state.cr4 = cr4;
6879	}
6880
6881	/* When single-stepping over STI and MOV SS, we must clear the
6882	 * corresponding interruptibility bits in the guest state. Otherwise
6883	 * vmentry fails as it then expects bit 14 (BS) in pending debug
6884	 * exceptions being set, but that's not correct for the guest debugging
6885	 * case. */
6886	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6887		vmx_set_interrupt_shadow(vcpu, 0);
6888
6889	kvm_load_guest_xsave_state(vcpu);
6890
6891	pt_guest_enter(vmx);
6892
6893	atomic_switch_perf_msrs(vmx);
6894
6895	if (enable_preemption_timer)
6896		vmx_update_hv_timer(vcpu);
6897
6898	kvm_wait_lapic_expire(vcpu);
6899
6900	/*
6901	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
6902	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
6903	 * is no need to worry about the conditional branch over the wrmsr
6904	 * being speculatively taken.
6905	 */
6906	x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
6907
6908	/* The actual VMENTER/EXIT is in the .noinstr.text section. */
6909	vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx));
6910
6911	/* All fields are clean at this point */
6912	if (static_branch_unlikely(&enable_evmcs))
6913		current_evmcs->hv_clean_fields |=
6914			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
6915
6916	if (static_branch_unlikely(&enable_evmcs))
6917		current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
6918
6919	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
6920	if (vmx->host_debugctlmsr)
6921		update_debugctlmsr(vmx->host_debugctlmsr);
6922
6923#ifndef CONFIG_X86_64
6924	/*
6925	 * The sysexit path does not restore ds/es, so we must set them to
6926	 * a reasonable value ourselves.
6927	 *
6928	 * We can't defer this to vmx_prepare_switch_to_host() since that
6929	 * function may be executed in interrupt context, which saves and
6930	 * restore segments around it, nullifying its effect.
6931	 */
6932	loadsegment(ds, __USER_DS);
6933	loadsegment(es, __USER_DS);
6934#endif
6935
6936	vmx_register_cache_reset(vcpu);
6937
6938	pt_guest_exit(vmx);
6939
6940	kvm_load_host_xsave_state(vcpu);
6941
6942	vmx->nested.nested_run_pending = 0;
6943	vmx->idt_vectoring_info = 0;
6944
6945	if (unlikely(vmx->fail)) {
6946		vmx->exit_reason.full = 0xdead;
6947		return EXIT_FASTPATH_NONE;
6948	}
6949
6950	vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
6951	if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
6952		kvm_machine_check();
6953
6954	trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
6955
6956	if (unlikely(vmx->exit_reason.failed_vmentry))
6957		return EXIT_FASTPATH_NONE;
6958
6959	vmx->loaded_vmcs->launched = 1;
6960	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
6961
6962	vmx_recover_nmi_blocking(vmx);
6963	vmx_complete_interrupts(vmx);
6964
6965	if (is_guest_mode(vcpu))
6966		return EXIT_FASTPATH_NONE;
6967
6968	exit_fastpath = vmx_exit_handlers_fastpath(vcpu);
6969	if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) {
6970		if (!kvm_vcpu_exit_request(vcpu)) {
6971			/*
6972			 * FIXME: this goto should be a loop in vcpu_enter_guest,
6973			 * but it would incur the cost of a retpoline for now.
6974			 * Revisit once static calls are available.
6975			 */
6976			if (vcpu->arch.apicv_active)
6977				vmx_sync_pir_to_irr(vcpu);
6978			goto reenter_guest;
6979		}
6980		exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
6981	}
6982
6983	return exit_fastpath;
6984}
6985
6986static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
6987{
6988	struct vcpu_vmx *vmx = to_vmx(vcpu);
6989
6990	if (enable_pml)
6991		vmx_destroy_pml_buffer(vmx);
6992	free_vpid(vmx->vpid);
6993	nested_vmx_free_vcpu(vcpu);
6994	free_loaded_vmcs(vmx->loaded_vmcs);
6995}
6996
6997static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
6998{
6999	struct vcpu_vmx *vmx;
7000	int i, cpu, err;
7001
7002	BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
7003	vmx = to_vmx(vcpu);
7004
7005	err = -ENOMEM;
7006
7007	vmx->vpid = allocate_vpid();
7008
7009	/*
7010	 * If PML is turned on, failure on enabling PML just results in failure
7011	 * of creating the vcpu, therefore we can simplify PML logic (by
7012	 * avoiding dealing with cases, such as enabling PML partially on vcpus
7013	 * for the guest), etc.
7014	 */
7015	if (enable_pml) {
7016		vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7017		if (!vmx->pml_pg)
7018			goto free_vpid;
7019	}
7020
7021	BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
7022
7023	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
7024		u32 index = vmx_uret_msrs_list[i];
7025		int j = vmx->nr_uret_msrs;
7026
7027		if (kvm_probe_user_return_msr(index))
7028			continue;
7029
7030		vmx->guest_uret_msrs[j].slot = i;
7031		vmx->guest_uret_msrs[j].data = 0;
7032		switch (index) {
7033		case MSR_IA32_TSX_CTRL:
7034			/*
7035			 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
7036			 * interception.  Keep the host value unchanged to avoid
7037			 * changing CPUID bits under the host kernel's feet.
7038			 *
7039			 * hle=0, rtm=0, tsx_ctrl=1 can be found with some
7040			 * combinations of new kernel and old userspace.  If
7041			 * those guests run on a tsx=off host, do allow guests
7042			 * to use TSX_CTRL, but do not change the value on the
7043			 * host so that TSX remains always disabled.
7044			 */
7045			if (boot_cpu_has(X86_FEATURE_RTM))
7046				vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
7047			else
7048				vmx->guest_uret_msrs[j].mask = 0;
7049			break;
7050		default:
7051			vmx->guest_uret_msrs[j].mask = -1ull;
7052			break;
7053		}
7054		++vmx->nr_uret_msrs;
7055	}
7056
7057	err = alloc_loaded_vmcs(&vmx->vmcs01);
7058	if (err < 0)
7059		goto free_pml;
7060
7061	/*
7062	 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7063	 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7064	 * feature only for vmcs01, KVM currently isn't equipped to realize any
7065	 * performance benefits from enabling it for vmcs02.
7066	 */
7067	if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
7068	    (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7069		struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
7070
7071		evmcs->hv_enlightenments_control.msr_bitmap = 1;
7072	}
7073
7074	/* The MSR bitmap starts with all ones */
7075	bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7076	bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7077
7078	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
7079#ifdef CONFIG_X86_64
7080	vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
7081	vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
7082	vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
7083#endif
7084	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
7085	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
7086	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
7087	if (kvm_cstate_in_guest(vcpu->kvm)) {
7088		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
7089		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
7090		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
7091		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
7092	}
7093	vmx->msr_bitmap_mode = 0;
7094
7095	vmx->loaded_vmcs = &vmx->vmcs01;
7096	cpu = get_cpu();
7097	vmx_vcpu_load(vcpu, cpu);
7098	vcpu->cpu = cpu;
7099	init_vmcs(vmx);
7100	vmx_vcpu_put(vcpu);
7101	put_cpu();
7102	if (cpu_need_virtualize_apic_accesses(vcpu)) {
7103		err = alloc_apic_access_page(vcpu->kvm);
7104		if (err)
7105			goto free_vmcs;
7106	}
7107
7108	if (enable_ept && !enable_unrestricted_guest) {
7109		err = init_rmode_identity_map(vcpu->kvm);
7110		if (err)
7111			goto free_vmcs;
7112	}
7113
7114	if (nested)
7115		memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
7116	else
7117		memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
7118
7119	vmx->nested.posted_intr_nv = -1;
7120	vmx->nested.current_vmptr = -1ull;
7121
7122	vcpu->arch.microcode_version = 0x100000000ULL;
7123	vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
7124
7125	/*
7126	 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
7127	 * or POSTED_INTR_WAKEUP_VECTOR.
7128	 */
7129	vmx->pi_desc.nv = POSTED_INTR_VECTOR;
7130	vmx->pi_desc.sn = 1;
7131
7132	vmx->ept_pointer = INVALID_PAGE;
7133
7134	return 0;
7135
7136free_vmcs:
7137	free_loaded_vmcs(vmx->loaded_vmcs);
7138free_pml:
7139	vmx_destroy_pml_buffer(vmx);
7140free_vpid:
7141	free_vpid(vmx->vpid);
7142	return err;
7143}
7144
7145#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7146#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7147
7148static int vmx_vm_init(struct kvm *kvm)
7149{
7150	spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
7151
7152	if (!ple_gap)
7153		kvm->arch.pause_in_guest = true;
7154
7155	if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7156		switch (l1tf_mitigation) {
7157		case L1TF_MITIGATION_OFF:
7158		case L1TF_MITIGATION_FLUSH_NOWARN:
7159			/* 'I explicitly don't care' is set */
7160			break;
7161		case L1TF_MITIGATION_FLUSH:
7162		case L1TF_MITIGATION_FLUSH_NOSMT:
7163		case L1TF_MITIGATION_FULL:
7164			/*
7165			 * Warn upon starting the first VM in a potentially
7166			 * insecure environment.
7167			 */
7168			if (sched_smt_active())
7169				pr_warn_once(L1TF_MSG_SMT);
7170			if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7171				pr_warn_once(L1TF_MSG_L1D);
7172			break;
7173		case L1TF_MITIGATION_FULL_FORCE:
7174			/* Flush is enforced */
7175			break;
7176		}
7177	}
7178	kvm_apicv_init(kvm, enable_apicv);
7179	return 0;
7180}
7181
7182static int __init vmx_check_processor_compat(void)
7183{
7184	struct vmcs_config vmcs_conf;
7185	struct vmx_capability vmx_cap;
7186
7187	if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
7188	    !this_cpu_has(X86_FEATURE_VMX)) {
7189		pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
7190		return -EIO;
7191	}
7192
7193	if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
7194		return -EIO;
7195	if (nested)
7196		nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
7197	if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
7198		printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
7199				smp_processor_id());
7200		return -EIO;
7201	}
7202	return 0;
7203}
7204
7205static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7206{
7207	u8 cache;
7208	u64 ipat = 0;
7209
7210	/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
7211	 * memory aliases with conflicting memory types and sometimes MCEs.
7212	 * We have to be careful as to what are honored and when.
7213	 *
7214	 * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
7215	 * UC.  The effective memory type is UC or WC depending on guest PAT.
7216	 * This was historically the source of MCEs and we want to be
7217	 * conservative.
7218	 *
7219	 * When there is no need to deal with noncoherent DMA (e.g., no VT-d
7220	 * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
7221	 * EPT memory type is set to WB.  The effective memory type is forced
7222	 * WB.
7223	 *
7224	 * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
7225	 * EPT memory type is used to emulate guest CD/MTRR.
7226	 */
7227
7228	if (is_mmio) {
7229		cache = MTRR_TYPE_UNCACHABLE;
7230		goto exit;
7231	}
7232
7233	if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
7234		ipat = VMX_EPT_IPAT_BIT;
7235		cache = MTRR_TYPE_WRBACK;
7236		goto exit;
7237	}
7238
7239	if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
7240		ipat = VMX_EPT_IPAT_BIT;
7241		if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
7242			cache = MTRR_TYPE_WRBACK;
7243		else
7244			cache = MTRR_TYPE_UNCACHABLE;
7245		goto exit;
7246	}
7247
7248	cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
7249
7250exit:
7251	return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
7252}
7253
7254static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
7255{
7256	/*
7257	 * These bits in the secondary execution controls field
7258	 * are dynamic, the others are mostly based on the hypervisor
7259	 * architecture and the guest's CPUID.  Do not touch the
7260	 * dynamic bits.
7261	 */
7262	u32 mask =
7263		SECONDARY_EXEC_SHADOW_VMCS |
7264		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7265		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7266		SECONDARY_EXEC_DESC;
7267
7268	u32 new_ctl = vmx->secondary_exec_control;
7269	u32 cur_ctl = secondary_exec_controls_get(vmx);
7270
7271	secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
7272}
7273
7274/*
7275 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7276 * (indicating "allowed-1") if they are supported in the guest's CPUID.
7277 */
7278static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7279{
7280	struct vcpu_vmx *vmx = to_vmx(vcpu);
7281	struct kvm_cpuid_entry2 *entry;
7282
7283	vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7284	vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7285
7286#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {		\
7287	if (entry && (entry->_reg & (_cpuid_mask)))			\
7288		vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);	\
7289} while (0)
7290
7291	entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
7292	cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
7293	cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
7294	cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
7295	cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
7296	cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
7297	cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
7298	cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
7299	cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
7300	cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
7301	cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7302	cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
7303	cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
7304	cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
7305	cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
7306
7307	entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
7308	cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
7309	cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
7310	cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
7311	cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
7312	cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
7313	cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
7314
7315#undef cr4_fixed1_update
7316}
7317
7318static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
7319{
7320	struct vcpu_vmx *vmx = to_vmx(vcpu);
7321
7322	if (kvm_mpx_supported()) {
7323		bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
7324
7325		if (mpx_enabled) {
7326			vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
7327			vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
7328		} else {
7329			vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
7330			vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
7331		}
7332	}
7333}
7334
7335static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7336{
7337	struct vcpu_vmx *vmx = to_vmx(vcpu);
7338	struct kvm_cpuid_entry2 *best = NULL;
7339	int i;
7340
7341	for (i = 0; i < PT_CPUID_LEAVES; i++) {
7342		best = kvm_find_cpuid_entry(vcpu, 0x14, i);
7343		if (!best)
7344			return;
7345		vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7346		vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7347		vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7348		vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7349	}
7350
7351	/* Get the number of configurable Address Ranges for filtering */
7352	vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
7353						PT_CAP_num_address_ranges);
7354
7355	/* Initialize and clear the no dependency bits */
7356	vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7357			RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
7358
7359	/*
7360	 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7361	 * will inject an #GP
7362	 */
7363	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7364		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7365
7366	/*
7367	 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7368	 * PSBFreq can be set
7369	 */
7370	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7371		vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7372				RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7373
7374	/*
7375	 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
7376	 * MTCFreq can be set
7377	 */
7378	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7379		vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7380				RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
7381
7382	/* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7383	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7384		vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7385							RTIT_CTL_PTW_EN);
7386
7387	/* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7388	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7389		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7390
7391	/* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7392	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7393		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7394
7395	/* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
7396	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7397		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7398
7399	/* unmask address range configure area */
7400	for (i = 0; i < vmx->pt_desc.addr_range; i++)
7401		vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7402}
7403
7404static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
7405{
7406	struct vcpu_vmx *vmx = to_vmx(vcpu);
7407
7408	/* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7409	vcpu->arch.xsaves_enabled = false;
7410
7411	if (cpu_has_secondary_exec_ctrls()) {
7412		vmx_compute_secondary_exec_control(vmx);
7413		vmcs_set_secondary_exec_control(vmx);
7414	}
7415
7416	if (nested_vmx_allowed(vcpu))
7417		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
7418			FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7419			FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
7420	else
7421		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
7422			~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7423			  FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
7424
7425	if (nested_vmx_allowed(vcpu)) {
7426		nested_vmx_cr_fixed1_bits_update(vcpu);
7427		nested_vmx_entry_exit_ctls_update(vcpu);
7428	}
7429
7430	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7431			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7432		update_intel_pt_cfg(vcpu);
7433
7434	if (boot_cpu_has(X86_FEATURE_RTM)) {
7435		struct vmx_uret_msr *msr;
7436		msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7437		if (msr) {
7438			bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7439			vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
7440		}
7441	}
7442
7443	set_cr4_guest_host_mask(vmx);
7444
7445	/* Refresh #PF interception to account for MAXPHYADDR changes. */
7446	update_exception_bitmap(vcpu);
7447}
7448
7449static __init void vmx_set_cpu_caps(void)
7450{
7451	kvm_set_cpu_caps();
7452
7453	/* CPUID 0x1 */
7454	if (nested)
7455		kvm_cpu_cap_set(X86_FEATURE_VMX);
7456
7457	/* CPUID 0x7 */
7458	if (kvm_mpx_supported())
7459		kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
7460	if (cpu_has_vmx_invpcid())
7461		kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
7462	if (vmx_pt_mode_is_host_guest())
7463		kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
7464
7465	if (vmx_umip_emulated())
7466		kvm_cpu_cap_set(X86_FEATURE_UMIP);
7467
7468	/* CPUID 0xD.1 */
7469	supported_xss = 0;
7470	if (!cpu_has_vmx_xsaves())
7471		kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
7472
7473	/* CPUID 0x80000001 and 0x7 (RDPID) */
7474	if (!cpu_has_vmx_rdtscp()) {
7475		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
7476		kvm_cpu_cap_clear(X86_FEATURE_RDPID);
7477	}
7478
7479	if (cpu_has_vmx_waitpkg())
7480		kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
7481}
7482
7483static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
7484{
7485	to_vmx(vcpu)->req_immediate_exit = true;
7486}
7487
7488static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
7489				  struct x86_instruction_info *info)
7490{
7491	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7492	unsigned short port;
7493	bool intercept;
7494	int size;
7495
7496	if (info->intercept == x86_intercept_in ||
7497	    info->intercept == x86_intercept_ins) {
7498		port = info->src_val;
7499		size = info->dst_bytes;
7500	} else {
7501		port = info->dst_val;
7502		size = info->src_bytes;
7503	}
7504
7505	/*
7506	 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
7507	 * VM-exits depend on the 'unconditional IO exiting' VM-execution
7508	 * control.
7509	 *
7510	 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
7511	 */
7512	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7513		intercept = nested_cpu_has(vmcs12,
7514					   CPU_BASED_UNCOND_IO_EXITING);
7515	else
7516		intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
7517
7518	/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7519	return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
7520}
7521
7522static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7523			       struct x86_instruction_info *info,
7524			       enum x86_intercept_stage stage,
7525			       struct x86_exception *exception)
7526{
7527	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7528
7529	switch (info->intercept) {
7530	/*
7531	 * RDPID causes #UD if disabled through secondary execution controls.
7532	 * Because it is marked as EmulateOnUD, we need to intercept it here.
7533	 * Note, RDPID is hidden behind ENABLE_RDTSCP.
7534	 */
7535	case x86_intercept_rdpid:
7536		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
7537			exception->vector = UD_VECTOR;
7538			exception->error_code_valid = false;
7539			return X86EMUL_PROPAGATE_FAULT;
7540		}
7541		break;
7542
7543	case x86_intercept_in:
7544	case x86_intercept_ins:
7545	case x86_intercept_out:
7546	case x86_intercept_outs:
7547		return vmx_check_intercept_io(vcpu, info);
7548
7549	case x86_intercept_lgdt:
7550	case x86_intercept_lidt:
7551	case x86_intercept_lldt:
7552	case x86_intercept_ltr:
7553	case x86_intercept_sgdt:
7554	case x86_intercept_sidt:
7555	case x86_intercept_sldt:
7556	case x86_intercept_str:
7557		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
7558			return X86EMUL_CONTINUE;
7559
7560		/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
7561		break;
7562
7563	case x86_intercept_pause:
7564		/*
7565		 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
7566		 * with vanilla NOPs in the emulator.  Apply the interception
7567		 * check only to actual PAUSE instructions.  Don't check
7568		 * PAUSE-loop-exiting, software can't expect a given PAUSE to
7569		 * exit, i.e. KVM is within its rights to allow L2 to execute
7570		 * the PAUSE.
7571		 */
7572		if ((info->rep_prefix != REPE_PREFIX) ||
7573		    !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
7574			return X86EMUL_CONTINUE;
7575
7576		break;
7577
7578	/* TODO: check more intercepts... */
7579	default:
7580		break;
7581	}
7582
7583	return X86EMUL_UNHANDLEABLE;
7584}
7585
7586#ifdef CONFIG_X86_64
7587/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
7588static inline int u64_shl_div_u64(u64 a, unsigned int shift,
7589				  u64 divisor, u64 *result)
7590{
7591	u64 low = a << shift, high = a >> (64 - shift);
7592
7593	/* To avoid the overflow on divq */
7594	if (high >= divisor)
7595		return 1;
7596
7597	/* Low hold the result, high hold rem which is discarded */
7598	asm("divq %2\n\t" : "=a" (low), "=d" (high) :
7599	    "rm" (divisor), "0" (low), "1" (high));
7600	*result = low;
7601
7602	return 0;
7603}
7604
7605static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
7606			    bool *expired)
7607{
7608	struct vcpu_vmx *vmx;
7609	u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
7610	struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
7611
7612	vmx = to_vmx(vcpu);
7613	tscl = rdtsc();
7614	guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
7615	delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
7616	lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
7617						    ktimer->timer_advance_ns);
7618
7619	if (delta_tsc > lapic_timer_advance_cycles)
7620		delta_tsc -= lapic_timer_advance_cycles;
7621	else
7622		delta_tsc = 0;
7623
7624	/* Convert to host delta tsc if tsc scaling is enabled */
7625	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
7626	    delta_tsc && u64_shl_div_u64(delta_tsc,
7627				kvm_tsc_scaling_ratio_frac_bits,
7628				vcpu->arch.tsc_scaling_ratio, &delta_tsc))
7629		return -ERANGE;
7630
7631	/*
7632	 * If the delta tsc can't fit in the 32 bit after the multi shift,
7633	 * we can't use the preemption timer.
7634	 * It's possible that it fits on later vmentries, but checking
7635	 * on every vmentry is costly so we just use an hrtimer.
7636	 */
7637	if (delta_tsc >> (cpu_preemption_timer_multi + 32))
7638		return -ERANGE;
7639
7640	vmx->hv_deadline_tsc = tscl + delta_tsc;
7641	*expired = !delta_tsc;
7642	return 0;
7643}
7644
7645static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
7646{
7647	to_vmx(vcpu)->hv_deadline_tsc = -1;
7648}
7649#endif
7650
7651static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
7652{
7653	if (!kvm_pause_in_guest(vcpu->kvm))
7654		shrink_ple_window(vcpu);
7655}
7656
7657static void vmx_slot_enable_log_dirty(struct kvm *kvm,
7658				     struct kvm_memory_slot *slot)
7659{
7660	if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
7661		kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
7662	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
7663}
7664
7665static void vmx_slot_disable_log_dirty(struct kvm *kvm,
7666				       struct kvm_memory_slot *slot)
7667{
7668	kvm_mmu_slot_set_dirty(kvm, slot);
7669}
7670
7671static void vmx_flush_log_dirty(struct kvm *kvm)
7672{
7673	kvm_flush_pml_buffers(kvm);
7674}
7675
7676static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
7677					   struct kvm_memory_slot *memslot,
7678					   gfn_t offset, unsigned long mask)
7679{
7680	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
7681}
7682
7683static int vmx_pre_block(struct kvm_vcpu *vcpu)
7684{
7685	if (pi_pre_block(vcpu))
7686		return 1;
7687
7688	if (kvm_lapic_hv_timer_in_use(vcpu))
7689		kvm_lapic_switch_to_sw_timer(vcpu);
7690
7691	return 0;
7692}
7693
7694static void vmx_post_block(struct kvm_vcpu *vcpu)
7695{
7696	if (kvm_x86_ops.set_hv_timer)
7697		kvm_lapic_switch_to_hv_timer(vcpu);
7698
7699	pi_post_block(vcpu);
7700}
7701
7702static void vmx_setup_mce(struct kvm_vcpu *vcpu)
7703{
7704	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
7705		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
7706			FEAT_CTL_LMCE_ENABLED;
7707	else
7708		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
7709			~FEAT_CTL_LMCE_ENABLED;
7710}
7711
7712static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
7713{
7714	/* we need a nested vmexit to enter SMM, postpone if run is pending */
7715	if (to_vmx(vcpu)->nested.nested_run_pending)
7716		return -EBUSY;
7717	return !is_smm(vcpu);
7718}
7719
7720static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
7721{
7722	struct vcpu_vmx *vmx = to_vmx(vcpu);
7723
7724	vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
7725	if (vmx->nested.smm.guest_mode)
7726		nested_vmx_vmexit(vcpu, -1, 0, 0);
7727
7728	vmx->nested.smm.vmxon = vmx->nested.vmxon;
7729	vmx->nested.vmxon = false;
7730	vmx_clear_hlt(vcpu);
7731	return 0;
7732}
7733
7734static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
7735{
7736	struct vcpu_vmx *vmx = to_vmx(vcpu);
7737	int ret;
7738
7739	if (vmx->nested.smm.vmxon) {
7740		vmx->nested.vmxon = true;
7741		vmx->nested.smm.vmxon = false;
7742	}
7743
7744	if (vmx->nested.smm.guest_mode) {
7745		ret = nested_vmx_enter_non_root_mode(vcpu, false);
7746		if (ret)
7747			return ret;
7748
7749		vmx->nested.smm.guest_mode = false;
7750	}
7751	return 0;
7752}
7753
7754static void enable_smi_window(struct kvm_vcpu *vcpu)
7755{
7756	/* RSM will cause a vmexit anyway.  */
7757}
7758
7759static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
7760{
7761	return to_vmx(vcpu)->nested.vmxon;
7762}
7763
7764static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
7765{
7766	if (is_guest_mode(vcpu)) {
7767		struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
7768
7769		if (hrtimer_try_to_cancel(timer) == 1)
7770			hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
7771	}
7772}
7773
7774static void hardware_unsetup(void)
7775{
7776	kvm_set_posted_intr_wakeup_handler(NULL);
7777
7778	if (nested)
7779		nested_vmx_hardware_unsetup();
7780
7781	free_kvm_area();
7782}
7783
7784static bool vmx_check_apicv_inhibit_reasons(ulong bit)
7785{
7786	ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
7787			  BIT(APICV_INHIBIT_REASON_HYPERV);
7788
7789	return supported & BIT(bit);
7790}
7791
7792static struct kvm_x86_ops vmx_x86_ops __initdata = {
7793	.hardware_unsetup = hardware_unsetup,
7794
7795	.hardware_enable = hardware_enable,
7796	.hardware_disable = hardware_disable,
7797	.cpu_has_accelerated_tpr = report_flexpriority,
7798	.has_emulated_msr = vmx_has_emulated_msr,
7799
7800	.vm_size = sizeof(struct kvm_vmx),
7801	.vm_init = vmx_vm_init,
7802
7803	.vcpu_create = vmx_create_vcpu,
7804	.vcpu_free = vmx_free_vcpu,
7805	.vcpu_reset = vmx_vcpu_reset,
7806
7807	.prepare_guest_switch = vmx_prepare_switch_to_guest,
7808	.vcpu_load = vmx_vcpu_load,
7809	.vcpu_put = vmx_vcpu_put,
7810
7811	.update_exception_bitmap = update_exception_bitmap,
7812	.get_msr_feature = vmx_get_msr_feature,
7813	.get_msr = vmx_get_msr,
7814	.set_msr = vmx_set_msr,
7815	.get_segment_base = vmx_get_segment_base,
7816	.get_segment = vmx_get_segment,
7817	.set_segment = vmx_set_segment,
7818	.get_cpl = vmx_get_cpl,
7819	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
7820	.set_cr0 = vmx_set_cr0,
7821	.is_valid_cr4 = vmx_is_valid_cr4,
7822	.set_cr4 = vmx_set_cr4,
7823	.set_efer = vmx_set_efer,
7824	.get_idt = vmx_get_idt,
7825	.set_idt = vmx_set_idt,
7826	.get_gdt = vmx_get_gdt,
7827	.set_gdt = vmx_set_gdt,
7828	.set_dr7 = vmx_set_dr7,
7829	.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
7830	.cache_reg = vmx_cache_reg,
7831	.get_rflags = vmx_get_rflags,
7832	.set_rflags = vmx_set_rflags,
7833
7834	.tlb_flush_all = vmx_flush_tlb_all,
7835	.tlb_flush_current = vmx_flush_tlb_current,
7836	.tlb_flush_gva = vmx_flush_tlb_gva,
7837	.tlb_flush_guest = vmx_flush_tlb_guest,
7838
7839	.run = vmx_vcpu_run,
7840	.handle_exit = vmx_handle_exit,
7841	.skip_emulated_instruction = vmx_skip_emulated_instruction,
7842	.update_emulated_instruction = vmx_update_emulated_instruction,
7843	.set_interrupt_shadow = vmx_set_interrupt_shadow,
7844	.get_interrupt_shadow = vmx_get_interrupt_shadow,
7845	.patch_hypercall = vmx_patch_hypercall,
7846	.set_irq = vmx_inject_irq,
7847	.set_nmi = vmx_inject_nmi,
7848	.queue_exception = vmx_queue_exception,
7849	.cancel_injection = vmx_cancel_injection,
7850	.interrupt_allowed = vmx_interrupt_allowed,
7851	.nmi_allowed = vmx_nmi_allowed,
7852	.get_nmi_mask = vmx_get_nmi_mask,
7853	.set_nmi_mask = vmx_set_nmi_mask,
7854	.enable_nmi_window = enable_nmi_window,
7855	.enable_irq_window = enable_irq_window,
7856	.update_cr8_intercept = update_cr8_intercept,
7857	.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
7858	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
7859	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
7860	.load_eoi_exitmap = vmx_load_eoi_exitmap,
7861	.apicv_post_state_restore = vmx_apicv_post_state_restore,
7862	.check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
7863	.hwapic_irr_update = vmx_hwapic_irr_update,
7864	.hwapic_isr_update = vmx_hwapic_isr_update,
7865	.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
7866	.sync_pir_to_irr = vmx_sync_pir_to_irr,
7867	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
7868	.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
7869
7870	.set_tss_addr = vmx_set_tss_addr,
7871	.set_identity_map_addr = vmx_set_identity_map_addr,
7872	.get_mt_mask = vmx_get_mt_mask,
7873
7874	.get_exit_info = vmx_get_exit_info,
7875
7876	.vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
7877
7878	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
7879
7880	.write_l1_tsc_offset = vmx_write_l1_tsc_offset,
7881
7882	.load_mmu_pgd = vmx_load_mmu_pgd,
7883
7884	.check_intercept = vmx_check_intercept,
7885	.handle_exit_irqoff = vmx_handle_exit_irqoff,
7886
7887	.request_immediate_exit = vmx_request_immediate_exit,
7888
7889	.sched_in = vmx_sched_in,
7890
7891	.slot_enable_log_dirty = vmx_slot_enable_log_dirty,
7892	.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
7893	.flush_log_dirty = vmx_flush_log_dirty,
7894	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
7895
7896	.pre_block = vmx_pre_block,
7897	.post_block = vmx_post_block,
7898
7899	.pmu_ops = &intel_pmu_ops,
7900	.nested_ops = &vmx_nested_ops,
7901
7902	.update_pi_irte = pi_update_irte,
7903
7904#ifdef CONFIG_X86_64
7905	.set_hv_timer = vmx_set_hv_timer,
7906	.cancel_hv_timer = vmx_cancel_hv_timer,
7907#endif
7908
7909	.setup_mce = vmx_setup_mce,
7910
7911	.smi_allowed = vmx_smi_allowed,
7912	.pre_enter_smm = vmx_pre_enter_smm,
7913	.pre_leave_smm = vmx_pre_leave_smm,
7914	.enable_smi_window = enable_smi_window,
7915
7916	.can_emulate_instruction = vmx_can_emulate_instruction,
7917	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
7918	.migrate_timers = vmx_migrate_timers,
7919
7920	.msr_filter_changed = vmx_msr_filter_changed,
7921};
7922
7923static __init int hardware_setup(void)
7924{
7925	unsigned long host_bndcfgs;
7926	struct desc_ptr dt;
7927	int r, i, ept_lpage_level;
7928
7929	store_idt(&dt);
7930	host_idt_base = dt.address;
7931
7932	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
7933		kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
7934
7935	if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
7936		return -EIO;
7937
7938	if (boot_cpu_has(X86_FEATURE_NX))
7939		kvm_enable_efer_bits(EFER_NX);
7940
7941	if (boot_cpu_has(X86_FEATURE_MPX)) {
7942		rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
7943		WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
7944	}
7945
7946	if (!cpu_has_vmx_mpx())
7947		supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
7948				    XFEATURE_MASK_BNDCSR);
7949
7950	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
7951	    !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
7952		enable_vpid = 0;
7953
7954	if (!cpu_has_vmx_ept() ||
7955	    !cpu_has_vmx_ept_4levels() ||
7956	    !cpu_has_vmx_ept_mt_wb() ||
7957	    !cpu_has_vmx_invept_global())
7958		enable_ept = 0;
7959
7960	if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
7961		enable_ept_ad_bits = 0;
7962
7963	if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
7964		enable_unrestricted_guest = 0;
7965
7966	if (!cpu_has_vmx_flexpriority())
7967		flexpriority_enabled = 0;
7968
7969	if (!cpu_has_virtual_nmis())
7970		enable_vnmi = 0;
7971
7972	/*
7973	 * set_apic_access_page_addr() is used to reload apic access
7974	 * page upon invalidation.  No need to do anything if not
7975	 * using the APIC_ACCESS_ADDR VMCS field.
7976	 */
7977	if (!flexpriority_enabled)
7978		vmx_x86_ops.set_apic_access_page_addr = NULL;
7979
7980	if (!cpu_has_vmx_tpr_shadow())
7981		vmx_x86_ops.update_cr8_intercept = NULL;
7982
7983#if IS_ENABLED(CONFIG_HYPERV)
7984	if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
7985	    && enable_ept) {
7986		vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
7987		vmx_x86_ops.tlb_remote_flush_with_range =
7988				hv_remote_flush_tlb_with_range;
7989	}
7990#endif
7991
7992	if (!cpu_has_vmx_ple()) {
7993		ple_gap = 0;
7994		ple_window = 0;
7995		ple_window_grow = 0;
7996		ple_window_max = 0;
7997		ple_window_shrink = 0;
7998	}
7999
8000	if (!cpu_has_vmx_apicv()) {
8001		enable_apicv = 0;
8002		vmx_x86_ops.sync_pir_to_irr = NULL;
8003	}
8004
8005	if (cpu_has_vmx_tsc_scaling()) {
8006		kvm_has_tsc_control = true;
8007		kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8008		kvm_tsc_scaling_ratio_frac_bits = 48;
8009	}
8010
8011	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8012
8013	if (enable_ept)
8014		vmx_enable_tdp();
8015
8016	if (!enable_ept)
8017		ept_lpage_level = 0;
8018	else if (cpu_has_vmx_ept_1g_page())
8019		ept_lpage_level = PG_LEVEL_1G;
8020	else if (cpu_has_vmx_ept_2m_page())
8021		ept_lpage_level = PG_LEVEL_2M;
8022	else
8023		ept_lpage_level = PG_LEVEL_4K;
8024	kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
8025
8026	/*
8027	 * Only enable PML when hardware supports PML feature, and both EPT
8028	 * and EPT A/D bit features are enabled -- PML depends on them to work.
8029	 */
8030	if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8031		enable_pml = 0;
8032
8033	if (!enable_pml) {
8034		vmx_x86_ops.slot_enable_log_dirty = NULL;
8035		vmx_x86_ops.slot_disable_log_dirty = NULL;
8036		vmx_x86_ops.flush_log_dirty = NULL;
8037		vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
8038	}
8039
8040	if (!cpu_has_vmx_preemption_timer())
8041		enable_preemption_timer = false;
8042
8043	if (enable_preemption_timer) {
8044		u64 use_timer_freq = 5000ULL * 1000 * 1000;
8045		u64 vmx_msr;
8046
8047		rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
8048		cpu_preemption_timer_multi =
8049			vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
8050
8051		if (tsc_khz)
8052			use_timer_freq = (u64)tsc_khz * 1000;
8053		use_timer_freq >>= cpu_preemption_timer_multi;
8054
8055		/*
8056		 * KVM "disables" the preemption timer by setting it to its max
8057		 * value.  Don't use the timer if it might cause spurious exits
8058		 * at a rate faster than 0.1 Hz (of uninterrupted guest time).
8059		 */
8060		if (use_timer_freq > 0xffffffffu / 10)
8061			enable_preemption_timer = false;
8062	}
8063
8064	if (!enable_preemption_timer) {
8065		vmx_x86_ops.set_hv_timer = NULL;
8066		vmx_x86_ops.cancel_hv_timer = NULL;
8067		vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
8068	}
8069
8070	kvm_mce_cap_supported |= MCG_LMCE_P;
8071
8072	if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8073		return -EINVAL;
8074	if (!enable_ept || !cpu_has_vmx_intel_pt())
8075		pt_mode = PT_MODE_SYSTEM;
8076
8077	if (nested) {
8078		nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
8079					   vmx_capability.ept);
8080
8081		r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
8082		if (r)
8083			return r;
8084	}
8085
8086	vmx_set_cpu_caps();
8087
8088	r = alloc_kvm_area();
8089	if (r)
8090		nested_vmx_hardware_unsetup();
8091
8092	kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
8093
8094	return r;
8095}
8096
8097static struct kvm_x86_init_ops vmx_init_ops __initdata = {
8098	.cpu_has_kvm_support = cpu_has_kvm_support,
8099	.disabled_by_bios = vmx_disabled_by_bios,
8100	.check_processor_compatibility = vmx_check_processor_compat,
8101	.hardware_setup = hardware_setup,
8102	.intel_pt_intr_in_guest = vmx_pt_mode_is_host_guest,
8103
8104	.runtime_ops = &vmx_x86_ops,
8105};
8106
8107static void vmx_cleanup_l1d_flush(void)
8108{
8109	if (vmx_l1d_flush_pages) {
8110		free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
8111		vmx_l1d_flush_pages = NULL;
8112	}
8113	/* Restore state so sysfs ignores VMX */
8114	l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
8115}
8116
8117static void vmx_exit(void)
8118{
8119#ifdef CONFIG_KEXEC_CORE
8120	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
8121	synchronize_rcu();
8122#endif
8123
8124	kvm_exit();
8125
8126#if IS_ENABLED(CONFIG_HYPERV)
8127	if (static_branch_unlikely(&enable_evmcs)) {
8128		int cpu;
8129		struct hv_vp_assist_page *vp_ap;
8130		/*
8131		 * Reset everything to support using non-enlightened VMCS
8132		 * access later (e.g. when we reload the module with
8133		 * enlightened_vmcs=0)
8134		 */
8135		for_each_online_cpu(cpu) {
8136			vp_ap =	hv_get_vp_assist_page(cpu);
8137
8138			if (!vp_ap)
8139				continue;
8140
8141			vp_ap->nested_control.features.directhypercall = 0;
8142			vp_ap->current_nested_vmcs = 0;
8143			vp_ap->enlighten_vmentry = 0;
8144		}
8145
8146		static_branch_disable(&enable_evmcs);
8147	}
8148#endif
8149	vmx_cleanup_l1d_flush();
8150}
8151module_exit(vmx_exit);
8152
8153static int __init vmx_init(void)
8154{
8155	int r, cpu;
8156
8157#if IS_ENABLED(CONFIG_HYPERV)
8158	/*
8159	 * Enlightened VMCS usage should be recommended and the host needs
8160	 * to support eVMCS v1 or above. We can also disable eVMCS support
8161	 * with module parameter.
8162	 */
8163	if (enlightened_vmcs &&
8164	    ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
8165	    (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
8166	    KVM_EVMCS_VERSION) {
8167		int cpu;
8168
8169		/* Check that we have assist pages on all online CPUs */
8170		for_each_online_cpu(cpu) {
8171			if (!hv_get_vp_assist_page(cpu)) {
8172				enlightened_vmcs = false;
8173				break;
8174			}
8175		}
8176
8177		if (enlightened_vmcs) {
8178			pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
8179			static_branch_enable(&enable_evmcs);
8180		}
8181
8182		if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
8183			vmx_x86_ops.enable_direct_tlbflush
8184				= hv_enable_direct_tlbflush;
8185
8186	} else {
8187		enlightened_vmcs = false;
8188	}
8189#endif
8190
8191	r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
8192		     __alignof__(struct vcpu_vmx), THIS_MODULE);
8193	if (r)
8194		return r;
8195
8196	/*
8197	 * Must be called after kvm_init() so enable_ept is properly set
8198	 * up. Hand the parameter mitigation value in which was stored in
8199	 * the pre module init parser. If no parameter was given, it will
8200	 * contain 'auto' which will be turned into the default 'cond'
8201	 * mitigation mode.
8202	 */
8203	r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
8204	if (r) {
8205		vmx_exit();
8206		return r;
8207	}
8208
8209	vmx_setup_fb_clear_ctrl();
8210
8211	for_each_possible_cpu(cpu) {
8212		INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
8213
8214		pi_init_cpu(cpu);
8215	}
8216
8217#ifdef CONFIG_KEXEC_CORE
8218	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
8219			   crash_vmclear_local_loaded_vmcss);
8220#endif
8221	vmx_check_vmcs12_offsets();
8222
8223	/*
8224	 * Shadow paging doesn't have a (further) performance penalty
8225	 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8226	 * by default
8227	 */
8228	if (!enable_ept)
8229		allow_smaller_maxphyaddr = true;
8230
8231	return 0;
8232}
8233module_init(vmx_init);
8234