xref: /kernel/linux/linux-5.10/arch/x86/kvm/x86.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * derived from drivers/kvm/kvm_main.c
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright (C) 2008 Qumranet, Inc.
9 * Copyright IBM Corporation, 2008
10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 *
12 * Authors:
13 *   Avi Kivity   <avi@qumranet.com>
14 *   Yaniv Kamay  <yaniv@qumranet.com>
15 *   Amit Shah    <amit.shah@qumranet.com>
16 *   Ben-Ami Yassour <benami@il.ibm.com>
17 */
18
19#include <linux/kvm_host.h>
20#include "irq.h"
21#include "ioapic.h"
22#include "mmu.h"
23#include "i8254.h"
24#include "tss.h"
25#include "kvm_cache_regs.h"
26#include "kvm_emulate.h"
27#include "x86.h"
28#include "cpuid.h"
29#include "pmu.h"
30#include "hyperv.h"
31#include "lapic.h"
32
33#include <linux/clocksource.h>
34#include <linux/interrupt.h>
35#include <linux/kvm.h>
36#include <linux/fs.h>
37#include <linux/vmalloc.h>
38#include <linux/export.h>
39#include <linux/moduleparam.h>
40#include <linux/mman.h>
41#include <linux/highmem.h>
42#include <linux/iommu.h>
43#include <linux/intel-iommu.h>
44#include <linux/cpufreq.h>
45#include <linux/user-return-notifier.h>
46#include <linux/srcu.h>
47#include <linux/slab.h>
48#include <linux/perf_event.h>
49#include <linux/uaccess.h>
50#include <linux/hash.h>
51#include <linux/pci.h>
52#include <linux/timekeeper_internal.h>
53#include <linux/pvclock_gtod.h>
54#include <linux/kvm_irqfd.h>
55#include <linux/irqbypass.h>
56#include <linux/sched/stat.h>
57#include <linux/sched/isolation.h>
58#include <linux/mem_encrypt.h>
59#include <linux/entry-kvm.h>
60
61#include <trace/events/kvm.h>
62
63#include <asm/debugreg.h>
64#include <asm/msr.h>
65#include <asm/desc.h>
66#include <asm/mce.h>
67#include <linux/kernel_stat.h>
68#include <asm/fpu/internal.h> /* Ugh! */
69#include <asm/pvclock.h>
70#include <asm/div64.h>
71#include <asm/irq_remapping.h>
72#include <asm/mshyperv.h>
73#include <asm/hypervisor.h>
74#include <asm/tlbflush.h>
75#include <asm/intel_pt.h>
76#include <asm/emulate_prefix.h>
77#include <clocksource/hyperv_timer.h>
78
79#define CREATE_TRACE_POINTS
80#include "trace.h"
81
82#define MAX_IO_MSRS 256
83#define KVM_MAX_MCE_BANKS 32
84u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
85EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
86
87#define emul_to_vcpu(ctxt) \
88	((struct kvm_vcpu *)(ctxt)->vcpu)
89
90/* EFER defaults:
91 * - enable syscall per default because its emulated by KVM
92 * - enable LME and LMA per default on 64 bit KVM
93 */
94#ifdef CONFIG_X86_64
95static
96u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
97#else
98static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
99#endif
100
101static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
102
103#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
104                                    KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
105
106static void update_cr8_intercept(struct kvm_vcpu *vcpu);
107static void process_nmi(struct kvm_vcpu *vcpu);
108static void process_smi(struct kvm_vcpu *vcpu);
109static void enter_smm(struct kvm_vcpu *vcpu);
110static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
111static void store_regs(struct kvm_vcpu *vcpu);
112static int sync_regs(struct kvm_vcpu *vcpu);
113
114struct kvm_x86_ops kvm_x86_ops __read_mostly;
115EXPORT_SYMBOL_GPL(kvm_x86_ops);
116
117static bool __read_mostly ignore_msrs = 0;
118module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
119
120static bool __read_mostly report_ignored_msrs = true;
121module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
122
123unsigned int min_timer_period_us = 200;
124module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
125
126static bool __read_mostly kvmclock_periodic_sync = true;
127module_param(kvmclock_periodic_sync, bool, S_IRUGO);
128
129bool __read_mostly kvm_has_tsc_control;
130EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
131u32  __read_mostly kvm_max_guest_tsc_khz;
132EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
133u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
134EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
135u64  __read_mostly kvm_max_tsc_scaling_ratio;
136EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
137u64 __read_mostly kvm_default_tsc_scaling_ratio;
138EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
139
140/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
141static u32 __read_mostly tsc_tolerance_ppm = 250;
142module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
143
144/*
145 * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
146 * adaptive tuning starting from default advancment of 1000ns.  '0' disables
147 * advancement entirely.  Any other value is used as-is and disables adaptive
148 * tuning, i.e. allows priveleged userspace to set an exact advancement time.
149 */
150static int __read_mostly lapic_timer_advance_ns = -1;
151module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
152
153static bool __read_mostly vector_hashing = true;
154module_param(vector_hashing, bool, S_IRUGO);
155
156bool __read_mostly enable_vmware_backdoor = false;
157module_param(enable_vmware_backdoor, bool, S_IRUGO);
158EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
159
160static bool __read_mostly force_emulation_prefix = false;
161module_param(force_emulation_prefix, bool, S_IRUGO);
162
163int __read_mostly pi_inject_timer = -1;
164module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
165
166/*
167 * Restoring the host value for MSRs that are only consumed when running in
168 * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
169 * returns to userspace, i.e. the kernel can run with the guest's value.
170 */
171#define KVM_MAX_NR_USER_RETURN_MSRS 16
172
173struct kvm_user_return_msrs_global {
174	int nr;
175	u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
176};
177
178struct kvm_user_return_msrs {
179	struct user_return_notifier urn;
180	bool registered;
181	struct kvm_user_return_msr_values {
182		u64 host;
183		u64 curr;
184	} values[KVM_MAX_NR_USER_RETURN_MSRS];
185};
186
187static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
188static struct kvm_user_return_msrs __percpu *user_return_msrs;
189
190#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
191				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
192				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
193				| XFEATURE_MASK_PKRU)
194
195u64 __read_mostly host_efer;
196EXPORT_SYMBOL_GPL(host_efer);
197
198bool __read_mostly allow_smaller_maxphyaddr = 0;
199EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
200
201static u64 __read_mostly host_xss;
202u64 __read_mostly supported_xss;
203EXPORT_SYMBOL_GPL(supported_xss);
204
205struct kvm_stats_debugfs_item debugfs_entries[] = {
206	VCPU_STAT("pf_fixed", pf_fixed),
207	VCPU_STAT("pf_guest", pf_guest),
208	VCPU_STAT("tlb_flush", tlb_flush),
209	VCPU_STAT("invlpg", invlpg),
210	VCPU_STAT("exits", exits),
211	VCPU_STAT("io_exits", io_exits),
212	VCPU_STAT("mmio_exits", mmio_exits),
213	VCPU_STAT("signal_exits", signal_exits),
214	VCPU_STAT("irq_window", irq_window_exits),
215	VCPU_STAT("nmi_window", nmi_window_exits),
216	VCPU_STAT("halt_exits", halt_exits),
217	VCPU_STAT("halt_successful_poll", halt_successful_poll),
218	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
219	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
220	VCPU_STAT("halt_wakeup", halt_wakeup),
221	VCPU_STAT("hypercalls", hypercalls),
222	VCPU_STAT("request_irq", request_irq_exits),
223	VCPU_STAT("irq_exits", irq_exits),
224	VCPU_STAT("host_state_reload", host_state_reload),
225	VCPU_STAT("fpu_reload", fpu_reload),
226	VCPU_STAT("insn_emulation", insn_emulation),
227	VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
228	VCPU_STAT("irq_injections", irq_injections),
229	VCPU_STAT("nmi_injections", nmi_injections),
230	VCPU_STAT("req_event", req_event),
231	VCPU_STAT("l1d_flush", l1d_flush),
232	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
233	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
234	VCPU_STAT("preemption_reported", preemption_reported),
235	VCPU_STAT("preemption_other", preemption_other),
236	VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
237	VM_STAT("mmu_pte_write", mmu_pte_write),
238	VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
239	VM_STAT("mmu_flooded", mmu_flooded),
240	VM_STAT("mmu_recycled", mmu_recycled),
241	VM_STAT("mmu_cache_miss", mmu_cache_miss),
242	VM_STAT("mmu_unsync", mmu_unsync),
243	VM_STAT("remote_tlb_flush", remote_tlb_flush),
244	VM_STAT("largepages", lpages, .mode = 0444),
245	VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
246	VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
247	{ NULL }
248};
249
250u64 __read_mostly host_xcr0;
251u64 __read_mostly supported_xcr0;
252EXPORT_SYMBOL_GPL(supported_xcr0);
253
254static struct kmem_cache *x86_fpu_cache;
255
256static struct kmem_cache *x86_emulator_cache;
257
258/*
259 * When called, it means the previous get/set msr reached an invalid msr.
260 * Return true if we want to ignore/silent this failed msr access.
261 */
262static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
263				  u64 data, bool write)
264{
265	const char *op = write ? "wrmsr" : "rdmsr";
266
267	if (ignore_msrs) {
268		if (report_ignored_msrs)
269			kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
270				      op, msr, data);
271		/* Mask the error */
272		return true;
273	} else {
274		kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
275				      op, msr, data);
276		return false;
277	}
278}
279
280static struct kmem_cache *kvm_alloc_emulator_cache(void)
281{
282	unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
283	unsigned int size = sizeof(struct x86_emulate_ctxt);
284
285	return kmem_cache_create_usercopy("x86_emulator", size,
286					  __alignof__(struct x86_emulate_ctxt),
287					  SLAB_ACCOUNT, useroffset,
288					  size - useroffset, NULL);
289}
290
291static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
292
293static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
294{
295	int i;
296	for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
297		vcpu->arch.apf.gfns[i] = ~0;
298}
299
300static void kvm_on_user_return(struct user_return_notifier *urn)
301{
302	unsigned slot;
303	struct kvm_user_return_msrs *msrs
304		= container_of(urn, struct kvm_user_return_msrs, urn);
305	struct kvm_user_return_msr_values *values;
306	unsigned long flags;
307
308	/*
309	 * Disabling irqs at this point since the following code could be
310	 * interrupted and executed through kvm_arch_hardware_disable()
311	 */
312	local_irq_save(flags);
313	if (msrs->registered) {
314		msrs->registered = false;
315		user_return_notifier_unregister(urn);
316	}
317	local_irq_restore(flags);
318	for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
319		values = &msrs->values[slot];
320		if (values->host != values->curr) {
321			wrmsrl(user_return_msrs_global.msrs[slot], values->host);
322			values->curr = values->host;
323		}
324	}
325}
326
327int kvm_probe_user_return_msr(u32 msr)
328{
329	u64 val;
330	int ret;
331
332	preempt_disable();
333	ret = rdmsrl_safe(msr, &val);
334	if (ret)
335		goto out;
336	ret = wrmsrl_safe(msr, val);
337out:
338	preempt_enable();
339	return ret;
340}
341EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr);
342
343void kvm_define_user_return_msr(unsigned slot, u32 msr)
344{
345	BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
346	user_return_msrs_global.msrs[slot] = msr;
347	if (slot >= user_return_msrs_global.nr)
348		user_return_msrs_global.nr = slot + 1;
349}
350EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
351
352static void kvm_user_return_msr_cpu_online(void)
353{
354	unsigned int cpu = smp_processor_id();
355	struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
356	u64 value;
357	int i;
358
359	for (i = 0; i < user_return_msrs_global.nr; ++i) {
360		rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
361		msrs->values[i].host = value;
362		msrs->values[i].curr = value;
363	}
364}
365
366int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
367{
368	unsigned int cpu = smp_processor_id();
369	struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
370	int err;
371
372	value = (value & mask) | (msrs->values[slot].host & ~mask);
373	if (value == msrs->values[slot].curr)
374		return 0;
375	err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
376	if (err)
377		return 1;
378
379	msrs->values[slot].curr = value;
380	if (!msrs->registered) {
381		msrs->urn.on_user_return = kvm_on_user_return;
382		user_return_notifier_register(&msrs->urn);
383		msrs->registered = true;
384	}
385	return 0;
386}
387EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
388
389static void drop_user_return_notifiers(void)
390{
391	unsigned int cpu = smp_processor_id();
392	struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
393
394	if (msrs->registered)
395		kvm_on_user_return(&msrs->urn);
396}
397
398u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
399{
400	return vcpu->arch.apic_base;
401}
402EXPORT_SYMBOL_GPL(kvm_get_apic_base);
403
404enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
405{
406	return kvm_apic_mode(kvm_get_apic_base(vcpu));
407}
408EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
409
410int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
411{
412	enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
413	enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
414	u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
415		(guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
416
417	if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
418		return 1;
419	if (!msr_info->host_initiated) {
420		if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
421			return 1;
422		if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
423			return 1;
424	}
425
426	kvm_lapic_set_base(vcpu, msr_info->data);
427	kvm_recalculate_apic_map(vcpu->kvm);
428	return 0;
429}
430EXPORT_SYMBOL_GPL(kvm_set_apic_base);
431
432asmlinkage __visible noinstr void kvm_spurious_fault(void)
433{
434	/* Fault while not rebooting.  We want the trace. */
435	BUG_ON(!kvm_rebooting);
436}
437EXPORT_SYMBOL_GPL(kvm_spurious_fault);
438
439#define EXCPT_BENIGN		0
440#define EXCPT_CONTRIBUTORY	1
441#define EXCPT_PF		2
442
443static int exception_class(int vector)
444{
445	switch (vector) {
446	case PF_VECTOR:
447		return EXCPT_PF;
448	case DE_VECTOR:
449	case TS_VECTOR:
450	case NP_VECTOR:
451	case SS_VECTOR:
452	case GP_VECTOR:
453		return EXCPT_CONTRIBUTORY;
454	default:
455		break;
456	}
457	return EXCPT_BENIGN;
458}
459
460#define EXCPT_FAULT		0
461#define EXCPT_TRAP		1
462#define EXCPT_ABORT		2
463#define EXCPT_INTERRUPT		3
464#define EXCPT_DB		4
465
466static int exception_type(int vector)
467{
468	unsigned int mask;
469
470	if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
471		return EXCPT_INTERRUPT;
472
473	mask = 1 << vector;
474
475	/*
476	 * #DBs can be trap-like or fault-like, the caller must check other CPU
477	 * state, e.g. DR6, to determine whether a #DB is a trap or fault.
478	 */
479	if (mask & (1 << DB_VECTOR))
480		return EXCPT_DB;
481
482	if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
483		return EXCPT_TRAP;
484
485	if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
486		return EXCPT_ABORT;
487
488	/* Reserved exceptions will result in fault */
489	return EXCPT_FAULT;
490}
491
492void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
493{
494	unsigned nr = vcpu->arch.exception.nr;
495	bool has_payload = vcpu->arch.exception.has_payload;
496	unsigned long payload = vcpu->arch.exception.payload;
497
498	if (!has_payload)
499		return;
500
501	switch (nr) {
502	case DB_VECTOR:
503		/*
504		 * "Certain debug exceptions may clear bit 0-3.  The
505		 * remaining contents of the DR6 register are never
506		 * cleared by the processor".
507		 */
508		vcpu->arch.dr6 &= ~DR_TRAP_BITS;
509		/*
510		 * DR6.RTM is set by all #DB exceptions that don't clear it.
511		 */
512		vcpu->arch.dr6 |= DR6_RTM;
513		vcpu->arch.dr6 |= payload;
514		/*
515		 * Bit 16 should be set in the payload whenever the #DB
516		 * exception should clear DR6.RTM. This makes the payload
517		 * compatible with the pending debug exceptions under VMX.
518		 * Though not currently documented in the SDM, this also
519		 * makes the payload compatible with the exit qualification
520		 * for #DB exceptions under VMX.
521		 */
522		vcpu->arch.dr6 ^= payload & DR6_RTM;
523
524		/*
525		 * The #DB payload is defined as compatible with the 'pending
526		 * debug exceptions' field under VMX, not DR6. While bit 12 is
527		 * defined in the 'pending debug exceptions' field (enabled
528		 * breakpoint), it is reserved and must be zero in DR6.
529		 */
530		vcpu->arch.dr6 &= ~BIT(12);
531		break;
532	case PF_VECTOR:
533		vcpu->arch.cr2 = payload;
534		break;
535	}
536
537	vcpu->arch.exception.has_payload = false;
538	vcpu->arch.exception.payload = 0;
539}
540EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
541
542static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
543		unsigned nr, bool has_error, u32 error_code,
544	        bool has_payload, unsigned long payload, bool reinject)
545{
546	u32 prev_nr;
547	int class1, class2;
548
549	kvm_make_request(KVM_REQ_EVENT, vcpu);
550
551	if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
552	queue:
553		if (reinject) {
554			/*
555			 * On vmentry, vcpu->arch.exception.pending is only
556			 * true if an event injection was blocked by
557			 * nested_run_pending.  In that case, however,
558			 * vcpu_enter_guest requests an immediate exit,
559			 * and the guest shouldn't proceed far enough to
560			 * need reinjection.
561			 */
562			WARN_ON_ONCE(vcpu->arch.exception.pending);
563			vcpu->arch.exception.injected = true;
564			if (WARN_ON_ONCE(has_payload)) {
565				/*
566				 * A reinjected event has already
567				 * delivered its payload.
568				 */
569				has_payload = false;
570				payload = 0;
571			}
572		} else {
573			vcpu->arch.exception.pending = true;
574			vcpu->arch.exception.injected = false;
575		}
576		vcpu->arch.exception.has_error_code = has_error;
577		vcpu->arch.exception.nr = nr;
578		vcpu->arch.exception.error_code = error_code;
579		vcpu->arch.exception.has_payload = has_payload;
580		vcpu->arch.exception.payload = payload;
581		if (!is_guest_mode(vcpu))
582			kvm_deliver_exception_payload(vcpu);
583		return;
584	}
585
586	/* to check exception */
587	prev_nr = vcpu->arch.exception.nr;
588	if (prev_nr == DF_VECTOR) {
589		/* triple fault -> shutdown */
590		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
591		return;
592	}
593	class1 = exception_class(prev_nr);
594	class2 = exception_class(nr);
595	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
596		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
597		/*
598		 * Generate double fault per SDM Table 5-5.  Set
599		 * exception.pending = true so that the double fault
600		 * can trigger a nested vmexit.
601		 */
602		vcpu->arch.exception.pending = true;
603		vcpu->arch.exception.injected = false;
604		vcpu->arch.exception.has_error_code = true;
605		vcpu->arch.exception.nr = DF_VECTOR;
606		vcpu->arch.exception.error_code = 0;
607		vcpu->arch.exception.has_payload = false;
608		vcpu->arch.exception.payload = 0;
609	} else
610		/* replace previous exception with a new one in a hope
611		   that instruction re-execution will regenerate lost
612		   exception */
613		goto queue;
614}
615
616void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
617{
618	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
619}
620EXPORT_SYMBOL_GPL(kvm_queue_exception);
621
622void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
623{
624	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
625}
626EXPORT_SYMBOL_GPL(kvm_requeue_exception);
627
628void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
629			   unsigned long payload)
630{
631	kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
632}
633EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
634
635static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
636				    u32 error_code, unsigned long payload)
637{
638	kvm_multiple_exception(vcpu, nr, true, error_code,
639			       true, payload, false);
640}
641
642int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
643{
644	if (err)
645		kvm_inject_gp(vcpu, 0);
646	else
647		return kvm_skip_emulated_instruction(vcpu);
648
649	return 1;
650}
651EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
652
653void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
654{
655	++vcpu->stat.pf_guest;
656	vcpu->arch.exception.nested_apf =
657		is_guest_mode(vcpu) && fault->async_page_fault;
658	if (vcpu->arch.exception.nested_apf) {
659		vcpu->arch.apf.nested_apf_token = fault->address;
660		kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
661	} else {
662		kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
663					fault->address);
664	}
665}
666EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
667
668bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
669				    struct x86_exception *fault)
670{
671	struct kvm_mmu *fault_mmu;
672	WARN_ON_ONCE(fault->vector != PF_VECTOR);
673
674	fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
675					       vcpu->arch.walk_mmu;
676
677	/*
678	 * Invalidate the TLB entry for the faulting address, if it exists,
679	 * else the access will fault indefinitely (and to emulate hardware).
680	 */
681	if ((fault->error_code & PFERR_PRESENT_MASK) &&
682	    !(fault->error_code & PFERR_RSVD_MASK))
683		kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
684				       fault_mmu->root_hpa);
685
686	fault_mmu->inject_page_fault(vcpu, fault);
687	return fault->nested_page_fault;
688}
689EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
690
691void kvm_inject_nmi(struct kvm_vcpu *vcpu)
692{
693	atomic_inc(&vcpu->arch.nmi_queued);
694	kvm_make_request(KVM_REQ_NMI, vcpu);
695}
696EXPORT_SYMBOL_GPL(kvm_inject_nmi);
697
698void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
699{
700	kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
701}
702EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
703
704void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
705{
706	kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
707}
708EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
709
710/*
711 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
712 * a #GP and return false.
713 */
714bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
715{
716	if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
717		return true;
718	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
719	return false;
720}
721EXPORT_SYMBOL_GPL(kvm_require_cpl);
722
723bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
724{
725	if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
726		return true;
727
728	kvm_queue_exception(vcpu, UD_VECTOR);
729	return false;
730}
731EXPORT_SYMBOL_GPL(kvm_require_dr);
732
733/*
734 * This function will be used to read from the physical memory of the currently
735 * running guest. The difference to kvm_vcpu_read_guest_page is that this function
736 * can read from guest physical or from the guest's guest physical memory.
737 */
738int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
739			    gfn_t ngfn, void *data, int offset, int len,
740			    u32 access)
741{
742	struct x86_exception exception;
743	gfn_t real_gfn;
744	gpa_t ngpa;
745
746	ngpa     = gfn_to_gpa(ngfn);
747	real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
748	if (real_gfn == UNMAPPED_GVA)
749		return -EFAULT;
750
751	real_gfn = gpa_to_gfn(real_gfn);
752
753	return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
754}
755EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
756
757static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
758			       void *data, int offset, int len, u32 access)
759{
760	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
761				       data, offset, len, access);
762}
763
764static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
765{
766	return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
767	       rsvd_bits(1, 2);
768}
769
770/*
771 * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.
772 */
773int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
774{
775	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
776	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
777	int i;
778	int ret;
779	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
780
781	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
782				      offset * sizeof(u64), sizeof(pdpte),
783				      PFERR_USER_MASK|PFERR_WRITE_MASK);
784	if (ret < 0) {
785		ret = 0;
786		goto out;
787	}
788	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
789		if ((pdpte[i] & PT_PRESENT_MASK) &&
790		    (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
791			ret = 0;
792			goto out;
793		}
794	}
795	ret = 1;
796
797	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
798	kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
799
800out:
801
802	return ret;
803}
804EXPORT_SYMBOL_GPL(load_pdptrs);
805
806bool pdptrs_changed(struct kvm_vcpu *vcpu)
807{
808	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
809	int offset;
810	gfn_t gfn;
811	int r;
812
813	if (!is_pae_paging(vcpu))
814		return false;
815
816	if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
817		return true;
818
819	gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
820	offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
821	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
822				       PFERR_USER_MASK | PFERR_WRITE_MASK);
823	if (r < 0)
824		return true;
825
826	return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
827}
828EXPORT_SYMBOL_GPL(pdptrs_changed);
829
830int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
831{
832	unsigned long old_cr0 = kvm_read_cr0(vcpu);
833	unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
834	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
835
836	cr0 |= X86_CR0_ET;
837
838#ifdef CONFIG_X86_64
839	if (cr0 & 0xffffffff00000000UL)
840		return 1;
841#endif
842
843	cr0 &= ~CR0_RESERVED_BITS;
844
845	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
846		return 1;
847
848	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
849		return 1;
850
851#ifdef CONFIG_X86_64
852	if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
853	    (cr0 & X86_CR0_PG)) {
854		int cs_db, cs_l;
855
856		if (!is_pae(vcpu))
857			return 1;
858		kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
859		if (cs_l)
860			return 1;
861	}
862#endif
863	if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
864	    is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
865	    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
866		return 1;
867
868	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
869		return 1;
870
871	kvm_x86_ops.set_cr0(vcpu, cr0);
872
873	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
874		kvm_clear_async_pf_completion_queue(vcpu);
875		kvm_async_pf_hash_reset(vcpu);
876	}
877
878	if ((cr0 ^ old_cr0) & update_bits)
879		kvm_mmu_reset_context(vcpu);
880
881	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
882	    kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
883	    !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
884		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
885
886	return 0;
887}
888EXPORT_SYMBOL_GPL(kvm_set_cr0);
889
890void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
891{
892	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
893}
894EXPORT_SYMBOL_GPL(kvm_lmsw);
895
896void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
897{
898	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
899
900		if (vcpu->arch.xcr0 != host_xcr0)
901			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
902
903		if (vcpu->arch.xsaves_enabled &&
904		    vcpu->arch.ia32_xss != host_xss)
905			wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
906	}
907
908	if (static_cpu_has(X86_FEATURE_PKU) &&
909	    (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
910	     (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
911	    vcpu->arch.pkru != vcpu->arch.host_pkru)
912		__write_pkru(vcpu->arch.pkru);
913}
914EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
915
916void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
917{
918	if (static_cpu_has(X86_FEATURE_PKU) &&
919	    (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
920	     (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
921		vcpu->arch.pkru = rdpkru();
922		if (vcpu->arch.pkru != vcpu->arch.host_pkru)
923			__write_pkru(vcpu->arch.host_pkru);
924	}
925
926	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
927
928		if (vcpu->arch.xcr0 != host_xcr0)
929			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
930
931		if (vcpu->arch.xsaves_enabled &&
932		    vcpu->arch.ia32_xss != host_xss)
933			wrmsrl(MSR_IA32_XSS, host_xss);
934	}
935
936}
937EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
938
939static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
940{
941	u64 xcr0 = xcr;
942	u64 old_xcr0 = vcpu->arch.xcr0;
943	u64 valid_bits;
944
945	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
946	if (index != XCR_XFEATURE_ENABLED_MASK)
947		return 1;
948	if (!(xcr0 & XFEATURE_MASK_FP))
949		return 1;
950	if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
951		return 1;
952
953	/*
954	 * Do not allow the guest to set bits that we do not support
955	 * saving.  However, xcr0 bit 0 is always set, even if the
956	 * emulated CPU does not support XSAVE (see fx_init).
957	 */
958	valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
959	if (xcr0 & ~valid_bits)
960		return 1;
961
962	if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
963	    (!(xcr0 & XFEATURE_MASK_BNDCSR)))
964		return 1;
965
966	if (xcr0 & XFEATURE_MASK_AVX512) {
967		if (!(xcr0 & XFEATURE_MASK_YMM))
968			return 1;
969		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
970			return 1;
971	}
972	vcpu->arch.xcr0 = xcr0;
973
974	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
975		kvm_update_cpuid_runtime(vcpu);
976	return 0;
977}
978
979int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
980{
981	if (kvm_x86_ops.get_cpl(vcpu) != 0 ||
982	    __kvm_set_xcr(vcpu, index, xcr)) {
983		kvm_inject_gp(vcpu, 0);
984		return 1;
985	}
986	return 0;
987}
988EXPORT_SYMBOL_GPL(kvm_set_xcr);
989
990int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
991{
992	if (cr4 & cr4_reserved_bits)
993		return -EINVAL;
994
995	if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
996		return -EINVAL;
997
998	if (!kvm_x86_ops.is_valid_cr4(vcpu, cr4))
999		return -EINVAL;
1000
1001	return 0;
1002}
1003EXPORT_SYMBOL_GPL(kvm_valid_cr4);
1004
1005int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1006{
1007	unsigned long old_cr4 = kvm_read_cr4(vcpu);
1008	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
1009				   X86_CR4_SMEP;
1010	unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE;
1011
1012	if (kvm_valid_cr4(vcpu, cr4))
1013		return 1;
1014
1015	if (is_long_mode(vcpu)) {
1016		if (!(cr4 & X86_CR4_PAE))
1017			return 1;
1018		if ((cr4 ^ old_cr4) & X86_CR4_LA57)
1019			return 1;
1020	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
1021		   && ((cr4 ^ old_cr4) & pdptr_bits)
1022		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
1023				   kvm_read_cr3(vcpu)))
1024		return 1;
1025
1026	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
1027		if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
1028			return 1;
1029
1030		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
1031		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
1032			return 1;
1033	}
1034
1035	kvm_x86_ops.set_cr4(vcpu, cr4);
1036
1037	if (((cr4 ^ old_cr4) & mmu_role_bits) ||
1038	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
1039		kvm_mmu_reset_context(vcpu);
1040
1041	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1042		kvm_update_cpuid_runtime(vcpu);
1043
1044	return 0;
1045}
1046EXPORT_SYMBOL_GPL(kvm_set_cr4);
1047
1048int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1049{
1050	bool skip_tlb_flush = false;
1051#ifdef CONFIG_X86_64
1052	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
1053
1054	if (pcid_enabled) {
1055		skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
1056		cr3 &= ~X86_CR3_PCID_NOFLUSH;
1057	}
1058#endif
1059
1060	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
1061		if (!skip_tlb_flush) {
1062			kvm_mmu_sync_roots(vcpu);
1063			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1064		}
1065		return 0;
1066	}
1067
1068	if (is_long_mode(vcpu) &&
1069	    (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
1070		return 1;
1071	else if (is_pae_paging(vcpu) &&
1072		 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
1073		return 1;
1074
1075	kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
1076	vcpu->arch.cr3 = cr3;
1077	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1078
1079	return 0;
1080}
1081EXPORT_SYMBOL_GPL(kvm_set_cr3);
1082
1083int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
1084{
1085	if (cr8 & CR8_RESERVED_BITS)
1086		return 1;
1087	if (lapic_in_kernel(vcpu))
1088		kvm_lapic_set_tpr(vcpu, cr8);
1089	else
1090		vcpu->arch.cr8 = cr8;
1091	return 0;
1092}
1093EXPORT_SYMBOL_GPL(kvm_set_cr8);
1094
1095unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
1096{
1097	if (lapic_in_kernel(vcpu))
1098		return kvm_lapic_get_cr8(vcpu);
1099	else
1100		return vcpu->arch.cr8;
1101}
1102EXPORT_SYMBOL_GPL(kvm_get_cr8);
1103
1104static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
1105{
1106	int i;
1107
1108	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1109		for (i = 0; i < KVM_NR_DB_REGS; i++)
1110			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1111		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
1112	}
1113}
1114
1115void kvm_update_dr7(struct kvm_vcpu *vcpu)
1116{
1117	unsigned long dr7;
1118
1119	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1120		dr7 = vcpu->arch.guest_debug_dr7;
1121	else
1122		dr7 = vcpu->arch.dr7;
1123	kvm_x86_ops.set_dr7(vcpu, dr7);
1124	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1125	if (dr7 & DR7_BP_EN_MASK)
1126		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
1127}
1128EXPORT_SYMBOL_GPL(kvm_update_dr7);
1129
1130static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
1131{
1132	u64 fixed = DR6_FIXED_1;
1133
1134	if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
1135		fixed |= DR6_RTM;
1136	return fixed;
1137}
1138
1139static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1140{
1141	size_t size = ARRAY_SIZE(vcpu->arch.db);
1142
1143	switch (dr) {
1144	case 0 ... 3:
1145		vcpu->arch.db[array_index_nospec(dr, size)] = val;
1146		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1147			vcpu->arch.eff_db[dr] = val;
1148		break;
1149	case 4:
1150	case 6:
1151		if (!kvm_dr6_valid(val))
1152			return -1; /* #GP */
1153		vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1154		break;
1155	case 5:
1156	default: /* 7 */
1157		if (!kvm_dr7_valid(val))
1158			return -1; /* #GP */
1159		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
1160		kvm_update_dr7(vcpu);
1161		break;
1162	}
1163
1164	return 0;
1165}
1166
1167int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1168{
1169	if (__kvm_set_dr(vcpu, dr, val)) {
1170		kvm_inject_gp(vcpu, 0);
1171		return 1;
1172	}
1173	return 0;
1174}
1175EXPORT_SYMBOL_GPL(kvm_set_dr);
1176
1177int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
1178{
1179	size_t size = ARRAY_SIZE(vcpu->arch.db);
1180
1181	switch (dr) {
1182	case 0 ... 3:
1183		*val = vcpu->arch.db[array_index_nospec(dr, size)];
1184		break;
1185	case 4:
1186	case 6:
1187		*val = vcpu->arch.dr6;
1188		break;
1189	case 5:
1190	default: /* 7 */
1191		*val = vcpu->arch.dr7;
1192		break;
1193	}
1194	return 0;
1195}
1196EXPORT_SYMBOL_GPL(kvm_get_dr);
1197
1198bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1199{
1200	u32 ecx = kvm_rcx_read(vcpu);
1201	u64 data;
1202	int err;
1203
1204	err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1205	if (err)
1206		return err;
1207	kvm_rax_write(vcpu, (u32)data);
1208	kvm_rdx_write(vcpu, data >> 32);
1209	return err;
1210}
1211EXPORT_SYMBOL_GPL(kvm_rdpmc);
1212
1213/*
1214 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1215 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1216 *
1217 * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
1218 * extract the supported MSRs from the related const lists.
1219 * msrs_to_save is selected from the msrs_to_save_all to reflect the
1220 * capabilities of the host cpu. This capabilities test skips MSRs that are
1221 * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
1222 * may depend on host virtualization features rather than host cpu features.
1223 */
1224
1225static const u32 msrs_to_save_all[] = {
1226	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1227	MSR_STAR,
1228#ifdef CONFIG_X86_64
1229	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1230#endif
1231	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1232	MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1233	MSR_IA32_SPEC_CTRL,
1234	MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1235	MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1236	MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1237	MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1238	MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1239	MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
1240	MSR_IA32_UMWAIT_CONTROL,
1241
1242	MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
1243	MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
1244	MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
1245	MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
1246	MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
1247	MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
1248	MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
1249	MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
1250	MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
1251	MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
1252	MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
1253	MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
1254	MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
1255	MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
1256	MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
1257	MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
1258	MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
1259	MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
1260	MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
1261	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
1262	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
1263	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
1264
1265	MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
1266	MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
1267	MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
1268	MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
1269	MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
1270	MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
1271};
1272
1273static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
1274static unsigned num_msrs_to_save;
1275
1276static const u32 emulated_msrs_all[] = {
1277	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1278	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1279	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1280	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1281	HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1282	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1283	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1284	HV_X64_MSR_RESET,
1285	HV_X64_MSR_VP_INDEX,
1286	HV_X64_MSR_VP_RUNTIME,
1287	HV_X64_MSR_SCONTROL,
1288	HV_X64_MSR_STIMER0_CONFIG,
1289	HV_X64_MSR_VP_ASSIST_PAGE,
1290	HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1291	HV_X64_MSR_TSC_EMULATION_STATUS,
1292	HV_X64_MSR_SYNDBG_OPTIONS,
1293	HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
1294	HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
1295	HV_X64_MSR_SYNDBG_PENDING_BUFFER,
1296
1297	MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1298	MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
1299
1300	MSR_IA32_TSC_ADJUST,
1301	MSR_IA32_TSCDEADLINE,
1302	MSR_IA32_ARCH_CAPABILITIES,
1303	MSR_IA32_PERF_CAPABILITIES,
1304	MSR_IA32_MISC_ENABLE,
1305	MSR_IA32_MCG_STATUS,
1306	MSR_IA32_MCG_CTL,
1307	MSR_IA32_MCG_EXT_CTL,
1308	MSR_IA32_SMBASE,
1309	MSR_SMI_COUNT,
1310	MSR_PLATFORM_INFO,
1311	MSR_MISC_FEATURES_ENABLES,
1312	MSR_AMD64_VIRT_SPEC_CTRL,
1313	MSR_IA32_POWER_CTL,
1314	MSR_IA32_UCODE_REV,
1315
1316	/*
1317	 * The following list leaves out MSRs whose values are determined
1318	 * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
1319	 * We always support the "true" VMX control MSRs, even if the host
1320	 * processor does not, so I am putting these registers here rather
1321	 * than in msrs_to_save_all.
1322	 */
1323	MSR_IA32_VMX_BASIC,
1324	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1325	MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1326	MSR_IA32_VMX_TRUE_EXIT_CTLS,
1327	MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1328	MSR_IA32_VMX_MISC,
1329	MSR_IA32_VMX_CR0_FIXED0,
1330	MSR_IA32_VMX_CR4_FIXED0,
1331	MSR_IA32_VMX_VMCS_ENUM,
1332	MSR_IA32_VMX_PROCBASED_CTLS2,
1333	MSR_IA32_VMX_EPT_VPID_CAP,
1334	MSR_IA32_VMX_VMFUNC,
1335
1336	MSR_K7_HWCR,
1337	MSR_KVM_POLL_CONTROL,
1338};
1339
1340static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
1341static unsigned num_emulated_msrs;
1342
1343/*
1344 * List of msr numbers which are used to expose MSR-based features that
1345 * can be used by a hypervisor to validate requested CPU features.
1346 */
1347static const u32 msr_based_features_all[] = {
1348	MSR_IA32_VMX_BASIC,
1349	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1350	MSR_IA32_VMX_PINBASED_CTLS,
1351	MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1352	MSR_IA32_VMX_PROCBASED_CTLS,
1353	MSR_IA32_VMX_TRUE_EXIT_CTLS,
1354	MSR_IA32_VMX_EXIT_CTLS,
1355	MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1356	MSR_IA32_VMX_ENTRY_CTLS,
1357	MSR_IA32_VMX_MISC,
1358	MSR_IA32_VMX_CR0_FIXED0,
1359	MSR_IA32_VMX_CR0_FIXED1,
1360	MSR_IA32_VMX_CR4_FIXED0,
1361	MSR_IA32_VMX_CR4_FIXED1,
1362	MSR_IA32_VMX_VMCS_ENUM,
1363	MSR_IA32_VMX_PROCBASED_CTLS2,
1364	MSR_IA32_VMX_EPT_VPID_CAP,
1365	MSR_IA32_VMX_VMFUNC,
1366
1367	MSR_AMD64_DE_CFG,
1368	MSR_IA32_UCODE_REV,
1369	MSR_IA32_ARCH_CAPABILITIES,
1370	MSR_IA32_PERF_CAPABILITIES,
1371};
1372
1373static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
1374static unsigned int num_msr_based_features;
1375
1376/*
1377 * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
1378 * does not yet virtualize. These include:
1379 *   10 - MISC_PACKAGE_CTRLS
1380 *   11 - ENERGY_FILTERING_CTL
1381 *   12 - DOITM
1382 *   18 - FB_CLEAR_CTRL
1383 *   21 - XAPIC_DISABLE_STATUS
1384 *   23 - OVERCLOCKING_STATUS
1385 */
1386
1387#define KVM_SUPPORTED_ARCH_CAP \
1388	(ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
1389	 ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
1390	 ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
1391	 ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
1392	 ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
1393	 ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR)
1394
1395static u64 kvm_get_arch_capabilities(void)
1396{
1397	u64 data = 0;
1398
1399	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
1400		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
1401		data &= KVM_SUPPORTED_ARCH_CAP;
1402	}
1403
1404	/*
1405	 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
1406	 * the nested hypervisor runs with NX huge pages.  If it is not,
1407	 * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
1408	 * L1 guests, so it need not worry about its own (L2) guests.
1409	 */
1410	data |= ARCH_CAP_PSCHANGE_MC_NO;
1411
1412	/*
1413	 * If we're doing cache flushes (either "always" or "cond")
1414	 * we will do one whenever the guest does a vmlaunch/vmresume.
1415	 * If an outer hypervisor is doing the cache flush for us
1416	 * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
1417	 * capability to the guest too, and if EPT is disabled we're not
1418	 * vulnerable.  Overall, only VMENTER_L1D_FLUSH_NEVER will
1419	 * require a nested hypervisor to do a flush of its own.
1420	 */
1421	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1422		data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1423
1424	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
1425		data |= ARCH_CAP_RDCL_NO;
1426	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
1427		data |= ARCH_CAP_SSB_NO;
1428	if (!boot_cpu_has_bug(X86_BUG_MDS))
1429		data |= ARCH_CAP_MDS_NO;
1430	if (!boot_cpu_has_bug(X86_BUG_RFDS))
1431		data |= ARCH_CAP_RFDS_NO;
1432
1433	if (!boot_cpu_has(X86_FEATURE_RTM)) {
1434		/*
1435		 * If RTM=0 because the kernel has disabled TSX, the host might
1436		 * have TAA_NO or TSX_CTRL.  Clear TAA_NO (the guest sees RTM=0
1437		 * and therefore knows that there cannot be TAA) but keep
1438		 * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
1439		 * and we want to allow migrating those guests to tsx=off hosts.
1440		 */
1441		data &= ~ARCH_CAP_TAA_NO;
1442	} else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
1443		data |= ARCH_CAP_TAA_NO;
1444	} else {
1445		/*
1446		 * Nothing to do here; we emulate TSX_CTRL if present on the
1447		 * host so the guest can choose between disabling TSX or
1448		 * using VERW to clear CPU buffers.
1449		 */
1450	}
1451
1452	if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
1453		data |= ARCH_CAP_GDS_NO;
1454
1455	return data;
1456}
1457
1458static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1459{
1460	switch (msr->index) {
1461	case MSR_IA32_ARCH_CAPABILITIES:
1462		msr->data = kvm_get_arch_capabilities();
1463		break;
1464	case MSR_IA32_UCODE_REV:
1465		rdmsrl_safe(msr->index, &msr->data);
1466		break;
1467	default:
1468		return kvm_x86_ops.get_msr_feature(msr);
1469	}
1470	return 0;
1471}
1472
1473static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1474{
1475	struct kvm_msr_entry msr;
1476	int r;
1477
1478	msr.index = index;
1479	r = kvm_get_msr_feature(&msr);
1480
1481	if (r == KVM_MSR_RET_INVALID) {
1482		/* Unconditionally clear the output for simplicity */
1483		*data = 0;
1484		if (kvm_msr_ignored_check(vcpu, index, 0, false))
1485			r = 0;
1486	}
1487
1488	if (r)
1489		return r;
1490
1491	*data = msr.data;
1492
1493	return 0;
1494}
1495
1496static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1497{
1498	if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1499		return false;
1500
1501	if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1502		return false;
1503
1504	if (efer & (EFER_LME | EFER_LMA) &&
1505	    !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1506		return false;
1507
1508	if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1509		return false;
1510
1511	return true;
1512
1513}
1514bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1515{
1516	if (efer & efer_reserved_bits)
1517		return false;
1518
1519	return __kvm_valid_efer(vcpu, efer);
1520}
1521EXPORT_SYMBOL_GPL(kvm_valid_efer);
1522
1523static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1524{
1525	u64 old_efer = vcpu->arch.efer;
1526	u64 efer = msr_info->data;
1527	int r;
1528
1529	if (efer & efer_reserved_bits)
1530		return 1;
1531
1532	if (!msr_info->host_initiated) {
1533		if (!__kvm_valid_efer(vcpu, efer))
1534			return 1;
1535
1536		if (is_paging(vcpu) &&
1537		    (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1538			return 1;
1539	}
1540
1541	efer &= ~EFER_LMA;
1542	efer |= vcpu->arch.efer & EFER_LMA;
1543
1544	r = kvm_x86_ops.set_efer(vcpu, efer);
1545	if (r) {
1546		WARN_ON(r > 0);
1547		return r;
1548	}
1549
1550	/* Update reserved bits */
1551	if ((efer ^ old_efer) & EFER_NX)
1552		kvm_mmu_reset_context(vcpu);
1553
1554	return 0;
1555}
1556
1557void kvm_enable_efer_bits(u64 mask)
1558{
1559       efer_reserved_bits &= ~mask;
1560}
1561EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1562
1563bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
1564{
1565	struct kvm_x86_msr_filter *msr_filter;
1566	struct msr_bitmap_range *ranges;
1567	struct kvm *kvm = vcpu->kvm;
1568	bool allowed;
1569	int idx;
1570	u32 i;
1571
1572	/* x2APIC MSRs do not support filtering. */
1573	if (index >= 0x800 && index <= 0x8ff)
1574		return true;
1575
1576	idx = srcu_read_lock(&kvm->srcu);
1577
1578	msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
1579	if (!msr_filter) {
1580		allowed = true;
1581		goto out;
1582	}
1583
1584	allowed = msr_filter->default_allow;
1585	ranges = msr_filter->ranges;
1586
1587	for (i = 0; i < msr_filter->count; i++) {
1588		u32 start = ranges[i].base;
1589		u32 end = start + ranges[i].nmsrs;
1590		u32 flags = ranges[i].flags;
1591		unsigned long *bitmap = ranges[i].bitmap;
1592
1593		if ((index >= start) && (index < end) && (flags & type)) {
1594			allowed = !!test_bit(index - start, bitmap);
1595			break;
1596		}
1597
1598		/* Note, VM-Exits that go down the "slow" path are accounted below. */
1599		++vcpu->stat.exits;
1600	}
1601
1602out:
1603	srcu_read_unlock(&kvm->srcu, idx);
1604
1605	return allowed;
1606}
1607EXPORT_SYMBOL_GPL(kvm_msr_allowed);
1608
1609/*
1610 * Write @data into the MSR specified by @index.  Select MSR specific fault
1611 * checks are bypassed if @host_initiated is %true.
1612 * Returns 0 on success, non-0 otherwise.
1613 * Assumes vcpu_load() was already called.
1614 */
1615static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
1616			 bool host_initiated)
1617{
1618	struct msr_data msr;
1619
1620	if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
1621		return KVM_MSR_RET_FILTERED;
1622
1623	switch (index) {
1624	case MSR_FS_BASE:
1625	case MSR_GS_BASE:
1626	case MSR_KERNEL_GS_BASE:
1627	case MSR_CSTAR:
1628	case MSR_LSTAR:
1629		if (is_noncanonical_address(data, vcpu))
1630			return 1;
1631		break;
1632	case MSR_IA32_SYSENTER_EIP:
1633	case MSR_IA32_SYSENTER_ESP:
1634		/*
1635		 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1636		 * non-canonical address is written on Intel but not on
1637		 * AMD (which ignores the top 32-bits, because it does
1638		 * not implement 64-bit SYSENTER).
1639		 *
1640		 * 64-bit code should hence be able to write a non-canonical
1641		 * value on AMD.  Making the address canonical ensures that
1642		 * vmentry does not fail on Intel after writing a non-canonical
1643		 * value, and that something deterministic happens if the guest
1644		 * invokes 64-bit SYSENTER.
1645		 */
1646		data = __canonical_address(data, vcpu_virt_addr_bits(vcpu));
1647	}
1648
1649	msr.data = data;
1650	msr.index = index;
1651	msr.host_initiated = host_initiated;
1652
1653	return kvm_x86_ops.set_msr(vcpu, &msr);
1654}
1655
1656static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
1657				     u32 index, u64 data, bool host_initiated)
1658{
1659	int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
1660
1661	if (ret == KVM_MSR_RET_INVALID)
1662		if (kvm_msr_ignored_check(vcpu, index, data, true))
1663			ret = 0;
1664
1665	return ret;
1666}
1667
1668/*
1669 * Read the MSR specified by @index into @data.  Select MSR specific fault
1670 * checks are bypassed if @host_initiated is %true.
1671 * Returns 0 on success, non-0 otherwise.
1672 * Assumes vcpu_load() was already called.
1673 */
1674int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
1675		  bool host_initiated)
1676{
1677	struct msr_data msr;
1678	int ret;
1679
1680	if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
1681		return KVM_MSR_RET_FILTERED;
1682
1683	msr.index = index;
1684	msr.host_initiated = host_initiated;
1685
1686	ret = kvm_x86_ops.get_msr(vcpu, &msr);
1687	if (!ret)
1688		*data = msr.data;
1689	return ret;
1690}
1691
1692static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
1693				     u32 index, u64 *data, bool host_initiated)
1694{
1695	int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
1696
1697	if (ret == KVM_MSR_RET_INVALID) {
1698		/* Unconditionally clear *data for simplicity */
1699		*data = 0;
1700		if (kvm_msr_ignored_check(vcpu, index, 0, false))
1701			ret = 0;
1702	}
1703
1704	return ret;
1705}
1706
1707int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
1708{
1709	return kvm_get_msr_ignored_check(vcpu, index, data, false);
1710}
1711EXPORT_SYMBOL_GPL(kvm_get_msr);
1712
1713int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
1714{
1715	return kvm_set_msr_ignored_check(vcpu, index, data, false);
1716}
1717EXPORT_SYMBOL_GPL(kvm_set_msr);
1718
1719static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
1720{
1721	if (vcpu->run->msr.error) {
1722		kvm_inject_gp(vcpu, 0);
1723		return 1;
1724	} else if (is_read) {
1725		kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
1726		kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
1727	}
1728
1729	return kvm_skip_emulated_instruction(vcpu);
1730}
1731
1732static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
1733{
1734	return complete_emulated_msr(vcpu, true);
1735}
1736
1737static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
1738{
1739	return complete_emulated_msr(vcpu, false);
1740}
1741
1742static u64 kvm_msr_reason(int r)
1743{
1744	switch (r) {
1745	case KVM_MSR_RET_INVALID:
1746		return KVM_MSR_EXIT_REASON_UNKNOWN;
1747	case KVM_MSR_RET_FILTERED:
1748		return KVM_MSR_EXIT_REASON_FILTER;
1749	default:
1750		return KVM_MSR_EXIT_REASON_INVAL;
1751	}
1752}
1753
1754static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
1755			      u32 exit_reason, u64 data,
1756			      int (*completion)(struct kvm_vcpu *vcpu),
1757			      int r)
1758{
1759	u64 msr_reason = kvm_msr_reason(r);
1760
1761	/* Check if the user wanted to know about this MSR fault */
1762	if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
1763		return 0;
1764
1765	vcpu->run->exit_reason = exit_reason;
1766	vcpu->run->msr.error = 0;
1767	memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
1768	vcpu->run->msr.reason = msr_reason;
1769	vcpu->run->msr.index = index;
1770	vcpu->run->msr.data = data;
1771	vcpu->arch.complete_userspace_io = completion;
1772
1773	return 1;
1774}
1775
1776static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
1777{
1778	return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
1779				   complete_emulated_rdmsr, r);
1780}
1781
1782static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
1783{
1784	return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
1785				   complete_emulated_wrmsr, r);
1786}
1787
1788int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
1789{
1790	u32 ecx = kvm_rcx_read(vcpu);
1791	u64 data;
1792	int r;
1793
1794	r = kvm_get_msr(vcpu, ecx, &data);
1795
1796	/* MSR read failed? See if we should ask user space */
1797	if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
1798		/* Bounce to user space */
1799		return 0;
1800	}
1801
1802	/* MSR read failed? Inject a #GP */
1803	if (r) {
1804		trace_kvm_msr_read_ex(ecx);
1805		kvm_inject_gp(vcpu, 0);
1806		return 1;
1807	}
1808
1809	trace_kvm_msr_read(ecx, data);
1810
1811	kvm_rax_write(vcpu, data & -1u);
1812	kvm_rdx_write(vcpu, (data >> 32) & -1u);
1813	return kvm_skip_emulated_instruction(vcpu);
1814}
1815EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
1816
1817int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
1818{
1819	u32 ecx = kvm_rcx_read(vcpu);
1820	u64 data = kvm_read_edx_eax(vcpu);
1821	int r;
1822
1823	r = kvm_set_msr(vcpu, ecx, data);
1824
1825	/* MSR write failed? See if we should ask user space */
1826	if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
1827		/* Bounce to user space */
1828		return 0;
1829
1830	/* Signal all other negative errors to userspace */
1831	if (r < 0)
1832		return r;
1833
1834	/* MSR write failed? Inject a #GP */
1835	if (r > 0) {
1836		trace_kvm_msr_write_ex(ecx, data);
1837		kvm_inject_gp(vcpu, 0);
1838		return 1;
1839	}
1840
1841	trace_kvm_msr_write(ecx, data);
1842	return kvm_skip_emulated_instruction(vcpu);
1843}
1844EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
1845
1846bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
1847{
1848	return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
1849		xfer_to_guest_mode_work_pending();
1850}
1851EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
1852
1853/*
1854 * The fast path for frequent and performance sensitive wrmsr emulation,
1855 * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
1856 * the latency of virtual IPI by avoiding the expensive bits of transitioning
1857 * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
1858 * other cases which must be called after interrupts are enabled on the host.
1859 */
1860static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
1861{
1862	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
1863		return 1;
1864
1865	if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
1866		((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
1867		((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
1868		((u32)(data >> 32) != X2APIC_BROADCAST)) {
1869
1870		data &= ~(1 << 12);
1871		kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
1872		kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
1873		kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
1874		trace_kvm_apic_write(APIC_ICR, (u32)data);
1875		return 0;
1876	}
1877
1878	return 1;
1879}
1880
1881static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
1882{
1883	if (!kvm_can_use_hv_timer(vcpu))
1884		return 1;
1885
1886	kvm_set_lapic_tscdeadline_msr(vcpu, data);
1887	return 0;
1888}
1889
1890fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
1891{
1892	u32 msr = kvm_rcx_read(vcpu);
1893	u64 data;
1894	fastpath_t ret = EXIT_FASTPATH_NONE;
1895
1896	switch (msr) {
1897	case APIC_BASE_MSR + (APIC_ICR >> 4):
1898		data = kvm_read_edx_eax(vcpu);
1899		if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
1900			kvm_skip_emulated_instruction(vcpu);
1901			ret = EXIT_FASTPATH_EXIT_HANDLED;
1902		}
1903		break;
1904	case MSR_IA32_TSCDEADLINE:
1905		data = kvm_read_edx_eax(vcpu);
1906		if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
1907			kvm_skip_emulated_instruction(vcpu);
1908			ret = EXIT_FASTPATH_REENTER_GUEST;
1909		}
1910		break;
1911	default:
1912		break;
1913	}
1914
1915	if (ret != EXIT_FASTPATH_NONE)
1916		trace_kvm_msr_write(msr, data);
1917
1918	return ret;
1919}
1920EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
1921
1922/*
1923 * Adapt set_msr() to msr_io()'s calling convention
1924 */
1925static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1926{
1927	return kvm_get_msr_ignored_check(vcpu, index, data, true);
1928}
1929
1930static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1931{
1932	return kvm_set_msr_ignored_check(vcpu, index, *data, true);
1933}
1934
1935#ifdef CONFIG_X86_64
1936struct pvclock_clock {
1937	int vclock_mode;
1938	u64 cycle_last;
1939	u64 mask;
1940	u32 mult;
1941	u32 shift;
1942	u64 base_cycles;
1943	u64 offset;
1944};
1945
1946struct pvclock_gtod_data {
1947	seqcount_t	seq;
1948
1949	struct pvclock_clock clock; /* extract of a clocksource struct */
1950	struct pvclock_clock raw_clock; /* extract of a clocksource struct */
1951
1952	ktime_t		offs_boot;
1953	u64		wall_time_sec;
1954};
1955
1956static struct pvclock_gtod_data pvclock_gtod_data;
1957
1958static void update_pvclock_gtod(struct timekeeper *tk)
1959{
1960	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1961
1962	write_seqcount_begin(&vdata->seq);
1963
1964	/* copy pvclock gtod data */
1965	vdata->clock.vclock_mode	= tk->tkr_mono.clock->vdso_clock_mode;
1966	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
1967	vdata->clock.mask		= tk->tkr_mono.mask;
1968	vdata->clock.mult		= tk->tkr_mono.mult;
1969	vdata->clock.shift		= tk->tkr_mono.shift;
1970	vdata->clock.base_cycles	= tk->tkr_mono.xtime_nsec;
1971	vdata->clock.offset		= tk->tkr_mono.base;
1972
1973	vdata->raw_clock.vclock_mode	= tk->tkr_raw.clock->vdso_clock_mode;
1974	vdata->raw_clock.cycle_last	= tk->tkr_raw.cycle_last;
1975	vdata->raw_clock.mask		= tk->tkr_raw.mask;
1976	vdata->raw_clock.mult		= tk->tkr_raw.mult;
1977	vdata->raw_clock.shift		= tk->tkr_raw.shift;
1978	vdata->raw_clock.base_cycles	= tk->tkr_raw.xtime_nsec;
1979	vdata->raw_clock.offset		= tk->tkr_raw.base;
1980
1981	vdata->wall_time_sec            = tk->xtime_sec;
1982
1983	vdata->offs_boot		= tk->offs_boot;
1984
1985	write_seqcount_end(&vdata->seq);
1986}
1987
1988static s64 get_kvmclock_base_ns(void)
1989{
1990	/* Count up from boot time, but with the frequency of the raw clock.  */
1991	return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
1992}
1993#else
1994static s64 get_kvmclock_base_ns(void)
1995{
1996	/* Master clock not used, so we can just use CLOCK_BOOTTIME.  */
1997	return ktime_get_boottime_ns();
1998}
1999#endif
2000
2001static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
2002{
2003	int version;
2004	int r;
2005	struct pvclock_wall_clock wc;
2006	u64 wall_nsec;
2007
2008	kvm->arch.wall_clock = wall_clock;
2009
2010	if (!wall_clock)
2011		return;
2012
2013	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
2014	if (r)
2015		return;
2016
2017	if (version & 1)
2018		++version;  /* first time write, random junk */
2019
2020	++version;
2021
2022	if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
2023		return;
2024
2025	/*
2026	 * The guest calculates current wall clock time by adding
2027	 * system time (updated by kvm_guest_time_update below) to the
2028	 * wall clock specified here.  We do the reverse here.
2029	 */
2030	wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
2031
2032	wc.nsec = do_div(wall_nsec, 1000000000);
2033	wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
2034	wc.version = version;
2035
2036	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
2037
2038	version++;
2039	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
2040}
2041
2042static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
2043				  bool old_msr, bool host_initiated)
2044{
2045	struct kvm_arch *ka = &vcpu->kvm->arch;
2046
2047	if (vcpu->vcpu_id == 0 && !host_initiated) {
2048		if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
2049			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2050
2051		ka->boot_vcpu_runs_old_kvmclock = old_msr;
2052	}
2053
2054	vcpu->arch.time = system_time;
2055	kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2056
2057	/* we verify if the enable bit is set... */
2058	vcpu->arch.pv_time_enabled = false;
2059	if (!(system_time & 1))
2060		return;
2061
2062	if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
2063				       &vcpu->arch.pv_time, system_time & ~1ULL,
2064				       sizeof(struct pvclock_vcpu_time_info)))
2065		vcpu->arch.pv_time_enabled = true;
2066
2067	return;
2068}
2069
2070static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
2071{
2072	do_shl32_div32(dividend, divisor);
2073	return dividend;
2074}
2075
2076static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
2077			       s8 *pshift, u32 *pmultiplier)
2078{
2079	uint64_t scaled64;
2080	int32_t  shift = 0;
2081	uint64_t tps64;
2082	uint32_t tps32;
2083
2084	tps64 = base_hz;
2085	scaled64 = scaled_hz;
2086	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
2087		tps64 >>= 1;
2088		shift--;
2089	}
2090
2091	tps32 = (uint32_t)tps64;
2092	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
2093		if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
2094			scaled64 >>= 1;
2095		else
2096			tps32 <<= 1;
2097		shift++;
2098	}
2099
2100	*pshift = shift;
2101	*pmultiplier = div_frac(scaled64, tps32);
2102}
2103
2104#ifdef CONFIG_X86_64
2105static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
2106#endif
2107
2108static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
2109static unsigned long max_tsc_khz;
2110
2111static u32 adjust_tsc_khz(u32 khz, s32 ppm)
2112{
2113	u64 v = (u64)khz * (1000000 + ppm);
2114	do_div(v, 1000000);
2115	return v;
2116}
2117
2118static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2119{
2120	u64 ratio;
2121
2122	/* Guest TSC same frequency as host TSC? */
2123	if (!scale) {
2124		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
2125		return 0;
2126	}
2127
2128	/* TSC scaling supported? */
2129	if (!kvm_has_tsc_control) {
2130		if (user_tsc_khz > tsc_khz) {
2131			vcpu->arch.tsc_catchup = 1;
2132			vcpu->arch.tsc_always_catchup = 1;
2133			return 0;
2134		} else {
2135			pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
2136			return -1;
2137		}
2138	}
2139
2140	/* TSC scaling required  - calculate ratio */
2141	ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
2142				user_tsc_khz, tsc_khz);
2143
2144	if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
2145		pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
2146			            user_tsc_khz);
2147		return -1;
2148	}
2149
2150	vcpu->arch.tsc_scaling_ratio = ratio;
2151	return 0;
2152}
2153
2154static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
2155{
2156	u32 thresh_lo, thresh_hi;
2157	int use_scaling = 0;
2158
2159	/* tsc_khz can be zero if TSC calibration fails */
2160	if (user_tsc_khz == 0) {
2161		/* set tsc_scaling_ratio to a safe value */
2162		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
2163		return -1;
2164	}
2165
2166	/* Compute a scale to convert nanoseconds in TSC cycles */
2167	kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
2168			   &vcpu->arch.virtual_tsc_shift,
2169			   &vcpu->arch.virtual_tsc_mult);
2170	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
2171
2172	/*
2173	 * Compute the variation in TSC rate which is acceptable
2174	 * within the range of tolerance and decide if the
2175	 * rate being applied is within that bounds of the hardware
2176	 * rate.  If so, no scaling or compensation need be done.
2177	 */
2178	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
2179	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
2180	if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
2181		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
2182		use_scaling = 1;
2183	}
2184	return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
2185}
2186
2187static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
2188{
2189	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
2190				      vcpu->arch.virtual_tsc_mult,
2191				      vcpu->arch.virtual_tsc_shift);
2192	tsc += vcpu->arch.this_tsc_write;
2193	return tsc;
2194}
2195
2196static inline int gtod_is_based_on_tsc(int mode)
2197{
2198	return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
2199}
2200
2201static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
2202{
2203#ifdef CONFIG_X86_64
2204	bool vcpus_matched;
2205	struct kvm_arch *ka = &vcpu->kvm->arch;
2206	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2207
2208	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2209			 atomic_read(&vcpu->kvm->online_vcpus));
2210
2211	/*
2212	 * Once the masterclock is enabled, always perform request in
2213	 * order to update it.
2214	 *
2215	 * In order to enable masterclock, the host clocksource must be TSC
2216	 * and the vcpus need to have matched TSCs.  When that happens,
2217	 * perform request to enable masterclock.
2218	 */
2219	if (ka->use_master_clock ||
2220	    (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
2221		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2222
2223	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
2224			    atomic_read(&vcpu->kvm->online_vcpus),
2225		            ka->use_master_clock, gtod->clock.vclock_mode);
2226#endif
2227}
2228
2229/*
2230 * Multiply tsc by a fixed point number represented by ratio.
2231 *
2232 * The most significant 64-N bits (mult) of ratio represent the
2233 * integral part of the fixed point number; the remaining N bits
2234 * (frac) represent the fractional part, ie. ratio represents a fixed
2235 * point number (mult + frac * 2^(-N)).
2236 *
2237 * N equals to kvm_tsc_scaling_ratio_frac_bits.
2238 */
2239static inline u64 __scale_tsc(u64 ratio, u64 tsc)
2240{
2241	return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
2242}
2243
2244u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
2245{
2246	u64 _tsc = tsc;
2247	u64 ratio = vcpu->arch.tsc_scaling_ratio;
2248
2249	if (ratio != kvm_default_tsc_scaling_ratio)
2250		_tsc = __scale_tsc(ratio, tsc);
2251
2252	return _tsc;
2253}
2254EXPORT_SYMBOL_GPL(kvm_scale_tsc);
2255
2256static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2257{
2258	u64 tsc;
2259
2260	tsc = kvm_scale_tsc(vcpu, rdtsc());
2261
2262	return target_tsc - tsc;
2263}
2264
2265u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2266{
2267	return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
2268}
2269EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
2270
2271static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2272{
2273	vcpu->arch.l1_tsc_offset = offset;
2274	vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
2275}
2276
2277static inline bool kvm_check_tsc_unstable(void)
2278{
2279#ifdef CONFIG_X86_64
2280	/*
2281	 * TSC is marked unstable when we're running on Hyper-V,
2282	 * 'TSC page' clocksource is good.
2283	 */
2284	if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
2285		return false;
2286#endif
2287	return check_tsc_unstable();
2288}
2289
2290static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
2291{
2292	struct kvm *kvm = vcpu->kvm;
2293	u64 offset, ns, elapsed;
2294	unsigned long flags;
2295	bool matched;
2296	bool already_matched;
2297	bool synchronizing = false;
2298
2299	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
2300	offset = kvm_compute_tsc_offset(vcpu, data);
2301	ns = get_kvmclock_base_ns();
2302	elapsed = ns - kvm->arch.last_tsc_nsec;
2303
2304	if (vcpu->arch.virtual_tsc_khz) {
2305		if (data == 0) {
2306			/*
2307			 * detection of vcpu initialization -- need to sync
2308			 * with other vCPUs. This particularly helps to keep
2309			 * kvm_clock stable after CPU hotplug
2310			 */
2311			synchronizing = true;
2312		} else {
2313			u64 tsc_exp = kvm->arch.last_tsc_write +
2314						nsec_to_cycles(vcpu, elapsed);
2315			u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2316			/*
2317			 * Special case: TSC write with a small delta (1 second)
2318			 * of virtual cycle time against real time is
2319			 * interpreted as an attempt to synchronize the CPU.
2320			 */
2321			synchronizing = data < tsc_exp + tsc_hz &&
2322					data + tsc_hz > tsc_exp;
2323		}
2324	}
2325
2326	/*
2327	 * For a reliable TSC, we can match TSC offsets, and for an unstable
2328	 * TSC, we add elapsed time in this computation.  We could let the
2329	 * compensation code attempt to catch up if we fall behind, but
2330	 * it's better to try to match offsets from the beginning.
2331         */
2332	if (synchronizing &&
2333	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
2334		if (!kvm_check_tsc_unstable()) {
2335			offset = kvm->arch.cur_tsc_offset;
2336		} else {
2337			u64 delta = nsec_to_cycles(vcpu, elapsed);
2338			data += delta;
2339			offset = kvm_compute_tsc_offset(vcpu, data);
2340		}
2341		matched = true;
2342		already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
2343	} else {
2344		/*
2345		 * We split periods of matched TSC writes into generations.
2346		 * For each generation, we track the original measured
2347		 * nanosecond time, offset, and write, so if TSCs are in
2348		 * sync, we can match exact offset, and if not, we can match
2349		 * exact software computation in compute_guest_tsc()
2350		 *
2351		 * These values are tracked in kvm->arch.cur_xxx variables.
2352		 */
2353		kvm->arch.cur_tsc_generation++;
2354		kvm->arch.cur_tsc_nsec = ns;
2355		kvm->arch.cur_tsc_write = data;
2356		kvm->arch.cur_tsc_offset = offset;
2357		matched = false;
2358	}
2359
2360	/*
2361	 * We also track th most recent recorded KHZ, write and time to
2362	 * allow the matching interval to be extended at each write.
2363	 */
2364	kvm->arch.last_tsc_nsec = ns;
2365	kvm->arch.last_tsc_write = data;
2366	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
2367
2368	vcpu->arch.last_guest_tsc = data;
2369
2370	/* Keep track of which generation this VCPU has synchronized to */
2371	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2372	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2373	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2374
2375	kvm_vcpu_write_tsc_offset(vcpu, offset);
2376	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
2377
2378	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
2379	if (!matched) {
2380		kvm->arch.nr_vcpus_matched_tsc = 0;
2381	} else if (!already_matched) {
2382		kvm->arch.nr_vcpus_matched_tsc++;
2383	}
2384
2385	kvm_track_tsc_matching(vcpu);
2386	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
2387}
2388
2389static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
2390					   s64 adjustment)
2391{
2392	u64 tsc_offset = vcpu->arch.l1_tsc_offset;
2393	kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
2394}
2395
2396static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
2397{
2398	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
2399		WARN_ON(adjustment < 0);
2400	adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
2401	adjust_tsc_offset_guest(vcpu, adjustment);
2402}
2403
2404#ifdef CONFIG_X86_64
2405
2406static u64 read_tsc(void)
2407{
2408	u64 ret = (u64)rdtsc_ordered();
2409	u64 last = pvclock_gtod_data.clock.cycle_last;
2410
2411	if (likely(ret >= last))
2412		return ret;
2413
2414	/*
2415	 * GCC likes to generate cmov here, but this branch is extremely
2416	 * predictable (it's just a function of time and the likely is
2417	 * very likely) and there's a data dependence, so force GCC
2418	 * to generate a branch instead.  I don't barrier() because
2419	 * we don't actually need a barrier, and if this function
2420	 * ever gets inlined it will generate worse code.
2421	 */
2422	asm volatile ("");
2423	return last;
2424}
2425
2426static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
2427			  int *mode)
2428{
2429	long v;
2430	u64 tsc_pg_val;
2431
2432	switch (clock->vclock_mode) {
2433	case VDSO_CLOCKMODE_HVCLOCK:
2434		tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
2435						  tsc_timestamp);
2436		if (tsc_pg_val != U64_MAX) {
2437			/* TSC page valid */
2438			*mode = VDSO_CLOCKMODE_HVCLOCK;
2439			v = (tsc_pg_val - clock->cycle_last) &
2440				clock->mask;
2441		} else {
2442			/* TSC page invalid */
2443			*mode = VDSO_CLOCKMODE_NONE;
2444		}
2445		break;
2446	case VDSO_CLOCKMODE_TSC:
2447		*mode = VDSO_CLOCKMODE_TSC;
2448		*tsc_timestamp = read_tsc();
2449		v = (*tsc_timestamp - clock->cycle_last) &
2450			clock->mask;
2451		break;
2452	default:
2453		*mode = VDSO_CLOCKMODE_NONE;
2454	}
2455
2456	if (*mode == VDSO_CLOCKMODE_NONE)
2457		*tsc_timestamp = v = 0;
2458
2459	return v * clock->mult;
2460}
2461
2462static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
2463{
2464	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2465	unsigned long seq;
2466	int mode;
2467	u64 ns;
2468
2469	do {
2470		seq = read_seqcount_begin(&gtod->seq);
2471		ns = gtod->raw_clock.base_cycles;
2472		ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
2473		ns >>= gtod->raw_clock.shift;
2474		ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
2475	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2476	*t = ns;
2477
2478	return mode;
2479}
2480
2481static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
2482{
2483	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2484	unsigned long seq;
2485	int mode;
2486	u64 ns;
2487
2488	do {
2489		seq = read_seqcount_begin(&gtod->seq);
2490		ts->tv_sec = gtod->wall_time_sec;
2491		ns = gtod->clock.base_cycles;
2492		ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
2493		ns >>= gtod->clock.shift;
2494	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2495
2496	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
2497	ts->tv_nsec = ns;
2498
2499	return mode;
2500}
2501
2502/* returns true if host is using TSC based clocksource */
2503static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
2504{
2505	/* checked again under seqlock below */
2506	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2507		return false;
2508
2509	return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
2510						      tsc_timestamp));
2511}
2512
2513/* returns true if host is using TSC based clocksource */
2514static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
2515					   u64 *tsc_timestamp)
2516{
2517	/* checked again under seqlock below */
2518	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2519		return false;
2520
2521	return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
2522}
2523#endif
2524
2525/*
2526 *
2527 * Assuming a stable TSC across physical CPUS, and a stable TSC
2528 * across virtual CPUs, the following condition is possible.
2529 * Each numbered line represents an event visible to both
2530 * CPUs at the next numbered event.
2531 *
2532 * "timespecX" represents host monotonic time. "tscX" represents
2533 * RDTSC value.
2534 *
2535 * 		VCPU0 on CPU0		|	VCPU1 on CPU1
2536 *
2537 * 1.  read timespec0,tsc0
2538 * 2.					| timespec1 = timespec0 + N
2539 * 					| tsc1 = tsc0 + M
2540 * 3. transition to guest		| transition to guest
2541 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
2542 * 5.				        | ret1 = timespec1 + (rdtsc - tsc1)
2543 * 				        | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
2544 *
2545 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
2546 *
2547 * 	- ret0 < ret1
2548 *	- timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
2549 *		...
2550 *	- 0 < N - M => M < N
2551 *
2552 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
2553 * always the case (the difference between two distinct xtime instances
2554 * might be smaller then the difference between corresponding TSC reads,
2555 * when updating guest vcpus pvclock areas).
2556 *
2557 * To avoid that problem, do not allow visibility of distinct
2558 * system_timestamp/tsc_timestamp values simultaneously: use a master
2559 * copy of host monotonic time values. Update that master copy
2560 * in lockstep.
2561 *
2562 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
2563 *
2564 */
2565
2566static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
2567{
2568#ifdef CONFIG_X86_64
2569	struct kvm_arch *ka = &kvm->arch;
2570	int vclock_mode;
2571	bool host_tsc_clocksource, vcpus_matched;
2572
2573	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2574			atomic_read(&kvm->online_vcpus));
2575
2576	/*
2577	 * If the host uses TSC clock, then passthrough TSC as stable
2578	 * to the guest.
2579	 */
2580	host_tsc_clocksource = kvm_get_time_and_clockread(
2581					&ka->master_kernel_ns,
2582					&ka->master_cycle_now);
2583
2584	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
2585				&& !ka->backwards_tsc_observed
2586				&& !ka->boot_vcpu_runs_old_kvmclock;
2587
2588	if (ka->use_master_clock)
2589		atomic_set(&kvm_guest_has_master_clock, 1);
2590
2591	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
2592	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
2593					vcpus_matched);
2594#endif
2595}
2596
2597void kvm_make_mclock_inprogress_request(struct kvm *kvm)
2598{
2599	kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
2600}
2601
2602static void kvm_gen_update_masterclock(struct kvm *kvm)
2603{
2604#ifdef CONFIG_X86_64
2605	int i;
2606	struct kvm_vcpu *vcpu;
2607	struct kvm_arch *ka = &kvm->arch;
2608
2609	spin_lock(&ka->pvclock_gtod_sync_lock);
2610	kvm_make_mclock_inprogress_request(kvm);
2611	/* no guest entries from this point */
2612	pvclock_update_vm_gtod_copy(kvm);
2613
2614	kvm_for_each_vcpu(i, vcpu, kvm)
2615		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2616
2617	/* guest entries allowed */
2618	kvm_for_each_vcpu(i, vcpu, kvm)
2619		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
2620
2621	spin_unlock(&ka->pvclock_gtod_sync_lock);
2622#endif
2623}
2624
2625u64 get_kvmclock_ns(struct kvm *kvm)
2626{
2627	struct kvm_arch *ka = &kvm->arch;
2628	struct pvclock_vcpu_time_info hv_clock;
2629	u64 ret;
2630
2631	spin_lock(&ka->pvclock_gtod_sync_lock);
2632	if (!ka->use_master_clock) {
2633		spin_unlock(&ka->pvclock_gtod_sync_lock);
2634		return get_kvmclock_base_ns() + ka->kvmclock_offset;
2635	}
2636
2637	hv_clock.tsc_timestamp = ka->master_cycle_now;
2638	hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
2639	spin_unlock(&ka->pvclock_gtod_sync_lock);
2640
2641	/* both __this_cpu_read() and rdtsc() should be on the same cpu */
2642	get_cpu();
2643
2644	if (__this_cpu_read(cpu_tsc_khz)) {
2645		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
2646				   &hv_clock.tsc_shift,
2647				   &hv_clock.tsc_to_system_mul);
2648		ret = __pvclock_read_cycles(&hv_clock, rdtsc());
2649	} else
2650		ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
2651
2652	put_cpu();
2653
2654	return ret;
2655}
2656
2657static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
2658{
2659	struct kvm_vcpu_arch *vcpu = &v->arch;
2660	struct pvclock_vcpu_time_info guest_hv_clock;
2661
2662	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
2663		&guest_hv_clock, sizeof(guest_hv_clock))))
2664		return;
2665
2666	/* This VCPU is paused, but it's legal for a guest to read another
2667	 * VCPU's kvmclock, so we really have to follow the specification where
2668	 * it says that version is odd if data is being modified, and even after
2669	 * it is consistent.
2670	 *
2671	 * Version field updates must be kept separate.  This is because
2672	 * kvm_write_guest_cached might use a "rep movs" instruction, and
2673	 * writes within a string instruction are weakly ordered.  So there
2674	 * are three writes overall.
2675	 *
2676	 * As a small optimization, only write the version field in the first
2677	 * and third write.  The vcpu->pv_time cache is still valid, because the
2678	 * version field is the first in the struct.
2679	 */
2680	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
2681
2682	if (guest_hv_clock.version & 1)
2683		++guest_hv_clock.version;  /* first time write, random junk */
2684
2685	vcpu->hv_clock.version = guest_hv_clock.version + 1;
2686	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2687				&vcpu->hv_clock,
2688				sizeof(vcpu->hv_clock.version));
2689
2690	smp_wmb();
2691
2692	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
2693	vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
2694
2695	if (vcpu->pvclock_set_guest_stopped_request) {
2696		vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
2697		vcpu->pvclock_set_guest_stopped_request = false;
2698	}
2699
2700	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
2701
2702	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2703				&vcpu->hv_clock,
2704				sizeof(vcpu->hv_clock));
2705
2706	smp_wmb();
2707
2708	vcpu->hv_clock.version++;
2709	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2710				&vcpu->hv_clock,
2711				sizeof(vcpu->hv_clock.version));
2712}
2713
2714static int kvm_guest_time_update(struct kvm_vcpu *v)
2715{
2716	unsigned long flags, tgt_tsc_khz;
2717	struct kvm_vcpu_arch *vcpu = &v->arch;
2718	struct kvm_arch *ka = &v->kvm->arch;
2719	s64 kernel_ns;
2720	u64 tsc_timestamp, host_tsc;
2721	u8 pvclock_flags;
2722	bool use_master_clock;
2723
2724	kernel_ns = 0;
2725	host_tsc = 0;
2726
2727	/*
2728	 * If the host uses TSC clock, then passthrough TSC as stable
2729	 * to the guest.
2730	 */
2731	spin_lock(&ka->pvclock_gtod_sync_lock);
2732	use_master_clock = ka->use_master_clock;
2733	if (use_master_clock) {
2734		host_tsc = ka->master_cycle_now;
2735		kernel_ns = ka->master_kernel_ns;
2736	}
2737	spin_unlock(&ka->pvclock_gtod_sync_lock);
2738
2739	/* Keep irq disabled to prevent changes to the clock */
2740	local_irq_save(flags);
2741	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
2742	if (unlikely(tgt_tsc_khz == 0)) {
2743		local_irq_restore(flags);
2744		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2745		return 1;
2746	}
2747	if (!use_master_clock) {
2748		host_tsc = rdtsc();
2749		kernel_ns = get_kvmclock_base_ns();
2750	}
2751
2752	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
2753
2754	/*
2755	 * We may have to catch up the TSC to match elapsed wall clock
2756	 * time for two reasons, even if kvmclock is used.
2757	 *   1) CPU could have been running below the maximum TSC rate
2758	 *   2) Broken TSC compensation resets the base at each VCPU
2759	 *      entry to avoid unknown leaps of TSC even when running
2760	 *      again on the same CPU.  This may cause apparent elapsed
2761	 *      time to disappear, and the guest to stand still or run
2762	 *	very slowly.
2763	 */
2764	if (vcpu->tsc_catchup) {
2765		u64 tsc = compute_guest_tsc(v, kernel_ns);
2766		if (tsc > tsc_timestamp) {
2767			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
2768			tsc_timestamp = tsc;
2769		}
2770	}
2771
2772	local_irq_restore(flags);
2773
2774	/* With all the info we got, fill in the values */
2775
2776	if (kvm_has_tsc_control)
2777		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
2778
2779	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
2780		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
2781				   &vcpu->hv_clock.tsc_shift,
2782				   &vcpu->hv_clock.tsc_to_system_mul);
2783		vcpu->hw_tsc_khz = tgt_tsc_khz;
2784	}
2785
2786	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
2787	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
2788	vcpu->last_guest_tsc = tsc_timestamp;
2789
2790	/* If the host uses TSC clocksource, then it is stable */
2791	pvclock_flags = 0;
2792	if (use_master_clock)
2793		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
2794
2795	vcpu->hv_clock.flags = pvclock_flags;
2796
2797	if (vcpu->pv_time_enabled)
2798		kvm_setup_pvclock_page(v);
2799	if (v == kvm_get_vcpu(v->kvm, 0))
2800		kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
2801	return 0;
2802}
2803
2804/*
2805 * kvmclock updates which are isolated to a given vcpu, such as
2806 * vcpu->cpu migration, should not allow system_timestamp from
2807 * the rest of the vcpus to remain static. Otherwise ntp frequency
2808 * correction applies to one vcpu's system_timestamp but not
2809 * the others.
2810 *
2811 * So in those cases, request a kvmclock update for all vcpus.
2812 * We need to rate-limit these requests though, as they can
2813 * considerably slow guests that have a large number of vcpus.
2814 * The time for a remote vcpu to update its kvmclock is bound
2815 * by the delay we use to rate-limit the updates.
2816 */
2817
2818#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
2819
2820static void kvmclock_update_fn(struct work_struct *work)
2821{
2822	int i;
2823	struct delayed_work *dwork = to_delayed_work(work);
2824	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2825					   kvmclock_update_work);
2826	struct kvm *kvm = container_of(ka, struct kvm, arch);
2827	struct kvm_vcpu *vcpu;
2828
2829	kvm_for_each_vcpu(i, vcpu, kvm) {
2830		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2831		kvm_vcpu_kick(vcpu);
2832	}
2833}
2834
2835static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
2836{
2837	struct kvm *kvm = v->kvm;
2838
2839	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2840	schedule_delayed_work(&kvm->arch.kvmclock_update_work,
2841					KVMCLOCK_UPDATE_DELAY);
2842}
2843
2844#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
2845
2846static void kvmclock_sync_fn(struct work_struct *work)
2847{
2848	struct delayed_work *dwork = to_delayed_work(work);
2849	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2850					   kvmclock_sync_work);
2851	struct kvm *kvm = container_of(ka, struct kvm, arch);
2852
2853	if (!kvmclock_periodic_sync)
2854		return;
2855
2856	schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
2857	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
2858					KVMCLOCK_SYNC_PERIOD);
2859}
2860
2861/*
2862 * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
2863 */
2864static bool can_set_mci_status(struct kvm_vcpu *vcpu)
2865{
2866	/* McStatusWrEn enabled? */
2867	if (guest_cpuid_is_amd_or_hygon(vcpu))
2868		return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
2869
2870	return false;
2871}
2872
2873static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2874{
2875	u64 mcg_cap = vcpu->arch.mcg_cap;
2876	unsigned bank_num = mcg_cap & 0xff;
2877	u32 msr = msr_info->index;
2878	u64 data = msr_info->data;
2879
2880	switch (msr) {
2881	case MSR_IA32_MCG_STATUS:
2882		vcpu->arch.mcg_status = data;
2883		break;
2884	case MSR_IA32_MCG_CTL:
2885		if (!(mcg_cap & MCG_CTL_P) &&
2886		    (data || !msr_info->host_initiated))
2887			return 1;
2888		if (data != 0 && data != ~(u64)0)
2889			return 1;
2890		vcpu->arch.mcg_ctl = data;
2891		break;
2892	default:
2893		if (msr >= MSR_IA32_MC0_CTL &&
2894		    msr < MSR_IA32_MCx_CTL(bank_num)) {
2895			u32 offset = array_index_nospec(
2896				msr - MSR_IA32_MC0_CTL,
2897				MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
2898
2899			/* only 0 or all 1s can be written to IA32_MCi_CTL
2900			 * some Linux kernels though clear bit 10 in bank 4 to
2901			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2902			 * this to avoid an uncatched #GP in the guest.
2903			 *
2904			 * UNIXWARE clears bit 0 of MC1_CTL to ignore
2905			 * correctable, single-bit ECC data errors.
2906			 */
2907			if ((offset & 0x3) == 0 &&
2908			    data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
2909				return 1;
2910
2911			/* MCi_STATUS */
2912			if (!msr_info->host_initiated &&
2913			    (offset & 0x3) == 1 && data != 0) {
2914				if (!can_set_mci_status(vcpu))
2915					return 1;
2916			}
2917
2918			vcpu->arch.mce_banks[offset] = data;
2919			break;
2920		}
2921		return 1;
2922	}
2923	return 0;
2924}
2925
2926static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
2927{
2928	struct kvm *kvm = vcpu->kvm;
2929	int lm = is_long_mode(vcpu);
2930	u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
2931		: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
2932	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
2933		: kvm->arch.xen_hvm_config.blob_size_32;
2934	u32 page_num = data & ~PAGE_MASK;
2935	u64 page_addr = data & PAGE_MASK;
2936	u8 *page;
2937
2938	if (page_num >= blob_size)
2939		return 1;
2940
2941	page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2942	if (IS_ERR(page))
2943		return PTR_ERR(page);
2944
2945	if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
2946		kfree(page);
2947		return 1;
2948	}
2949	return 0;
2950}
2951
2952static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
2953{
2954	u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
2955
2956	return (vcpu->arch.apf.msr_en_val & mask) == mask;
2957}
2958
2959static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2960{
2961	gpa_t gpa = data & ~0x3f;
2962
2963	/* Bits 4:5 are reserved, Should be zero */
2964	if (data & 0x30)
2965		return 1;
2966
2967	if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
2968	    (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
2969		return 1;
2970
2971	if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
2972	    (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
2973		return 1;
2974
2975	if (!lapic_in_kernel(vcpu))
2976		return data ? 1 : 0;
2977
2978	vcpu->arch.apf.msr_en_val = data;
2979
2980	if (!kvm_pv_async_pf_enabled(vcpu)) {
2981		kvm_clear_async_pf_completion_queue(vcpu);
2982		kvm_async_pf_hash_reset(vcpu);
2983		return 0;
2984	}
2985
2986	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2987					sizeof(u64)))
2988		return 1;
2989
2990	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2991	vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2992
2993	kvm_async_pf_wakeup_all(vcpu);
2994
2995	return 0;
2996}
2997
2998static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
2999{
3000	/* Bits 8-63 are reserved */
3001	if (data >> 8)
3002		return 1;
3003
3004	if (!lapic_in_kernel(vcpu))
3005		return 1;
3006
3007	vcpu->arch.apf.msr_int_val = data;
3008
3009	vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
3010
3011	return 0;
3012}
3013
3014static void kvmclock_reset(struct kvm_vcpu *vcpu)
3015{
3016	vcpu->arch.pv_time_enabled = false;
3017	vcpu->arch.time = 0;
3018}
3019
3020static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
3021{
3022	++vcpu->stat.tlb_flush;
3023	kvm_x86_ops.tlb_flush_all(vcpu);
3024}
3025
3026static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
3027{
3028	++vcpu->stat.tlb_flush;
3029	kvm_x86_ops.tlb_flush_guest(vcpu);
3030}
3031
3032static void record_steal_time(struct kvm_vcpu *vcpu)
3033{
3034	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
3035	struct kvm_steal_time __user *st;
3036	struct kvm_memslots *slots;
3037	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
3038	u64 steal;
3039	u32 version;
3040
3041	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3042		return;
3043
3044	if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
3045		return;
3046
3047	slots = kvm_memslots(vcpu->kvm);
3048
3049	if (unlikely(slots->generation != ghc->generation ||
3050		     gpa != ghc->gpa ||
3051		     kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
3052		/* We rely on the fact that it fits in a single page. */
3053		BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
3054
3055		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
3056		    kvm_is_error_hva(ghc->hva) || !ghc->memslot)
3057			return;
3058	}
3059
3060	st = (struct kvm_steal_time __user *)ghc->hva;
3061	/*
3062	 * Doing a TLB flush here, on the guest's behalf, can avoid
3063	 * expensive IPIs.
3064	 */
3065	if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
3066		u8 st_preempted = 0;
3067		int err = -EFAULT;
3068
3069		if (!user_access_begin(st, sizeof(*st)))
3070			return;
3071
3072		asm volatile("1: xchgb %0, %2\n"
3073			     "xor %1, %1\n"
3074			     "2:\n"
3075			     _ASM_EXTABLE_UA(1b, 2b)
3076			     : "+q" (st_preempted),
3077			       "+&r" (err),
3078			       "+m" (st->preempted));
3079		if (err)
3080			goto out;
3081
3082		user_access_end();
3083
3084		vcpu->arch.st.preempted = 0;
3085
3086		trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
3087				       st_preempted & KVM_VCPU_FLUSH_TLB);
3088		if (st_preempted & KVM_VCPU_FLUSH_TLB)
3089			kvm_vcpu_flush_tlb_guest(vcpu);
3090
3091		if (!user_access_begin(st, sizeof(*st)))
3092			goto dirty;
3093	} else {
3094		if (!user_access_begin(st, sizeof(*st)))
3095			return;
3096
3097		unsafe_put_user(0, &st->preempted, out);
3098		vcpu->arch.st.preempted = 0;
3099	}
3100
3101	unsafe_get_user(version, &st->version, out);
3102	if (version & 1)
3103		version += 1;  /* first time write, random junk */
3104
3105	version += 1;
3106	unsafe_put_user(version, &st->version, out);
3107
3108	smp_wmb();
3109
3110	unsafe_get_user(steal, &st->steal, out);
3111	steal += current->sched_info.run_delay -
3112		vcpu->arch.st.last_steal;
3113	vcpu->arch.st.last_steal = current->sched_info.run_delay;
3114	unsafe_put_user(steal, &st->steal, out);
3115
3116	version += 1;
3117	unsafe_put_user(version, &st->version, out);
3118
3119 out:
3120	user_access_end();
3121 dirty:
3122	mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
3123}
3124
3125int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3126{
3127	bool pr = false;
3128	u32 msr = msr_info->index;
3129	u64 data = msr_info->data;
3130
3131	switch (msr) {
3132	case MSR_AMD64_NB_CFG:
3133	case MSR_IA32_UCODE_WRITE:
3134	case MSR_VM_HSAVE_PA:
3135	case MSR_AMD64_PATCH_LOADER:
3136	case MSR_AMD64_BU_CFG2:
3137	case MSR_AMD64_DC_CFG:
3138	case MSR_AMD64_TW_CFG:
3139	case MSR_F15H_EX_CFG:
3140		break;
3141
3142	case MSR_IA32_UCODE_REV:
3143		if (msr_info->host_initiated)
3144			vcpu->arch.microcode_version = data;
3145		break;
3146	case MSR_IA32_ARCH_CAPABILITIES:
3147		if (!msr_info->host_initiated)
3148			return 1;
3149		vcpu->arch.arch_capabilities = data;
3150		break;
3151	case MSR_IA32_PERF_CAPABILITIES: {
3152		struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
3153
3154		if (!msr_info->host_initiated)
3155			return 1;
3156		if (kvm_get_msr_feature(&msr_ent))
3157			return 1;
3158		if (data & ~msr_ent.data)
3159			return 1;
3160
3161		vcpu->arch.perf_capabilities = data;
3162
3163		return 0;
3164		}
3165	case MSR_EFER:
3166		return set_efer(vcpu, msr_info);
3167	case MSR_K7_HWCR:
3168		data &= ~(u64)0x40;	/* ignore flush filter disable */
3169		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
3170		data &= ~(u64)0x8;	/* ignore TLB cache disable */
3171
3172		/* Handle McStatusWrEn */
3173		if (data == BIT_ULL(18)) {
3174			vcpu->arch.msr_hwcr = data;
3175		} else if (data != 0) {
3176			vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
3177				    data);
3178			return 1;
3179		}
3180		break;
3181	case MSR_FAM10H_MMIO_CONF_BASE:
3182		if (data != 0) {
3183			vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
3184				    "0x%llx\n", data);
3185			return 1;
3186		}
3187		break;
3188	case MSR_IA32_DEBUGCTLMSR:
3189		if (!data) {
3190			/* We support the non-activated case already */
3191			break;
3192		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
3193			/* Values other than LBR and BTF are vendor-specific,
3194			   thus reserved and should throw a #GP */
3195			return 1;
3196		} else if (report_ignored_msrs)
3197			vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
3198				    __func__, data);
3199		break;
3200	case 0x200 ... 0x2ff:
3201		return kvm_mtrr_set_msr(vcpu, msr, data);
3202	case MSR_IA32_APICBASE:
3203		return kvm_set_apic_base(vcpu, msr_info);
3204	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
3205		return kvm_x2apic_msr_write(vcpu, msr, data);
3206	case MSR_IA32_TSCDEADLINE:
3207		kvm_set_lapic_tscdeadline_msr(vcpu, data);
3208		break;
3209	case MSR_IA32_TSC_ADJUST:
3210		if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
3211			if (!msr_info->host_initiated) {
3212				s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
3213				adjust_tsc_offset_guest(vcpu, adj);
3214				/* Before back to guest, tsc_timestamp must be adjusted
3215				 * as well, otherwise guest's percpu pvclock time could jump.
3216				 */
3217				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3218			}
3219			vcpu->arch.ia32_tsc_adjust_msr = data;
3220		}
3221		break;
3222	case MSR_IA32_MISC_ENABLE:
3223		if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
3224		    ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
3225			if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
3226				return 1;
3227			vcpu->arch.ia32_misc_enable_msr = data;
3228			kvm_update_cpuid_runtime(vcpu);
3229		} else {
3230			vcpu->arch.ia32_misc_enable_msr = data;
3231		}
3232		break;
3233	case MSR_IA32_SMBASE:
3234		if (!msr_info->host_initiated)
3235			return 1;
3236		vcpu->arch.smbase = data;
3237		break;
3238	case MSR_IA32_POWER_CTL:
3239		vcpu->arch.msr_ia32_power_ctl = data;
3240		break;
3241	case MSR_IA32_TSC:
3242		if (msr_info->host_initiated) {
3243			kvm_synchronize_tsc(vcpu, data);
3244		} else {
3245			u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
3246			adjust_tsc_offset_guest(vcpu, adj);
3247			vcpu->arch.ia32_tsc_adjust_msr += adj;
3248		}
3249		break;
3250	case MSR_IA32_XSS:
3251		if (!msr_info->host_initiated &&
3252		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3253			return 1;
3254		/*
3255		 * KVM supports exposing PT to the guest, but does not support
3256		 * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
3257		 * XSAVES/XRSTORS to save/restore PT MSRs.
3258		 */
3259		if (data & ~supported_xss)
3260			return 1;
3261		vcpu->arch.ia32_xss = data;
3262		kvm_update_cpuid_runtime(vcpu);
3263		break;
3264	case MSR_SMI_COUNT:
3265		if (!msr_info->host_initiated)
3266			return 1;
3267		vcpu->arch.smi_count = data;
3268		break;
3269	case MSR_KVM_WALL_CLOCK_NEW:
3270		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3271			return 1;
3272
3273		kvm_write_wall_clock(vcpu->kvm, data);
3274		break;
3275	case MSR_KVM_WALL_CLOCK:
3276		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3277			return 1;
3278
3279		kvm_write_wall_clock(vcpu->kvm, data);
3280		break;
3281	case MSR_KVM_SYSTEM_TIME_NEW:
3282		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3283			return 1;
3284
3285		kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
3286		break;
3287	case MSR_KVM_SYSTEM_TIME:
3288		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3289			return 1;
3290
3291		kvm_write_system_time(vcpu, data, true,  msr_info->host_initiated);
3292		break;
3293	case MSR_KVM_ASYNC_PF_EN:
3294		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3295			return 1;
3296
3297		if (kvm_pv_enable_async_pf(vcpu, data))
3298			return 1;
3299		break;
3300	case MSR_KVM_ASYNC_PF_INT:
3301		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3302			return 1;
3303
3304		if (kvm_pv_enable_async_pf_int(vcpu, data))
3305			return 1;
3306		break;
3307	case MSR_KVM_ASYNC_PF_ACK:
3308		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3309			return 1;
3310		if (data & 0x1) {
3311			vcpu->arch.apf.pageready_pending = false;
3312			kvm_check_async_pf_completion(vcpu);
3313		}
3314		break;
3315	case MSR_KVM_STEAL_TIME:
3316		if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3317			return 1;
3318
3319		if (unlikely(!sched_info_on()))
3320			return 1;
3321
3322		if (data & KVM_STEAL_RESERVED_MASK)
3323			return 1;
3324
3325		vcpu->arch.st.msr_val = data;
3326
3327		if (!(data & KVM_MSR_ENABLED))
3328			break;
3329
3330		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3331
3332		break;
3333	case MSR_KVM_PV_EOI_EN:
3334		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3335			return 1;
3336
3337		if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
3338			return 1;
3339		break;
3340
3341	case MSR_KVM_POLL_CONTROL:
3342		if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3343			return 1;
3344
3345		/* only enable bit supported */
3346		if (data & (-1ULL << 1))
3347			return 1;
3348
3349		vcpu->arch.msr_kvm_poll_control = data;
3350		break;
3351
3352	case MSR_IA32_MCG_CTL:
3353	case MSR_IA32_MCG_STATUS:
3354	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3355		return set_msr_mce(vcpu, msr_info);
3356
3357	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3358	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
3359		pr = true;
3360		fallthrough;
3361	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3362	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
3363		if (kvm_pmu_is_valid_msr(vcpu, msr))
3364			return kvm_pmu_set_msr(vcpu, msr_info);
3365
3366		if (pr || data != 0)
3367			vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
3368				    "0x%x data 0x%llx\n", msr, data);
3369		break;
3370	case MSR_K7_CLK_CTL:
3371		/*
3372		 * Ignore all writes to this no longer documented MSR.
3373		 * Writes are only relevant for old K7 processors,
3374		 * all pre-dating SVM, but a recommended workaround from
3375		 * AMD for these chips. It is possible to specify the
3376		 * affected processor models on the command line, hence
3377		 * the need to ignore the workaround.
3378		 */
3379		break;
3380	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3381	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3382	case HV_X64_MSR_SYNDBG_OPTIONS:
3383	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3384	case HV_X64_MSR_CRASH_CTL:
3385	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
3386	case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3387	case HV_X64_MSR_TSC_EMULATION_CONTROL:
3388	case HV_X64_MSR_TSC_EMULATION_STATUS:
3389		return kvm_hv_set_msr_common(vcpu, msr, data,
3390					     msr_info->host_initiated);
3391	case MSR_IA32_BBL_CR_CTL3:
3392		/* Drop writes to this legacy MSR -- see rdmsr
3393		 * counterpart for further detail.
3394		 */
3395		if (report_ignored_msrs)
3396			vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
3397				msr, data);
3398		break;
3399	case MSR_AMD64_OSVW_ID_LENGTH:
3400		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3401			return 1;
3402		vcpu->arch.osvw.length = data;
3403		break;
3404	case MSR_AMD64_OSVW_STATUS:
3405		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3406			return 1;
3407		vcpu->arch.osvw.status = data;
3408		break;
3409	case MSR_PLATFORM_INFO:
3410		if (!msr_info->host_initiated ||
3411		    (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
3412		     cpuid_fault_enabled(vcpu)))
3413			return 1;
3414		vcpu->arch.msr_platform_info = data;
3415		break;
3416	case MSR_MISC_FEATURES_ENABLES:
3417		if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
3418		    (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
3419		     !supports_cpuid_fault(vcpu)))
3420			return 1;
3421		vcpu->arch.msr_misc_features_enables = data;
3422		break;
3423	default:
3424		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
3425			return xen_hvm_config(vcpu, data);
3426		if (kvm_pmu_is_valid_msr(vcpu, msr))
3427			return kvm_pmu_set_msr(vcpu, msr_info);
3428		return KVM_MSR_RET_INVALID;
3429	}
3430	return 0;
3431}
3432EXPORT_SYMBOL_GPL(kvm_set_msr_common);
3433
3434static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
3435{
3436	u64 data;
3437	u64 mcg_cap = vcpu->arch.mcg_cap;
3438	unsigned bank_num = mcg_cap & 0xff;
3439
3440	switch (msr) {
3441	case MSR_IA32_P5_MC_ADDR:
3442	case MSR_IA32_P5_MC_TYPE:
3443		data = 0;
3444		break;
3445	case MSR_IA32_MCG_CAP:
3446		data = vcpu->arch.mcg_cap;
3447		break;
3448	case MSR_IA32_MCG_CTL:
3449		if (!(mcg_cap & MCG_CTL_P) && !host)
3450			return 1;
3451		data = vcpu->arch.mcg_ctl;
3452		break;
3453	case MSR_IA32_MCG_STATUS:
3454		data = vcpu->arch.mcg_status;
3455		break;
3456	default:
3457		if (msr >= MSR_IA32_MC0_CTL &&
3458		    msr < MSR_IA32_MCx_CTL(bank_num)) {
3459			u32 offset = array_index_nospec(
3460				msr - MSR_IA32_MC0_CTL,
3461				MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
3462
3463			data = vcpu->arch.mce_banks[offset];
3464			break;
3465		}
3466		return 1;
3467	}
3468	*pdata = data;
3469	return 0;
3470}
3471
3472int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3473{
3474	switch (msr_info->index) {
3475	case MSR_IA32_PLATFORM_ID:
3476	case MSR_IA32_EBL_CR_POWERON:
3477	case MSR_IA32_DEBUGCTLMSR:
3478	case MSR_IA32_LASTBRANCHFROMIP:
3479	case MSR_IA32_LASTBRANCHTOIP:
3480	case MSR_IA32_LASTINTFROMIP:
3481	case MSR_IA32_LASTINTTOIP:
3482	case MSR_K8_SYSCFG:
3483	case MSR_K8_TSEG_ADDR:
3484	case MSR_K8_TSEG_MASK:
3485	case MSR_VM_HSAVE_PA:
3486	case MSR_K8_INT_PENDING_MSG:
3487	case MSR_AMD64_NB_CFG:
3488	case MSR_FAM10H_MMIO_CONF_BASE:
3489	case MSR_AMD64_BU_CFG2:
3490	case MSR_IA32_PERF_CTL:
3491	case MSR_AMD64_DC_CFG:
3492	case MSR_AMD64_TW_CFG:
3493	case MSR_F15H_EX_CFG:
3494	/*
3495	 * Intel Sandy Bridge CPUs must support the RAPL (running average power
3496	 * limit) MSRs. Just return 0, as we do not want to expose the host
3497	 * data here. Do not conditionalize this on CPUID, as KVM does not do
3498	 * so for existing CPU-specific MSRs.
3499	 */
3500	case MSR_RAPL_POWER_UNIT:
3501	case MSR_PP0_ENERGY_STATUS:	/* Power plane 0 (core) */
3502	case MSR_PP1_ENERGY_STATUS:	/* Power plane 1 (graphics uncore) */
3503	case MSR_PKG_ENERGY_STATUS:	/* Total package */
3504	case MSR_DRAM_ENERGY_STATUS:	/* DRAM controller */
3505		msr_info->data = 0;
3506		break;
3507	case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
3508	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3509	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3510	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
3511	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
3512		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
3513			return kvm_pmu_get_msr(vcpu, msr_info);
3514		msr_info->data = 0;
3515		break;
3516	case MSR_IA32_UCODE_REV:
3517		msr_info->data = vcpu->arch.microcode_version;
3518		break;
3519	case MSR_IA32_ARCH_CAPABILITIES:
3520		if (!msr_info->host_initiated &&
3521		    !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
3522			return 1;
3523		msr_info->data = vcpu->arch.arch_capabilities;
3524		break;
3525	case MSR_IA32_PERF_CAPABILITIES:
3526		if (!msr_info->host_initiated &&
3527		    !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
3528			return 1;
3529		msr_info->data = vcpu->arch.perf_capabilities;
3530		break;
3531	case MSR_IA32_POWER_CTL:
3532		msr_info->data = vcpu->arch.msr_ia32_power_ctl;
3533		break;
3534	case MSR_IA32_TSC: {
3535		/*
3536		 * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
3537		 * even when not intercepted. AMD manual doesn't explicitly
3538		 * state this but appears to behave the same.
3539		 *
3540		 * On userspace reads and writes, however, we unconditionally
3541		 * return L1's TSC value to ensure backwards-compatible
3542		 * behavior for migration.
3543		 */
3544		u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
3545							    vcpu->arch.tsc_offset;
3546
3547		msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
3548		break;
3549	}
3550	case MSR_MTRRcap:
3551	case 0x200 ... 0x2ff:
3552		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
3553	case 0xcd: /* fsb frequency */
3554		msr_info->data = 3;
3555		break;
3556		/*
3557		 * MSR_EBC_FREQUENCY_ID
3558		 * Conservative value valid for even the basic CPU models.
3559		 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
3560		 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
3561		 * and 266MHz for model 3, or 4. Set Core Clock
3562		 * Frequency to System Bus Frequency Ratio to 1 (bits
3563		 * 31:24) even though these are only valid for CPU
3564		 * models > 2, however guests may end up dividing or
3565		 * multiplying by zero otherwise.
3566		 */
3567	case MSR_EBC_FREQUENCY_ID:
3568		msr_info->data = 1 << 24;
3569		break;
3570	case MSR_IA32_APICBASE:
3571		msr_info->data = kvm_get_apic_base(vcpu);
3572		break;
3573	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
3574		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
3575	case MSR_IA32_TSCDEADLINE:
3576		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
3577		break;
3578	case MSR_IA32_TSC_ADJUST:
3579		msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
3580		break;
3581	case MSR_IA32_MISC_ENABLE:
3582		msr_info->data = vcpu->arch.ia32_misc_enable_msr;
3583		break;
3584	case MSR_IA32_SMBASE:
3585		if (!msr_info->host_initiated)
3586			return 1;
3587		msr_info->data = vcpu->arch.smbase;
3588		break;
3589	case MSR_SMI_COUNT:
3590		msr_info->data = vcpu->arch.smi_count;
3591		break;
3592	case MSR_IA32_PERF_STATUS:
3593		/* TSC increment by tick */
3594		msr_info->data = 1000ULL;
3595		/* CPU multiplier */
3596		msr_info->data |= (((uint64_t)4ULL) << 40);
3597		break;
3598	case MSR_EFER:
3599		msr_info->data = vcpu->arch.efer;
3600		break;
3601	case MSR_KVM_WALL_CLOCK:
3602		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3603			return 1;
3604
3605		msr_info->data = vcpu->kvm->arch.wall_clock;
3606		break;
3607	case MSR_KVM_WALL_CLOCK_NEW:
3608		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3609			return 1;
3610
3611		msr_info->data = vcpu->kvm->arch.wall_clock;
3612		break;
3613	case MSR_KVM_SYSTEM_TIME:
3614		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3615			return 1;
3616
3617		msr_info->data = vcpu->arch.time;
3618		break;
3619	case MSR_KVM_SYSTEM_TIME_NEW:
3620		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3621			return 1;
3622
3623		msr_info->data = vcpu->arch.time;
3624		break;
3625	case MSR_KVM_ASYNC_PF_EN:
3626		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3627			return 1;
3628
3629		msr_info->data = vcpu->arch.apf.msr_en_val;
3630		break;
3631	case MSR_KVM_ASYNC_PF_INT:
3632		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3633			return 1;
3634
3635		msr_info->data = vcpu->arch.apf.msr_int_val;
3636		break;
3637	case MSR_KVM_ASYNC_PF_ACK:
3638		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3639			return 1;
3640
3641		msr_info->data = 0;
3642		break;
3643	case MSR_KVM_STEAL_TIME:
3644		if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3645			return 1;
3646
3647		msr_info->data = vcpu->arch.st.msr_val;
3648		break;
3649	case MSR_KVM_PV_EOI_EN:
3650		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3651			return 1;
3652
3653		msr_info->data = vcpu->arch.pv_eoi.msr_val;
3654		break;
3655	case MSR_KVM_POLL_CONTROL:
3656		if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3657			return 1;
3658
3659		msr_info->data = vcpu->arch.msr_kvm_poll_control;
3660		break;
3661	case MSR_IA32_P5_MC_ADDR:
3662	case MSR_IA32_P5_MC_TYPE:
3663	case MSR_IA32_MCG_CAP:
3664	case MSR_IA32_MCG_CTL:
3665	case MSR_IA32_MCG_STATUS:
3666	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3667		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
3668				   msr_info->host_initiated);
3669	case MSR_IA32_XSS:
3670		if (!msr_info->host_initiated &&
3671		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3672			return 1;
3673		msr_info->data = vcpu->arch.ia32_xss;
3674		break;
3675	case MSR_K7_CLK_CTL:
3676		/*
3677		 * Provide expected ramp-up count for K7. All other
3678		 * are set to zero, indicating minimum divisors for
3679		 * every field.
3680		 *
3681		 * This prevents guest kernels on AMD host with CPU
3682		 * type 6, model 8 and higher from exploding due to
3683		 * the rdmsr failing.
3684		 */
3685		msr_info->data = 0x20000000;
3686		break;
3687	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3688	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3689	case HV_X64_MSR_SYNDBG_OPTIONS:
3690	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3691	case HV_X64_MSR_CRASH_CTL:
3692	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
3693	case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3694	case HV_X64_MSR_TSC_EMULATION_CONTROL:
3695	case HV_X64_MSR_TSC_EMULATION_STATUS:
3696		return kvm_hv_get_msr_common(vcpu,
3697					     msr_info->index, &msr_info->data,
3698					     msr_info->host_initiated);
3699	case MSR_IA32_BBL_CR_CTL3:
3700		/* This legacy MSR exists but isn't fully documented in current
3701		 * silicon.  It is however accessed by winxp in very narrow
3702		 * scenarios where it sets bit #19, itself documented as
3703		 * a "reserved" bit.  Best effort attempt to source coherent
3704		 * read data here should the balance of the register be
3705		 * interpreted by the guest:
3706		 *
3707		 * L2 cache control register 3: 64GB range, 256KB size,
3708		 * enabled, latency 0x1, configured
3709		 */
3710		msr_info->data = 0xbe702111;
3711		break;
3712	case MSR_AMD64_OSVW_ID_LENGTH:
3713		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3714			return 1;
3715		msr_info->data = vcpu->arch.osvw.length;
3716		break;
3717	case MSR_AMD64_OSVW_STATUS:
3718		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3719			return 1;
3720		msr_info->data = vcpu->arch.osvw.status;
3721		break;
3722	case MSR_PLATFORM_INFO:
3723		if (!msr_info->host_initiated &&
3724		    !vcpu->kvm->arch.guest_can_read_msr_platform_info)
3725			return 1;
3726		msr_info->data = vcpu->arch.msr_platform_info;
3727		break;
3728	case MSR_MISC_FEATURES_ENABLES:
3729		msr_info->data = vcpu->arch.msr_misc_features_enables;
3730		break;
3731	case MSR_K7_HWCR:
3732		msr_info->data = vcpu->arch.msr_hwcr;
3733		break;
3734	default:
3735		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
3736			return kvm_pmu_get_msr(vcpu, msr_info);
3737		return KVM_MSR_RET_INVALID;
3738	}
3739	return 0;
3740}
3741EXPORT_SYMBOL_GPL(kvm_get_msr_common);
3742
3743/*
3744 * Read or write a bunch of msrs. All parameters are kernel addresses.
3745 *
3746 * @return number of msrs set successfully.
3747 */
3748static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
3749		    struct kvm_msr_entry *entries,
3750		    int (*do_msr)(struct kvm_vcpu *vcpu,
3751				  unsigned index, u64 *data))
3752{
3753	int i;
3754
3755	for (i = 0; i < msrs->nmsrs; ++i)
3756		if (do_msr(vcpu, entries[i].index, &entries[i].data))
3757			break;
3758
3759	return i;
3760}
3761
3762/*
3763 * Read or write a bunch of msrs. Parameters are user addresses.
3764 *
3765 * @return number of msrs set successfully.
3766 */
3767static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
3768		  int (*do_msr)(struct kvm_vcpu *vcpu,
3769				unsigned index, u64 *data),
3770		  int writeback)
3771{
3772	struct kvm_msrs msrs;
3773	struct kvm_msr_entry *entries;
3774	int r, n;
3775	unsigned size;
3776
3777	r = -EFAULT;
3778	if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
3779		goto out;
3780
3781	r = -E2BIG;
3782	if (msrs.nmsrs >= MAX_IO_MSRS)
3783		goto out;
3784
3785	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
3786	entries = memdup_user(user_msrs->entries, size);
3787	if (IS_ERR(entries)) {
3788		r = PTR_ERR(entries);
3789		goto out;
3790	}
3791
3792	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
3793	if (r < 0)
3794		goto out_free;
3795
3796	r = -EFAULT;
3797	if (writeback && copy_to_user(user_msrs->entries, entries, size))
3798		goto out_free;
3799
3800	r = n;
3801
3802out_free:
3803	kfree(entries);
3804out:
3805	return r;
3806}
3807
3808static inline bool kvm_can_mwait_in_guest(void)
3809{
3810	return boot_cpu_has(X86_FEATURE_MWAIT) &&
3811		!boot_cpu_has_bug(X86_BUG_MONITOR) &&
3812		boot_cpu_has(X86_FEATURE_ARAT);
3813}
3814
3815int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
3816{
3817	int r = 0;
3818
3819	switch (ext) {
3820	case KVM_CAP_IRQCHIP:
3821	case KVM_CAP_HLT:
3822	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
3823	case KVM_CAP_SET_TSS_ADDR:
3824	case KVM_CAP_EXT_CPUID:
3825	case KVM_CAP_EXT_EMUL_CPUID:
3826	case KVM_CAP_CLOCKSOURCE:
3827	case KVM_CAP_PIT:
3828	case KVM_CAP_NOP_IO_DELAY:
3829	case KVM_CAP_MP_STATE:
3830	case KVM_CAP_SYNC_MMU:
3831	case KVM_CAP_USER_NMI:
3832	case KVM_CAP_REINJECT_CONTROL:
3833	case KVM_CAP_IRQ_INJECT_STATUS:
3834	case KVM_CAP_IOEVENTFD:
3835	case KVM_CAP_IOEVENTFD_NO_LENGTH:
3836	case KVM_CAP_PIT2:
3837	case KVM_CAP_PIT_STATE2:
3838	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
3839	case KVM_CAP_XEN_HVM:
3840	case KVM_CAP_VCPU_EVENTS:
3841	case KVM_CAP_HYPERV:
3842	case KVM_CAP_HYPERV_VAPIC:
3843	case KVM_CAP_HYPERV_SPIN:
3844	case KVM_CAP_HYPERV_SYNIC:
3845	case KVM_CAP_HYPERV_SYNIC2:
3846	case KVM_CAP_HYPERV_VP_INDEX:
3847	case KVM_CAP_HYPERV_EVENTFD:
3848	case KVM_CAP_HYPERV_TLBFLUSH:
3849	case KVM_CAP_HYPERV_SEND_IPI:
3850	case KVM_CAP_HYPERV_CPUID:
3851	case KVM_CAP_PCI_SEGMENT:
3852	case KVM_CAP_DEBUGREGS:
3853	case KVM_CAP_X86_ROBUST_SINGLESTEP:
3854	case KVM_CAP_XSAVE:
3855	case KVM_CAP_ASYNC_PF:
3856	case KVM_CAP_ASYNC_PF_INT:
3857	case KVM_CAP_GET_TSC_KHZ:
3858	case KVM_CAP_KVMCLOCK_CTRL:
3859	case KVM_CAP_READONLY_MEM:
3860	case KVM_CAP_HYPERV_TIME:
3861	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
3862	case KVM_CAP_TSC_DEADLINE_TIMER:
3863	case KVM_CAP_DISABLE_QUIRKS:
3864	case KVM_CAP_SET_BOOT_CPU_ID:
3865 	case KVM_CAP_SPLIT_IRQCHIP:
3866	case KVM_CAP_IMMEDIATE_EXIT:
3867	case KVM_CAP_PMU_EVENT_FILTER:
3868	case KVM_CAP_GET_MSR_FEATURES:
3869	case KVM_CAP_MSR_PLATFORM_INFO:
3870	case KVM_CAP_EXCEPTION_PAYLOAD:
3871	case KVM_CAP_SET_GUEST_DEBUG:
3872	case KVM_CAP_LAST_CPU:
3873	case KVM_CAP_X86_USER_SPACE_MSR:
3874	case KVM_CAP_X86_MSR_FILTER:
3875	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
3876		r = 1;
3877		break;
3878	case KVM_CAP_SYNC_REGS:
3879		r = KVM_SYNC_X86_VALID_FIELDS;
3880		break;
3881	case KVM_CAP_ADJUST_CLOCK:
3882		r = KVM_CLOCK_TSC_STABLE;
3883		break;
3884	case KVM_CAP_X86_DISABLE_EXITS:
3885		r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
3886		      KVM_X86_DISABLE_EXITS_CSTATE;
3887		if(kvm_can_mwait_in_guest())
3888			r |= KVM_X86_DISABLE_EXITS_MWAIT;
3889		break;
3890	case KVM_CAP_X86_SMM:
3891		/* SMBASE is usually relocated above 1M on modern chipsets,
3892		 * and SMM handlers might indeed rely on 4G segment limits,
3893		 * so do not report SMM to be available if real mode is
3894		 * emulated via vm86 mode.  Still, do not go to great lengths
3895		 * to avoid userspace's usage of the feature, because it is a
3896		 * fringe case that is not enabled except via specific settings
3897		 * of the module parameters.
3898		 */
3899		r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
3900		break;
3901	case KVM_CAP_VAPIC:
3902		r = !kvm_x86_ops.cpu_has_accelerated_tpr();
3903		break;
3904	case KVM_CAP_NR_VCPUS:
3905		r = KVM_SOFT_MAX_VCPUS;
3906		break;
3907	case KVM_CAP_MAX_VCPUS:
3908		r = KVM_MAX_VCPUS;
3909		break;
3910	case KVM_CAP_MAX_VCPU_ID:
3911		r = KVM_MAX_VCPU_ID;
3912		break;
3913	case KVM_CAP_PV_MMU:	/* obsolete */
3914		r = 0;
3915		break;
3916	case KVM_CAP_MCE:
3917		r = KVM_MAX_MCE_BANKS;
3918		break;
3919	case KVM_CAP_XCRS:
3920		r = boot_cpu_has(X86_FEATURE_XSAVE);
3921		break;
3922	case KVM_CAP_TSC_CONTROL:
3923		r = kvm_has_tsc_control;
3924		break;
3925	case KVM_CAP_X2APIC_API:
3926		r = KVM_X2APIC_API_VALID_FLAGS;
3927		break;
3928	case KVM_CAP_NESTED_STATE:
3929		r = kvm_x86_ops.nested_ops->get_state ?
3930			kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
3931		break;
3932	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
3933		r = kvm_x86_ops.enable_direct_tlbflush != NULL;
3934		break;
3935	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
3936		r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
3937		break;
3938	case KVM_CAP_SMALLER_MAXPHYADDR:
3939		r = (int) allow_smaller_maxphyaddr;
3940		break;
3941	case KVM_CAP_STEAL_TIME:
3942		r = sched_info_on();
3943		break;
3944	default:
3945		break;
3946	}
3947	return r;
3948
3949}
3950
3951long kvm_arch_dev_ioctl(struct file *filp,
3952			unsigned int ioctl, unsigned long arg)
3953{
3954	void __user *argp = (void __user *)arg;
3955	long r;
3956
3957	switch (ioctl) {
3958	case KVM_GET_MSR_INDEX_LIST: {
3959		struct kvm_msr_list __user *user_msr_list = argp;
3960		struct kvm_msr_list msr_list;
3961		unsigned n;
3962
3963		r = -EFAULT;
3964		if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3965			goto out;
3966		n = msr_list.nmsrs;
3967		msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
3968		if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3969			goto out;
3970		r = -E2BIG;
3971		if (n < msr_list.nmsrs)
3972			goto out;
3973		r = -EFAULT;
3974		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3975				 num_msrs_to_save * sizeof(u32)))
3976			goto out;
3977		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
3978				 &emulated_msrs,
3979				 num_emulated_msrs * sizeof(u32)))
3980			goto out;
3981		r = 0;
3982		break;
3983	}
3984	case KVM_GET_SUPPORTED_CPUID:
3985	case KVM_GET_EMULATED_CPUID: {
3986		struct kvm_cpuid2 __user *cpuid_arg = argp;
3987		struct kvm_cpuid2 cpuid;
3988
3989		r = -EFAULT;
3990		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3991			goto out;
3992
3993		r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
3994					    ioctl);
3995		if (r)
3996			goto out;
3997
3998		r = -EFAULT;
3999		if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4000			goto out;
4001		r = 0;
4002		break;
4003	}
4004	case KVM_X86_GET_MCE_CAP_SUPPORTED:
4005		r = -EFAULT;
4006		if (copy_to_user(argp, &kvm_mce_cap_supported,
4007				 sizeof(kvm_mce_cap_supported)))
4008			goto out;
4009		r = 0;
4010		break;
4011	case KVM_GET_MSR_FEATURE_INDEX_LIST: {
4012		struct kvm_msr_list __user *user_msr_list = argp;
4013		struct kvm_msr_list msr_list;
4014		unsigned int n;
4015
4016		r = -EFAULT;
4017		if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
4018			goto out;
4019		n = msr_list.nmsrs;
4020		msr_list.nmsrs = num_msr_based_features;
4021		if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
4022			goto out;
4023		r = -E2BIG;
4024		if (n < msr_list.nmsrs)
4025			goto out;
4026		r = -EFAULT;
4027		if (copy_to_user(user_msr_list->indices, &msr_based_features,
4028				 num_msr_based_features * sizeof(u32)))
4029			goto out;
4030		r = 0;
4031		break;
4032	}
4033	case KVM_GET_MSRS:
4034		r = msr_io(NULL, argp, do_get_msr_feature, 1);
4035		break;
4036	default:
4037		r = -EINVAL;
4038		break;
4039	}
4040out:
4041	return r;
4042}
4043
4044static void wbinvd_ipi(void *garbage)
4045{
4046	wbinvd();
4047}
4048
4049static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
4050{
4051	return kvm_arch_has_noncoherent_dma(vcpu->kvm);
4052}
4053
4054void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
4055{
4056	/* Address WBINVD may be executed by guest */
4057	if (need_emulate_wbinvd(vcpu)) {
4058		if (kvm_x86_ops.has_wbinvd_exit())
4059			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
4060		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
4061			smp_call_function_single(vcpu->cpu,
4062					wbinvd_ipi, NULL, 1);
4063	}
4064
4065	kvm_x86_ops.vcpu_load(vcpu, cpu);
4066
4067	/* Save host pkru register if supported */
4068	vcpu->arch.host_pkru = read_pkru();
4069
4070	/* Apply any externally detected TSC adjustments (due to suspend) */
4071	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
4072		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
4073		vcpu->arch.tsc_offset_adjustment = 0;
4074		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4075	}
4076
4077	if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
4078		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
4079				rdtsc() - vcpu->arch.last_host_tsc;
4080		if (tsc_delta < 0)
4081			mark_tsc_unstable("KVM discovered backwards TSC");
4082
4083		if (kvm_check_tsc_unstable()) {
4084			u64 offset = kvm_compute_tsc_offset(vcpu,
4085						vcpu->arch.last_guest_tsc);
4086			kvm_vcpu_write_tsc_offset(vcpu, offset);
4087			vcpu->arch.tsc_catchup = 1;
4088		}
4089
4090		if (kvm_lapic_hv_timer_in_use(vcpu))
4091			kvm_lapic_restart_hv_timer(vcpu);
4092
4093		/*
4094		 * On a host with synchronized TSC, there is no need to update
4095		 * kvmclock on vcpu->cpu migration
4096		 */
4097		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
4098			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
4099		if (vcpu->cpu != cpu)
4100			kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
4101		vcpu->cpu = cpu;
4102	}
4103
4104	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
4105}
4106
4107static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
4108{
4109	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
4110	struct kvm_steal_time __user *st;
4111	struct kvm_memslots *slots;
4112	static const u8 preempted = KVM_VCPU_PREEMPTED;
4113	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
4114
4115	/*
4116	 * The vCPU can be marked preempted if and only if the VM-Exit was on
4117	 * an instruction boundary and will not trigger guest emulation of any
4118	 * kind (see vcpu_run).  Vendor specific code controls (conservatively)
4119	 * when this is true, for example allowing the vCPU to be marked
4120	 * preempted if and only if the VM-Exit was due to a host interrupt.
4121	 */
4122	if (!vcpu->arch.at_instruction_boundary) {
4123		vcpu->stat.preemption_other++;
4124		return;
4125	}
4126
4127	vcpu->stat.preemption_reported++;
4128	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
4129		return;
4130
4131	if (vcpu->arch.st.preempted)
4132		return;
4133
4134	/* This happens on process exit */
4135	if (unlikely(current->mm != vcpu->kvm->mm))
4136		return;
4137
4138	slots = kvm_memslots(vcpu->kvm);
4139
4140	if (unlikely(slots->generation != ghc->generation ||
4141		     gpa != ghc->gpa ||
4142		     kvm_is_error_hva(ghc->hva) || !ghc->memslot))
4143		return;
4144
4145	st = (struct kvm_steal_time __user *)ghc->hva;
4146	BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
4147
4148	if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
4149		vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
4150
4151	mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
4152}
4153
4154void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
4155{
4156	int idx;
4157
4158	if (vcpu->preempted) {
4159		vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
4160
4161		/*
4162		 * Take the srcu lock as memslots will be accessed to check the gfn
4163		 * cache generation against the memslots generation.
4164		 */
4165		idx = srcu_read_lock(&vcpu->kvm->srcu);
4166		kvm_steal_time_set_preempted(vcpu);
4167		srcu_read_unlock(&vcpu->kvm->srcu, idx);
4168	}
4169
4170	kvm_x86_ops.vcpu_put(vcpu);
4171	vcpu->arch.last_host_tsc = rdtsc();
4172	/*
4173	 * If userspace has set any breakpoints or watchpoints, dr6 is restored
4174	 * on every vmexit, but if not, we might have a stale dr6 from the
4175	 * guest. do_debug expects dr6 to be cleared after it runs, do the same.
4176	 */
4177	set_debugreg(0, 6);
4178}
4179
4180static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
4181				    struct kvm_lapic_state *s)
4182{
4183	if (vcpu->arch.apicv_active)
4184		kvm_x86_ops.sync_pir_to_irr(vcpu);
4185
4186	return kvm_apic_get_state(vcpu, s);
4187}
4188
4189static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
4190				    struct kvm_lapic_state *s)
4191{
4192	int r;
4193
4194	r = kvm_apic_set_state(vcpu, s);
4195	if (r)
4196		return r;
4197	update_cr8_intercept(vcpu);
4198
4199	return 0;
4200}
4201
4202static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
4203{
4204	/*
4205	 * We can accept userspace's request for interrupt injection
4206	 * as long as we have a place to store the interrupt number.
4207	 * The actual injection will happen when the CPU is able to
4208	 * deliver the interrupt.
4209	 */
4210	if (kvm_cpu_has_extint(vcpu))
4211		return false;
4212
4213	/* Acknowledging ExtINT does not happen if LINT0 is masked.  */
4214	return (!lapic_in_kernel(vcpu) ||
4215		kvm_apic_accept_pic_intr(vcpu));
4216}
4217
4218static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
4219{
4220	/*
4221	 * Do not cause an interrupt window exit if an exception
4222	 * is pending or an event needs reinjection; userspace
4223	 * might want to inject the interrupt manually using KVM_SET_REGS
4224	 * or KVM_SET_SREGS.  For that to work, we must be at an
4225	 * instruction boundary and with no events half-injected.
4226	 */
4227	return (kvm_arch_interrupt_allowed(vcpu) &&
4228		kvm_cpu_accept_dm_intr(vcpu) &&
4229		!kvm_event_needs_reinjection(vcpu) &&
4230		!vcpu->arch.exception.pending);
4231}
4232
4233static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
4234				    struct kvm_interrupt *irq)
4235{
4236	if (irq->irq >= KVM_NR_INTERRUPTS)
4237		return -EINVAL;
4238
4239	if (!irqchip_in_kernel(vcpu->kvm)) {
4240		kvm_queue_interrupt(vcpu, irq->irq, false);
4241		kvm_make_request(KVM_REQ_EVENT, vcpu);
4242		return 0;
4243	}
4244
4245	/*
4246	 * With in-kernel LAPIC, we only use this to inject EXTINT, so
4247	 * fail for in-kernel 8259.
4248	 */
4249	if (pic_in_kernel(vcpu->kvm))
4250		return -ENXIO;
4251
4252	if (vcpu->arch.pending_external_vector != -1)
4253		return -EEXIST;
4254
4255	vcpu->arch.pending_external_vector = irq->irq;
4256	kvm_make_request(KVM_REQ_EVENT, vcpu);
4257	return 0;
4258}
4259
4260static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
4261{
4262	kvm_inject_nmi(vcpu);
4263
4264	return 0;
4265}
4266
4267static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
4268{
4269	kvm_make_request(KVM_REQ_SMI, vcpu);
4270
4271	return 0;
4272}
4273
4274static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
4275					   struct kvm_tpr_access_ctl *tac)
4276{
4277	if (tac->flags)
4278		return -EINVAL;
4279	vcpu->arch.tpr_access_reporting = !!tac->enabled;
4280	return 0;
4281}
4282
4283static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
4284					u64 mcg_cap)
4285{
4286	int r;
4287	unsigned bank_num = mcg_cap & 0xff, bank;
4288
4289	r = -EINVAL;
4290	if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
4291		goto out;
4292	if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
4293		goto out;
4294	r = 0;
4295	vcpu->arch.mcg_cap = mcg_cap;
4296	/* Init IA32_MCG_CTL to all 1s */
4297	if (mcg_cap & MCG_CTL_P)
4298		vcpu->arch.mcg_ctl = ~(u64)0;
4299	/* Init IA32_MCi_CTL to all 1s */
4300	for (bank = 0; bank < bank_num; bank++)
4301		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
4302
4303	kvm_x86_ops.setup_mce(vcpu);
4304out:
4305	return r;
4306}
4307
4308static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
4309				      struct kvm_x86_mce *mce)
4310{
4311	u64 mcg_cap = vcpu->arch.mcg_cap;
4312	unsigned bank_num = mcg_cap & 0xff;
4313	u64 *banks = vcpu->arch.mce_banks;
4314
4315	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
4316		return -EINVAL;
4317	/*
4318	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
4319	 * reporting is disabled
4320	 */
4321	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
4322	    vcpu->arch.mcg_ctl != ~(u64)0)
4323		return 0;
4324	banks += 4 * mce->bank;
4325	/*
4326	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
4327	 * reporting is disabled for the bank
4328	 */
4329	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
4330		return 0;
4331	if (mce->status & MCI_STATUS_UC) {
4332		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
4333		    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
4334			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4335			return 0;
4336		}
4337		if (banks[1] & MCI_STATUS_VAL)
4338			mce->status |= MCI_STATUS_OVER;
4339		banks[2] = mce->addr;
4340		banks[3] = mce->misc;
4341		vcpu->arch.mcg_status = mce->mcg_status;
4342		banks[1] = mce->status;
4343		kvm_queue_exception(vcpu, MC_VECTOR);
4344	} else if (!(banks[1] & MCI_STATUS_VAL)
4345		   || !(banks[1] & MCI_STATUS_UC)) {
4346		if (banks[1] & MCI_STATUS_VAL)
4347			mce->status |= MCI_STATUS_OVER;
4348		banks[2] = mce->addr;
4349		banks[3] = mce->misc;
4350		banks[1] = mce->status;
4351	} else
4352		banks[1] |= MCI_STATUS_OVER;
4353	return 0;
4354}
4355
4356static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
4357					       struct kvm_vcpu_events *events)
4358{
4359	process_nmi(vcpu);
4360
4361	if (kvm_check_request(KVM_REQ_SMI, vcpu))
4362		process_smi(vcpu);
4363
4364	/*
4365	 * In guest mode, payload delivery should be deferred,
4366	 * so that the L1 hypervisor can intercept #PF before
4367	 * CR2 is modified (or intercept #DB before DR6 is
4368	 * modified under nVMX). Unless the per-VM capability,
4369	 * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
4370	 * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
4371	 * opportunistically defer the exception payload, deliver it if the
4372	 * capability hasn't been requested before processing a
4373	 * KVM_GET_VCPU_EVENTS.
4374	 */
4375	if (!vcpu->kvm->arch.exception_payload_enabled &&
4376	    vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
4377		kvm_deliver_exception_payload(vcpu);
4378
4379	/*
4380	 * The API doesn't provide the instruction length for software
4381	 * exceptions, so don't report them. As long as the guest RIP
4382	 * isn't advanced, we should expect to encounter the exception
4383	 * again.
4384	 */
4385	if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
4386		events->exception.injected = 0;
4387		events->exception.pending = 0;
4388	} else {
4389		events->exception.injected = vcpu->arch.exception.injected;
4390		events->exception.pending = vcpu->arch.exception.pending;
4391		/*
4392		 * For ABI compatibility, deliberately conflate
4393		 * pending and injected exceptions when
4394		 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
4395		 */
4396		if (!vcpu->kvm->arch.exception_payload_enabled)
4397			events->exception.injected |=
4398				vcpu->arch.exception.pending;
4399	}
4400	events->exception.nr = vcpu->arch.exception.nr;
4401	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
4402	events->exception.error_code = vcpu->arch.exception.error_code;
4403	events->exception_has_payload = vcpu->arch.exception.has_payload;
4404	events->exception_payload = vcpu->arch.exception.payload;
4405
4406	events->interrupt.injected =
4407		vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
4408	events->interrupt.nr = vcpu->arch.interrupt.nr;
4409	events->interrupt.soft = 0;
4410	events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
4411
4412	events->nmi.injected = vcpu->arch.nmi_injected;
4413	events->nmi.pending = vcpu->arch.nmi_pending != 0;
4414	events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
4415	events->nmi.pad = 0;
4416
4417	events->sipi_vector = 0; /* never valid when reporting to user space */
4418
4419	events->smi.smm = is_smm(vcpu);
4420	events->smi.pending = vcpu->arch.smi_pending;
4421	events->smi.smm_inside_nmi =
4422		!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
4423	events->smi.latched_init = kvm_lapic_latched_init(vcpu);
4424
4425	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
4426			 | KVM_VCPUEVENT_VALID_SHADOW
4427			 | KVM_VCPUEVENT_VALID_SMM);
4428	if (vcpu->kvm->arch.exception_payload_enabled)
4429		events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
4430
4431	memset(&events->reserved, 0, sizeof(events->reserved));
4432}
4433
4434static void kvm_smm_changed(struct kvm_vcpu *vcpu);
4435
4436static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
4437					      struct kvm_vcpu_events *events)
4438{
4439	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
4440			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
4441			      | KVM_VCPUEVENT_VALID_SHADOW
4442			      | KVM_VCPUEVENT_VALID_SMM
4443			      | KVM_VCPUEVENT_VALID_PAYLOAD))
4444		return -EINVAL;
4445
4446	if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
4447		if (!vcpu->kvm->arch.exception_payload_enabled)
4448			return -EINVAL;
4449		if (events->exception.pending)
4450			events->exception.injected = 0;
4451		else
4452			events->exception_has_payload = 0;
4453	} else {
4454		events->exception.pending = 0;
4455		events->exception_has_payload = 0;
4456	}
4457
4458	if ((events->exception.injected || events->exception.pending) &&
4459	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
4460		return -EINVAL;
4461
4462	/* INITs are latched while in SMM */
4463	if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
4464	    (events->smi.smm || events->smi.pending) &&
4465	    vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
4466		return -EINVAL;
4467
4468	process_nmi(vcpu);
4469	vcpu->arch.exception.injected = events->exception.injected;
4470	vcpu->arch.exception.pending = events->exception.pending;
4471	vcpu->arch.exception.nr = events->exception.nr;
4472	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
4473	vcpu->arch.exception.error_code = events->exception.error_code;
4474	vcpu->arch.exception.has_payload = events->exception_has_payload;
4475	vcpu->arch.exception.payload = events->exception_payload;
4476
4477	vcpu->arch.interrupt.injected = events->interrupt.injected;
4478	vcpu->arch.interrupt.nr = events->interrupt.nr;
4479	vcpu->arch.interrupt.soft = events->interrupt.soft;
4480	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
4481		kvm_x86_ops.set_interrupt_shadow(vcpu,
4482						  events->interrupt.shadow);
4483
4484	vcpu->arch.nmi_injected = events->nmi.injected;
4485	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
4486		vcpu->arch.nmi_pending = events->nmi.pending;
4487	kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
4488
4489	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
4490	    lapic_in_kernel(vcpu))
4491		vcpu->arch.apic->sipi_vector = events->sipi_vector;
4492
4493	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
4494		if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
4495			if (events->smi.smm)
4496				vcpu->arch.hflags |= HF_SMM_MASK;
4497			else
4498				vcpu->arch.hflags &= ~HF_SMM_MASK;
4499
4500			kvm_x86_ops.nested_ops->leave_nested(vcpu);
4501			kvm_smm_changed(vcpu);
4502		}
4503
4504		vcpu->arch.smi_pending = events->smi.pending;
4505
4506		if (events->smi.smm) {
4507			if (events->smi.smm_inside_nmi)
4508				vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
4509			else
4510				vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
4511		}
4512
4513		if (lapic_in_kernel(vcpu)) {
4514			if (events->smi.latched_init)
4515				set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
4516			else
4517				clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
4518		}
4519	}
4520
4521	kvm_make_request(KVM_REQ_EVENT, vcpu);
4522
4523	return 0;
4524}
4525
4526static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
4527					     struct kvm_debugregs *dbgregs)
4528{
4529	unsigned long val;
4530
4531	memset(dbgregs, 0, sizeof(*dbgregs));
4532	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
4533	kvm_get_dr(vcpu, 6, &val);
4534	dbgregs->dr6 = val;
4535	dbgregs->dr7 = vcpu->arch.dr7;
4536}
4537
4538static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
4539					    struct kvm_debugregs *dbgregs)
4540{
4541	if (dbgregs->flags)
4542		return -EINVAL;
4543
4544	if (dbgregs->dr6 & ~0xffffffffull)
4545		return -EINVAL;
4546	if (dbgregs->dr7 & ~0xffffffffull)
4547		return -EINVAL;
4548
4549	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
4550	kvm_update_dr0123(vcpu);
4551	vcpu->arch.dr6 = dbgregs->dr6;
4552	vcpu->arch.dr7 = dbgregs->dr7;
4553	kvm_update_dr7(vcpu);
4554
4555	return 0;
4556}
4557
4558#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
4559
4560static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
4561{
4562	struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
4563	u64 xstate_bv = xsave->header.xfeatures;
4564	u64 valid;
4565
4566	/*
4567	 * Copy legacy XSAVE area, to avoid complications with CPUID
4568	 * leaves 0 and 1 in the loop below.
4569	 */
4570	memcpy(dest, xsave, XSAVE_HDR_OFFSET);
4571
4572	/* Set XSTATE_BV */
4573	xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
4574	*(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
4575
4576	/*
4577	 * Copy each region from the possibly compacted offset to the
4578	 * non-compacted offset.
4579	 */
4580	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
4581	while (valid) {
4582		u64 xfeature_mask = valid & -valid;
4583		int xfeature_nr = fls64(xfeature_mask) - 1;
4584		void *src = get_xsave_addr(xsave, xfeature_nr);
4585
4586		if (src) {
4587			u32 size, offset, ecx, edx;
4588			cpuid_count(XSTATE_CPUID, xfeature_nr,
4589				    &size, &offset, &ecx, &edx);
4590			if (xfeature_nr == XFEATURE_PKRU)
4591				memcpy(dest + offset, &vcpu->arch.pkru,
4592				       sizeof(vcpu->arch.pkru));
4593			else
4594				memcpy(dest + offset, src, size);
4595
4596		}
4597
4598		valid -= xfeature_mask;
4599	}
4600}
4601
4602static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
4603{
4604	struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
4605	u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
4606	u64 valid;
4607
4608	/*
4609	 * Copy legacy XSAVE area, to avoid complications with CPUID
4610	 * leaves 0 and 1 in the loop below.
4611	 */
4612	memcpy(xsave, src, XSAVE_HDR_OFFSET);
4613
4614	/* Set XSTATE_BV and possibly XCOMP_BV.  */
4615	xsave->header.xfeatures = xstate_bv;
4616	if (boot_cpu_has(X86_FEATURE_XSAVES))
4617		xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
4618
4619	/*
4620	 * Copy each region from the non-compacted offset to the
4621	 * possibly compacted offset.
4622	 */
4623	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
4624	while (valid) {
4625		u64 xfeature_mask = valid & -valid;
4626		int xfeature_nr = fls64(xfeature_mask) - 1;
4627		void *dest = get_xsave_addr(xsave, xfeature_nr);
4628
4629		if (dest) {
4630			u32 size, offset, ecx, edx;
4631			cpuid_count(XSTATE_CPUID, xfeature_nr,
4632				    &size, &offset, &ecx, &edx);
4633			if (xfeature_nr == XFEATURE_PKRU)
4634				memcpy(&vcpu->arch.pkru, src + offset,
4635				       sizeof(vcpu->arch.pkru));
4636			else
4637				memcpy(dest, src + offset, size);
4638		}
4639
4640		valid -= xfeature_mask;
4641	}
4642}
4643
4644static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
4645					 struct kvm_xsave *guest_xsave)
4646{
4647	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
4648		memset(guest_xsave, 0, sizeof(struct kvm_xsave));
4649		fill_xsave((u8 *) guest_xsave->region, vcpu);
4650	} else {
4651		memcpy(guest_xsave->region,
4652			&vcpu->arch.guest_fpu->state.fxsave,
4653			sizeof(struct fxregs_state));
4654		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
4655			XFEATURE_MASK_FPSSE;
4656	}
4657}
4658
4659#define XSAVE_MXCSR_OFFSET 24
4660
4661static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
4662					struct kvm_xsave *guest_xsave)
4663{
4664	u64 xstate_bv =
4665		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
4666	u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
4667
4668	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
4669		/*
4670		 * Here we allow setting states that are not present in
4671		 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
4672		 * with old userspace.
4673		 */
4674		if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
4675			return -EINVAL;
4676		load_xsave(vcpu, (u8 *)guest_xsave->region);
4677	} else {
4678		if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
4679			mxcsr & ~mxcsr_feature_mask)
4680			return -EINVAL;
4681		memcpy(&vcpu->arch.guest_fpu->state.fxsave,
4682			guest_xsave->region, sizeof(struct fxregs_state));
4683	}
4684	return 0;
4685}
4686
4687static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
4688					struct kvm_xcrs *guest_xcrs)
4689{
4690	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
4691		guest_xcrs->nr_xcrs = 0;
4692		return;
4693	}
4694
4695	guest_xcrs->nr_xcrs = 1;
4696	guest_xcrs->flags = 0;
4697	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
4698	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
4699}
4700
4701static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
4702				       struct kvm_xcrs *guest_xcrs)
4703{
4704	int i, r = 0;
4705
4706	if (!boot_cpu_has(X86_FEATURE_XSAVE))
4707		return -EINVAL;
4708
4709	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
4710		return -EINVAL;
4711
4712	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
4713		/* Only support XCR0 currently */
4714		if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
4715			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
4716				guest_xcrs->xcrs[i].value);
4717			break;
4718		}
4719	if (r)
4720		r = -EINVAL;
4721	return r;
4722}
4723
4724/*
4725 * kvm_set_guest_paused() indicates to the guest kernel that it has been
4726 * stopped by the hypervisor.  This function will be called from the host only.
4727 * EINVAL is returned when the host attempts to set the flag for a guest that
4728 * does not support pv clocks.
4729 */
4730static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
4731{
4732	if (!vcpu->arch.pv_time_enabled)
4733		return -EINVAL;
4734	vcpu->arch.pvclock_set_guest_stopped_request = true;
4735	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4736	return 0;
4737}
4738
4739static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
4740				     struct kvm_enable_cap *cap)
4741{
4742	int r;
4743	uint16_t vmcs_version;
4744	void __user *user_ptr;
4745
4746	if (cap->flags)
4747		return -EINVAL;
4748
4749	switch (cap->cap) {
4750	case KVM_CAP_HYPERV_SYNIC2:
4751		if (cap->args[0])
4752			return -EINVAL;
4753		fallthrough;
4754
4755	case KVM_CAP_HYPERV_SYNIC:
4756		if (!irqchip_in_kernel(vcpu->kvm))
4757			return -EINVAL;
4758		return kvm_hv_activate_synic(vcpu, cap->cap ==
4759					     KVM_CAP_HYPERV_SYNIC2);
4760	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
4761		if (!kvm_x86_ops.nested_ops->enable_evmcs)
4762			return -ENOTTY;
4763		r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
4764		if (!r) {
4765			user_ptr = (void __user *)(uintptr_t)cap->args[0];
4766			if (copy_to_user(user_ptr, &vmcs_version,
4767					 sizeof(vmcs_version)))
4768				r = -EFAULT;
4769		}
4770		return r;
4771	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
4772		if (!kvm_x86_ops.enable_direct_tlbflush)
4773			return -ENOTTY;
4774
4775		return kvm_x86_ops.enable_direct_tlbflush(vcpu);
4776
4777	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
4778		vcpu->arch.pv_cpuid.enforce = cap->args[0];
4779		if (vcpu->arch.pv_cpuid.enforce)
4780			kvm_update_pv_runtime(vcpu);
4781
4782		return 0;
4783
4784	default:
4785		return -EINVAL;
4786	}
4787}
4788
4789long kvm_arch_vcpu_ioctl(struct file *filp,
4790			 unsigned int ioctl, unsigned long arg)
4791{
4792	struct kvm_vcpu *vcpu = filp->private_data;
4793	void __user *argp = (void __user *)arg;
4794	int r;
4795	union {
4796		struct kvm_lapic_state *lapic;
4797		struct kvm_xsave *xsave;
4798		struct kvm_xcrs *xcrs;
4799		void *buffer;
4800	} u;
4801
4802	vcpu_load(vcpu);
4803
4804	u.buffer = NULL;
4805	switch (ioctl) {
4806	case KVM_GET_LAPIC: {
4807		r = -EINVAL;
4808		if (!lapic_in_kernel(vcpu))
4809			goto out;
4810		u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
4811				GFP_KERNEL_ACCOUNT);
4812
4813		r = -ENOMEM;
4814		if (!u.lapic)
4815			goto out;
4816		r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
4817		if (r)
4818			goto out;
4819		r = -EFAULT;
4820		if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
4821			goto out;
4822		r = 0;
4823		break;
4824	}
4825	case KVM_SET_LAPIC: {
4826		r = -EINVAL;
4827		if (!lapic_in_kernel(vcpu))
4828			goto out;
4829		u.lapic = memdup_user(argp, sizeof(*u.lapic));
4830		if (IS_ERR(u.lapic)) {
4831			r = PTR_ERR(u.lapic);
4832			goto out_nofree;
4833		}
4834
4835		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
4836		break;
4837	}
4838	case KVM_INTERRUPT: {
4839		struct kvm_interrupt irq;
4840
4841		r = -EFAULT;
4842		if (copy_from_user(&irq, argp, sizeof(irq)))
4843			goto out;
4844		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
4845		break;
4846	}
4847	case KVM_NMI: {
4848		r = kvm_vcpu_ioctl_nmi(vcpu);
4849		break;
4850	}
4851	case KVM_SMI: {
4852		r = kvm_vcpu_ioctl_smi(vcpu);
4853		break;
4854	}
4855	case KVM_SET_CPUID: {
4856		struct kvm_cpuid __user *cpuid_arg = argp;
4857		struct kvm_cpuid cpuid;
4858
4859		r = -EFAULT;
4860		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4861			goto out;
4862		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
4863		break;
4864	}
4865	case KVM_SET_CPUID2: {
4866		struct kvm_cpuid2 __user *cpuid_arg = argp;
4867		struct kvm_cpuid2 cpuid;
4868
4869		r = -EFAULT;
4870		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4871			goto out;
4872		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
4873					      cpuid_arg->entries);
4874		break;
4875	}
4876	case KVM_GET_CPUID2: {
4877		struct kvm_cpuid2 __user *cpuid_arg = argp;
4878		struct kvm_cpuid2 cpuid;
4879
4880		r = -EFAULT;
4881		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4882			goto out;
4883		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
4884					      cpuid_arg->entries);
4885		if (r)
4886			goto out;
4887		r = -EFAULT;
4888		if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4889			goto out;
4890		r = 0;
4891		break;
4892	}
4893	case KVM_GET_MSRS: {
4894		int idx = srcu_read_lock(&vcpu->kvm->srcu);
4895		r = msr_io(vcpu, argp, do_get_msr, 1);
4896		srcu_read_unlock(&vcpu->kvm->srcu, idx);
4897		break;
4898	}
4899	case KVM_SET_MSRS: {
4900		int idx = srcu_read_lock(&vcpu->kvm->srcu);
4901		r = msr_io(vcpu, argp, do_set_msr, 0);
4902		srcu_read_unlock(&vcpu->kvm->srcu, idx);
4903		break;
4904	}
4905	case KVM_TPR_ACCESS_REPORTING: {
4906		struct kvm_tpr_access_ctl tac;
4907
4908		r = -EFAULT;
4909		if (copy_from_user(&tac, argp, sizeof(tac)))
4910			goto out;
4911		r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
4912		if (r)
4913			goto out;
4914		r = -EFAULT;
4915		if (copy_to_user(argp, &tac, sizeof(tac)))
4916			goto out;
4917		r = 0;
4918		break;
4919	};
4920	case KVM_SET_VAPIC_ADDR: {
4921		struct kvm_vapic_addr va;
4922		int idx;
4923
4924		r = -EINVAL;
4925		if (!lapic_in_kernel(vcpu))
4926			goto out;
4927		r = -EFAULT;
4928		if (copy_from_user(&va, argp, sizeof(va)))
4929			goto out;
4930		idx = srcu_read_lock(&vcpu->kvm->srcu);
4931		r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
4932		srcu_read_unlock(&vcpu->kvm->srcu, idx);
4933		break;
4934	}
4935	case KVM_X86_SETUP_MCE: {
4936		u64 mcg_cap;
4937
4938		r = -EFAULT;
4939		if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
4940			goto out;
4941		r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
4942		break;
4943	}
4944	case KVM_X86_SET_MCE: {
4945		struct kvm_x86_mce mce;
4946
4947		r = -EFAULT;
4948		if (copy_from_user(&mce, argp, sizeof(mce)))
4949			goto out;
4950		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
4951		break;
4952	}
4953	case KVM_GET_VCPU_EVENTS: {
4954		struct kvm_vcpu_events events;
4955
4956		kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
4957
4958		r = -EFAULT;
4959		if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
4960			break;
4961		r = 0;
4962		break;
4963	}
4964	case KVM_SET_VCPU_EVENTS: {
4965		struct kvm_vcpu_events events;
4966
4967		r = -EFAULT;
4968		if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
4969			break;
4970
4971		r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
4972		break;
4973	}
4974	case KVM_GET_DEBUGREGS: {
4975		struct kvm_debugregs dbgregs;
4976
4977		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
4978
4979		r = -EFAULT;
4980		if (copy_to_user(argp, &dbgregs,
4981				 sizeof(struct kvm_debugregs)))
4982			break;
4983		r = 0;
4984		break;
4985	}
4986	case KVM_SET_DEBUGREGS: {
4987		struct kvm_debugregs dbgregs;
4988
4989		r = -EFAULT;
4990		if (copy_from_user(&dbgregs, argp,
4991				   sizeof(struct kvm_debugregs)))
4992			break;
4993
4994		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
4995		break;
4996	}
4997	case KVM_GET_XSAVE: {
4998		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
4999		r = -ENOMEM;
5000		if (!u.xsave)
5001			break;
5002
5003		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
5004
5005		r = -EFAULT;
5006		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
5007			break;
5008		r = 0;
5009		break;
5010	}
5011	case KVM_SET_XSAVE: {
5012		u.xsave = memdup_user(argp, sizeof(*u.xsave));
5013		if (IS_ERR(u.xsave)) {
5014			r = PTR_ERR(u.xsave);
5015			goto out_nofree;
5016		}
5017
5018		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
5019		break;
5020	}
5021	case KVM_GET_XCRS: {
5022		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
5023		r = -ENOMEM;
5024		if (!u.xcrs)
5025			break;
5026
5027		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
5028
5029		r = -EFAULT;
5030		if (copy_to_user(argp, u.xcrs,
5031				 sizeof(struct kvm_xcrs)))
5032			break;
5033		r = 0;
5034		break;
5035	}
5036	case KVM_SET_XCRS: {
5037		u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
5038		if (IS_ERR(u.xcrs)) {
5039			r = PTR_ERR(u.xcrs);
5040			goto out_nofree;
5041		}
5042
5043		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
5044		break;
5045	}
5046	case KVM_SET_TSC_KHZ: {
5047		u32 user_tsc_khz;
5048
5049		r = -EINVAL;
5050		user_tsc_khz = (u32)arg;
5051
5052		if (kvm_has_tsc_control &&
5053		    user_tsc_khz >= kvm_max_guest_tsc_khz)
5054			goto out;
5055
5056		if (user_tsc_khz == 0)
5057			user_tsc_khz = tsc_khz;
5058
5059		if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
5060			r = 0;
5061
5062		goto out;
5063	}
5064	case KVM_GET_TSC_KHZ: {
5065		r = vcpu->arch.virtual_tsc_khz;
5066		goto out;
5067	}
5068	case KVM_KVMCLOCK_CTRL: {
5069		r = kvm_set_guest_paused(vcpu);
5070		goto out;
5071	}
5072	case KVM_ENABLE_CAP: {
5073		struct kvm_enable_cap cap;
5074
5075		r = -EFAULT;
5076		if (copy_from_user(&cap, argp, sizeof(cap)))
5077			goto out;
5078		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
5079		break;
5080	}
5081	case KVM_GET_NESTED_STATE: {
5082		struct kvm_nested_state __user *user_kvm_nested_state = argp;
5083		u32 user_data_size;
5084
5085		r = -EINVAL;
5086		if (!kvm_x86_ops.nested_ops->get_state)
5087			break;
5088
5089		BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
5090		r = -EFAULT;
5091		if (get_user(user_data_size, &user_kvm_nested_state->size))
5092			break;
5093
5094		r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
5095						     user_data_size);
5096		if (r < 0)
5097			break;
5098
5099		if (r > user_data_size) {
5100			if (put_user(r, &user_kvm_nested_state->size))
5101				r = -EFAULT;
5102			else
5103				r = -E2BIG;
5104			break;
5105		}
5106
5107		r = 0;
5108		break;
5109	}
5110	case KVM_SET_NESTED_STATE: {
5111		struct kvm_nested_state __user *user_kvm_nested_state = argp;
5112		struct kvm_nested_state kvm_state;
5113		int idx;
5114
5115		r = -EINVAL;
5116		if (!kvm_x86_ops.nested_ops->set_state)
5117			break;
5118
5119		r = -EFAULT;
5120		if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
5121			break;
5122
5123		r = -EINVAL;
5124		if (kvm_state.size < sizeof(kvm_state))
5125			break;
5126
5127		if (kvm_state.flags &
5128		    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
5129		      | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
5130		      | KVM_STATE_NESTED_GIF_SET))
5131			break;
5132
5133		/* nested_run_pending implies guest_mode.  */
5134		if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
5135		    && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
5136			break;
5137
5138		idx = srcu_read_lock(&vcpu->kvm->srcu);
5139		r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
5140		srcu_read_unlock(&vcpu->kvm->srcu, idx);
5141		break;
5142	}
5143	case KVM_GET_SUPPORTED_HV_CPUID: {
5144		struct kvm_cpuid2 __user *cpuid_arg = argp;
5145		struct kvm_cpuid2 cpuid;
5146
5147		r = -EFAULT;
5148		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
5149			goto out;
5150
5151		r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
5152						cpuid_arg->entries);
5153		if (r)
5154			goto out;
5155
5156		r = -EFAULT;
5157		if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
5158			goto out;
5159		r = 0;
5160		break;
5161	}
5162	default:
5163		r = -EINVAL;
5164	}
5165out:
5166	kfree(u.buffer);
5167out_nofree:
5168	vcpu_put(vcpu);
5169	return r;
5170}
5171
5172vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
5173{
5174	return VM_FAULT_SIGBUS;
5175}
5176
5177static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
5178{
5179	int ret;
5180
5181	if (addr > (unsigned int)(-3 * PAGE_SIZE))
5182		return -EINVAL;
5183	ret = kvm_x86_ops.set_tss_addr(kvm, addr);
5184	return ret;
5185}
5186
5187static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
5188					      u64 ident_addr)
5189{
5190	return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
5191}
5192
5193static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
5194					 unsigned long kvm_nr_mmu_pages)
5195{
5196	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
5197		return -EINVAL;
5198
5199	mutex_lock(&kvm->slots_lock);
5200
5201	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
5202	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
5203
5204	mutex_unlock(&kvm->slots_lock);
5205	return 0;
5206}
5207
5208static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
5209{
5210	return kvm->arch.n_max_mmu_pages;
5211}
5212
5213static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
5214{
5215	struct kvm_pic *pic = kvm->arch.vpic;
5216	int r;
5217
5218	r = 0;
5219	switch (chip->chip_id) {
5220	case KVM_IRQCHIP_PIC_MASTER:
5221		memcpy(&chip->chip.pic, &pic->pics[0],
5222			sizeof(struct kvm_pic_state));
5223		break;
5224	case KVM_IRQCHIP_PIC_SLAVE:
5225		memcpy(&chip->chip.pic, &pic->pics[1],
5226			sizeof(struct kvm_pic_state));
5227		break;
5228	case KVM_IRQCHIP_IOAPIC:
5229		kvm_get_ioapic(kvm, &chip->chip.ioapic);
5230		break;
5231	default:
5232		r = -EINVAL;
5233		break;
5234	}
5235	return r;
5236}
5237
5238static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
5239{
5240	struct kvm_pic *pic = kvm->arch.vpic;
5241	int r;
5242
5243	r = 0;
5244	switch (chip->chip_id) {
5245	case KVM_IRQCHIP_PIC_MASTER:
5246		spin_lock(&pic->lock);
5247		memcpy(&pic->pics[0], &chip->chip.pic,
5248			sizeof(struct kvm_pic_state));
5249		spin_unlock(&pic->lock);
5250		break;
5251	case KVM_IRQCHIP_PIC_SLAVE:
5252		spin_lock(&pic->lock);
5253		memcpy(&pic->pics[1], &chip->chip.pic,
5254			sizeof(struct kvm_pic_state));
5255		spin_unlock(&pic->lock);
5256		break;
5257	case KVM_IRQCHIP_IOAPIC:
5258		kvm_set_ioapic(kvm, &chip->chip.ioapic);
5259		break;
5260	default:
5261		r = -EINVAL;
5262		break;
5263	}
5264	kvm_pic_update_irq(pic);
5265	return r;
5266}
5267
5268static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
5269{
5270	struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
5271
5272	BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
5273
5274	mutex_lock(&kps->lock);
5275	memcpy(ps, &kps->channels, sizeof(*ps));
5276	mutex_unlock(&kps->lock);
5277	return 0;
5278}
5279
5280static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
5281{
5282	int i;
5283	struct kvm_pit *pit = kvm->arch.vpit;
5284
5285	mutex_lock(&pit->pit_state.lock);
5286	memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
5287	for (i = 0; i < 3; i++)
5288		kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
5289	mutex_unlock(&pit->pit_state.lock);
5290	return 0;
5291}
5292
5293static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
5294{
5295	mutex_lock(&kvm->arch.vpit->pit_state.lock);
5296	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
5297		sizeof(ps->channels));
5298	ps->flags = kvm->arch.vpit->pit_state.flags;
5299	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
5300	memset(&ps->reserved, 0, sizeof(ps->reserved));
5301	return 0;
5302}
5303
5304static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
5305{
5306	int start = 0;
5307	int i;
5308	u32 prev_legacy, cur_legacy;
5309	struct kvm_pit *pit = kvm->arch.vpit;
5310
5311	mutex_lock(&pit->pit_state.lock);
5312	prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
5313	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
5314	if (!prev_legacy && cur_legacy)
5315		start = 1;
5316	memcpy(&pit->pit_state.channels, &ps->channels,
5317	       sizeof(pit->pit_state.channels));
5318	pit->pit_state.flags = ps->flags;
5319	for (i = 0; i < 3; i++)
5320		kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
5321				   start && i == 0);
5322	mutex_unlock(&pit->pit_state.lock);
5323	return 0;
5324}
5325
5326static int kvm_vm_ioctl_reinject(struct kvm *kvm,
5327				 struct kvm_reinject_control *control)
5328{
5329	struct kvm_pit *pit = kvm->arch.vpit;
5330
5331	/* pit->pit_state.lock was overloaded to prevent userspace from getting
5332	 * an inconsistent state after running multiple KVM_REINJECT_CONTROL
5333	 * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
5334	 */
5335	mutex_lock(&pit->pit_state.lock);
5336	kvm_pit_set_reinject(pit, control->pit_reinject);
5337	mutex_unlock(&pit->pit_state.lock);
5338
5339	return 0;
5340}
5341
5342void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
5343{
5344	/*
5345	 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
5346	 */
5347	if (kvm_x86_ops.flush_log_dirty)
5348		kvm_x86_ops.flush_log_dirty(kvm);
5349}
5350
5351int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
5352			bool line_status)
5353{
5354	if (!irqchip_in_kernel(kvm))
5355		return -ENXIO;
5356
5357	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
5358					irq_event->irq, irq_event->level,
5359					line_status);
5360	return 0;
5361}
5362
5363int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
5364			    struct kvm_enable_cap *cap)
5365{
5366	int r;
5367
5368	if (cap->flags)
5369		return -EINVAL;
5370
5371	switch (cap->cap) {
5372	case KVM_CAP_DISABLE_QUIRKS:
5373		kvm->arch.disabled_quirks = cap->args[0];
5374		r = 0;
5375		break;
5376	case KVM_CAP_SPLIT_IRQCHIP: {
5377		mutex_lock(&kvm->lock);
5378		r = -EINVAL;
5379		if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
5380			goto split_irqchip_unlock;
5381		r = -EEXIST;
5382		if (irqchip_in_kernel(kvm))
5383			goto split_irqchip_unlock;
5384		if (kvm->created_vcpus)
5385			goto split_irqchip_unlock;
5386		r = kvm_setup_empty_irq_routing(kvm);
5387		if (r)
5388			goto split_irqchip_unlock;
5389		/* Pairs with irqchip_in_kernel. */
5390		smp_wmb();
5391		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
5392		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
5393		r = 0;
5394split_irqchip_unlock:
5395		mutex_unlock(&kvm->lock);
5396		break;
5397	}
5398	case KVM_CAP_X2APIC_API:
5399		r = -EINVAL;
5400		if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
5401			break;
5402
5403		if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
5404			kvm->arch.x2apic_format = true;
5405		if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
5406			kvm->arch.x2apic_broadcast_quirk_disabled = true;
5407
5408		r = 0;
5409		break;
5410	case KVM_CAP_X86_DISABLE_EXITS:
5411		r = -EINVAL;
5412		if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
5413			break;
5414
5415		if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
5416			kvm_can_mwait_in_guest())
5417			kvm->arch.mwait_in_guest = true;
5418		if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
5419			kvm->arch.hlt_in_guest = true;
5420		if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
5421			kvm->arch.pause_in_guest = true;
5422		if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
5423			kvm->arch.cstate_in_guest = true;
5424		r = 0;
5425		break;
5426	case KVM_CAP_MSR_PLATFORM_INFO:
5427		kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
5428		r = 0;
5429		break;
5430	case KVM_CAP_EXCEPTION_PAYLOAD:
5431		kvm->arch.exception_payload_enabled = cap->args[0];
5432		r = 0;
5433		break;
5434	case KVM_CAP_X86_USER_SPACE_MSR:
5435		r = -EINVAL;
5436		if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL |
5437				     KVM_MSR_EXIT_REASON_UNKNOWN |
5438				     KVM_MSR_EXIT_REASON_FILTER))
5439			break;
5440		kvm->arch.user_space_msr_mask = cap->args[0];
5441		r = 0;
5442		break;
5443	default:
5444		r = -EINVAL;
5445		break;
5446	}
5447	return r;
5448}
5449
5450static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
5451{
5452	struct kvm_x86_msr_filter *msr_filter;
5453
5454	msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
5455	if (!msr_filter)
5456		return NULL;
5457
5458	msr_filter->default_allow = default_allow;
5459	return msr_filter;
5460}
5461
5462static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
5463{
5464	u32 i;
5465
5466	if (!msr_filter)
5467		return;
5468
5469	for (i = 0; i < msr_filter->count; i++)
5470		kfree(msr_filter->ranges[i].bitmap);
5471
5472	kfree(msr_filter);
5473}
5474
5475static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
5476			      struct kvm_msr_filter_range *user_range)
5477{
5478	struct msr_bitmap_range range;
5479	unsigned long *bitmap = NULL;
5480	size_t bitmap_size;
5481	int r;
5482
5483	if (!user_range->nmsrs)
5484		return 0;
5485
5486	bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
5487	if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
5488		return -EINVAL;
5489
5490	bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
5491	if (IS_ERR(bitmap))
5492		return PTR_ERR(bitmap);
5493
5494	range = (struct msr_bitmap_range) {
5495		.flags = user_range->flags,
5496		.base = user_range->base,
5497		.nmsrs = user_range->nmsrs,
5498		.bitmap = bitmap,
5499	};
5500
5501	if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
5502		r = -EINVAL;
5503		goto err;
5504	}
5505
5506	if (!range.flags) {
5507		r = -EINVAL;
5508		goto err;
5509	}
5510
5511	/* Everything ok, add this range identifier. */
5512	msr_filter->ranges[msr_filter->count] = range;
5513	msr_filter->count++;
5514
5515	return 0;
5516err:
5517	kfree(bitmap);
5518	return r;
5519}
5520
5521static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
5522				       struct kvm_msr_filter *filter)
5523{
5524	struct kvm_x86_msr_filter *new_filter, *old_filter;
5525	bool default_allow;
5526	bool empty = true;
5527	int r = 0;
5528	u32 i;
5529
5530	if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY)
5531		return -EINVAL;
5532
5533	for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
5534		empty &= !filter->ranges[i].nmsrs;
5535
5536	default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
5537	if (empty && !default_allow)
5538		return -EINVAL;
5539
5540	new_filter = kvm_alloc_msr_filter(default_allow);
5541	if (!new_filter)
5542		return -ENOMEM;
5543
5544	for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
5545		r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
5546		if (r) {
5547			kvm_free_msr_filter(new_filter);
5548			return r;
5549		}
5550	}
5551
5552	mutex_lock(&kvm->lock);
5553
5554	/* The per-VM filter is protected by kvm->lock... */
5555	old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
5556
5557	rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
5558	synchronize_srcu(&kvm->srcu);
5559
5560	kvm_free_msr_filter(old_filter);
5561
5562	kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
5563	mutex_unlock(&kvm->lock);
5564
5565	return 0;
5566}
5567
5568#ifdef CONFIG_KVM_COMPAT
5569/* for KVM_X86_SET_MSR_FILTER */
5570struct kvm_msr_filter_range_compat {
5571	__u32 flags;
5572	__u32 nmsrs;
5573	__u32 base;
5574	__u32 bitmap;
5575};
5576
5577struct kvm_msr_filter_compat {
5578	__u32 flags;
5579	struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES];
5580};
5581
5582#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat)
5583
5584long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5585			      unsigned long arg)
5586{
5587	void __user *argp = (void __user *)arg;
5588	struct kvm *kvm = filp->private_data;
5589	long r = -ENOTTY;
5590
5591	switch (ioctl) {
5592	case KVM_X86_SET_MSR_FILTER_COMPAT: {
5593		struct kvm_msr_filter __user *user_msr_filter = argp;
5594		struct kvm_msr_filter_compat filter_compat;
5595		struct kvm_msr_filter filter;
5596		int i;
5597
5598		if (copy_from_user(&filter_compat, user_msr_filter,
5599				   sizeof(filter_compat)))
5600			return -EFAULT;
5601
5602		filter.flags = filter_compat.flags;
5603		for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
5604			struct kvm_msr_filter_range_compat *cr;
5605
5606			cr = &filter_compat.ranges[i];
5607			filter.ranges[i] = (struct kvm_msr_filter_range) {
5608				.flags = cr->flags,
5609				.nmsrs = cr->nmsrs,
5610				.base = cr->base,
5611				.bitmap = (__u8 *)(ulong)cr->bitmap,
5612			};
5613		}
5614
5615		r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
5616		break;
5617	}
5618	}
5619
5620	return r;
5621}
5622#endif
5623
5624long kvm_arch_vm_ioctl(struct file *filp,
5625		       unsigned int ioctl, unsigned long arg)
5626{
5627	struct kvm *kvm = filp->private_data;
5628	void __user *argp = (void __user *)arg;
5629	int r = -ENOTTY;
5630	/*
5631	 * This union makes it completely explicit to gcc-3.x
5632	 * that these two variables' stack usage should be
5633	 * combined, not added together.
5634	 */
5635	union {
5636		struct kvm_pit_state ps;
5637		struct kvm_pit_state2 ps2;
5638		struct kvm_pit_config pit_config;
5639	} u;
5640
5641	switch (ioctl) {
5642	case KVM_SET_TSS_ADDR:
5643		r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
5644		break;
5645	case KVM_SET_IDENTITY_MAP_ADDR: {
5646		u64 ident_addr;
5647
5648		mutex_lock(&kvm->lock);
5649		r = -EINVAL;
5650		if (kvm->created_vcpus)
5651			goto set_identity_unlock;
5652		r = -EFAULT;
5653		if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
5654			goto set_identity_unlock;
5655		r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
5656set_identity_unlock:
5657		mutex_unlock(&kvm->lock);
5658		break;
5659	}
5660	case KVM_SET_NR_MMU_PAGES:
5661		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
5662		break;
5663	case KVM_GET_NR_MMU_PAGES:
5664		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
5665		break;
5666	case KVM_CREATE_IRQCHIP: {
5667		mutex_lock(&kvm->lock);
5668
5669		r = -EEXIST;
5670		if (irqchip_in_kernel(kvm))
5671			goto create_irqchip_unlock;
5672
5673		r = -EINVAL;
5674		if (kvm->created_vcpus)
5675			goto create_irqchip_unlock;
5676
5677		r = kvm_pic_init(kvm);
5678		if (r)
5679			goto create_irqchip_unlock;
5680
5681		r = kvm_ioapic_init(kvm);
5682		if (r) {
5683			kvm_pic_destroy(kvm);
5684			goto create_irqchip_unlock;
5685		}
5686
5687		r = kvm_setup_default_irq_routing(kvm);
5688		if (r) {
5689			kvm_ioapic_destroy(kvm);
5690			kvm_pic_destroy(kvm);
5691			goto create_irqchip_unlock;
5692		}
5693		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
5694		smp_wmb();
5695		kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
5696	create_irqchip_unlock:
5697		mutex_unlock(&kvm->lock);
5698		break;
5699	}
5700	case KVM_CREATE_PIT:
5701		u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
5702		goto create_pit;
5703	case KVM_CREATE_PIT2:
5704		r = -EFAULT;
5705		if (copy_from_user(&u.pit_config, argp,
5706				   sizeof(struct kvm_pit_config)))
5707			goto out;
5708	create_pit:
5709		mutex_lock(&kvm->lock);
5710		r = -EEXIST;
5711		if (kvm->arch.vpit)
5712			goto create_pit_unlock;
5713		r = -ENOMEM;
5714		kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
5715		if (kvm->arch.vpit)
5716			r = 0;
5717	create_pit_unlock:
5718		mutex_unlock(&kvm->lock);
5719		break;
5720	case KVM_GET_IRQCHIP: {
5721		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
5722		struct kvm_irqchip *chip;
5723
5724		chip = memdup_user(argp, sizeof(*chip));
5725		if (IS_ERR(chip)) {
5726			r = PTR_ERR(chip);
5727			goto out;
5728		}
5729
5730		r = -ENXIO;
5731		if (!irqchip_kernel(kvm))
5732			goto get_irqchip_out;
5733		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
5734		if (r)
5735			goto get_irqchip_out;
5736		r = -EFAULT;
5737		if (copy_to_user(argp, chip, sizeof(*chip)))
5738			goto get_irqchip_out;
5739		r = 0;
5740	get_irqchip_out:
5741		kfree(chip);
5742		break;
5743	}
5744	case KVM_SET_IRQCHIP: {
5745		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
5746		struct kvm_irqchip *chip;
5747
5748		chip = memdup_user(argp, sizeof(*chip));
5749		if (IS_ERR(chip)) {
5750			r = PTR_ERR(chip);
5751			goto out;
5752		}
5753
5754		r = -ENXIO;
5755		if (!irqchip_kernel(kvm))
5756			goto set_irqchip_out;
5757		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
5758	set_irqchip_out:
5759		kfree(chip);
5760		break;
5761	}
5762	case KVM_GET_PIT: {
5763		r = -EFAULT;
5764		if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
5765			goto out;
5766		r = -ENXIO;
5767		if (!kvm->arch.vpit)
5768			goto out;
5769		r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
5770		if (r)
5771			goto out;
5772		r = -EFAULT;
5773		if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
5774			goto out;
5775		r = 0;
5776		break;
5777	}
5778	case KVM_SET_PIT: {
5779		r = -EFAULT;
5780		if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
5781			goto out;
5782		mutex_lock(&kvm->lock);
5783		r = -ENXIO;
5784		if (!kvm->arch.vpit)
5785			goto set_pit_out;
5786		r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
5787set_pit_out:
5788		mutex_unlock(&kvm->lock);
5789		break;
5790	}
5791	case KVM_GET_PIT2: {
5792		r = -ENXIO;
5793		if (!kvm->arch.vpit)
5794			goto out;
5795		r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
5796		if (r)
5797			goto out;
5798		r = -EFAULT;
5799		if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
5800			goto out;
5801		r = 0;
5802		break;
5803	}
5804	case KVM_SET_PIT2: {
5805		r = -EFAULT;
5806		if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
5807			goto out;
5808		mutex_lock(&kvm->lock);
5809		r = -ENXIO;
5810		if (!kvm->arch.vpit)
5811			goto set_pit2_out;
5812		r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
5813set_pit2_out:
5814		mutex_unlock(&kvm->lock);
5815		break;
5816	}
5817	case KVM_REINJECT_CONTROL: {
5818		struct kvm_reinject_control control;
5819		r =  -EFAULT;
5820		if (copy_from_user(&control, argp, sizeof(control)))
5821			goto out;
5822		r = -ENXIO;
5823		if (!kvm->arch.vpit)
5824			goto out;
5825		r = kvm_vm_ioctl_reinject(kvm, &control);
5826		break;
5827	}
5828	case KVM_SET_BOOT_CPU_ID:
5829		r = 0;
5830		mutex_lock(&kvm->lock);
5831		if (kvm->created_vcpus)
5832			r = -EBUSY;
5833		else
5834			kvm->arch.bsp_vcpu_id = arg;
5835		mutex_unlock(&kvm->lock);
5836		break;
5837	case KVM_XEN_HVM_CONFIG: {
5838		struct kvm_xen_hvm_config xhc;
5839		r = -EFAULT;
5840		if (copy_from_user(&xhc, argp, sizeof(xhc)))
5841			goto out;
5842		r = -EINVAL;
5843		if (xhc.flags)
5844			goto out;
5845		memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
5846		r = 0;
5847		break;
5848	}
5849	case KVM_SET_CLOCK: {
5850		struct kvm_clock_data user_ns;
5851		u64 now_ns;
5852
5853		r = -EFAULT;
5854		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
5855			goto out;
5856
5857		r = -EINVAL;
5858		if (user_ns.flags)
5859			goto out;
5860
5861		r = 0;
5862		/*
5863		 * TODO: userspace has to take care of races with VCPU_RUN, so
5864		 * kvm_gen_update_masterclock() can be cut down to locked
5865		 * pvclock_update_vm_gtod_copy().
5866		 */
5867		kvm_gen_update_masterclock(kvm);
5868		now_ns = get_kvmclock_ns(kvm);
5869		kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
5870		kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
5871		break;
5872	}
5873	case KVM_GET_CLOCK: {
5874		struct kvm_clock_data user_ns;
5875		u64 now_ns;
5876
5877		now_ns = get_kvmclock_ns(kvm);
5878		user_ns.clock = now_ns;
5879		user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
5880		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
5881
5882		r = -EFAULT;
5883		if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
5884			goto out;
5885		r = 0;
5886		break;
5887	}
5888	case KVM_MEMORY_ENCRYPT_OP: {
5889		r = -ENOTTY;
5890		if (kvm_x86_ops.mem_enc_op)
5891			r = kvm_x86_ops.mem_enc_op(kvm, argp);
5892		break;
5893	}
5894	case KVM_MEMORY_ENCRYPT_REG_REGION: {
5895		struct kvm_enc_region region;
5896
5897		r = -EFAULT;
5898		if (copy_from_user(&region, argp, sizeof(region)))
5899			goto out;
5900
5901		r = -ENOTTY;
5902		if (kvm_x86_ops.mem_enc_reg_region)
5903			r = kvm_x86_ops.mem_enc_reg_region(kvm, &region);
5904		break;
5905	}
5906	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
5907		struct kvm_enc_region region;
5908
5909		r = -EFAULT;
5910		if (copy_from_user(&region, argp, sizeof(region)))
5911			goto out;
5912
5913		r = -ENOTTY;
5914		if (kvm_x86_ops.mem_enc_unreg_region)
5915			r = kvm_x86_ops.mem_enc_unreg_region(kvm, &region);
5916		break;
5917	}
5918	case KVM_HYPERV_EVENTFD: {
5919		struct kvm_hyperv_eventfd hvevfd;
5920
5921		r = -EFAULT;
5922		if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
5923			goto out;
5924		r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
5925		break;
5926	}
5927	case KVM_SET_PMU_EVENT_FILTER:
5928		r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
5929		break;
5930	case KVM_X86_SET_MSR_FILTER: {
5931		struct kvm_msr_filter __user *user_msr_filter = argp;
5932		struct kvm_msr_filter filter;
5933
5934		if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
5935			return -EFAULT;
5936
5937		r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
5938		break;
5939	}
5940	default:
5941		r = -ENOTTY;
5942	}
5943out:
5944	return r;
5945}
5946
5947static void kvm_init_msr_list(void)
5948{
5949	struct x86_pmu_capability x86_pmu;
5950	u32 dummy[2];
5951	unsigned i;
5952
5953	BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
5954			 "Please update the fixed PMCs in msrs_to_saved_all[]");
5955
5956	perf_get_x86_pmu_capability(&x86_pmu);
5957
5958	num_msrs_to_save = 0;
5959	num_emulated_msrs = 0;
5960	num_msr_based_features = 0;
5961
5962	for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
5963		if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
5964			continue;
5965
5966		/*
5967		 * Even MSRs that are valid in the host may not be exposed
5968		 * to the guests in some cases.
5969		 */
5970		switch (msrs_to_save_all[i]) {
5971		case MSR_IA32_BNDCFGS:
5972			if (!kvm_mpx_supported())
5973				continue;
5974			break;
5975		case MSR_TSC_AUX:
5976			if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
5977				continue;
5978			break;
5979		case MSR_IA32_UMWAIT_CONTROL:
5980			if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
5981				continue;
5982			break;
5983		case MSR_IA32_RTIT_CTL:
5984		case MSR_IA32_RTIT_STATUS:
5985			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
5986				continue;
5987			break;
5988		case MSR_IA32_RTIT_CR3_MATCH:
5989			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
5990			    !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
5991				continue;
5992			break;
5993		case MSR_IA32_RTIT_OUTPUT_BASE:
5994		case MSR_IA32_RTIT_OUTPUT_MASK:
5995			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
5996				(!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
5997				 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
5998				continue;
5999			break;
6000		case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
6001			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
6002				msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
6003				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
6004				continue;
6005			break;
6006		case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
6007			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
6008			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
6009				continue;
6010			break;
6011		case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
6012			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
6013			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
6014				continue;
6015			break;
6016		default:
6017			break;
6018		}
6019
6020		msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
6021	}
6022
6023	for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
6024		if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i]))
6025			continue;
6026
6027		emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
6028	}
6029
6030	for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
6031		struct kvm_msr_entry msr;
6032
6033		msr.index = msr_based_features_all[i];
6034		if (kvm_get_msr_feature(&msr))
6035			continue;
6036
6037		msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
6038	}
6039}
6040
6041static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
6042			   const void *v)
6043{
6044	int handled = 0;
6045	int n;
6046
6047	do {
6048		n = min(len, 8);
6049		if (!(lapic_in_kernel(vcpu) &&
6050		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
6051		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
6052			break;
6053		handled += n;
6054		addr += n;
6055		len -= n;
6056		v += n;
6057	} while (len);
6058
6059	return handled;
6060}
6061
6062static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
6063{
6064	int handled = 0;
6065	int n;
6066
6067	do {
6068		n = min(len, 8);
6069		if (!(lapic_in_kernel(vcpu) &&
6070		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
6071					 addr, n, v))
6072		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
6073			break;
6074		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
6075		handled += n;
6076		addr += n;
6077		len -= n;
6078		v += n;
6079	} while (len);
6080
6081	return handled;
6082}
6083
6084static void kvm_set_segment(struct kvm_vcpu *vcpu,
6085			struct kvm_segment *var, int seg)
6086{
6087	kvm_x86_ops.set_segment(vcpu, var, seg);
6088}
6089
6090void kvm_get_segment(struct kvm_vcpu *vcpu,
6091		     struct kvm_segment *var, int seg)
6092{
6093	kvm_x86_ops.get_segment(vcpu, var, seg);
6094}
6095
6096gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
6097			   struct x86_exception *exception)
6098{
6099	gpa_t t_gpa;
6100
6101	BUG_ON(!mmu_is_nested(vcpu));
6102
6103	/* NPT walks are always user-walks */
6104	access |= PFERR_USER_MASK;
6105	t_gpa  = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
6106
6107	return t_gpa;
6108}
6109
6110gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
6111			      struct x86_exception *exception)
6112{
6113	u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6114	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
6115}
6116
6117 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
6118				struct x86_exception *exception)
6119{
6120	u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6121	access |= PFERR_FETCH_MASK;
6122	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
6123}
6124
6125gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
6126			       struct x86_exception *exception)
6127{
6128	u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6129	access |= PFERR_WRITE_MASK;
6130	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
6131}
6132
6133/* uses this to access any guest's mapped memory without checking CPL */
6134gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
6135				struct x86_exception *exception)
6136{
6137	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
6138}
6139
6140static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
6141				      struct kvm_vcpu *vcpu, u32 access,
6142				      struct x86_exception *exception)
6143{
6144	void *data = val;
6145	int r = X86EMUL_CONTINUE;
6146
6147	while (bytes) {
6148		gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
6149							    exception);
6150		unsigned offset = addr & (PAGE_SIZE-1);
6151		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
6152		int ret;
6153
6154		if (gpa == UNMAPPED_GVA)
6155			return X86EMUL_PROPAGATE_FAULT;
6156		ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
6157					       offset, toread);
6158		if (ret < 0) {
6159			r = X86EMUL_IO_NEEDED;
6160			goto out;
6161		}
6162
6163		bytes -= toread;
6164		data += toread;
6165		addr += toread;
6166	}
6167out:
6168	return r;
6169}
6170
6171/* used for instruction fetching */
6172static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
6173				gva_t addr, void *val, unsigned int bytes,
6174				struct x86_exception *exception)
6175{
6176	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6177	u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6178	unsigned offset;
6179	int ret;
6180
6181	/* Inline kvm_read_guest_virt_helper for speed.  */
6182	gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
6183						    exception);
6184	if (unlikely(gpa == UNMAPPED_GVA))
6185		return X86EMUL_PROPAGATE_FAULT;
6186
6187	offset = addr & (PAGE_SIZE-1);
6188	if (WARN_ON(offset + bytes > PAGE_SIZE))
6189		bytes = (unsigned)PAGE_SIZE - offset;
6190	ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
6191				       offset, bytes);
6192	if (unlikely(ret < 0))
6193		return X86EMUL_IO_NEEDED;
6194
6195	return X86EMUL_CONTINUE;
6196}
6197
6198int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
6199			       gva_t addr, void *val, unsigned int bytes,
6200			       struct x86_exception *exception)
6201{
6202	u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6203
6204	/*
6205	 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
6206	 * is returned, but our callers are not ready for that and they blindly
6207	 * call kvm_inject_page_fault.  Ensure that they at least do not leak
6208	 * uninitialized kernel stack memory into cr2 and error code.
6209	 */
6210	memset(exception, 0, sizeof(*exception));
6211	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
6212					  exception);
6213}
6214EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
6215
6216static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
6217			     gva_t addr, void *val, unsigned int bytes,
6218			     struct x86_exception *exception, bool system)
6219{
6220	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6221	u32 access = 0;
6222
6223	if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
6224		access |= PFERR_USER_MASK;
6225
6226	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
6227}
6228
6229static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
6230		unsigned long addr, void *val, unsigned int bytes)
6231{
6232	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6233	int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
6234
6235	return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
6236}
6237
6238static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
6239				      struct kvm_vcpu *vcpu, u32 access,
6240				      struct x86_exception *exception)
6241{
6242	void *data = val;
6243	int r = X86EMUL_CONTINUE;
6244
6245	while (bytes) {
6246		gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
6247							     access,
6248							     exception);
6249		unsigned offset = addr & (PAGE_SIZE-1);
6250		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
6251		int ret;
6252
6253		if (gpa == UNMAPPED_GVA)
6254			return X86EMUL_PROPAGATE_FAULT;
6255		ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
6256		if (ret < 0) {
6257			r = X86EMUL_IO_NEEDED;
6258			goto out;
6259		}
6260
6261		bytes -= towrite;
6262		data += towrite;
6263		addr += towrite;
6264	}
6265out:
6266	return r;
6267}
6268
6269static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
6270			      unsigned int bytes, struct x86_exception *exception,
6271			      bool system)
6272{
6273	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6274	u32 access = PFERR_WRITE_MASK;
6275
6276	if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
6277		access |= PFERR_USER_MASK;
6278
6279	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
6280					   access, exception);
6281}
6282
6283int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
6284				unsigned int bytes, struct x86_exception *exception)
6285{
6286	/* kvm_write_guest_virt_system can pull in tons of pages. */
6287	vcpu->arch.l1tf_flush_l1d = true;
6288
6289	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
6290					   PFERR_WRITE_MASK, exception);
6291}
6292EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
6293
6294int handle_ud(struct kvm_vcpu *vcpu)
6295{
6296	static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
6297	int emul_type = EMULTYPE_TRAP_UD;
6298	char sig[5]; /* ud2; .ascii "kvm" */
6299	struct x86_exception e;
6300
6301	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
6302		return 1;
6303
6304	if (force_emulation_prefix &&
6305	    kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
6306				sig, sizeof(sig), &e) == 0 &&
6307	    memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
6308		kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
6309		emul_type = EMULTYPE_TRAP_UD_FORCED;
6310	}
6311
6312	return kvm_emulate_instruction(vcpu, emul_type);
6313}
6314EXPORT_SYMBOL_GPL(handle_ud);
6315
6316static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
6317			    gpa_t gpa, bool write)
6318{
6319	/* For APIC access vmexit */
6320	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
6321		return 1;
6322
6323	if (vcpu_match_mmio_gpa(vcpu, gpa)) {
6324		trace_vcpu_match_mmio(gva, gpa, write, true);
6325		return 1;
6326	}
6327
6328	return 0;
6329}
6330
6331static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
6332				gpa_t *gpa, struct x86_exception *exception,
6333				bool write)
6334{
6335	u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
6336		| (write ? PFERR_WRITE_MASK : 0);
6337
6338	/*
6339	 * currently PKRU is only applied to ept enabled guest so
6340	 * there is no pkey in EPT page table for L1 guest or EPT
6341	 * shadow page table for L2 guest.
6342	 */
6343	if (vcpu_match_mmio_gva(vcpu, gva)
6344	    && !permission_fault(vcpu, vcpu->arch.walk_mmu,
6345				 vcpu->arch.mmio_access, 0, access)) {
6346		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
6347					(gva & (PAGE_SIZE - 1));
6348		trace_vcpu_match_mmio(gva, *gpa, write, false);
6349		return 1;
6350	}
6351
6352	*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
6353
6354	if (*gpa == UNMAPPED_GVA)
6355		return -1;
6356
6357	return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
6358}
6359
6360int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
6361			const void *val, int bytes)
6362{
6363	int ret;
6364
6365	ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
6366	if (ret < 0)
6367		return 0;
6368	kvm_page_track_write(vcpu, gpa, val, bytes);
6369	return 1;
6370}
6371
6372struct read_write_emulator_ops {
6373	int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
6374				  int bytes);
6375	int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
6376				  void *val, int bytes);
6377	int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
6378			       int bytes, void *val);
6379	int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
6380				    void *val, int bytes);
6381	bool write;
6382};
6383
6384static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
6385{
6386	if (vcpu->mmio_read_completed) {
6387		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
6388			       vcpu->mmio_fragments[0].gpa, val);
6389		vcpu->mmio_read_completed = 0;
6390		return 1;
6391	}
6392
6393	return 0;
6394}
6395
6396static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
6397			void *val, int bytes)
6398{
6399	return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
6400}
6401
6402static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
6403			 void *val, int bytes)
6404{
6405	return emulator_write_phys(vcpu, gpa, val, bytes);
6406}
6407
6408static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
6409{
6410	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
6411	return vcpu_mmio_write(vcpu, gpa, bytes, val);
6412}
6413
6414static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
6415			  void *val, int bytes)
6416{
6417	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
6418	return X86EMUL_IO_NEEDED;
6419}
6420
6421static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
6422			   void *val, int bytes)
6423{
6424	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
6425
6426	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
6427	return X86EMUL_CONTINUE;
6428}
6429
6430static const struct read_write_emulator_ops read_emultor = {
6431	.read_write_prepare = read_prepare,
6432	.read_write_emulate = read_emulate,
6433	.read_write_mmio = vcpu_mmio_read,
6434	.read_write_exit_mmio = read_exit_mmio,
6435};
6436
6437static const struct read_write_emulator_ops write_emultor = {
6438	.read_write_emulate = write_emulate,
6439	.read_write_mmio = write_mmio,
6440	.read_write_exit_mmio = write_exit_mmio,
6441	.write = true,
6442};
6443
6444static int emulator_read_write_onepage(unsigned long addr, void *val,
6445				       unsigned int bytes,
6446				       struct x86_exception *exception,
6447				       struct kvm_vcpu *vcpu,
6448				       const struct read_write_emulator_ops *ops)
6449{
6450	gpa_t gpa;
6451	int handled, ret;
6452	bool write = ops->write;
6453	struct kvm_mmio_fragment *frag;
6454	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
6455
6456	/*
6457	 * If the exit was due to a NPF we may already have a GPA.
6458	 * If the GPA is present, use it to avoid the GVA to GPA table walk.
6459	 * Note, this cannot be used on string operations since string
6460	 * operation using rep will only have the initial GPA from the NPF
6461	 * occurred.
6462	 */
6463	if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
6464	    (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
6465		gpa = ctxt->gpa_val;
6466		ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
6467	} else {
6468		ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
6469		if (ret < 0)
6470			return X86EMUL_PROPAGATE_FAULT;
6471	}
6472
6473	if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
6474		return X86EMUL_CONTINUE;
6475
6476	/*
6477	 * Is this MMIO handled locally?
6478	 */
6479	handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
6480	if (handled == bytes)
6481		return X86EMUL_CONTINUE;
6482
6483	gpa += handled;
6484	bytes -= handled;
6485	val += handled;
6486
6487	WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
6488	frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
6489	frag->gpa = gpa;
6490	frag->data = val;
6491	frag->len = bytes;
6492	return X86EMUL_CONTINUE;
6493}
6494
6495static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
6496			unsigned long addr,
6497			void *val, unsigned int bytes,
6498			struct x86_exception *exception,
6499			const struct read_write_emulator_ops *ops)
6500{
6501	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6502	gpa_t gpa;
6503	int rc;
6504
6505	if (ops->read_write_prepare &&
6506		  ops->read_write_prepare(vcpu, val, bytes))
6507		return X86EMUL_CONTINUE;
6508
6509	vcpu->mmio_nr_fragments = 0;
6510
6511	/* Crossing a page boundary? */
6512	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
6513		int now;
6514
6515		now = -addr & ~PAGE_MASK;
6516		rc = emulator_read_write_onepage(addr, val, now, exception,
6517						 vcpu, ops);
6518
6519		if (rc != X86EMUL_CONTINUE)
6520			return rc;
6521		addr += now;
6522		if (ctxt->mode != X86EMUL_MODE_PROT64)
6523			addr = (u32)addr;
6524		val += now;
6525		bytes -= now;
6526	}
6527
6528	rc = emulator_read_write_onepage(addr, val, bytes, exception,
6529					 vcpu, ops);
6530	if (rc != X86EMUL_CONTINUE)
6531		return rc;
6532
6533	if (!vcpu->mmio_nr_fragments)
6534		return rc;
6535
6536	gpa = vcpu->mmio_fragments[0].gpa;
6537
6538	vcpu->mmio_needed = 1;
6539	vcpu->mmio_cur_fragment = 0;
6540
6541	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
6542	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
6543	vcpu->run->exit_reason = KVM_EXIT_MMIO;
6544	vcpu->run->mmio.phys_addr = gpa;
6545
6546	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
6547}
6548
6549static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
6550				  unsigned long addr,
6551				  void *val,
6552				  unsigned int bytes,
6553				  struct x86_exception *exception)
6554{
6555	return emulator_read_write(ctxt, addr, val, bytes,
6556				   exception, &read_emultor);
6557}
6558
6559static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
6560			    unsigned long addr,
6561			    const void *val,
6562			    unsigned int bytes,
6563			    struct x86_exception *exception)
6564{
6565	return emulator_read_write(ctxt, addr, (void *)val, bytes,
6566				   exception, &write_emultor);
6567}
6568
6569#define CMPXCHG_TYPE(t, ptr, old, new) \
6570	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
6571
6572#ifdef CONFIG_X86_64
6573#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
6574#else
6575#  define CMPXCHG64(ptr, old, new) \
6576	(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
6577#endif
6578
6579static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
6580				     unsigned long addr,
6581				     const void *old,
6582				     const void *new,
6583				     unsigned int bytes,
6584				     struct x86_exception *exception)
6585{
6586	struct kvm_host_map map;
6587	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6588	u64 page_line_mask;
6589	gpa_t gpa;
6590	char *kaddr;
6591	bool exchanged;
6592
6593	/* guests cmpxchg8b have to be emulated atomically */
6594	if (bytes > 8 || (bytes & (bytes - 1)))
6595		goto emul_write;
6596
6597	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
6598
6599	if (gpa == UNMAPPED_GVA ||
6600	    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
6601		goto emul_write;
6602
6603	/*
6604	 * Emulate the atomic as a straight write to avoid #AC if SLD is
6605	 * enabled in the host and the access splits a cache line.
6606	 */
6607	if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
6608		page_line_mask = ~(cache_line_size() - 1);
6609	else
6610		page_line_mask = PAGE_MASK;
6611
6612	if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
6613		goto emul_write;
6614
6615	if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
6616		goto emul_write;
6617
6618	kaddr = map.hva + offset_in_page(gpa);
6619
6620	switch (bytes) {
6621	case 1:
6622		exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
6623		break;
6624	case 2:
6625		exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
6626		break;
6627	case 4:
6628		exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
6629		break;
6630	case 8:
6631		exchanged = CMPXCHG64(kaddr, old, new);
6632		break;
6633	default:
6634		BUG();
6635	}
6636
6637	kvm_vcpu_unmap(vcpu, &map, true);
6638
6639	if (!exchanged)
6640		return X86EMUL_CMPXCHG_FAILED;
6641
6642	kvm_page_track_write(vcpu, gpa, new, bytes);
6643
6644	return X86EMUL_CONTINUE;
6645
6646emul_write:
6647	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
6648
6649	return emulator_write_emulated(ctxt, addr, new, bytes, exception);
6650}
6651
6652static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
6653{
6654	int r = 0, i;
6655
6656	for (i = 0; i < vcpu->arch.pio.count; i++) {
6657		if (vcpu->arch.pio.in)
6658			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
6659					    vcpu->arch.pio.size, pd);
6660		else
6661			r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
6662					     vcpu->arch.pio.port, vcpu->arch.pio.size,
6663					     pd);
6664		if (r)
6665			break;
6666		pd += vcpu->arch.pio.size;
6667	}
6668	return r;
6669}
6670
6671static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
6672			       unsigned short port, void *val,
6673			       unsigned int count, bool in)
6674{
6675	vcpu->arch.pio.port = port;
6676	vcpu->arch.pio.in = in;
6677	vcpu->arch.pio.count  = count;
6678	vcpu->arch.pio.size = size;
6679
6680	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
6681		vcpu->arch.pio.count = 0;
6682		return 1;
6683	}
6684
6685	vcpu->run->exit_reason = KVM_EXIT_IO;
6686	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
6687	vcpu->run->io.size = size;
6688	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
6689	vcpu->run->io.count = count;
6690	vcpu->run->io.port = port;
6691
6692	return 0;
6693}
6694
6695static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
6696			   unsigned short port, void *val, unsigned int count)
6697{
6698	int ret;
6699
6700	if (vcpu->arch.pio.count)
6701		goto data_avail;
6702
6703	memset(vcpu->arch.pio_data, 0, size * count);
6704
6705	ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
6706	if (ret) {
6707data_avail:
6708		memcpy(val, vcpu->arch.pio_data, size * count);
6709		trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
6710		vcpu->arch.pio.count = 0;
6711		return 1;
6712	}
6713
6714	return 0;
6715}
6716
6717static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
6718				    int size, unsigned short port, void *val,
6719				    unsigned int count)
6720{
6721	return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
6722
6723}
6724
6725static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
6726			    unsigned short port, const void *val,
6727			    unsigned int count)
6728{
6729	memcpy(vcpu->arch.pio_data, val, size * count);
6730	trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
6731	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
6732}
6733
6734static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
6735				     int size, unsigned short port,
6736				     const void *val, unsigned int count)
6737{
6738	return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
6739}
6740
6741static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
6742{
6743	return kvm_x86_ops.get_segment_base(vcpu, seg);
6744}
6745
6746static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
6747{
6748	kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
6749}
6750
6751static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
6752{
6753	if (!need_emulate_wbinvd(vcpu))
6754		return X86EMUL_CONTINUE;
6755
6756	if (kvm_x86_ops.has_wbinvd_exit()) {
6757		int cpu = get_cpu();
6758
6759		cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
6760		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
6761				wbinvd_ipi, NULL, 1);
6762		put_cpu();
6763		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
6764	} else
6765		wbinvd();
6766	return X86EMUL_CONTINUE;
6767}
6768
6769int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
6770{
6771	kvm_emulate_wbinvd_noskip(vcpu);
6772	return kvm_skip_emulated_instruction(vcpu);
6773}
6774EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
6775
6776
6777
6778static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
6779{
6780	kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
6781}
6782
6783static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
6784			   unsigned long *dest)
6785{
6786	return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
6787}
6788
6789static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
6790			   unsigned long value)
6791{
6792
6793	return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
6794}
6795
6796static u64 mk_cr_64(u64 curr_cr, u32 new_val)
6797{
6798	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
6799}
6800
6801static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
6802{
6803	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6804	unsigned long value;
6805
6806	switch (cr) {
6807	case 0:
6808		value = kvm_read_cr0(vcpu);
6809		break;
6810	case 2:
6811		value = vcpu->arch.cr2;
6812		break;
6813	case 3:
6814		value = kvm_read_cr3(vcpu);
6815		break;
6816	case 4:
6817		value = kvm_read_cr4(vcpu);
6818		break;
6819	case 8:
6820		value = kvm_get_cr8(vcpu);
6821		break;
6822	default:
6823		kvm_err("%s: unexpected cr %u\n", __func__, cr);
6824		return 0;
6825	}
6826
6827	return value;
6828}
6829
6830static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
6831{
6832	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6833	int res = 0;
6834
6835	switch (cr) {
6836	case 0:
6837		res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
6838		break;
6839	case 2:
6840		vcpu->arch.cr2 = val;
6841		break;
6842	case 3:
6843		res = kvm_set_cr3(vcpu, val);
6844		break;
6845	case 4:
6846		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
6847		break;
6848	case 8:
6849		res = kvm_set_cr8(vcpu, val);
6850		break;
6851	default:
6852		kvm_err("%s: unexpected cr %u\n", __func__, cr);
6853		res = -1;
6854	}
6855
6856	return res;
6857}
6858
6859static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
6860{
6861	return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
6862}
6863
6864static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
6865{
6866	kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
6867}
6868
6869static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
6870{
6871	kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
6872}
6873
6874static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
6875{
6876	kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
6877}
6878
6879static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
6880{
6881	kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
6882}
6883
6884static unsigned long emulator_get_cached_segment_base(
6885	struct x86_emulate_ctxt *ctxt, int seg)
6886{
6887	return get_segment_base(emul_to_vcpu(ctxt), seg);
6888}
6889
6890static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
6891				 struct desc_struct *desc, u32 *base3,
6892				 int seg)
6893{
6894	struct kvm_segment var;
6895
6896	kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
6897	*selector = var.selector;
6898
6899	if (var.unusable) {
6900		memset(desc, 0, sizeof(*desc));
6901		if (base3)
6902			*base3 = 0;
6903		return false;
6904	}
6905
6906	if (var.g)
6907		var.limit >>= 12;
6908	set_desc_limit(desc, var.limit);
6909	set_desc_base(desc, (unsigned long)var.base);
6910#ifdef CONFIG_X86_64
6911	if (base3)
6912		*base3 = var.base >> 32;
6913#endif
6914	desc->type = var.type;
6915	desc->s = var.s;
6916	desc->dpl = var.dpl;
6917	desc->p = var.present;
6918	desc->avl = var.avl;
6919	desc->l = var.l;
6920	desc->d = var.db;
6921	desc->g = var.g;
6922
6923	return true;
6924}
6925
6926static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
6927				 struct desc_struct *desc, u32 base3,
6928				 int seg)
6929{
6930	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6931	struct kvm_segment var;
6932
6933	var.selector = selector;
6934	var.base = get_desc_base(desc);
6935#ifdef CONFIG_X86_64
6936	var.base |= ((u64)base3) << 32;
6937#endif
6938	var.limit = get_desc_limit(desc);
6939	if (desc->g)
6940		var.limit = (var.limit << 12) | 0xfff;
6941	var.type = desc->type;
6942	var.dpl = desc->dpl;
6943	var.db = desc->d;
6944	var.s = desc->s;
6945	var.l = desc->l;
6946	var.g = desc->g;
6947	var.avl = desc->avl;
6948	var.present = desc->p;
6949	var.unusable = !var.present;
6950	var.padding = 0;
6951
6952	kvm_set_segment(vcpu, &var, seg);
6953	return;
6954}
6955
6956static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
6957			    u32 msr_index, u64 *pdata)
6958{
6959	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6960	int r;
6961
6962	r = kvm_get_msr(vcpu, msr_index, pdata);
6963
6964	if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
6965		/* Bounce to user space */
6966		return X86EMUL_IO_NEEDED;
6967	}
6968
6969	return r;
6970}
6971
6972static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
6973			    u32 msr_index, u64 data)
6974{
6975	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6976	int r;
6977
6978	r = kvm_set_msr(vcpu, msr_index, data);
6979
6980	if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
6981		/* Bounce to user space */
6982		return X86EMUL_IO_NEEDED;
6983	}
6984
6985	return r;
6986}
6987
6988static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
6989{
6990	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6991
6992	return vcpu->arch.smbase;
6993}
6994
6995static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
6996{
6997	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6998
6999	vcpu->arch.smbase = smbase;
7000}
7001
7002static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
7003			      u32 pmc)
7004{
7005	return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
7006}
7007
7008static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
7009			     u32 pmc, u64 *pdata)
7010{
7011	return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
7012}
7013
7014static void emulator_halt(struct x86_emulate_ctxt *ctxt)
7015{
7016	emul_to_vcpu(ctxt)->arch.halt_request = 1;
7017}
7018
7019static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
7020			      struct x86_instruction_info *info,
7021			      enum x86_intercept_stage stage)
7022{
7023	return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
7024					    &ctxt->exception);
7025}
7026
7027static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
7028			      u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
7029			      bool exact_only)
7030{
7031	return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
7032}
7033
7034static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
7035{
7036	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
7037}
7038
7039static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
7040{
7041	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
7042}
7043
7044static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
7045{
7046	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
7047}
7048
7049static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
7050{
7051	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
7052}
7053
7054static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
7055{
7056	return kvm_register_read(emul_to_vcpu(ctxt), reg);
7057}
7058
7059static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
7060{
7061	kvm_register_write(emul_to_vcpu(ctxt), reg, val);
7062}
7063
7064static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
7065{
7066	kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
7067}
7068
7069static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
7070{
7071	return emul_to_vcpu(ctxt)->arch.hflags;
7072}
7073
7074static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
7075{
7076	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7077
7078	vcpu->arch.hflags = emul_flags;
7079	kvm_mmu_reset_context(vcpu);
7080}
7081
7082static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
7083				  const char *smstate)
7084{
7085	return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
7086}
7087
7088static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
7089{
7090	kvm_smm_changed(emul_to_vcpu(ctxt));
7091}
7092
7093static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
7094{
7095	return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
7096}
7097
7098static const struct x86_emulate_ops emulate_ops = {
7099	.read_gpr            = emulator_read_gpr,
7100	.write_gpr           = emulator_write_gpr,
7101	.read_std            = emulator_read_std,
7102	.write_std           = emulator_write_std,
7103	.read_phys           = kvm_read_guest_phys_system,
7104	.fetch               = kvm_fetch_guest_virt,
7105	.read_emulated       = emulator_read_emulated,
7106	.write_emulated      = emulator_write_emulated,
7107	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
7108	.invlpg              = emulator_invlpg,
7109	.pio_in_emulated     = emulator_pio_in_emulated,
7110	.pio_out_emulated    = emulator_pio_out_emulated,
7111	.get_segment         = emulator_get_segment,
7112	.set_segment         = emulator_set_segment,
7113	.get_cached_segment_base = emulator_get_cached_segment_base,
7114	.get_gdt             = emulator_get_gdt,
7115	.get_idt	     = emulator_get_idt,
7116	.set_gdt             = emulator_set_gdt,
7117	.set_idt	     = emulator_set_idt,
7118	.get_cr              = emulator_get_cr,
7119	.set_cr              = emulator_set_cr,
7120	.cpl                 = emulator_get_cpl,
7121	.get_dr              = emulator_get_dr,
7122	.set_dr              = emulator_set_dr,
7123	.get_smbase          = emulator_get_smbase,
7124	.set_smbase          = emulator_set_smbase,
7125	.set_msr             = emulator_set_msr,
7126	.get_msr             = emulator_get_msr,
7127	.check_pmc	     = emulator_check_pmc,
7128	.read_pmc            = emulator_read_pmc,
7129	.halt                = emulator_halt,
7130	.wbinvd              = emulator_wbinvd,
7131	.fix_hypercall       = emulator_fix_hypercall,
7132	.intercept           = emulator_intercept,
7133	.get_cpuid           = emulator_get_cpuid,
7134	.guest_has_long_mode = emulator_guest_has_long_mode,
7135	.guest_has_movbe     = emulator_guest_has_movbe,
7136	.guest_has_fxsr      = emulator_guest_has_fxsr,
7137	.guest_has_rdpid     = emulator_guest_has_rdpid,
7138	.set_nmi_mask        = emulator_set_nmi_mask,
7139	.get_hflags          = emulator_get_hflags,
7140	.set_hflags          = emulator_set_hflags,
7141	.pre_leave_smm       = emulator_pre_leave_smm,
7142	.post_leave_smm      = emulator_post_leave_smm,
7143	.set_xcr             = emulator_set_xcr,
7144};
7145
7146static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
7147{
7148	u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
7149	/*
7150	 * an sti; sti; sequence only disable interrupts for the first
7151	 * instruction. So, if the last instruction, be it emulated or
7152	 * not, left the system with the INT_STI flag enabled, it
7153	 * means that the last instruction is an sti. We should not
7154	 * leave the flag on in this case. The same goes for mov ss
7155	 */
7156	if (int_shadow & mask)
7157		mask = 0;
7158	if (unlikely(int_shadow || mask)) {
7159		kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
7160		if (!mask)
7161			kvm_make_request(KVM_REQ_EVENT, vcpu);
7162	}
7163}
7164
7165static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
7166{
7167	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
7168	if (ctxt->exception.vector == PF_VECTOR)
7169		return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
7170
7171	if (ctxt->exception.error_code_valid)
7172		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
7173				      ctxt->exception.error_code);
7174	else
7175		kvm_queue_exception(vcpu, ctxt->exception.vector);
7176	return false;
7177}
7178
7179static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
7180{
7181	struct x86_emulate_ctxt *ctxt;
7182
7183	ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
7184	if (!ctxt) {
7185		pr_err("kvm: failed to allocate vcpu's emulator\n");
7186		return NULL;
7187	}
7188
7189	ctxt->vcpu = vcpu;
7190	ctxt->ops = &emulate_ops;
7191	vcpu->arch.emulate_ctxt = ctxt;
7192
7193	return ctxt;
7194}
7195
7196static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
7197{
7198	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
7199	int cs_db, cs_l;
7200
7201	kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
7202
7203	ctxt->gpa_available = false;
7204	ctxt->eflags = kvm_get_rflags(vcpu);
7205	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
7206
7207	ctxt->eip = kvm_rip_read(vcpu);
7208	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
7209		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
7210		     (cs_l && is_long_mode(vcpu))	? X86EMUL_MODE_PROT64 :
7211		     cs_db				? X86EMUL_MODE_PROT32 :
7212							  X86EMUL_MODE_PROT16;
7213	BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
7214	BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
7215	BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
7216
7217	ctxt->interruptibility = 0;
7218	ctxt->have_exception = false;
7219	ctxt->exception.vector = -1;
7220	ctxt->perm_ok = false;
7221
7222	init_decode_cache(ctxt);
7223	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
7224}
7225
7226void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
7227{
7228	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
7229	int ret;
7230
7231	init_emulate_ctxt(vcpu);
7232
7233	ctxt->op_bytes = 2;
7234	ctxt->ad_bytes = 2;
7235	ctxt->_eip = ctxt->eip + inc_eip;
7236	ret = emulate_int_real(ctxt, irq);
7237
7238	if (ret != X86EMUL_CONTINUE) {
7239		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
7240	} else {
7241		ctxt->eip = ctxt->_eip;
7242		kvm_rip_write(vcpu, ctxt->eip);
7243		kvm_set_rflags(vcpu, ctxt->eflags);
7244	}
7245}
7246EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
7247
7248static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
7249{
7250	++vcpu->stat.insn_emulation_fail;
7251	trace_kvm_emulate_insn_failed(vcpu);
7252
7253	if (emulation_type & EMULTYPE_VMWARE_GP) {
7254		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
7255		return 1;
7256	}
7257
7258	if (emulation_type & EMULTYPE_SKIP) {
7259		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7260		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7261		vcpu->run->internal.ndata = 0;
7262		return 0;
7263	}
7264
7265	kvm_queue_exception(vcpu, UD_VECTOR);
7266
7267	if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
7268		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7269		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7270		vcpu->run->internal.ndata = 0;
7271		return 0;
7272	}
7273
7274	return 1;
7275}
7276
7277static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
7278				  bool write_fault_to_shadow_pgtable,
7279				  int emulation_type)
7280{
7281	gpa_t gpa = cr2_or_gpa;
7282	kvm_pfn_t pfn;
7283
7284	if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
7285		return false;
7286
7287	if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
7288	    WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
7289		return false;
7290
7291	if (!vcpu->arch.mmu->direct_map) {
7292		/*
7293		 * Write permission should be allowed since only
7294		 * write access need to be emulated.
7295		 */
7296		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
7297
7298		/*
7299		 * If the mapping is invalid in guest, let cpu retry
7300		 * it to generate fault.
7301		 */
7302		if (gpa == UNMAPPED_GVA)
7303			return true;
7304	}
7305
7306	/*
7307	 * Do not retry the unhandleable instruction if it faults on the
7308	 * readonly host memory, otherwise it will goto a infinite loop:
7309	 * retry instruction -> write #PF -> emulation fail -> retry
7310	 * instruction -> ...
7311	 */
7312	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
7313
7314	/*
7315	 * If the instruction failed on the error pfn, it can not be fixed,
7316	 * report the error to userspace.
7317	 */
7318	if (is_error_noslot_pfn(pfn))
7319		return false;
7320
7321	kvm_release_pfn_clean(pfn);
7322
7323	/* The instructions are well-emulated on direct mmu. */
7324	if (vcpu->arch.mmu->direct_map) {
7325		unsigned int indirect_shadow_pages;
7326
7327		spin_lock(&vcpu->kvm->mmu_lock);
7328		indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
7329		spin_unlock(&vcpu->kvm->mmu_lock);
7330
7331		if (indirect_shadow_pages)
7332			kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
7333
7334		return true;
7335	}
7336
7337	/*
7338	 * if emulation was due to access to shadowed page table
7339	 * and it failed try to unshadow page and re-enter the
7340	 * guest to let CPU execute the instruction.
7341	 */
7342	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
7343
7344	/*
7345	 * If the access faults on its page table, it can not
7346	 * be fixed by unprotecting shadow page and it should
7347	 * be reported to userspace.
7348	 */
7349	return !write_fault_to_shadow_pgtable;
7350}
7351
7352static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
7353			      gpa_t cr2_or_gpa,  int emulation_type)
7354{
7355	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7356	unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
7357
7358	last_retry_eip = vcpu->arch.last_retry_eip;
7359	last_retry_addr = vcpu->arch.last_retry_addr;
7360
7361	/*
7362	 * If the emulation is caused by #PF and it is non-page_table
7363	 * writing instruction, it means the VM-EXIT is caused by shadow
7364	 * page protected, we can zap the shadow page and retry this
7365	 * instruction directly.
7366	 *
7367	 * Note: if the guest uses a non-page-table modifying instruction
7368	 * on the PDE that points to the instruction, then we will unmap
7369	 * the instruction and go to an infinite loop. So, we cache the
7370	 * last retried eip and the last fault address, if we meet the eip
7371	 * and the address again, we can break out of the potential infinite
7372	 * loop.
7373	 */
7374	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
7375
7376	if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
7377		return false;
7378
7379	if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
7380	    WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
7381		return false;
7382
7383	if (x86_page_table_writing_insn(ctxt))
7384		return false;
7385
7386	if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
7387		return false;
7388
7389	vcpu->arch.last_retry_eip = ctxt->eip;
7390	vcpu->arch.last_retry_addr = cr2_or_gpa;
7391
7392	if (!vcpu->arch.mmu->direct_map)
7393		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
7394
7395	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
7396
7397	return true;
7398}
7399
7400static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
7401static int complete_emulated_pio(struct kvm_vcpu *vcpu);
7402
7403static void kvm_smm_changed(struct kvm_vcpu *vcpu)
7404{
7405	if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
7406		/* This is a good place to trace that we are exiting SMM.  */
7407		trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
7408
7409		/* Process a latched INIT or SMI, if any.  */
7410		kvm_make_request(KVM_REQ_EVENT, vcpu);
7411	}
7412
7413	kvm_mmu_reset_context(vcpu);
7414}
7415
7416static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
7417				unsigned long *db)
7418{
7419	u32 dr6 = 0;
7420	int i;
7421	u32 enable, rwlen;
7422
7423	enable = dr7;
7424	rwlen = dr7 >> 16;
7425	for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
7426		if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
7427			dr6 |= (1 << i);
7428	return dr6;
7429}
7430
7431static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
7432{
7433	struct kvm_run *kvm_run = vcpu->run;
7434
7435	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
7436		kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
7437		kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
7438		kvm_run->debug.arch.exception = DB_VECTOR;
7439		kvm_run->exit_reason = KVM_EXIT_DEBUG;
7440		return 0;
7441	}
7442	kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
7443	return 1;
7444}
7445
7446int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
7447{
7448	unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
7449	int r;
7450
7451	r = kvm_x86_ops.skip_emulated_instruction(vcpu);
7452	if (unlikely(!r))
7453		return 0;
7454
7455	/*
7456	 * rflags is the old, "raw" value of the flags.  The new value has
7457	 * not been saved yet.
7458	 *
7459	 * This is correct even for TF set by the guest, because "the
7460	 * processor will not generate this exception after the instruction
7461	 * that sets the TF flag".
7462	 */
7463	if (unlikely(rflags & X86_EFLAGS_TF))
7464		r = kvm_vcpu_do_singlestep(vcpu);
7465	return r;
7466}
7467EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
7468
7469static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
7470{
7471	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
7472	    (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
7473		struct kvm_run *kvm_run = vcpu->run;
7474		unsigned long eip = kvm_get_linear_rip(vcpu);
7475		u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
7476					   vcpu->arch.guest_debug_dr7,
7477					   vcpu->arch.eff_db);
7478
7479		if (dr6 != 0) {
7480			kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
7481			kvm_run->debug.arch.pc = eip;
7482			kvm_run->debug.arch.exception = DB_VECTOR;
7483			kvm_run->exit_reason = KVM_EXIT_DEBUG;
7484			*r = 0;
7485			return true;
7486		}
7487	}
7488
7489	if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
7490	    !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
7491		unsigned long eip = kvm_get_linear_rip(vcpu);
7492		u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
7493					   vcpu->arch.dr7,
7494					   vcpu->arch.db);
7495
7496		if (dr6 != 0) {
7497			kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
7498			*r = 1;
7499			return true;
7500		}
7501	}
7502
7503	return false;
7504}
7505
7506static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
7507{
7508	switch (ctxt->opcode_len) {
7509	case 1:
7510		switch (ctxt->b) {
7511		case 0xe4:	/* IN */
7512		case 0xe5:
7513		case 0xec:
7514		case 0xed:
7515		case 0xe6:	/* OUT */
7516		case 0xe7:
7517		case 0xee:
7518		case 0xef:
7519		case 0x6c:	/* INS */
7520		case 0x6d:
7521		case 0x6e:	/* OUTS */
7522		case 0x6f:
7523			return true;
7524		}
7525		break;
7526	case 2:
7527		switch (ctxt->b) {
7528		case 0x33:	/* RDPMC */
7529			return true;
7530		}
7531		break;
7532	}
7533
7534	return false;
7535}
7536
7537/*
7538 * Decode an instruction for emulation.  The caller is responsible for handling
7539 * code breakpoints.  Note, manually detecting code breakpoints is unnecessary
7540 * (and wrong) when emulating on an intercepted fault-like exception[*], as
7541 * code breakpoints have higher priority and thus have already been done by
7542 * hardware.
7543 *
7544 * [*] Except #MC, which is higher priority, but KVM should never emulate in
7545 *     response to a machine check.
7546 */
7547int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
7548				    void *insn, int insn_len)
7549{
7550	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
7551	int r;
7552
7553	init_emulate_ctxt(vcpu);
7554
7555	ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
7556
7557	r = x86_decode_insn(ctxt, insn, insn_len);
7558
7559	trace_kvm_emulate_insn_start(vcpu);
7560	++vcpu->stat.insn_emulation;
7561
7562	return r;
7563}
7564EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
7565
7566int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
7567			    int emulation_type, void *insn, int insn_len)
7568{
7569	int r;
7570	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
7571	bool writeback = true;
7572	bool write_fault_to_spt;
7573
7574	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
7575		return 1;
7576
7577	vcpu->arch.l1tf_flush_l1d = true;
7578
7579	/*
7580	 * Clear write_fault_to_shadow_pgtable here to ensure it is
7581	 * never reused.
7582	 */
7583	write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
7584	vcpu->arch.write_fault_to_shadow_pgtable = false;
7585
7586	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
7587		kvm_clear_exception_queue(vcpu);
7588
7589		/*
7590		 * Return immediately if RIP hits a code breakpoint, such #DBs
7591		 * are fault-like and are higher priority than any faults on
7592		 * the code fetch itself.
7593		 */
7594		if (!(emulation_type & EMULTYPE_SKIP) &&
7595		    kvm_vcpu_check_code_breakpoint(vcpu, &r))
7596			return r;
7597
7598		r = x86_decode_emulated_instruction(vcpu, emulation_type,
7599						    insn, insn_len);
7600		if (r != EMULATION_OK)  {
7601			if ((emulation_type & EMULTYPE_TRAP_UD) ||
7602			    (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
7603				kvm_queue_exception(vcpu, UD_VECTOR);
7604				return 1;
7605			}
7606			if (reexecute_instruction(vcpu, cr2_or_gpa,
7607						  write_fault_to_spt,
7608						  emulation_type))
7609				return 1;
7610
7611			if (ctxt->have_exception &&
7612			    !(emulation_type & EMULTYPE_SKIP)) {
7613				/*
7614				 * #UD should result in just EMULATION_FAILED, and trap-like
7615				 * exception should not be encountered during decode.
7616				 */
7617				WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
7618					     exception_type(ctxt->exception.vector) == EXCPT_TRAP);
7619				inject_emulated_exception(vcpu);
7620				return 1;
7621			}
7622			return handle_emulation_failure(vcpu, emulation_type);
7623		}
7624	}
7625
7626	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
7627	    !is_vmware_backdoor_opcode(ctxt)) {
7628		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
7629		return 1;
7630	}
7631
7632	/*
7633	 * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
7634	 * for kvm_skip_emulated_instruction().  The caller is responsible for
7635	 * updating interruptibility state and injecting single-step #DBs.
7636	 */
7637	if (emulation_type & EMULTYPE_SKIP) {
7638		kvm_rip_write(vcpu, ctxt->_eip);
7639		if (ctxt->eflags & X86_EFLAGS_RF)
7640			kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
7641		return 1;
7642	}
7643
7644	if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
7645		return 1;
7646
7647	/* this is needed for vmware backdoor interface to work since it
7648	   changes registers values  during IO operation */
7649	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
7650		vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
7651		emulator_invalidate_register_cache(ctxt);
7652	}
7653
7654restart:
7655	if (emulation_type & EMULTYPE_PF) {
7656		/* Save the faulting GPA (cr2) in the address field */
7657		ctxt->exception.address = cr2_or_gpa;
7658
7659		/* With shadow page tables, cr2 contains a GVA or nGPA. */
7660		if (vcpu->arch.mmu->direct_map) {
7661			ctxt->gpa_available = true;
7662			ctxt->gpa_val = cr2_or_gpa;
7663		}
7664	} else {
7665		/* Sanitize the address out of an abundance of paranoia. */
7666		ctxt->exception.address = 0;
7667	}
7668
7669	r = x86_emulate_insn(ctxt);
7670
7671	if (r == EMULATION_INTERCEPTED)
7672		return 1;
7673
7674	if (r == EMULATION_FAILED) {
7675		if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
7676					emulation_type))
7677			return 1;
7678
7679		return handle_emulation_failure(vcpu, emulation_type);
7680	}
7681
7682	if (ctxt->have_exception) {
7683		r = 1;
7684		if (inject_emulated_exception(vcpu))
7685			return r;
7686	} else if (vcpu->arch.pio.count) {
7687		if (!vcpu->arch.pio.in) {
7688			/* FIXME: return into emulator if single-stepping.  */
7689			vcpu->arch.pio.count = 0;
7690		} else {
7691			writeback = false;
7692			vcpu->arch.complete_userspace_io = complete_emulated_pio;
7693		}
7694		r = 0;
7695	} else if (vcpu->mmio_needed) {
7696		++vcpu->stat.mmio_exits;
7697
7698		if (!vcpu->mmio_is_write)
7699			writeback = false;
7700		r = 0;
7701		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
7702	} else if (r == EMULATION_RESTART)
7703		goto restart;
7704	else
7705		r = 1;
7706
7707	if (writeback) {
7708		unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
7709		toggle_interruptibility(vcpu, ctxt->interruptibility);
7710		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
7711
7712		/*
7713		 * Note, EXCPT_DB is assumed to be fault-like as the emulator
7714		 * only supports code breakpoints and general detect #DB, both
7715		 * of which are fault-like.
7716		 */
7717		if (!ctxt->have_exception ||
7718		    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
7719			kvm_rip_write(vcpu, ctxt->eip);
7720			if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
7721				r = kvm_vcpu_do_singlestep(vcpu);
7722			if (kvm_x86_ops.update_emulated_instruction)
7723				kvm_x86_ops.update_emulated_instruction(vcpu);
7724			__kvm_set_rflags(vcpu, ctxt->eflags);
7725		}
7726
7727		/*
7728		 * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
7729		 * do nothing, and it will be requested again as soon as
7730		 * the shadow expires.  But we still need to check here,
7731		 * because POPF has no interrupt shadow.
7732		 */
7733		if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
7734			kvm_make_request(KVM_REQ_EVENT, vcpu);
7735	} else
7736		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
7737
7738	return r;
7739}
7740
7741int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
7742{
7743	return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
7744}
7745EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
7746
7747int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
7748					void *insn, int insn_len)
7749{
7750	return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
7751}
7752EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
7753
7754static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
7755{
7756	vcpu->arch.pio.count = 0;
7757	return 1;
7758}
7759
7760static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
7761{
7762	vcpu->arch.pio.count = 0;
7763
7764	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
7765		return 1;
7766
7767	return kvm_skip_emulated_instruction(vcpu);
7768}
7769
7770static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
7771			    unsigned short port)
7772{
7773	unsigned long val = kvm_rax_read(vcpu);
7774	int ret = emulator_pio_out(vcpu, size, port, &val, 1);
7775
7776	if (ret)
7777		return ret;
7778
7779	/*
7780	 * Workaround userspace that relies on old KVM behavior of %rip being
7781	 * incremented prior to exiting to userspace to handle "OUT 0x7e".
7782	 */
7783	if (port == 0x7e &&
7784	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
7785		vcpu->arch.complete_userspace_io =
7786			complete_fast_pio_out_port_0x7e;
7787		kvm_skip_emulated_instruction(vcpu);
7788	} else {
7789		vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
7790		vcpu->arch.complete_userspace_io = complete_fast_pio_out;
7791	}
7792	return 0;
7793}
7794
7795static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
7796{
7797	unsigned long val;
7798
7799	/* We should only ever be called with arch.pio.count equal to 1 */
7800	BUG_ON(vcpu->arch.pio.count != 1);
7801
7802	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
7803		vcpu->arch.pio.count = 0;
7804		return 1;
7805	}
7806
7807	/* For size less than 4 we merge, else we zero extend */
7808	val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
7809
7810	/*
7811	 * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
7812	 * the copy and tracing
7813	 */
7814	emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
7815	kvm_rax_write(vcpu, val);
7816
7817	return kvm_skip_emulated_instruction(vcpu);
7818}
7819
7820static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
7821			   unsigned short port)
7822{
7823	unsigned long val;
7824	int ret;
7825
7826	/* For size less than 4 we merge, else we zero extend */
7827	val = (size < 4) ? kvm_rax_read(vcpu) : 0;
7828
7829	ret = emulator_pio_in(vcpu, size, port, &val, 1);
7830	if (ret) {
7831		kvm_rax_write(vcpu, val);
7832		return ret;
7833	}
7834
7835	vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
7836	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
7837
7838	return 0;
7839}
7840
7841int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
7842{
7843	int ret;
7844
7845	if (in)
7846		ret = kvm_fast_pio_in(vcpu, size, port);
7847	else
7848		ret = kvm_fast_pio_out(vcpu, size, port);
7849	return ret && kvm_skip_emulated_instruction(vcpu);
7850}
7851EXPORT_SYMBOL_GPL(kvm_fast_pio);
7852
7853static int kvmclock_cpu_down_prep(unsigned int cpu)
7854{
7855	__this_cpu_write(cpu_tsc_khz, 0);
7856	return 0;
7857}
7858
7859static void tsc_khz_changed(void *data)
7860{
7861	struct cpufreq_freqs *freq = data;
7862	unsigned long khz = 0;
7863
7864	if (data)
7865		khz = freq->new;
7866	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
7867		khz = cpufreq_quick_get(raw_smp_processor_id());
7868	if (!khz)
7869		khz = tsc_khz;
7870	__this_cpu_write(cpu_tsc_khz, khz);
7871}
7872
7873#ifdef CONFIG_X86_64
7874static void kvm_hyperv_tsc_notifier(void)
7875{
7876	struct kvm *kvm;
7877	struct kvm_vcpu *vcpu;
7878	int cpu;
7879
7880	mutex_lock(&kvm_lock);
7881	list_for_each_entry(kvm, &vm_list, vm_list)
7882		kvm_make_mclock_inprogress_request(kvm);
7883
7884	hyperv_stop_tsc_emulation();
7885
7886	/* TSC frequency always matches when on Hyper-V */
7887	for_each_present_cpu(cpu)
7888		per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
7889	kvm_max_guest_tsc_khz = tsc_khz;
7890
7891	list_for_each_entry(kvm, &vm_list, vm_list) {
7892		struct kvm_arch *ka = &kvm->arch;
7893
7894		spin_lock(&ka->pvclock_gtod_sync_lock);
7895
7896		pvclock_update_vm_gtod_copy(kvm);
7897
7898		kvm_for_each_vcpu(cpu, vcpu, kvm)
7899			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
7900
7901		kvm_for_each_vcpu(cpu, vcpu, kvm)
7902			kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
7903
7904		spin_unlock(&ka->pvclock_gtod_sync_lock);
7905	}
7906	mutex_unlock(&kvm_lock);
7907}
7908#endif
7909
7910static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
7911{
7912	struct kvm *kvm;
7913	struct kvm_vcpu *vcpu;
7914	int i, send_ipi = 0;
7915
7916	/*
7917	 * We allow guests to temporarily run on slowing clocks,
7918	 * provided we notify them after, or to run on accelerating
7919	 * clocks, provided we notify them before.  Thus time never
7920	 * goes backwards.
7921	 *
7922	 * However, we have a problem.  We can't atomically update
7923	 * the frequency of a given CPU from this function; it is
7924	 * merely a notifier, which can be called from any CPU.
7925	 * Changing the TSC frequency at arbitrary points in time
7926	 * requires a recomputation of local variables related to
7927	 * the TSC for each VCPU.  We must flag these local variables
7928	 * to be updated and be sure the update takes place with the
7929	 * new frequency before any guests proceed.
7930	 *
7931	 * Unfortunately, the combination of hotplug CPU and frequency
7932	 * change creates an intractable locking scenario; the order
7933	 * of when these callouts happen is undefined with respect to
7934	 * CPU hotplug, and they can race with each other.  As such,
7935	 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
7936	 * undefined; you can actually have a CPU frequency change take
7937	 * place in between the computation of X and the setting of the
7938	 * variable.  To protect against this problem, all updates of
7939	 * the per_cpu tsc_khz variable are done in an interrupt
7940	 * protected IPI, and all callers wishing to update the value
7941	 * must wait for a synchronous IPI to complete (which is trivial
7942	 * if the caller is on the CPU already).  This establishes the
7943	 * necessary total order on variable updates.
7944	 *
7945	 * Note that because a guest time update may take place
7946	 * anytime after the setting of the VCPU's request bit, the
7947	 * correct TSC value must be set before the request.  However,
7948	 * to ensure the update actually makes it to any guest which
7949	 * starts running in hardware virtualization between the set
7950	 * and the acquisition of the spinlock, we must also ping the
7951	 * CPU after setting the request bit.
7952	 *
7953	 */
7954
7955	smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
7956
7957	mutex_lock(&kvm_lock);
7958	list_for_each_entry(kvm, &vm_list, vm_list) {
7959		kvm_for_each_vcpu(i, vcpu, kvm) {
7960			if (vcpu->cpu != cpu)
7961				continue;
7962			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
7963			if (vcpu->cpu != raw_smp_processor_id())
7964				send_ipi = 1;
7965		}
7966	}
7967	mutex_unlock(&kvm_lock);
7968
7969	if (freq->old < freq->new && send_ipi) {
7970		/*
7971		 * We upscale the frequency.  Must make the guest
7972		 * doesn't see old kvmclock values while running with
7973		 * the new frequency, otherwise we risk the guest sees
7974		 * time go backwards.
7975		 *
7976		 * In case we update the frequency for another cpu
7977		 * (which might be in guest context) send an interrupt
7978		 * to kick the cpu out of guest context.  Next time
7979		 * guest context is entered kvmclock will be updated,
7980		 * so the guest will not see stale values.
7981		 */
7982		smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
7983	}
7984}
7985
7986static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
7987				     void *data)
7988{
7989	struct cpufreq_freqs *freq = data;
7990	int cpu;
7991
7992	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
7993		return 0;
7994	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
7995		return 0;
7996
7997	for_each_cpu(cpu, freq->policy->cpus)
7998		__kvmclock_cpufreq_notifier(freq, cpu);
7999
8000	return 0;
8001}
8002
8003static struct notifier_block kvmclock_cpufreq_notifier_block = {
8004	.notifier_call  = kvmclock_cpufreq_notifier
8005};
8006
8007static int kvmclock_cpu_online(unsigned int cpu)
8008{
8009	tsc_khz_changed(NULL);
8010	return 0;
8011}
8012
8013static void kvm_timer_init(void)
8014{
8015	max_tsc_khz = tsc_khz;
8016
8017	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
8018#ifdef CONFIG_CPU_FREQ
8019		struct cpufreq_policy *policy;
8020		int cpu;
8021
8022		cpu = get_cpu();
8023		policy = cpufreq_cpu_get(cpu);
8024		if (policy) {
8025			if (policy->cpuinfo.max_freq)
8026				max_tsc_khz = policy->cpuinfo.max_freq;
8027			cpufreq_cpu_put(policy);
8028		}
8029		put_cpu();
8030#endif
8031		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
8032					  CPUFREQ_TRANSITION_NOTIFIER);
8033	}
8034
8035	cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
8036			  kvmclock_cpu_online, kvmclock_cpu_down_prep);
8037}
8038
8039DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
8040EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
8041
8042int kvm_is_in_guest(void)
8043{
8044	return __this_cpu_read(current_vcpu) != NULL;
8045}
8046
8047static int kvm_is_user_mode(void)
8048{
8049	int user_mode = 3;
8050
8051	if (__this_cpu_read(current_vcpu))
8052		user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
8053
8054	return user_mode != 0;
8055}
8056
8057static unsigned long kvm_get_guest_ip(void)
8058{
8059	unsigned long ip = 0;
8060
8061	if (__this_cpu_read(current_vcpu))
8062		ip = kvm_rip_read(__this_cpu_read(current_vcpu));
8063
8064	return ip;
8065}
8066
8067static void kvm_handle_intel_pt_intr(void)
8068{
8069	struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
8070
8071	kvm_make_request(KVM_REQ_PMI, vcpu);
8072	__set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8073			(unsigned long *)&vcpu->arch.pmu.global_status);
8074}
8075
8076static struct perf_guest_info_callbacks kvm_guest_cbs = {
8077	.is_in_guest		= kvm_is_in_guest,
8078	.is_user_mode		= kvm_is_user_mode,
8079	.get_guest_ip		= kvm_get_guest_ip,
8080	.handle_intel_pt_intr	= NULL,
8081};
8082
8083#ifdef CONFIG_X86_64
8084static void pvclock_gtod_update_fn(struct work_struct *work)
8085{
8086	struct kvm *kvm;
8087
8088	struct kvm_vcpu *vcpu;
8089	int i;
8090
8091	mutex_lock(&kvm_lock);
8092	list_for_each_entry(kvm, &vm_list, vm_list)
8093		kvm_for_each_vcpu(i, vcpu, kvm)
8094			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
8095	atomic_set(&kvm_guest_has_master_clock, 0);
8096	mutex_unlock(&kvm_lock);
8097}
8098
8099static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
8100
8101/*
8102 * Indirection to move queue_work() out of the tk_core.seq write held
8103 * region to prevent possible deadlocks against time accessors which
8104 * are invoked with work related locks held.
8105 */
8106static void pvclock_irq_work_fn(struct irq_work *w)
8107{
8108	queue_work(system_long_wq, &pvclock_gtod_work);
8109}
8110
8111static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
8112
8113/*
8114 * Notification about pvclock gtod data update.
8115 */
8116static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
8117			       void *priv)
8118{
8119	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
8120	struct timekeeper *tk = priv;
8121
8122	update_pvclock_gtod(tk);
8123
8124	/*
8125	 * Disable master clock if host does not trust, or does not use,
8126	 * TSC based clocksource. Delegate queue_work() to irq_work as
8127	 * this is invoked with tk_core.seq write held.
8128	 */
8129	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
8130	    atomic_read(&kvm_guest_has_master_clock) != 0)
8131		irq_work_queue(&pvclock_irq_work);
8132	return 0;
8133}
8134
8135static struct notifier_block pvclock_gtod_notifier = {
8136	.notifier_call = pvclock_gtod_notify,
8137};
8138#endif
8139
8140int kvm_arch_init(void *opaque)
8141{
8142	struct kvm_x86_init_ops *ops = opaque;
8143	int r;
8144
8145	if (kvm_x86_ops.hardware_enable) {
8146		printk(KERN_ERR "kvm: already loaded the other module\n");
8147		r = -EEXIST;
8148		goto out;
8149	}
8150
8151	if (!ops->cpu_has_kvm_support()) {
8152		pr_err_ratelimited("kvm: no hardware support\n");
8153		r = -EOPNOTSUPP;
8154		goto out;
8155	}
8156	if (ops->disabled_by_bios()) {
8157		pr_err_ratelimited("kvm: disabled by bios\n");
8158		r = -EOPNOTSUPP;
8159		goto out;
8160	}
8161
8162	/*
8163	 * KVM explicitly assumes that the guest has an FPU and
8164	 * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
8165	 * vCPU's FPU state as a fxregs_state struct.
8166	 */
8167	if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
8168		printk(KERN_ERR "kvm: inadequate fpu\n");
8169		r = -EOPNOTSUPP;
8170		goto out;
8171	}
8172
8173	r = -ENOMEM;
8174	x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
8175					  __alignof__(struct fpu), SLAB_ACCOUNT,
8176					  NULL);
8177	if (!x86_fpu_cache) {
8178		printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
8179		goto out;
8180	}
8181
8182	x86_emulator_cache = kvm_alloc_emulator_cache();
8183	if (!x86_emulator_cache) {
8184		pr_err("kvm: failed to allocate cache for x86 emulator\n");
8185		goto out_free_x86_fpu_cache;
8186	}
8187
8188	user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
8189	if (!user_return_msrs) {
8190		printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
8191		goto out_free_x86_emulator_cache;
8192	}
8193
8194	r = kvm_mmu_vendor_module_init();
8195	if (r)
8196		goto out_free_percpu;
8197
8198	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
8199			PT_DIRTY_MASK, PT64_NX_MASK, 0,
8200			PT_PRESENT_MASK, 0, sme_me_mask);
8201	kvm_timer_init();
8202
8203	if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest())
8204		kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr;
8205	perf_register_guest_info_callbacks(&kvm_guest_cbs);
8206
8207	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
8208		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
8209		supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
8210	}
8211
8212	kvm_lapic_init();
8213	if (pi_inject_timer == -1)
8214		pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
8215#ifdef CONFIG_X86_64
8216	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
8217
8218	if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
8219		set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
8220#endif
8221
8222	return 0;
8223
8224out_free_percpu:
8225	free_percpu(user_return_msrs);
8226out_free_x86_emulator_cache:
8227	kmem_cache_destroy(x86_emulator_cache);
8228out_free_x86_fpu_cache:
8229	kmem_cache_destroy(x86_fpu_cache);
8230out:
8231	return r;
8232}
8233
8234void kvm_arch_exit(void)
8235{
8236#ifdef CONFIG_X86_64
8237	if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
8238		clear_hv_tscchange_cb();
8239#endif
8240	kvm_lapic_exit();
8241	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
8242	kvm_guest_cbs.handle_intel_pt_intr = NULL;
8243
8244	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
8245		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
8246					    CPUFREQ_TRANSITION_NOTIFIER);
8247	cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
8248#ifdef CONFIG_X86_64
8249	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
8250	irq_work_sync(&pvclock_irq_work);
8251	cancel_work_sync(&pvclock_gtod_work);
8252#endif
8253	kvm_x86_ops.hardware_enable = NULL;
8254	kvm_mmu_vendor_module_exit();
8255	free_percpu(user_return_msrs);
8256	kmem_cache_destroy(x86_emulator_cache);
8257	kmem_cache_destroy(x86_fpu_cache);
8258}
8259
8260int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
8261{
8262	++vcpu->stat.halt_exits;
8263	if (lapic_in_kernel(vcpu)) {
8264		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
8265		return 1;
8266	} else {
8267		vcpu->run->exit_reason = KVM_EXIT_HLT;
8268		return 0;
8269	}
8270}
8271EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
8272
8273int kvm_emulate_halt(struct kvm_vcpu *vcpu)
8274{
8275	int ret = kvm_skip_emulated_instruction(vcpu);
8276	/*
8277	 * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
8278	 * KVM_EXIT_DEBUG here.
8279	 */
8280	return kvm_vcpu_halt(vcpu) && ret;
8281}
8282EXPORT_SYMBOL_GPL(kvm_emulate_halt);
8283
8284#ifdef CONFIG_X86_64
8285static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
8286			        unsigned long clock_type)
8287{
8288	struct kvm_clock_pairing clock_pairing;
8289	struct timespec64 ts;
8290	u64 cycle;
8291	int ret;
8292
8293	if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
8294		return -KVM_EOPNOTSUPP;
8295
8296	if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
8297		return -KVM_EOPNOTSUPP;
8298
8299	clock_pairing.sec = ts.tv_sec;
8300	clock_pairing.nsec = ts.tv_nsec;
8301	clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
8302	clock_pairing.flags = 0;
8303	memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
8304
8305	ret = 0;
8306	if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
8307			    sizeof(struct kvm_clock_pairing)))
8308		ret = -KVM_EFAULT;
8309
8310	return ret;
8311}
8312#endif
8313
8314/*
8315 * kvm_pv_kick_cpu_op:  Kick a vcpu.
8316 *
8317 * @apicid - apicid of vcpu to be kicked.
8318 */
8319static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
8320{
8321	/*
8322	 * All other fields are unused for APIC_DM_REMRD, but may be consumed by
8323	 * common code, e.g. for tracing. Defer initialization to the compiler.
8324	 */
8325	struct kvm_lapic_irq lapic_irq = {
8326		.delivery_mode = APIC_DM_REMRD,
8327		.dest_mode = APIC_DEST_PHYSICAL,
8328		.shorthand = APIC_DEST_NOSHORT,
8329		.dest_id = apicid,
8330	};
8331
8332	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
8333}
8334
8335bool kvm_apicv_activated(struct kvm *kvm)
8336{
8337	return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
8338}
8339EXPORT_SYMBOL_GPL(kvm_apicv_activated);
8340
8341void kvm_apicv_init(struct kvm *kvm, bool enable)
8342{
8343	if (enable)
8344		clear_bit(APICV_INHIBIT_REASON_DISABLE,
8345			  &kvm->arch.apicv_inhibit_reasons);
8346	else
8347		set_bit(APICV_INHIBIT_REASON_DISABLE,
8348			&kvm->arch.apicv_inhibit_reasons);
8349}
8350EXPORT_SYMBOL_GPL(kvm_apicv_init);
8351
8352static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
8353{
8354	struct kvm_vcpu *target = NULL;
8355	struct kvm_apic_map *map;
8356
8357	rcu_read_lock();
8358	map = rcu_dereference(kvm->arch.apic_map);
8359
8360	if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
8361		target = map->phys_map[dest_id]->vcpu;
8362
8363	rcu_read_unlock();
8364
8365	if (target && READ_ONCE(target->ready))
8366		kvm_vcpu_yield_to(target);
8367}
8368
8369int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
8370{
8371	unsigned long nr, a0, a1, a2, a3, ret;
8372	int op_64_bit;
8373
8374	if (kvm_hv_hypercall_enabled(vcpu->kvm))
8375		return kvm_hv_hypercall(vcpu);
8376
8377	nr = kvm_rax_read(vcpu);
8378	a0 = kvm_rbx_read(vcpu);
8379	a1 = kvm_rcx_read(vcpu);
8380	a2 = kvm_rdx_read(vcpu);
8381	a3 = kvm_rsi_read(vcpu);
8382
8383	trace_kvm_hypercall(nr, a0, a1, a2, a3);
8384
8385	op_64_bit = is_64_bit_mode(vcpu);
8386	if (!op_64_bit) {
8387		nr &= 0xFFFFFFFF;
8388		a0 &= 0xFFFFFFFF;
8389		a1 &= 0xFFFFFFFF;
8390		a2 &= 0xFFFFFFFF;
8391		a3 &= 0xFFFFFFFF;
8392	}
8393
8394	if (kvm_x86_ops.get_cpl(vcpu) != 0) {
8395		ret = -KVM_EPERM;
8396		goto out;
8397	}
8398
8399	ret = -KVM_ENOSYS;
8400
8401	switch (nr) {
8402	case KVM_HC_VAPIC_POLL_IRQ:
8403		ret = 0;
8404		break;
8405	case KVM_HC_KICK_CPU:
8406		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
8407			break;
8408
8409		kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
8410		kvm_sched_yield(vcpu->kvm, a1);
8411		ret = 0;
8412		break;
8413#ifdef CONFIG_X86_64
8414	case KVM_HC_CLOCK_PAIRING:
8415		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
8416		break;
8417#endif
8418	case KVM_HC_SEND_IPI:
8419		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
8420			break;
8421
8422		ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
8423		break;
8424	case KVM_HC_SCHED_YIELD:
8425		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
8426			break;
8427
8428		kvm_sched_yield(vcpu->kvm, a0);
8429		ret = 0;
8430		break;
8431	default:
8432		ret = -KVM_ENOSYS;
8433		break;
8434	}
8435out:
8436	if (!op_64_bit)
8437		ret = (u32)ret;
8438	kvm_rax_write(vcpu, ret);
8439
8440	++vcpu->stat.hypercalls;
8441	return kvm_skip_emulated_instruction(vcpu);
8442}
8443EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
8444
8445static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
8446{
8447	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8448	char instruction[3];
8449	unsigned long rip = kvm_rip_read(vcpu);
8450
8451	kvm_x86_ops.patch_hypercall(vcpu, instruction);
8452
8453	return emulator_write_emulated(ctxt, rip, instruction, 3,
8454		&ctxt->exception);
8455}
8456
8457static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
8458{
8459	return vcpu->run->request_interrupt_window &&
8460		likely(!pic_in_kernel(vcpu->kvm));
8461}
8462
8463static void post_kvm_run_save(struct kvm_vcpu *vcpu)
8464{
8465	struct kvm_run *kvm_run = vcpu->run;
8466
8467	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
8468	kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
8469	kvm_run->cr8 = kvm_get_cr8(vcpu);
8470	kvm_run->apic_base = kvm_get_apic_base(vcpu);
8471	kvm_run->ready_for_interrupt_injection =
8472		pic_in_kernel(vcpu->kvm) ||
8473		kvm_vcpu_ready_for_interrupt_injection(vcpu);
8474}
8475
8476static void update_cr8_intercept(struct kvm_vcpu *vcpu)
8477{
8478	int max_irr, tpr;
8479
8480	if (!kvm_x86_ops.update_cr8_intercept)
8481		return;
8482
8483	if (!lapic_in_kernel(vcpu))
8484		return;
8485
8486	if (vcpu->arch.apicv_active)
8487		return;
8488
8489	if (!vcpu->arch.apic->vapic_addr)
8490		max_irr = kvm_lapic_find_highest_irr(vcpu);
8491	else
8492		max_irr = -1;
8493
8494	if (max_irr != -1)
8495		max_irr >>= 4;
8496
8497	tpr = kvm_lapic_get_cr8(vcpu);
8498
8499	kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
8500}
8501
8502static void kvm_inject_exception(struct kvm_vcpu *vcpu)
8503{
8504	trace_kvm_inj_exception(vcpu->arch.exception.nr,
8505				vcpu->arch.exception.has_error_code,
8506				vcpu->arch.exception.error_code,
8507				vcpu->arch.exception.injected);
8508
8509	if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
8510		vcpu->arch.exception.error_code = false;
8511	kvm_x86_ops.queue_exception(vcpu);
8512}
8513
8514static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
8515{
8516	int r;
8517	bool can_inject = true;
8518
8519	/* try to reinject previous events if any */
8520
8521	if (vcpu->arch.exception.injected) {
8522		kvm_inject_exception(vcpu);
8523		can_inject = false;
8524	}
8525	/*
8526	 * Do not inject an NMI or interrupt if there is a pending
8527	 * exception.  Exceptions and interrupts are recognized at
8528	 * instruction boundaries, i.e. the start of an instruction.
8529	 * Trap-like exceptions, e.g. #DB, have higher priority than
8530	 * NMIs and interrupts, i.e. traps are recognized before an
8531	 * NMI/interrupt that's pending on the same instruction.
8532	 * Fault-like exceptions, e.g. #GP and #PF, are the lowest
8533	 * priority, but are only generated (pended) during instruction
8534	 * execution, i.e. a pending fault-like exception means the
8535	 * fault occurred on the *previous* instruction and must be
8536	 * serviced prior to recognizing any new events in order to
8537	 * fully complete the previous instruction.
8538	 */
8539	else if (!vcpu->arch.exception.pending) {
8540		if (vcpu->arch.nmi_injected) {
8541			kvm_x86_ops.set_nmi(vcpu);
8542			can_inject = false;
8543		} else if (vcpu->arch.interrupt.injected) {
8544			kvm_x86_ops.set_irq(vcpu);
8545			can_inject = false;
8546		}
8547	}
8548
8549	WARN_ON_ONCE(vcpu->arch.exception.injected &&
8550		     vcpu->arch.exception.pending);
8551
8552	/*
8553	 * Call check_nested_events() even if we reinjected a previous event
8554	 * in order for caller to determine if it should require immediate-exit
8555	 * from L2 to L1 due to pending L1 events which require exit
8556	 * from L2 to L1.
8557	 */
8558	if (is_guest_mode(vcpu)) {
8559		r = kvm_x86_ops.nested_ops->check_events(vcpu);
8560		if (r < 0)
8561			goto busy;
8562	}
8563
8564	/* try to inject new event if pending */
8565	if (vcpu->arch.exception.pending) {
8566		/*
8567		 * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
8568		 * value pushed on the stack.  Trap-like exception and all #DBs
8569		 * leave RF as-is (KVM follows Intel's behavior in this regard;
8570		 * AMD states that code breakpoint #DBs excplitly clear RF=0).
8571		 *
8572		 * Note, most versions of Intel's SDM and AMD's APM incorrectly
8573		 * describe the behavior of General Detect #DBs, which are
8574		 * fault-like.  They do _not_ set RF, a la code breakpoints.
8575		 */
8576		if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
8577			__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
8578					     X86_EFLAGS_RF);
8579
8580		if (vcpu->arch.exception.nr == DB_VECTOR) {
8581			kvm_deliver_exception_payload(vcpu);
8582			if (vcpu->arch.dr7 & DR7_GD) {
8583				vcpu->arch.dr7 &= ~DR7_GD;
8584				kvm_update_dr7(vcpu);
8585			}
8586		}
8587
8588		kvm_inject_exception(vcpu);
8589
8590		vcpu->arch.exception.pending = false;
8591		vcpu->arch.exception.injected = true;
8592
8593		can_inject = false;
8594	}
8595
8596	/*
8597	 * Finally, inject interrupt events.  If an event cannot be injected
8598	 * due to architectural conditions (e.g. IF=0) a window-open exit
8599	 * will re-request KVM_REQ_EVENT.  Sometimes however an event is pending
8600	 * and can architecturally be injected, but we cannot do it right now:
8601	 * an interrupt could have arrived just now and we have to inject it
8602	 * as a vmexit, or there could already an event in the queue, which is
8603	 * indicated by can_inject.  In that case we request an immediate exit
8604	 * in order to make progress and get back here for another iteration.
8605	 * The kvm_x86_ops hooks communicate this by returning -EBUSY.
8606	 */
8607	if (vcpu->arch.smi_pending) {
8608		r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
8609		if (r < 0)
8610			goto busy;
8611		if (r) {
8612			vcpu->arch.smi_pending = false;
8613			++vcpu->arch.smi_count;
8614			enter_smm(vcpu);
8615			can_inject = false;
8616		} else
8617			kvm_x86_ops.enable_smi_window(vcpu);
8618	}
8619
8620	if (vcpu->arch.nmi_pending) {
8621		r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
8622		if (r < 0)
8623			goto busy;
8624		if (r) {
8625			--vcpu->arch.nmi_pending;
8626			vcpu->arch.nmi_injected = true;
8627			kvm_x86_ops.set_nmi(vcpu);
8628			can_inject = false;
8629			WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
8630		}
8631		if (vcpu->arch.nmi_pending)
8632			kvm_x86_ops.enable_nmi_window(vcpu);
8633	}
8634
8635	if (kvm_cpu_has_injectable_intr(vcpu)) {
8636		r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
8637		if (r < 0)
8638			goto busy;
8639		if (r) {
8640			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
8641			kvm_x86_ops.set_irq(vcpu);
8642			WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
8643		}
8644		if (kvm_cpu_has_injectable_intr(vcpu))
8645			kvm_x86_ops.enable_irq_window(vcpu);
8646	}
8647
8648	if (is_guest_mode(vcpu) &&
8649	    kvm_x86_ops.nested_ops->hv_timer_pending &&
8650	    kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
8651		*req_immediate_exit = true;
8652
8653	WARN_ON(vcpu->arch.exception.pending);
8654	return;
8655
8656busy:
8657	*req_immediate_exit = true;
8658	return;
8659}
8660
8661static void process_nmi(struct kvm_vcpu *vcpu)
8662{
8663	unsigned limit = 2;
8664
8665	/*
8666	 * x86 is limited to one NMI running, and one NMI pending after it.
8667	 * If an NMI is already in progress, limit further NMIs to just one.
8668	 * Otherwise, allow two (and we'll inject the first one immediately).
8669	 */
8670	if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
8671		limit = 1;
8672
8673	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
8674	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
8675	kvm_make_request(KVM_REQ_EVENT, vcpu);
8676}
8677
8678static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
8679{
8680	u32 flags = 0;
8681	flags |= seg->g       << 23;
8682	flags |= seg->db      << 22;
8683	flags |= seg->l       << 21;
8684	flags |= seg->avl     << 20;
8685	flags |= seg->present << 15;
8686	flags |= seg->dpl     << 13;
8687	flags |= seg->s       << 12;
8688	flags |= seg->type    << 8;
8689	return flags;
8690}
8691
8692static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
8693{
8694	struct kvm_segment seg;
8695	int offset;
8696
8697	kvm_get_segment(vcpu, &seg, n);
8698	put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
8699
8700	if (n < 3)
8701		offset = 0x7f84 + n * 12;
8702	else
8703		offset = 0x7f2c + (n - 3) * 12;
8704
8705	put_smstate(u32, buf, offset + 8, seg.base);
8706	put_smstate(u32, buf, offset + 4, seg.limit);
8707	put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
8708}
8709
8710#ifdef CONFIG_X86_64
8711static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
8712{
8713	struct kvm_segment seg;
8714	int offset;
8715	u16 flags;
8716
8717	kvm_get_segment(vcpu, &seg, n);
8718	offset = 0x7e00 + n * 16;
8719
8720	flags = enter_smm_get_segment_flags(&seg) >> 8;
8721	put_smstate(u16, buf, offset, seg.selector);
8722	put_smstate(u16, buf, offset + 2, flags);
8723	put_smstate(u32, buf, offset + 4, seg.limit);
8724	put_smstate(u64, buf, offset + 8, seg.base);
8725}
8726#endif
8727
8728static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
8729{
8730	struct desc_ptr dt;
8731	struct kvm_segment seg;
8732	unsigned long val;
8733	int i;
8734
8735	put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
8736	put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
8737	put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
8738	put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
8739
8740	for (i = 0; i < 8; i++)
8741		put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
8742
8743	kvm_get_dr(vcpu, 6, &val);
8744	put_smstate(u32, buf, 0x7fcc, (u32)val);
8745	kvm_get_dr(vcpu, 7, &val);
8746	put_smstate(u32, buf, 0x7fc8, (u32)val);
8747
8748	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
8749	put_smstate(u32, buf, 0x7fc4, seg.selector);
8750	put_smstate(u32, buf, 0x7f64, seg.base);
8751	put_smstate(u32, buf, 0x7f60, seg.limit);
8752	put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
8753
8754	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
8755	put_smstate(u32, buf, 0x7fc0, seg.selector);
8756	put_smstate(u32, buf, 0x7f80, seg.base);
8757	put_smstate(u32, buf, 0x7f7c, seg.limit);
8758	put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
8759
8760	kvm_x86_ops.get_gdt(vcpu, &dt);
8761	put_smstate(u32, buf, 0x7f74, dt.address);
8762	put_smstate(u32, buf, 0x7f70, dt.size);
8763
8764	kvm_x86_ops.get_idt(vcpu, &dt);
8765	put_smstate(u32, buf, 0x7f58, dt.address);
8766	put_smstate(u32, buf, 0x7f54, dt.size);
8767
8768	for (i = 0; i < 6; i++)
8769		enter_smm_save_seg_32(vcpu, buf, i);
8770
8771	put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
8772
8773	/* revision id */
8774	put_smstate(u32, buf, 0x7efc, 0x00020000);
8775	put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
8776}
8777
8778#ifdef CONFIG_X86_64
8779static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
8780{
8781	struct desc_ptr dt;
8782	struct kvm_segment seg;
8783	unsigned long val;
8784	int i;
8785
8786	for (i = 0; i < 16; i++)
8787		put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
8788
8789	put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
8790	put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
8791
8792	kvm_get_dr(vcpu, 6, &val);
8793	put_smstate(u64, buf, 0x7f68, val);
8794	kvm_get_dr(vcpu, 7, &val);
8795	put_smstate(u64, buf, 0x7f60, val);
8796
8797	put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
8798	put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
8799	put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
8800
8801	put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
8802
8803	/* revision id */
8804	put_smstate(u32, buf, 0x7efc, 0x00020064);
8805
8806	put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
8807
8808	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
8809	put_smstate(u16, buf, 0x7e90, seg.selector);
8810	put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
8811	put_smstate(u32, buf, 0x7e94, seg.limit);
8812	put_smstate(u64, buf, 0x7e98, seg.base);
8813
8814	kvm_x86_ops.get_idt(vcpu, &dt);
8815	put_smstate(u32, buf, 0x7e84, dt.size);
8816	put_smstate(u64, buf, 0x7e88, dt.address);
8817
8818	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
8819	put_smstate(u16, buf, 0x7e70, seg.selector);
8820	put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
8821	put_smstate(u32, buf, 0x7e74, seg.limit);
8822	put_smstate(u64, buf, 0x7e78, seg.base);
8823
8824	kvm_x86_ops.get_gdt(vcpu, &dt);
8825	put_smstate(u32, buf, 0x7e64, dt.size);
8826	put_smstate(u64, buf, 0x7e68, dt.address);
8827
8828	for (i = 0; i < 6; i++)
8829		enter_smm_save_seg_64(vcpu, buf, i);
8830}
8831#endif
8832
8833static void enter_smm(struct kvm_vcpu *vcpu)
8834{
8835	struct kvm_segment cs, ds;
8836	struct desc_ptr dt;
8837	char buf[512];
8838	u32 cr0;
8839
8840	trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
8841	memset(buf, 0, 512);
8842#ifdef CONFIG_X86_64
8843	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
8844		enter_smm_save_state_64(vcpu, buf);
8845	else
8846#endif
8847		enter_smm_save_state_32(vcpu, buf);
8848
8849	/*
8850	 * Give pre_enter_smm() a chance to make ISA-specific changes to the
8851	 * vCPU state (e.g. leave guest mode) after we've saved the state into
8852	 * the SMM state-save area.
8853	 */
8854	kvm_x86_ops.pre_enter_smm(vcpu, buf);
8855
8856	vcpu->arch.hflags |= HF_SMM_MASK;
8857	kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
8858
8859	if (kvm_x86_ops.get_nmi_mask(vcpu))
8860		vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
8861	else
8862		kvm_x86_ops.set_nmi_mask(vcpu, true);
8863
8864	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
8865	kvm_rip_write(vcpu, 0x8000);
8866
8867	cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
8868	kvm_x86_ops.set_cr0(vcpu, cr0);
8869	vcpu->arch.cr0 = cr0;
8870
8871	kvm_x86_ops.set_cr4(vcpu, 0);
8872
8873	/* Undocumented: IDT limit is set to zero on entry to SMM.  */
8874	dt.address = dt.size = 0;
8875	kvm_x86_ops.set_idt(vcpu, &dt);
8876
8877	__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
8878
8879	cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
8880	cs.base = vcpu->arch.smbase;
8881
8882	ds.selector = 0;
8883	ds.base = 0;
8884
8885	cs.limit    = ds.limit = 0xffffffff;
8886	cs.type     = ds.type = 0x3;
8887	cs.dpl      = ds.dpl = 0;
8888	cs.db       = ds.db = 0;
8889	cs.s        = ds.s = 1;
8890	cs.l        = ds.l = 0;
8891	cs.g        = ds.g = 1;
8892	cs.avl      = ds.avl = 0;
8893	cs.present  = ds.present = 1;
8894	cs.unusable = ds.unusable = 0;
8895	cs.padding  = ds.padding = 0;
8896
8897	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
8898	kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
8899	kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
8900	kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
8901	kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
8902	kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
8903
8904#ifdef CONFIG_X86_64
8905	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
8906		kvm_x86_ops.set_efer(vcpu, 0);
8907#endif
8908
8909	kvm_update_cpuid_runtime(vcpu);
8910	kvm_mmu_reset_context(vcpu);
8911}
8912
8913static void process_smi(struct kvm_vcpu *vcpu)
8914{
8915	vcpu->arch.smi_pending = true;
8916	kvm_make_request(KVM_REQ_EVENT, vcpu);
8917}
8918
8919void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
8920				       unsigned long *vcpu_bitmap)
8921{
8922	cpumask_var_t cpus;
8923
8924	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
8925
8926	kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
8927				    NULL, vcpu_bitmap, cpus);
8928
8929	free_cpumask_var(cpus);
8930}
8931
8932void kvm_make_scan_ioapic_request(struct kvm *kvm)
8933{
8934	kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
8935}
8936
8937void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
8938{
8939	if (!lapic_in_kernel(vcpu))
8940		return;
8941
8942	vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
8943	kvm_apic_update_apicv(vcpu);
8944	kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
8945}
8946EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
8947
8948/*
8949 * NOTE: Do not hold any lock prior to calling this.
8950 *
8951 * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
8952 * locked, because it calls __x86_set_memory_region() which does
8953 * synchronize_srcu(&kvm->srcu).
8954 */
8955void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
8956{
8957	struct kvm_vcpu *except;
8958	unsigned long old, new, expected;
8959
8960	if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
8961	    !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
8962		return;
8963
8964	old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
8965	do {
8966		expected = new = old;
8967		if (activate)
8968			__clear_bit(bit, &new);
8969		else
8970			__set_bit(bit, &new);
8971		if (new == old)
8972			break;
8973		old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
8974	} while (old != expected);
8975
8976	if (!!old == !!new)
8977		return;
8978
8979	trace_kvm_apicv_update_request(activate, bit);
8980	if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
8981		kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
8982
8983	/*
8984	 * Sending request to update APICV for all other vcpus,
8985	 * while update the calling vcpu immediately instead of
8986	 * waiting for another #VMEXIT to handle the request.
8987	 */
8988	except = kvm_get_running_vcpu();
8989	kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
8990					 except);
8991	if (except)
8992		kvm_vcpu_update_apicv(except);
8993}
8994EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
8995
8996static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
8997{
8998	if (!kvm_apic_present(vcpu))
8999		return;
9000
9001	bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
9002
9003	if (irqchip_split(vcpu->kvm))
9004		kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
9005	else {
9006		if (vcpu->arch.apicv_active)
9007			kvm_x86_ops.sync_pir_to_irr(vcpu);
9008		if (ioapic_in_kernel(vcpu->kvm))
9009			kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
9010	}
9011
9012	if (is_guest_mode(vcpu))
9013		vcpu->arch.load_eoi_exitmap_pending = true;
9014	else
9015		kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
9016}
9017
9018static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
9019{
9020	u64 eoi_exit_bitmap[4];
9021
9022	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
9023		return;
9024
9025	bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
9026		  vcpu_to_synic(vcpu)->vec_bitmap, 256);
9027	kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
9028}
9029
9030void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
9031					    unsigned long start, unsigned long end)
9032{
9033	unsigned long apic_address;
9034
9035	/*
9036	 * The physical address of apic access page is stored in the VMCS.
9037	 * Update it when it becomes invalid.
9038	 */
9039	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
9040	if (start <= apic_address && apic_address < end)
9041		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
9042}
9043
9044void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
9045{
9046	if (kvm_x86_ops.guest_memory_reclaimed)
9047		kvm_x86_ops.guest_memory_reclaimed(kvm);
9048}
9049
9050void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
9051{
9052	if (!lapic_in_kernel(vcpu))
9053		return;
9054
9055	if (!kvm_x86_ops.set_apic_access_page_addr)
9056		return;
9057
9058	kvm_x86_ops.set_apic_access_page_addr(vcpu);
9059}
9060
9061void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
9062{
9063	smp_send_reschedule(vcpu->cpu);
9064}
9065EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
9066
9067/*
9068 * Returns 1 to let vcpu_run() continue the guest execution loop without
9069 * exiting to the userspace.  Otherwise, the value will be returned to the
9070 * userspace.
9071 */
9072static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
9073{
9074	int r;
9075	bool req_int_win =
9076		dm_request_for_irq_injection(vcpu) &&
9077		kvm_cpu_accept_dm_intr(vcpu);
9078	fastpath_t exit_fastpath;
9079
9080	bool req_immediate_exit = false;
9081
9082	if (kvm_request_pending(vcpu)) {
9083		if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
9084			if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
9085				r = 0;
9086				goto out;
9087			}
9088		}
9089		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
9090			kvm_mmu_unload(vcpu);
9091		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
9092			__kvm_migrate_timers(vcpu);
9093		if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
9094			kvm_gen_update_masterclock(vcpu->kvm);
9095		if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
9096			kvm_gen_kvmclock_update(vcpu);
9097		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
9098			r = kvm_guest_time_update(vcpu);
9099			if (unlikely(r))
9100				goto out;
9101		}
9102		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
9103			kvm_mmu_sync_roots(vcpu);
9104		if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
9105			kvm_mmu_load_pgd(vcpu);
9106		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
9107			kvm_vcpu_flush_tlb_all(vcpu);
9108
9109			/* Flushing all ASIDs flushes the current ASID... */
9110			kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
9111		}
9112		if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
9113			kvm_vcpu_flush_tlb_current(vcpu);
9114		if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
9115			kvm_vcpu_flush_tlb_guest(vcpu);
9116
9117		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
9118			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
9119			r = 0;
9120			goto out;
9121		}
9122		if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
9123			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
9124			vcpu->mmio_needed = 0;
9125			r = 0;
9126			goto out;
9127		}
9128		if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
9129			/* Page is swapped out. Do synthetic halt */
9130			vcpu->arch.apf.halted = true;
9131			r = 1;
9132			goto out;
9133		}
9134		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
9135			record_steal_time(vcpu);
9136		if (kvm_check_request(KVM_REQ_SMI, vcpu))
9137			process_smi(vcpu);
9138		if (kvm_check_request(KVM_REQ_NMI, vcpu))
9139			process_nmi(vcpu);
9140		if (kvm_check_request(KVM_REQ_PMU, vcpu))
9141			kvm_pmu_handle_event(vcpu);
9142		if (kvm_check_request(KVM_REQ_PMI, vcpu))
9143			kvm_pmu_deliver_pmi(vcpu);
9144		if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
9145			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
9146			if (test_bit(vcpu->arch.pending_ioapic_eoi,
9147				     vcpu->arch.ioapic_handled_vectors)) {
9148				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
9149				vcpu->run->eoi.vector =
9150						vcpu->arch.pending_ioapic_eoi;
9151				r = 0;
9152				goto out;
9153			}
9154		}
9155		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
9156			vcpu_scan_ioapic(vcpu);
9157		if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
9158			vcpu_load_eoi_exitmap(vcpu);
9159		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
9160			kvm_vcpu_reload_apic_access_page(vcpu);
9161		if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
9162			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
9163			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
9164			r = 0;
9165			goto out;
9166		}
9167		if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
9168			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
9169			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
9170			r = 0;
9171			goto out;
9172		}
9173		if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
9174			vcpu->run->exit_reason = KVM_EXIT_HYPERV;
9175			vcpu->run->hyperv = vcpu->arch.hyperv.exit;
9176			r = 0;
9177			goto out;
9178		}
9179
9180		/*
9181		 * KVM_REQ_HV_STIMER has to be processed after
9182		 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
9183		 * depend on the guest clock being up-to-date
9184		 */
9185		if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
9186			kvm_hv_process_stimers(vcpu);
9187		if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
9188			kvm_vcpu_update_apicv(vcpu);
9189		if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
9190			kvm_check_async_pf_completion(vcpu);
9191		if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
9192			kvm_x86_ops.msr_filter_changed(vcpu);
9193	}
9194
9195	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
9196		++vcpu->stat.req_event;
9197		kvm_apic_accept_events(vcpu);
9198		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
9199			r = 1;
9200			goto out;
9201		}
9202
9203		inject_pending_event(vcpu, &req_immediate_exit);
9204		if (req_int_win)
9205			kvm_x86_ops.enable_irq_window(vcpu);
9206
9207		if (kvm_lapic_enabled(vcpu)) {
9208			update_cr8_intercept(vcpu);
9209			kvm_lapic_sync_to_vapic(vcpu);
9210		}
9211	}
9212
9213	r = kvm_mmu_reload(vcpu);
9214	if (unlikely(r)) {
9215		goto cancel_injection;
9216	}
9217
9218	preempt_disable();
9219
9220	kvm_x86_ops.prepare_guest_switch(vcpu);
9221
9222	/*
9223	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
9224	 * IPI are then delayed after guest entry, which ensures that they
9225	 * result in virtual interrupt delivery.
9226	 */
9227	local_irq_disable();
9228	vcpu->mode = IN_GUEST_MODE;
9229
9230	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
9231
9232	/*
9233	 * 1) We should set ->mode before checking ->requests.  Please see
9234	 * the comment in kvm_vcpu_exiting_guest_mode().
9235	 *
9236	 * 2) For APICv, we should set ->mode before checking PID.ON. This
9237	 * pairs with the memory barrier implicit in pi_test_and_set_on
9238	 * (see vmx_deliver_posted_interrupt).
9239	 *
9240	 * 3) This also orders the write to mode from any reads to the page
9241	 * tables done while the VCPU is running.  Please see the comment
9242	 * in kvm_flush_remote_tlbs.
9243	 */
9244	smp_mb__after_srcu_read_unlock();
9245
9246	/*
9247	 * This handles the case where a posted interrupt was
9248	 * notified with kvm_vcpu_kick.
9249	 */
9250	if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
9251		kvm_x86_ops.sync_pir_to_irr(vcpu);
9252
9253	if (kvm_vcpu_exit_request(vcpu)) {
9254		vcpu->mode = OUTSIDE_GUEST_MODE;
9255		smp_wmb();
9256		local_irq_enable();
9257		preempt_enable();
9258		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
9259		r = 1;
9260		goto cancel_injection;
9261	}
9262
9263	if (req_immediate_exit) {
9264		kvm_make_request(KVM_REQ_EVENT, vcpu);
9265		kvm_x86_ops.request_immediate_exit(vcpu);
9266	}
9267
9268	trace_kvm_entry(vcpu);
9269
9270	fpregs_assert_state_consistent();
9271	if (test_thread_flag(TIF_NEED_FPU_LOAD))
9272		switch_fpu_return();
9273
9274	if (unlikely(vcpu->arch.switch_db_regs)) {
9275		set_debugreg(0, 7);
9276		set_debugreg(vcpu->arch.eff_db[0], 0);
9277		set_debugreg(vcpu->arch.eff_db[1], 1);
9278		set_debugreg(vcpu->arch.eff_db[2], 2);
9279		set_debugreg(vcpu->arch.eff_db[3], 3);
9280		set_debugreg(vcpu->arch.dr6, 6);
9281		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
9282	} else if (unlikely(hw_breakpoint_active())) {
9283		set_debugreg(0, 7);
9284	}
9285
9286	exit_fastpath = kvm_x86_ops.run(vcpu);
9287
9288	/*
9289	 * Do this here before restoring debug registers on the host.  And
9290	 * since we do this before handling the vmexit, a DR access vmexit
9291	 * can (a) read the correct value of the debug registers, (b) set
9292	 * KVM_DEBUGREG_WONT_EXIT again.
9293	 */
9294	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
9295		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
9296		kvm_x86_ops.sync_dirty_debug_regs(vcpu);
9297		kvm_update_dr0123(vcpu);
9298		kvm_update_dr7(vcpu);
9299		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
9300	}
9301
9302	/*
9303	 * If the guest has used debug registers, at least dr7
9304	 * will be disabled while returning to the host.
9305	 * If we don't have active breakpoints in the host, we don't
9306	 * care about the messed up debug address registers. But if
9307	 * we have some of them active, restore the old state.
9308	 */
9309	if (hw_breakpoint_active())
9310		hw_breakpoint_restore();
9311
9312	vcpu->arch.last_vmentry_cpu = vcpu->cpu;
9313	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
9314
9315	vcpu->mode = OUTSIDE_GUEST_MODE;
9316	smp_wmb();
9317
9318	kvm_x86_ops.handle_exit_irqoff(vcpu);
9319
9320	/*
9321	 * Consume any pending interrupts, including the possible source of
9322	 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
9323	 * An instruction is required after local_irq_enable() to fully unblock
9324	 * interrupts on processors that implement an interrupt shadow, the
9325	 * stat.exits increment will do nicely.
9326	 */
9327	kvm_before_interrupt(vcpu);
9328	local_irq_enable();
9329	++vcpu->stat.exits;
9330	local_irq_disable();
9331	kvm_after_interrupt(vcpu);
9332
9333	/*
9334	 * Wait until after servicing IRQs to account guest time so that any
9335	 * ticks that occurred while running the guest are properly accounted
9336	 * to the guest.  Waiting until IRQs are enabled degrades the accuracy
9337	 * of accounting via context tracking, but the loss of accuracy is
9338	 * acceptable for all known use cases.
9339	 */
9340	vtime_account_guest_exit();
9341
9342	if (lapic_in_kernel(vcpu)) {
9343		s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
9344		if (delta != S64_MIN) {
9345			trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
9346			vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
9347		}
9348	}
9349
9350	local_irq_enable();
9351	preempt_enable();
9352
9353	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
9354
9355	/*
9356	 * Profile KVM exit RIPs:
9357	 */
9358	if (unlikely(prof_on == KVM_PROFILING)) {
9359		unsigned long rip = kvm_rip_read(vcpu);
9360		profile_hit(KVM_PROFILING, (void *)rip);
9361	}
9362
9363	if (unlikely(vcpu->arch.tsc_always_catchup))
9364		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
9365
9366	if (vcpu->arch.apic_attention)
9367		kvm_lapic_sync_from_vapic(vcpu);
9368
9369	r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
9370	return r;
9371
9372cancel_injection:
9373	if (req_immediate_exit)
9374		kvm_make_request(KVM_REQ_EVENT, vcpu);
9375	kvm_x86_ops.cancel_injection(vcpu);
9376	if (unlikely(vcpu->arch.apic_attention))
9377		kvm_lapic_sync_from_vapic(vcpu);
9378out:
9379	return r;
9380}
9381
9382static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
9383{
9384	if (!kvm_arch_vcpu_runnable(vcpu) &&
9385	    (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) {
9386		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
9387		kvm_vcpu_block(vcpu);
9388		vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
9389
9390		if (kvm_x86_ops.post_block)
9391			kvm_x86_ops.post_block(vcpu);
9392
9393		if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
9394			return 1;
9395	}
9396
9397	kvm_apic_accept_events(vcpu);
9398	switch(vcpu->arch.mp_state) {
9399	case KVM_MP_STATE_HALTED:
9400		vcpu->arch.pv.pv_unhalted = false;
9401		vcpu->arch.mp_state =
9402			KVM_MP_STATE_RUNNABLE;
9403		fallthrough;
9404	case KVM_MP_STATE_RUNNABLE:
9405		vcpu->arch.apf.halted = false;
9406		break;
9407	case KVM_MP_STATE_INIT_RECEIVED:
9408		break;
9409	default:
9410		return -EINTR;
9411	}
9412	return 1;
9413}
9414
9415static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
9416{
9417	if (is_guest_mode(vcpu))
9418		kvm_x86_ops.nested_ops->check_events(vcpu);
9419
9420	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
9421		!vcpu->arch.apf.halted);
9422}
9423
9424static int vcpu_run(struct kvm_vcpu *vcpu)
9425{
9426	int r;
9427	struct kvm *kvm = vcpu->kvm;
9428
9429	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
9430	vcpu->arch.l1tf_flush_l1d = true;
9431
9432	for (;;) {
9433		/*
9434		 * If another guest vCPU requests a PV TLB flush in the middle
9435		 * of instruction emulation, the rest of the emulation could
9436		 * use a stale page translation. Assume that any code after
9437		 * this point can start executing an instruction.
9438		 */
9439		vcpu->arch.at_instruction_boundary = false;
9440		if (kvm_vcpu_running(vcpu)) {
9441			r = vcpu_enter_guest(vcpu);
9442		} else {
9443			r = vcpu_block(kvm, vcpu);
9444		}
9445
9446		if (r <= 0)
9447			break;
9448
9449		kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
9450		if (kvm_cpu_has_pending_timer(vcpu))
9451			kvm_inject_pending_timer_irqs(vcpu);
9452
9453		if (dm_request_for_irq_injection(vcpu) &&
9454			kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
9455			r = 0;
9456			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
9457			++vcpu->stat.request_irq_exits;
9458			break;
9459		}
9460
9461		if (__xfer_to_guest_mode_work_pending()) {
9462			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
9463			r = xfer_to_guest_mode_handle_work(vcpu);
9464			if (r)
9465				return r;
9466			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
9467		}
9468	}
9469
9470	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
9471
9472	return r;
9473}
9474
9475static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
9476{
9477	int r;
9478
9479	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
9480	r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
9481	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
9482	return r;
9483}
9484
9485static int complete_emulated_pio(struct kvm_vcpu *vcpu)
9486{
9487	BUG_ON(!vcpu->arch.pio.count);
9488
9489	return complete_emulated_io(vcpu);
9490}
9491
9492/*
9493 * Implements the following, as a state machine:
9494 *
9495 * read:
9496 *   for each fragment
9497 *     for each mmio piece in the fragment
9498 *       write gpa, len
9499 *       exit
9500 *       copy data
9501 *   execute insn
9502 *
9503 * write:
9504 *   for each fragment
9505 *     for each mmio piece in the fragment
9506 *       write gpa, len
9507 *       copy data
9508 *       exit
9509 */
9510static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
9511{
9512	struct kvm_run *run = vcpu->run;
9513	struct kvm_mmio_fragment *frag;
9514	unsigned len;
9515
9516	BUG_ON(!vcpu->mmio_needed);
9517
9518	/* Complete previous fragment */
9519	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
9520	len = min(8u, frag->len);
9521	if (!vcpu->mmio_is_write)
9522		memcpy(frag->data, run->mmio.data, len);
9523
9524	if (frag->len <= 8) {
9525		/* Switch to the next fragment. */
9526		frag++;
9527		vcpu->mmio_cur_fragment++;
9528	} else {
9529		/* Go forward to the next mmio piece. */
9530		frag->data += len;
9531		frag->gpa += len;
9532		frag->len -= len;
9533	}
9534
9535	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
9536		vcpu->mmio_needed = 0;
9537
9538		/* FIXME: return into emulator if single-stepping.  */
9539		if (vcpu->mmio_is_write)
9540			return 1;
9541		vcpu->mmio_read_completed = 1;
9542		return complete_emulated_io(vcpu);
9543	}
9544
9545	run->exit_reason = KVM_EXIT_MMIO;
9546	run->mmio.phys_addr = frag->gpa;
9547	if (vcpu->mmio_is_write)
9548		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
9549	run->mmio.len = min(8u, frag->len);
9550	run->mmio.is_write = vcpu->mmio_is_write;
9551	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
9552	return 0;
9553}
9554
9555static void kvm_save_current_fpu(struct fpu *fpu)
9556{
9557	/*
9558	 * If the target FPU state is not resident in the CPU registers, just
9559	 * memcpy() from current, else save CPU state directly to the target.
9560	 */
9561	if (test_thread_flag(TIF_NEED_FPU_LOAD))
9562		memcpy(&fpu->state, &current->thread.fpu.state,
9563		       fpu_kernel_xstate_size);
9564	else
9565		copy_fpregs_to_fpstate(fpu);
9566}
9567
9568/* Swap (qemu) user FPU context for the guest FPU context. */
9569static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
9570{
9571	fpregs_lock();
9572
9573	kvm_save_current_fpu(vcpu->arch.user_fpu);
9574
9575	/* PKRU is separately restored in kvm_x86_ops.run.  */
9576	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
9577				~XFEATURE_MASK_PKRU);
9578
9579	fpregs_mark_activate();
9580	fpregs_unlock();
9581
9582	trace_kvm_fpu(1);
9583}
9584
9585/* When vcpu_run ends, restore user space FPU context. */
9586static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
9587{
9588	fpregs_lock();
9589
9590	kvm_save_current_fpu(vcpu->arch.guest_fpu);
9591
9592	copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
9593
9594	fpregs_mark_activate();
9595	fpregs_unlock();
9596
9597	++vcpu->stat.fpu_reload;
9598	trace_kvm_fpu(0);
9599}
9600
9601int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
9602{
9603	struct kvm_run *kvm_run = vcpu->run;
9604	int r;
9605
9606	vcpu_load(vcpu);
9607	kvm_sigset_activate(vcpu);
9608	kvm_load_guest_fpu(vcpu);
9609
9610	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
9611		if (kvm_run->immediate_exit) {
9612			r = -EINTR;
9613			goto out;
9614		}
9615		kvm_vcpu_block(vcpu);
9616		kvm_apic_accept_events(vcpu);
9617		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
9618		r = -EAGAIN;
9619		if (signal_pending(current)) {
9620			r = -EINTR;
9621			kvm_run->exit_reason = KVM_EXIT_INTR;
9622			++vcpu->stat.signal_exits;
9623		}
9624		goto out;
9625	}
9626
9627	if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
9628		r = -EINVAL;
9629		goto out;
9630	}
9631
9632	if (kvm_run->kvm_dirty_regs) {
9633		r = sync_regs(vcpu);
9634		if (r != 0)
9635			goto out;
9636	}
9637
9638	/* re-sync apic's tpr */
9639	if (!lapic_in_kernel(vcpu)) {
9640		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
9641			r = -EINVAL;
9642			goto out;
9643		}
9644	}
9645
9646	if (unlikely(vcpu->arch.complete_userspace_io)) {
9647		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
9648		vcpu->arch.complete_userspace_io = NULL;
9649		r = cui(vcpu);
9650		if (r <= 0)
9651			goto out;
9652	} else
9653		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
9654
9655	if (kvm_run->immediate_exit)
9656		r = -EINTR;
9657	else
9658		r = vcpu_run(vcpu);
9659
9660out:
9661	kvm_put_guest_fpu(vcpu);
9662	if (kvm_run->kvm_valid_regs)
9663		store_regs(vcpu);
9664	post_kvm_run_save(vcpu);
9665	kvm_sigset_deactivate(vcpu);
9666
9667	vcpu_put(vcpu);
9668	return r;
9669}
9670
9671static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
9672{
9673	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
9674		/*
9675		 * We are here if userspace calls get_regs() in the middle of
9676		 * instruction emulation. Registers state needs to be copied
9677		 * back from emulation context to vcpu. Userspace shouldn't do
9678		 * that usually, but some bad designed PV devices (vmware
9679		 * backdoor interface) need this to work
9680		 */
9681		emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
9682		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
9683	}
9684	regs->rax = kvm_rax_read(vcpu);
9685	regs->rbx = kvm_rbx_read(vcpu);
9686	regs->rcx = kvm_rcx_read(vcpu);
9687	regs->rdx = kvm_rdx_read(vcpu);
9688	regs->rsi = kvm_rsi_read(vcpu);
9689	regs->rdi = kvm_rdi_read(vcpu);
9690	regs->rsp = kvm_rsp_read(vcpu);
9691	regs->rbp = kvm_rbp_read(vcpu);
9692#ifdef CONFIG_X86_64
9693	regs->r8 = kvm_r8_read(vcpu);
9694	regs->r9 = kvm_r9_read(vcpu);
9695	regs->r10 = kvm_r10_read(vcpu);
9696	regs->r11 = kvm_r11_read(vcpu);
9697	regs->r12 = kvm_r12_read(vcpu);
9698	regs->r13 = kvm_r13_read(vcpu);
9699	regs->r14 = kvm_r14_read(vcpu);
9700	regs->r15 = kvm_r15_read(vcpu);
9701#endif
9702
9703	regs->rip = kvm_rip_read(vcpu);
9704	regs->rflags = kvm_get_rflags(vcpu);
9705}
9706
9707int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
9708{
9709	vcpu_load(vcpu);
9710	__get_regs(vcpu, regs);
9711	vcpu_put(vcpu);
9712	return 0;
9713}
9714
9715static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
9716{
9717	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
9718	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
9719
9720	kvm_rax_write(vcpu, regs->rax);
9721	kvm_rbx_write(vcpu, regs->rbx);
9722	kvm_rcx_write(vcpu, regs->rcx);
9723	kvm_rdx_write(vcpu, regs->rdx);
9724	kvm_rsi_write(vcpu, regs->rsi);
9725	kvm_rdi_write(vcpu, regs->rdi);
9726	kvm_rsp_write(vcpu, regs->rsp);
9727	kvm_rbp_write(vcpu, regs->rbp);
9728#ifdef CONFIG_X86_64
9729	kvm_r8_write(vcpu, regs->r8);
9730	kvm_r9_write(vcpu, regs->r9);
9731	kvm_r10_write(vcpu, regs->r10);
9732	kvm_r11_write(vcpu, regs->r11);
9733	kvm_r12_write(vcpu, regs->r12);
9734	kvm_r13_write(vcpu, regs->r13);
9735	kvm_r14_write(vcpu, regs->r14);
9736	kvm_r15_write(vcpu, regs->r15);
9737#endif
9738
9739	kvm_rip_write(vcpu, regs->rip);
9740	kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
9741
9742	vcpu->arch.exception.pending = false;
9743
9744	kvm_make_request(KVM_REQ_EVENT, vcpu);
9745}
9746
9747int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
9748{
9749	vcpu_load(vcpu);
9750	__set_regs(vcpu, regs);
9751	vcpu_put(vcpu);
9752	return 0;
9753}
9754
9755void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
9756{
9757	struct kvm_segment cs;
9758
9759	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
9760	*db = cs.db;
9761	*l = cs.l;
9762}
9763EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
9764
9765static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
9766{
9767	struct desc_ptr dt;
9768
9769	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
9770	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
9771	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
9772	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
9773	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
9774	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
9775
9776	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
9777	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
9778
9779	kvm_x86_ops.get_idt(vcpu, &dt);
9780	sregs->idt.limit = dt.size;
9781	sregs->idt.base = dt.address;
9782	kvm_x86_ops.get_gdt(vcpu, &dt);
9783	sregs->gdt.limit = dt.size;
9784	sregs->gdt.base = dt.address;
9785
9786	sregs->cr0 = kvm_read_cr0(vcpu);
9787	sregs->cr2 = vcpu->arch.cr2;
9788	sregs->cr3 = kvm_read_cr3(vcpu);
9789	sregs->cr4 = kvm_read_cr4(vcpu);
9790	sregs->cr8 = kvm_get_cr8(vcpu);
9791	sregs->efer = vcpu->arch.efer;
9792	sregs->apic_base = kvm_get_apic_base(vcpu);
9793
9794	memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
9795
9796	if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
9797		set_bit(vcpu->arch.interrupt.nr,
9798			(unsigned long *)sregs->interrupt_bitmap);
9799}
9800
9801int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
9802				  struct kvm_sregs *sregs)
9803{
9804	vcpu_load(vcpu);
9805	__get_sregs(vcpu, sregs);
9806	vcpu_put(vcpu);
9807	return 0;
9808}
9809
9810int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
9811				    struct kvm_mp_state *mp_state)
9812{
9813	vcpu_load(vcpu);
9814	if (kvm_mpx_supported())
9815		kvm_load_guest_fpu(vcpu);
9816
9817	kvm_apic_accept_events(vcpu);
9818	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
9819					vcpu->arch.pv.pv_unhalted)
9820		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
9821	else
9822		mp_state->mp_state = vcpu->arch.mp_state;
9823
9824	if (kvm_mpx_supported())
9825		kvm_put_guest_fpu(vcpu);
9826	vcpu_put(vcpu);
9827	return 0;
9828}
9829
9830int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
9831				    struct kvm_mp_state *mp_state)
9832{
9833	int ret = -EINVAL;
9834
9835	vcpu_load(vcpu);
9836
9837	if (!lapic_in_kernel(vcpu) &&
9838	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
9839		goto out;
9840
9841	/*
9842	 * KVM_MP_STATE_INIT_RECEIVED means the processor is in
9843	 * INIT state; latched init should be reported using
9844	 * KVM_SET_VCPU_EVENTS, so reject it here.
9845	 */
9846	if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
9847	    (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
9848	     mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
9849		goto out;
9850
9851	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
9852		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
9853		set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
9854	} else
9855		vcpu->arch.mp_state = mp_state->mp_state;
9856	kvm_make_request(KVM_REQ_EVENT, vcpu);
9857
9858	ret = 0;
9859out:
9860	vcpu_put(vcpu);
9861	return ret;
9862}
9863
9864int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
9865		    int reason, bool has_error_code, u32 error_code)
9866{
9867	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9868	int ret;
9869
9870	init_emulate_ctxt(vcpu);
9871
9872	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
9873				   has_error_code, error_code);
9874	if (ret) {
9875		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9876		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
9877		vcpu->run->internal.ndata = 0;
9878		return 0;
9879	}
9880
9881	kvm_rip_write(vcpu, ctxt->eip);
9882	kvm_set_rflags(vcpu, ctxt->eflags);
9883	return 1;
9884}
9885EXPORT_SYMBOL_GPL(kvm_task_switch);
9886
9887static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
9888{
9889	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
9890		/*
9891		 * When EFER.LME and CR0.PG are set, the processor is in
9892		 * 64-bit mode (though maybe in a 32-bit code segment).
9893		 * CR4.PAE and EFER.LMA must be set.
9894		 */
9895		if (!(sregs->cr4 & X86_CR4_PAE)
9896		    || !(sregs->efer & EFER_LMA))
9897			return -EINVAL;
9898		if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits)
9899			return -EINVAL;
9900	} else {
9901		/*
9902		 * Not in 64-bit mode: EFER.LMA is clear and the code
9903		 * segment cannot be 64-bit.
9904		 */
9905		if (sregs->efer & EFER_LMA || sregs->cs.l)
9906			return -EINVAL;
9907	}
9908
9909	return kvm_valid_cr4(vcpu, sregs->cr4);
9910}
9911
9912static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
9913{
9914	struct msr_data apic_base_msr;
9915	int mmu_reset_needed = 0;
9916	int cpuid_update_needed = 0;
9917	int pending_vec, max_bits, idx;
9918	struct desc_ptr dt;
9919	int ret = -EINVAL;
9920
9921	if (kvm_valid_sregs(vcpu, sregs))
9922		goto out;
9923
9924	apic_base_msr.data = sregs->apic_base;
9925	apic_base_msr.host_initiated = true;
9926	if (kvm_set_apic_base(vcpu, &apic_base_msr))
9927		goto out;
9928
9929	dt.size = sregs->idt.limit;
9930	dt.address = sregs->idt.base;
9931	kvm_x86_ops.set_idt(vcpu, &dt);
9932	dt.size = sregs->gdt.limit;
9933	dt.address = sregs->gdt.base;
9934	kvm_x86_ops.set_gdt(vcpu, &dt);
9935
9936	vcpu->arch.cr2 = sregs->cr2;
9937	mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
9938	vcpu->arch.cr3 = sregs->cr3;
9939	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
9940
9941	kvm_set_cr8(vcpu, sregs->cr8);
9942
9943	mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
9944	kvm_x86_ops.set_efer(vcpu, sregs->efer);
9945
9946	mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
9947	kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
9948	vcpu->arch.cr0 = sregs->cr0;
9949
9950	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
9951	cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
9952				(X86_CR4_OSXSAVE | X86_CR4_PKE));
9953	kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
9954	if (cpuid_update_needed)
9955		kvm_update_cpuid_runtime(vcpu);
9956
9957	idx = srcu_read_lock(&vcpu->kvm->srcu);
9958	if (is_pae_paging(vcpu)) {
9959		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
9960		mmu_reset_needed = 1;
9961	}
9962	srcu_read_unlock(&vcpu->kvm->srcu, idx);
9963
9964	if (mmu_reset_needed)
9965		kvm_mmu_reset_context(vcpu);
9966
9967	max_bits = KVM_NR_INTERRUPTS;
9968	pending_vec = find_first_bit(
9969		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
9970	if (pending_vec < max_bits) {
9971		kvm_queue_interrupt(vcpu, pending_vec, false);
9972		pr_debug("Set back pending irq %d\n", pending_vec);
9973	}
9974
9975	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
9976	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
9977	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
9978	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
9979	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
9980	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
9981
9982	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
9983	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
9984
9985	update_cr8_intercept(vcpu);
9986
9987	/* Older userspace won't unhalt the vcpu on reset. */
9988	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
9989	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
9990	    !is_protmode(vcpu))
9991		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
9992
9993	kvm_make_request(KVM_REQ_EVENT, vcpu);
9994
9995	ret = 0;
9996out:
9997	return ret;
9998}
9999
10000int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
10001				  struct kvm_sregs *sregs)
10002{
10003	int ret;
10004
10005	vcpu_load(vcpu);
10006	ret = __set_sregs(vcpu, sregs);
10007	vcpu_put(vcpu);
10008	return ret;
10009}
10010
10011int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
10012					struct kvm_guest_debug *dbg)
10013{
10014	unsigned long rflags;
10015	int i, r;
10016
10017	vcpu_load(vcpu);
10018
10019	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
10020		r = -EBUSY;
10021		if (vcpu->arch.exception.pending)
10022			goto out;
10023		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
10024			kvm_queue_exception(vcpu, DB_VECTOR);
10025		else
10026			kvm_queue_exception(vcpu, BP_VECTOR);
10027	}
10028
10029	/*
10030	 * Read rflags as long as potentially injected trace flags are still
10031	 * filtered out.
10032	 */
10033	rflags = kvm_get_rflags(vcpu);
10034
10035	vcpu->guest_debug = dbg->control;
10036	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
10037		vcpu->guest_debug = 0;
10038
10039	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
10040		for (i = 0; i < KVM_NR_DB_REGS; ++i)
10041			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
10042		vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
10043	} else {
10044		for (i = 0; i < KVM_NR_DB_REGS; i++)
10045			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
10046	}
10047	kvm_update_dr7(vcpu);
10048
10049	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
10050		vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
10051			get_segment_base(vcpu, VCPU_SREG_CS);
10052
10053	/*
10054	 * Trigger an rflags update that will inject or remove the trace
10055	 * flags.
10056	 */
10057	kvm_set_rflags(vcpu, rflags);
10058
10059	kvm_x86_ops.update_exception_bitmap(vcpu);
10060
10061	r = 0;
10062
10063out:
10064	vcpu_put(vcpu);
10065	return r;
10066}
10067
10068/*
10069 * Translate a guest virtual address to a guest physical address.
10070 */
10071int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
10072				    struct kvm_translation *tr)
10073{
10074	unsigned long vaddr = tr->linear_address;
10075	gpa_t gpa;
10076	int idx;
10077
10078	vcpu_load(vcpu);
10079
10080	idx = srcu_read_lock(&vcpu->kvm->srcu);
10081	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
10082	srcu_read_unlock(&vcpu->kvm->srcu, idx);
10083	tr->physical_address = gpa;
10084	tr->valid = gpa != UNMAPPED_GVA;
10085	tr->writeable = 1;
10086	tr->usermode = 0;
10087
10088	vcpu_put(vcpu);
10089	return 0;
10090}
10091
10092int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
10093{
10094	struct fxregs_state *fxsave;
10095
10096	vcpu_load(vcpu);
10097
10098	fxsave = &vcpu->arch.guest_fpu->state.fxsave;
10099	memcpy(fpu->fpr, fxsave->st_space, 128);
10100	fpu->fcw = fxsave->cwd;
10101	fpu->fsw = fxsave->swd;
10102	fpu->ftwx = fxsave->twd;
10103	fpu->last_opcode = fxsave->fop;
10104	fpu->last_ip = fxsave->rip;
10105	fpu->last_dp = fxsave->rdp;
10106	memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
10107
10108	vcpu_put(vcpu);
10109	return 0;
10110}
10111
10112int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
10113{
10114	struct fxregs_state *fxsave;
10115
10116	vcpu_load(vcpu);
10117
10118	fxsave = &vcpu->arch.guest_fpu->state.fxsave;
10119
10120	memcpy(fxsave->st_space, fpu->fpr, 128);
10121	fxsave->cwd = fpu->fcw;
10122	fxsave->swd = fpu->fsw;
10123	fxsave->twd = fpu->ftwx;
10124	fxsave->fop = fpu->last_opcode;
10125	fxsave->rip = fpu->last_ip;
10126	fxsave->rdp = fpu->last_dp;
10127	memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
10128
10129	vcpu_put(vcpu);
10130	return 0;
10131}
10132
10133static void store_regs(struct kvm_vcpu *vcpu)
10134{
10135	BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
10136
10137	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
10138		__get_regs(vcpu, &vcpu->run->s.regs.regs);
10139
10140	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
10141		__get_sregs(vcpu, &vcpu->run->s.regs.sregs);
10142
10143	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
10144		kvm_vcpu_ioctl_x86_get_vcpu_events(
10145				vcpu, &vcpu->run->s.regs.events);
10146}
10147
10148static int sync_regs(struct kvm_vcpu *vcpu)
10149{
10150	if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
10151		return -EINVAL;
10152
10153	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
10154		__set_regs(vcpu, &vcpu->run->s.regs.regs);
10155		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
10156	}
10157	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
10158		if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
10159			return -EINVAL;
10160		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
10161	}
10162	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
10163		if (kvm_vcpu_ioctl_x86_set_vcpu_events(
10164				vcpu, &vcpu->run->s.regs.events))
10165			return -EINVAL;
10166		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
10167	}
10168
10169	return 0;
10170}
10171
10172static void fx_init(struct kvm_vcpu *vcpu)
10173{
10174	fpstate_init(&vcpu->arch.guest_fpu->state);
10175	if (boot_cpu_has(X86_FEATURE_XSAVES))
10176		vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
10177			host_xcr0 | XSTATE_COMPACTION_ENABLED;
10178
10179	/*
10180	 * Ensure guest xcr0 is valid for loading
10181	 */
10182	vcpu->arch.xcr0 = XFEATURE_MASK_FP;
10183
10184	vcpu->arch.cr0 |= X86_CR0_ET;
10185}
10186
10187int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
10188{
10189	if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
10190		pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
10191			     "guest TSC will not be reliable\n");
10192
10193	return 0;
10194}
10195
10196int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
10197{
10198	struct page *page;
10199	int r;
10200
10201	if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
10202		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
10203	else
10204		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
10205
10206	kvm_set_tsc_khz(vcpu, max_tsc_khz);
10207
10208	r = kvm_mmu_create(vcpu);
10209	if (r < 0)
10210		return r;
10211
10212	if (irqchip_in_kernel(vcpu->kvm)) {
10213		r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
10214		if (r < 0)
10215			goto fail_mmu_destroy;
10216		if (kvm_apicv_activated(vcpu->kvm))
10217			vcpu->arch.apicv_active = true;
10218	} else
10219		static_key_slow_inc(&kvm_no_apic_vcpu);
10220
10221	r = -ENOMEM;
10222
10223	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
10224	if (!page)
10225		goto fail_free_lapic;
10226	vcpu->arch.pio_data = page_address(page);
10227
10228	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
10229				       GFP_KERNEL_ACCOUNT);
10230	if (!vcpu->arch.mce_banks)
10231		goto fail_free_pio_data;
10232	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
10233
10234	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
10235				GFP_KERNEL_ACCOUNT))
10236		goto fail_free_mce_banks;
10237
10238	if (!alloc_emulate_ctxt(vcpu))
10239		goto free_wbinvd_dirty_mask;
10240
10241	vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
10242						GFP_KERNEL_ACCOUNT);
10243	if (!vcpu->arch.user_fpu) {
10244		pr_err("kvm: failed to allocate userspace's fpu\n");
10245		goto free_emulate_ctxt;
10246	}
10247
10248	vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
10249						 GFP_KERNEL_ACCOUNT);
10250	if (!vcpu->arch.guest_fpu) {
10251		pr_err("kvm: failed to allocate vcpu's fpu\n");
10252		goto free_user_fpu;
10253	}
10254	fx_init(vcpu);
10255
10256	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
10257	vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
10258
10259	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
10260
10261	kvm_async_pf_hash_reset(vcpu);
10262	kvm_pmu_init(vcpu);
10263
10264	vcpu->arch.pending_external_vector = -1;
10265	vcpu->arch.preempted_in_kernel = false;
10266
10267	kvm_hv_vcpu_init(vcpu);
10268
10269	r = kvm_x86_ops.vcpu_create(vcpu);
10270	if (r)
10271		goto free_guest_fpu;
10272
10273	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
10274	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
10275	kvm_vcpu_mtrr_init(vcpu);
10276	vcpu_load(vcpu);
10277	kvm_vcpu_reset(vcpu, false);
10278	kvm_init_mmu(vcpu, false);
10279	vcpu_put(vcpu);
10280	return 0;
10281
10282free_guest_fpu:
10283	kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
10284free_user_fpu:
10285	kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
10286free_emulate_ctxt:
10287	kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
10288free_wbinvd_dirty_mask:
10289	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
10290fail_free_mce_banks:
10291	kfree(vcpu->arch.mce_banks);
10292fail_free_pio_data:
10293	free_page((unsigned long)vcpu->arch.pio_data);
10294fail_free_lapic:
10295	kvm_free_lapic(vcpu);
10296fail_mmu_destroy:
10297	kvm_mmu_destroy(vcpu);
10298	return r;
10299}
10300
10301void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
10302{
10303	struct kvm *kvm = vcpu->kvm;
10304
10305	kvm_hv_vcpu_postcreate(vcpu);
10306
10307	if (mutex_lock_killable(&vcpu->mutex))
10308		return;
10309	vcpu_load(vcpu);
10310	kvm_synchronize_tsc(vcpu, 0);
10311	vcpu_put(vcpu);
10312
10313	/* poll control enabled by default */
10314	vcpu->arch.msr_kvm_poll_control = 1;
10315
10316	mutex_unlock(&vcpu->mutex);
10317
10318	if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
10319		schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
10320						KVMCLOCK_SYNC_PERIOD);
10321}
10322
10323void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
10324{
10325	int idx;
10326
10327	kvmclock_reset(vcpu);
10328
10329	kvm_x86_ops.vcpu_free(vcpu);
10330
10331	kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
10332	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
10333	kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
10334	kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
10335
10336	kvm_hv_vcpu_uninit(vcpu);
10337	kvm_pmu_destroy(vcpu);
10338	kfree(vcpu->arch.mce_banks);
10339	kvm_free_lapic(vcpu);
10340	idx = srcu_read_lock(&vcpu->kvm->srcu);
10341	kvm_mmu_destroy(vcpu);
10342	srcu_read_unlock(&vcpu->kvm->srcu, idx);
10343	free_page((unsigned long)vcpu->arch.pio_data);
10344	kvfree(vcpu->arch.cpuid_entries);
10345	if (!lapic_in_kernel(vcpu))
10346		static_key_slow_dec(&kvm_no_apic_vcpu);
10347}
10348
10349void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
10350{
10351	kvm_lapic_reset(vcpu, init_event);
10352
10353	vcpu->arch.hflags = 0;
10354
10355	vcpu->arch.smi_pending = 0;
10356	vcpu->arch.smi_count = 0;
10357	atomic_set(&vcpu->arch.nmi_queued, 0);
10358	vcpu->arch.nmi_pending = 0;
10359	vcpu->arch.nmi_injected = false;
10360	kvm_clear_interrupt_queue(vcpu);
10361	kvm_clear_exception_queue(vcpu);
10362
10363	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
10364	kvm_update_dr0123(vcpu);
10365	vcpu->arch.dr6 = DR6_INIT;
10366	vcpu->arch.dr7 = DR7_FIXED_1;
10367	kvm_update_dr7(vcpu);
10368
10369	vcpu->arch.cr2 = 0;
10370
10371	kvm_make_request(KVM_REQ_EVENT, vcpu);
10372	vcpu->arch.apf.msr_en_val = 0;
10373	vcpu->arch.apf.msr_int_val = 0;
10374	vcpu->arch.st.msr_val = 0;
10375
10376	kvmclock_reset(vcpu);
10377
10378	kvm_clear_async_pf_completion_queue(vcpu);
10379	kvm_async_pf_hash_reset(vcpu);
10380	vcpu->arch.apf.halted = false;
10381
10382	if (kvm_mpx_supported()) {
10383		void *mpx_state_buffer;
10384
10385		/*
10386		 * To avoid have the INIT path from kvm_apic_has_events() that be
10387		 * called with loaded FPU and does not let userspace fix the state.
10388		 */
10389		if (init_event)
10390			kvm_put_guest_fpu(vcpu);
10391		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
10392					XFEATURE_BNDREGS);
10393		if (mpx_state_buffer)
10394			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
10395		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
10396					XFEATURE_BNDCSR);
10397		if (mpx_state_buffer)
10398			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
10399		if (init_event)
10400			kvm_load_guest_fpu(vcpu);
10401	}
10402
10403	if (!init_event) {
10404		kvm_pmu_reset(vcpu);
10405		vcpu->arch.smbase = 0x30000;
10406
10407		vcpu->arch.msr_misc_features_enables = 0;
10408
10409		vcpu->arch.xcr0 = XFEATURE_MASK_FP;
10410	}
10411
10412	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
10413	vcpu->arch.regs_avail = ~0;
10414	vcpu->arch.regs_dirty = ~0;
10415
10416	vcpu->arch.ia32_xss = 0;
10417
10418	kvm_x86_ops.vcpu_reset(vcpu, init_event);
10419}
10420
10421void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
10422{
10423	struct kvm_segment cs;
10424
10425	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
10426	cs.selector = vector << 8;
10427	cs.base = vector << 12;
10428	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
10429	kvm_rip_write(vcpu, 0);
10430}
10431
10432int kvm_arch_hardware_enable(void)
10433{
10434	struct kvm *kvm;
10435	struct kvm_vcpu *vcpu;
10436	int i;
10437	int ret;
10438	u64 local_tsc;
10439	u64 max_tsc = 0;
10440	bool stable, backwards_tsc = false;
10441
10442	kvm_user_return_msr_cpu_online();
10443	ret = kvm_x86_ops.hardware_enable();
10444	if (ret != 0)
10445		return ret;
10446
10447	local_tsc = rdtsc();
10448	stable = !kvm_check_tsc_unstable();
10449	list_for_each_entry(kvm, &vm_list, vm_list) {
10450		kvm_for_each_vcpu(i, vcpu, kvm) {
10451			if (!stable && vcpu->cpu == smp_processor_id())
10452				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
10453			if (stable && vcpu->arch.last_host_tsc > local_tsc) {
10454				backwards_tsc = true;
10455				if (vcpu->arch.last_host_tsc > max_tsc)
10456					max_tsc = vcpu->arch.last_host_tsc;
10457			}
10458		}
10459	}
10460
10461	/*
10462	 * Sometimes, even reliable TSCs go backwards.  This happens on
10463	 * platforms that reset TSC during suspend or hibernate actions, but
10464	 * maintain synchronization.  We must compensate.  Fortunately, we can
10465	 * detect that condition here, which happens early in CPU bringup,
10466	 * before any KVM threads can be running.  Unfortunately, we can't
10467	 * bring the TSCs fully up to date with real time, as we aren't yet far
10468	 * enough into CPU bringup that we know how much real time has actually
10469	 * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
10470	 * variables that haven't been updated yet.
10471	 *
10472	 * So we simply find the maximum observed TSC above, then record the
10473	 * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
10474	 * the adjustment will be applied.  Note that we accumulate
10475	 * adjustments, in case multiple suspend cycles happen before some VCPU
10476	 * gets a chance to run again.  In the event that no KVM threads get a
10477	 * chance to run, we will miss the entire elapsed period, as we'll have
10478	 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
10479	 * loose cycle time.  This isn't too big a deal, since the loss will be
10480	 * uniform across all VCPUs (not to mention the scenario is extremely
10481	 * unlikely). It is possible that a second hibernate recovery happens
10482	 * much faster than a first, causing the observed TSC here to be
10483	 * smaller; this would require additional padding adjustment, which is
10484	 * why we set last_host_tsc to the local tsc observed here.
10485	 *
10486	 * N.B. - this code below runs only on platforms with reliable TSC,
10487	 * as that is the only way backwards_tsc is set above.  Also note
10488	 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
10489	 * have the same delta_cyc adjustment applied if backwards_tsc
10490	 * is detected.  Note further, this adjustment is only done once,
10491	 * as we reset last_host_tsc on all VCPUs to stop this from being
10492	 * called multiple times (one for each physical CPU bringup).
10493	 *
10494	 * Platforms with unreliable TSCs don't have to deal with this, they
10495	 * will be compensated by the logic in vcpu_load, which sets the TSC to
10496	 * catchup mode.  This will catchup all VCPUs to real time, but cannot
10497	 * guarantee that they stay in perfect synchronization.
10498	 */
10499	if (backwards_tsc) {
10500		u64 delta_cyc = max_tsc - local_tsc;
10501		list_for_each_entry(kvm, &vm_list, vm_list) {
10502			kvm->arch.backwards_tsc_observed = true;
10503			kvm_for_each_vcpu(i, vcpu, kvm) {
10504				vcpu->arch.tsc_offset_adjustment += delta_cyc;
10505				vcpu->arch.last_host_tsc = local_tsc;
10506				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
10507			}
10508
10509			/*
10510			 * We have to disable TSC offset matching.. if you were
10511			 * booting a VM while issuing an S4 host suspend....
10512			 * you may have some problem.  Solving this issue is
10513			 * left as an exercise to the reader.
10514			 */
10515			kvm->arch.last_tsc_nsec = 0;
10516			kvm->arch.last_tsc_write = 0;
10517		}
10518
10519	}
10520	return 0;
10521}
10522
10523void kvm_arch_hardware_disable(void)
10524{
10525	kvm_x86_ops.hardware_disable();
10526	drop_user_return_notifiers();
10527}
10528
10529int kvm_arch_hardware_setup(void *opaque)
10530{
10531	struct kvm_x86_init_ops *ops = opaque;
10532	int r;
10533
10534	rdmsrl_safe(MSR_EFER, &host_efer);
10535
10536	if (boot_cpu_has(X86_FEATURE_XSAVES))
10537		rdmsrl(MSR_IA32_XSS, host_xss);
10538
10539	r = ops->hardware_setup();
10540	if (r != 0)
10541		return r;
10542
10543	memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
10544
10545	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
10546		supported_xss = 0;
10547
10548#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
10549	cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
10550#undef __kvm_cpu_cap_has
10551
10552	if (kvm_has_tsc_control) {
10553		/*
10554		 * Make sure the user can only configure tsc_khz values that
10555		 * fit into a signed integer.
10556		 * A min value is not calculated because it will always
10557		 * be 1 on all machines.
10558		 */
10559		u64 max = min(0x7fffffffULL,
10560			      __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
10561		kvm_max_guest_tsc_khz = max;
10562
10563		kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
10564	}
10565
10566	kvm_init_msr_list();
10567	return 0;
10568}
10569
10570void kvm_arch_hardware_unsetup(void)
10571{
10572	kvm_x86_ops.hardware_unsetup();
10573}
10574
10575int kvm_arch_check_processor_compat(void *opaque)
10576{
10577	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
10578	struct kvm_x86_init_ops *ops = opaque;
10579
10580	WARN_ON(!irqs_disabled());
10581
10582	if (__cr4_reserved_bits(cpu_has, c) !=
10583	    __cr4_reserved_bits(cpu_has, &boot_cpu_data))
10584		return -EIO;
10585
10586	return ops->check_processor_compatibility();
10587}
10588
10589bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
10590{
10591	return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
10592}
10593EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
10594
10595bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
10596{
10597	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
10598}
10599
10600struct static_key kvm_no_apic_vcpu __read_mostly;
10601EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
10602
10603void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
10604{
10605	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
10606
10607	vcpu->arch.l1tf_flush_l1d = true;
10608	if (pmu->version && unlikely(pmu->event_count)) {
10609		pmu->need_cleanup = true;
10610		kvm_make_request(KVM_REQ_PMU, vcpu);
10611	}
10612	kvm_x86_ops.sched_in(vcpu, cpu);
10613}
10614
10615void kvm_arch_free_vm(struct kvm *kvm)
10616{
10617	kfree(kvm->arch.hyperv.hv_pa_pg);
10618	vfree(kvm);
10619}
10620
10621
10622int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
10623{
10624	int ret;
10625
10626	if (type)
10627		return -EINVAL;
10628
10629	ret = kvm_page_track_init(kvm);
10630	if (ret)
10631		return ret;
10632
10633	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
10634	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
10635	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
10636	INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
10637	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
10638	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
10639
10640	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
10641	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
10642	/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
10643	set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
10644		&kvm->arch.irq_sources_bitmap);
10645
10646	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
10647	mutex_init(&kvm->arch.apic_map_lock);
10648	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
10649
10650	kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
10651	pvclock_update_vm_gtod_copy(kvm);
10652
10653	kvm->arch.guest_can_read_msr_platform_info = true;
10654
10655	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
10656	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
10657
10658	kvm_hv_init_vm(kvm);
10659	kvm_mmu_init_vm(kvm);
10660
10661	return kvm_x86_ops.vm_init(kvm);
10662}
10663
10664int kvm_arch_post_init_vm(struct kvm *kvm)
10665{
10666	return kvm_mmu_post_init_vm(kvm);
10667}
10668
10669static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
10670{
10671	vcpu_load(vcpu);
10672	kvm_mmu_unload(vcpu);
10673	vcpu_put(vcpu);
10674}
10675
10676static void kvm_free_vcpus(struct kvm *kvm)
10677{
10678	unsigned int i;
10679	struct kvm_vcpu *vcpu;
10680
10681	/*
10682	 * Unpin any mmu pages first.
10683	 */
10684	kvm_for_each_vcpu(i, vcpu, kvm) {
10685		kvm_clear_async_pf_completion_queue(vcpu);
10686		kvm_unload_vcpu_mmu(vcpu);
10687	}
10688	kvm_for_each_vcpu(i, vcpu, kvm)
10689		kvm_vcpu_destroy(vcpu);
10690
10691	mutex_lock(&kvm->lock);
10692	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
10693		kvm->vcpus[i] = NULL;
10694
10695	atomic_set(&kvm->online_vcpus, 0);
10696	mutex_unlock(&kvm->lock);
10697}
10698
10699void kvm_arch_sync_events(struct kvm *kvm)
10700{
10701	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
10702	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
10703	kvm_free_pit(kvm);
10704}
10705
10706int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
10707{
10708	int i, r;
10709	unsigned long hva, old_npages;
10710	struct kvm_memslots *slots = kvm_memslots(kvm);
10711	struct kvm_memory_slot *slot;
10712
10713	/* Called with kvm->slots_lock held.  */
10714	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
10715		return -EINVAL;
10716
10717	slot = id_to_memslot(slots, id);
10718	if (size) {
10719		if (slot && slot->npages)
10720			return -EEXIST;
10721
10722		/*
10723		 * MAP_SHARED to prevent internal slot pages from being moved
10724		 * by fork()/COW.
10725		 */
10726		hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
10727			      MAP_SHARED | MAP_ANONYMOUS, 0);
10728		if (IS_ERR((void *)hva))
10729			return PTR_ERR((void *)hva);
10730	} else {
10731		if (!slot || !slot->npages)
10732			return 0;
10733
10734		old_npages = slot->npages;
10735		hva = 0;
10736	}
10737
10738	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
10739		struct kvm_userspace_memory_region m;
10740
10741		m.slot = id | (i << 16);
10742		m.flags = 0;
10743		m.guest_phys_addr = gpa;
10744		m.userspace_addr = hva;
10745		m.memory_size = size;
10746		r = __kvm_set_memory_region(kvm, &m);
10747		if (r < 0)
10748			return r;
10749	}
10750
10751	if (!size)
10752		vm_munmap(hva, old_npages * PAGE_SIZE);
10753
10754	return 0;
10755}
10756EXPORT_SYMBOL_GPL(__x86_set_memory_region);
10757
10758void kvm_arch_pre_destroy_vm(struct kvm *kvm)
10759{
10760	kvm_mmu_pre_destroy_vm(kvm);
10761}
10762
10763void kvm_arch_destroy_vm(struct kvm *kvm)
10764{
10765	if (current->mm == kvm->mm) {
10766		/*
10767		 * Free memory regions allocated on behalf of userspace,
10768		 * unless the the memory map has changed due to process exit
10769		 * or fd copying.
10770		 */
10771		mutex_lock(&kvm->slots_lock);
10772		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
10773					0, 0);
10774		__x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
10775					0, 0);
10776		__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
10777		mutex_unlock(&kvm->slots_lock);
10778	}
10779	if (kvm_x86_ops.vm_destroy)
10780		kvm_x86_ops.vm_destroy(kvm);
10781	kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
10782	kvm_pic_destroy(kvm);
10783	kvm_ioapic_destroy(kvm);
10784	kvm_free_vcpus(kvm);
10785	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
10786	kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
10787	kvm_mmu_uninit_vm(kvm);
10788	kvm_page_track_cleanup(kvm);
10789	kvm_hv_destroy_vm(kvm);
10790}
10791
10792void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
10793{
10794	int i;
10795
10796	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
10797		kvfree(slot->arch.rmap[i]);
10798		slot->arch.rmap[i] = NULL;
10799
10800		if (i == 0)
10801			continue;
10802
10803		kvfree(slot->arch.lpage_info[i - 1]);
10804		slot->arch.lpage_info[i - 1] = NULL;
10805	}
10806
10807	kvm_page_track_free_memslot(slot);
10808}
10809
10810static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
10811				      unsigned long npages)
10812{
10813	int i;
10814
10815	/*
10816	 * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
10817	 * old arrays will be freed by __kvm_set_memory_region() if installing
10818	 * the new memslot is successful.
10819	 */
10820	memset(&slot->arch, 0, sizeof(slot->arch));
10821
10822	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
10823		struct kvm_lpage_info *linfo;
10824		unsigned long ugfn;
10825		int lpages;
10826		int level = i + 1;
10827
10828		lpages = gfn_to_index(slot->base_gfn + npages - 1,
10829				      slot->base_gfn, level) + 1;
10830
10831		slot->arch.rmap[i] =
10832			__vcalloc(lpages, sizeof(*slot->arch.rmap[i]),
10833				 GFP_KERNEL_ACCOUNT);
10834		if (!slot->arch.rmap[i])
10835			goto out_free;
10836		if (i == 0)
10837			continue;
10838
10839		linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
10840		if (!linfo)
10841			goto out_free;
10842
10843		slot->arch.lpage_info[i - 1] = linfo;
10844
10845		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
10846			linfo[0].disallow_lpage = 1;
10847		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
10848			linfo[lpages - 1].disallow_lpage = 1;
10849		ugfn = slot->userspace_addr >> PAGE_SHIFT;
10850		/*
10851		 * If the gfn and userspace address are not aligned wrt each
10852		 * other, disable large page support for this slot.
10853		 */
10854		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
10855			unsigned long j;
10856
10857			for (j = 0; j < lpages; ++j)
10858				linfo[j].disallow_lpage = 1;
10859		}
10860	}
10861
10862	if (kvm_page_track_create_memslot(slot, npages))
10863		goto out_free;
10864
10865	return 0;
10866
10867out_free:
10868	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
10869		kvfree(slot->arch.rmap[i]);
10870		slot->arch.rmap[i] = NULL;
10871		if (i == 0)
10872			continue;
10873
10874		kvfree(slot->arch.lpage_info[i - 1]);
10875		slot->arch.lpage_info[i - 1] = NULL;
10876	}
10877	return -ENOMEM;
10878}
10879
10880void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
10881{
10882	struct kvm_vcpu *vcpu;
10883	int i;
10884
10885	/*
10886	 * memslots->generation has been incremented.
10887	 * mmio generation may have reached its maximum value.
10888	 */
10889	kvm_mmu_invalidate_mmio_sptes(kvm, gen);
10890
10891	/* Force re-initialization of steal_time cache */
10892	kvm_for_each_vcpu(i, vcpu, kvm)
10893		kvm_vcpu_kick(vcpu);
10894}
10895
10896int kvm_arch_prepare_memory_region(struct kvm *kvm,
10897				struct kvm_memory_slot *memslot,
10898				const struct kvm_userspace_memory_region *mem,
10899				enum kvm_mr_change change)
10900{
10901	if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
10902		return kvm_alloc_memslot_metadata(memslot,
10903						  mem->memory_size >> PAGE_SHIFT);
10904	return 0;
10905}
10906
10907static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
10908				     struct kvm_memory_slot *old,
10909				     struct kvm_memory_slot *new,
10910				     enum kvm_mr_change change)
10911{
10912	/*
10913	 * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
10914	 * See comments below.
10915	 */
10916	if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
10917		return;
10918
10919	/*
10920	 * Dirty logging tracks sptes in 4k granularity, meaning that large
10921	 * sptes have to be split.  If live migration is successful, the guest
10922	 * in the source machine will be destroyed and large sptes will be
10923	 * created in the destination. However, if the guest continues to run
10924	 * in the source machine (for example if live migration fails), small
10925	 * sptes will remain around and cause bad performance.
10926	 *
10927	 * Scan sptes if dirty logging has been stopped, dropping those
10928	 * which can be collapsed into a single large-page spte.  Later
10929	 * page faults will create the large-page sptes.
10930	 *
10931	 * There is no need to do this in any of the following cases:
10932	 * CREATE:      No dirty mappings will already exist.
10933	 * MOVE/DELETE: The old mappings will already have been cleaned up by
10934	 *		kvm_arch_flush_shadow_memslot()
10935	 */
10936	if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
10937	    !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
10938		kvm_mmu_zap_collapsible_sptes(kvm, new);
10939
10940	/*
10941	 * Enable or disable dirty logging for the slot.
10942	 *
10943	 * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
10944	 * slot have been zapped so no dirty logging updates are needed for
10945	 * the old slot.
10946	 * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
10947	 * any mappings that might be created in it will consume the
10948	 * properties of the new slot and do not need to be updated here.
10949	 *
10950	 * When PML is enabled, the kvm_x86_ops dirty logging hooks are
10951	 * called to enable/disable dirty logging.
10952	 *
10953	 * When disabling dirty logging with PML enabled, the D-bit is set
10954	 * for sptes in the slot in order to prevent unnecessary GPA
10955	 * logging in the PML buffer (and potential PML buffer full VMEXIT).
10956	 * This guarantees leaving PML enabled for the guest's lifetime
10957	 * won't have any additional overhead from PML when the guest is
10958	 * running with dirty logging disabled.
10959	 *
10960	 * When enabling dirty logging, large sptes are write-protected
10961	 * so they can be split on first write.  New large sptes cannot
10962	 * be created for this slot until the end of the logging.
10963	 * See the comments in fast_page_fault().
10964	 * For small sptes, nothing is done if the dirty log is in the
10965	 * initial-all-set state.  Otherwise, depending on whether pml
10966	 * is enabled the D-bit or the W-bit will be cleared.
10967	 */
10968	if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
10969		if (kvm_x86_ops.slot_enable_log_dirty) {
10970			kvm_x86_ops.slot_enable_log_dirty(kvm, new);
10971		} else {
10972			int level =
10973				kvm_dirty_log_manual_protect_and_init_set(kvm) ?
10974				PG_LEVEL_2M : PG_LEVEL_4K;
10975
10976			/*
10977			 * If we're with initial-all-set, we don't need
10978			 * to write protect any small page because
10979			 * they're reported as dirty already.  However
10980			 * we still need to write-protect huge pages
10981			 * so that the page split can happen lazily on
10982			 * the first write to the huge page.
10983			 */
10984			kvm_mmu_slot_remove_write_access(kvm, new, level);
10985		}
10986	} else {
10987		if (kvm_x86_ops.slot_disable_log_dirty)
10988			kvm_x86_ops.slot_disable_log_dirty(kvm, new);
10989	}
10990}
10991
10992void kvm_arch_commit_memory_region(struct kvm *kvm,
10993				const struct kvm_userspace_memory_region *mem,
10994				struct kvm_memory_slot *old,
10995				const struct kvm_memory_slot *new,
10996				enum kvm_mr_change change)
10997{
10998	if (!kvm->arch.n_requested_mmu_pages)
10999		kvm_mmu_change_mmu_pages(kvm,
11000				kvm_mmu_calculate_default_mmu_pages(kvm));
11001
11002	/*
11003	 * FIXME: const-ify all uses of struct kvm_memory_slot.
11004	 */
11005	kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
11006
11007	/* Free the arrays associated with the old memslot. */
11008	if (change == KVM_MR_MOVE)
11009		kvm_arch_free_memslot(kvm, old);
11010}
11011
11012void kvm_arch_flush_shadow_all(struct kvm *kvm)
11013{
11014	kvm_mmu_zap_all(kvm);
11015}
11016
11017void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
11018				   struct kvm_memory_slot *slot)
11019{
11020	kvm_page_track_flush_slot(kvm, slot);
11021}
11022
11023static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
11024{
11025	return (is_guest_mode(vcpu) &&
11026			kvm_x86_ops.guest_apic_has_interrupt &&
11027			kvm_x86_ops.guest_apic_has_interrupt(vcpu));
11028}
11029
11030static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
11031{
11032	if (!list_empty_careful(&vcpu->async_pf.done))
11033		return true;
11034
11035	if (kvm_apic_has_events(vcpu))
11036		return true;
11037
11038	if (vcpu->arch.pv.pv_unhalted)
11039		return true;
11040
11041	if (vcpu->arch.exception.pending)
11042		return true;
11043
11044	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
11045	    (vcpu->arch.nmi_pending &&
11046	     kvm_x86_ops.nmi_allowed(vcpu, false)))
11047		return true;
11048
11049	if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
11050	    (vcpu->arch.smi_pending &&
11051	     kvm_x86_ops.smi_allowed(vcpu, false)))
11052		return true;
11053
11054	if (kvm_arch_interrupt_allowed(vcpu) &&
11055	    (kvm_cpu_has_interrupt(vcpu) ||
11056	    kvm_guest_apic_has_interrupt(vcpu)))
11057		return true;
11058
11059	if (kvm_hv_has_stimer_pending(vcpu))
11060		return true;
11061
11062	if (is_guest_mode(vcpu) &&
11063	    kvm_x86_ops.nested_ops->hv_timer_pending &&
11064	    kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
11065		return true;
11066
11067	return false;
11068}
11069
11070int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
11071{
11072	return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
11073}
11074
11075bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
11076{
11077	if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
11078		return true;
11079
11080	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
11081		kvm_test_request(KVM_REQ_SMI, vcpu) ||
11082		 kvm_test_request(KVM_REQ_EVENT, vcpu))
11083		return true;
11084
11085	if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
11086		return true;
11087
11088	return false;
11089}
11090
11091bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
11092{
11093	return vcpu->arch.preempted_in_kernel;
11094}
11095
11096int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
11097{
11098	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
11099}
11100
11101int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
11102{
11103	return kvm_x86_ops.interrupt_allowed(vcpu, false);
11104}
11105
11106unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
11107{
11108	if (is_64_bit_mode(vcpu))
11109		return kvm_rip_read(vcpu);
11110	return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
11111		     kvm_rip_read(vcpu));
11112}
11113EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
11114
11115bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
11116{
11117	return kvm_get_linear_rip(vcpu) == linear_rip;
11118}
11119EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
11120
11121unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
11122{
11123	unsigned long rflags;
11124
11125	rflags = kvm_x86_ops.get_rflags(vcpu);
11126	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
11127		rflags &= ~X86_EFLAGS_TF;
11128	return rflags;
11129}
11130EXPORT_SYMBOL_GPL(kvm_get_rflags);
11131
11132static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
11133{
11134	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
11135	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
11136		rflags |= X86_EFLAGS_TF;
11137	kvm_x86_ops.set_rflags(vcpu, rflags);
11138}
11139
11140void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
11141{
11142	__kvm_set_rflags(vcpu, rflags);
11143	kvm_make_request(KVM_REQ_EVENT, vcpu);
11144}
11145EXPORT_SYMBOL_GPL(kvm_set_rflags);
11146
11147void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
11148{
11149	int r;
11150
11151	if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
11152	      work->wakeup_all)
11153		return;
11154
11155	r = kvm_mmu_reload(vcpu);
11156	if (unlikely(r))
11157		return;
11158
11159	if (!vcpu->arch.mmu->direct_map &&
11160	      work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
11161		return;
11162
11163	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
11164}
11165
11166static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
11167{
11168	BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
11169
11170	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
11171}
11172
11173static inline u32 kvm_async_pf_next_probe(u32 key)
11174{
11175	return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
11176}
11177
11178static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
11179{
11180	u32 key = kvm_async_pf_hash_fn(gfn);
11181
11182	while (vcpu->arch.apf.gfns[key] != ~0)
11183		key = kvm_async_pf_next_probe(key);
11184
11185	vcpu->arch.apf.gfns[key] = gfn;
11186}
11187
11188static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
11189{
11190	int i;
11191	u32 key = kvm_async_pf_hash_fn(gfn);
11192
11193	for (i = 0; i < ASYNC_PF_PER_VCPU &&
11194		     (vcpu->arch.apf.gfns[key] != gfn &&
11195		      vcpu->arch.apf.gfns[key] != ~0); i++)
11196		key = kvm_async_pf_next_probe(key);
11197
11198	return key;
11199}
11200
11201bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
11202{
11203	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
11204}
11205
11206static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
11207{
11208	u32 i, j, k;
11209
11210	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
11211
11212	if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
11213		return;
11214
11215	while (true) {
11216		vcpu->arch.apf.gfns[i] = ~0;
11217		do {
11218			j = kvm_async_pf_next_probe(j);
11219			if (vcpu->arch.apf.gfns[j] == ~0)
11220				return;
11221			k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
11222			/*
11223			 * k lies cyclically in ]i,j]
11224			 * |    i.k.j |
11225			 * |....j i.k.| or  |.k..j i...|
11226			 */
11227		} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
11228		vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
11229		i = j;
11230	}
11231}
11232
11233static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
11234{
11235	u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
11236
11237	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
11238				      sizeof(reason));
11239}
11240
11241static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
11242{
11243	unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
11244
11245	return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
11246					     &token, offset, sizeof(token));
11247}
11248
11249static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
11250{
11251	unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
11252	u32 val;
11253
11254	if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
11255					 &val, offset, sizeof(val)))
11256		return false;
11257
11258	return !val;
11259}
11260
11261static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
11262{
11263	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
11264		return false;
11265
11266	if (!kvm_pv_async_pf_enabled(vcpu) ||
11267	    (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
11268		return false;
11269
11270	return true;
11271}
11272
11273bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
11274{
11275	if (unlikely(!lapic_in_kernel(vcpu) ||
11276		     kvm_event_needs_reinjection(vcpu) ||
11277		     vcpu->arch.exception.pending))
11278		return false;
11279
11280	if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
11281		return false;
11282
11283	/*
11284	 * If interrupts are off we cannot even use an artificial
11285	 * halt state.
11286	 */
11287	return kvm_arch_interrupt_allowed(vcpu);
11288}
11289
11290bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
11291				     struct kvm_async_pf *work)
11292{
11293	struct x86_exception fault;
11294
11295	trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
11296	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
11297
11298	if (kvm_can_deliver_async_pf(vcpu) &&
11299	    !apf_put_user_notpresent(vcpu)) {
11300		fault.vector = PF_VECTOR;
11301		fault.error_code_valid = true;
11302		fault.error_code = 0;
11303		fault.nested_page_fault = false;
11304		fault.address = work->arch.token;
11305		fault.async_page_fault = true;
11306		kvm_inject_page_fault(vcpu, &fault);
11307		return true;
11308	} else {
11309		/*
11310		 * It is not possible to deliver a paravirtualized asynchronous
11311		 * page fault, but putting the guest in an artificial halt state
11312		 * can be beneficial nevertheless: if an interrupt arrives, we
11313		 * can deliver it timely and perhaps the guest will schedule
11314		 * another process.  When the instruction that triggered a page
11315		 * fault is retried, hopefully the page will be ready in the host.
11316		 */
11317		kvm_make_request(KVM_REQ_APF_HALT, vcpu);
11318		return false;
11319	}
11320}
11321
11322void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
11323				 struct kvm_async_pf *work)
11324{
11325	struct kvm_lapic_irq irq = {
11326		.delivery_mode = APIC_DM_FIXED,
11327		.vector = vcpu->arch.apf.vec
11328	};
11329
11330	if (work->wakeup_all)
11331		work->arch.token = ~0; /* broadcast wakeup */
11332	else
11333		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
11334	trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
11335
11336	if ((work->wakeup_all || work->notpresent_injected) &&
11337	    kvm_pv_async_pf_enabled(vcpu) &&
11338	    !apf_put_user_ready(vcpu, work->arch.token)) {
11339		vcpu->arch.apf.pageready_pending = true;
11340		kvm_apic_set_irq(vcpu, &irq, NULL);
11341	}
11342
11343	vcpu->arch.apf.halted = false;
11344	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
11345}
11346
11347void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
11348{
11349	kvm_make_request(KVM_REQ_APF_READY, vcpu);
11350	if (!vcpu->arch.apf.pageready_pending)
11351		kvm_vcpu_kick(vcpu);
11352}
11353
11354bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
11355{
11356	if (!kvm_pv_async_pf_enabled(vcpu))
11357		return true;
11358	else
11359		return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
11360}
11361
11362void kvm_arch_start_assignment(struct kvm *kvm)
11363{
11364	atomic_inc(&kvm->arch.assigned_device_count);
11365}
11366EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
11367
11368void kvm_arch_end_assignment(struct kvm *kvm)
11369{
11370	atomic_dec(&kvm->arch.assigned_device_count);
11371}
11372EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
11373
11374bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
11375{
11376	return arch_atomic_read(&kvm->arch.assigned_device_count);
11377}
11378EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
11379
11380void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
11381{
11382	atomic_inc(&kvm->arch.noncoherent_dma_count);
11383}
11384EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
11385
11386void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
11387{
11388	atomic_dec(&kvm->arch.noncoherent_dma_count);
11389}
11390EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
11391
11392bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
11393{
11394	return atomic_read(&kvm->arch.noncoherent_dma_count);
11395}
11396EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
11397
11398bool kvm_arch_has_irq_bypass(void)
11399{
11400	return true;
11401}
11402
11403int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
11404				      struct irq_bypass_producer *prod)
11405{
11406	struct kvm_kernel_irqfd *irqfd =
11407		container_of(cons, struct kvm_kernel_irqfd, consumer);
11408	int ret;
11409
11410	irqfd->producer = prod;
11411	kvm_arch_start_assignment(irqfd->kvm);
11412	ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
11413					 prod->irq, irqfd->gsi, 1);
11414
11415	if (ret)
11416		kvm_arch_end_assignment(irqfd->kvm);
11417
11418	return ret;
11419}
11420
11421void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
11422				      struct irq_bypass_producer *prod)
11423{
11424	int ret;
11425	struct kvm_kernel_irqfd *irqfd =
11426		container_of(cons, struct kvm_kernel_irqfd, consumer);
11427
11428	WARN_ON(irqfd->producer != prod);
11429	irqfd->producer = NULL;
11430
11431	/*
11432	 * When producer of consumer is unregistered, we change back to
11433	 * remapped mode, so we can re-use the current implementation
11434	 * when the irq is masked/disabled or the consumer side (KVM
11435	 * int this case doesn't want to receive the interrupts.
11436	*/
11437	ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
11438	if (ret)
11439		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
11440		       " fails: %d\n", irqfd->consumer.token, ret);
11441
11442	kvm_arch_end_assignment(irqfd->kvm);
11443}
11444
11445int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
11446				   uint32_t guest_irq, bool set)
11447{
11448	return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
11449}
11450
11451bool kvm_vector_hashing_enabled(void)
11452{
11453	return vector_hashing;
11454}
11455
11456bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
11457{
11458	return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
11459}
11460EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
11461
11462
11463int kvm_spec_ctrl_test_value(u64 value)
11464{
11465	/*
11466	 * test that setting IA32_SPEC_CTRL to given value
11467	 * is allowed by the host processor
11468	 */
11469
11470	u64 saved_value;
11471	unsigned long flags;
11472	int ret = 0;
11473
11474	local_irq_save(flags);
11475
11476	if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
11477		ret = 1;
11478	else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
11479		ret = 1;
11480	else
11481		wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
11482
11483	local_irq_restore(flags);
11484
11485	return ret;
11486}
11487EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
11488
11489void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
11490{
11491	struct x86_exception fault;
11492	u32 access = error_code &
11493		(PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
11494
11495	if (!(error_code & PFERR_PRESENT_MASK) ||
11496	    vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
11497		/*
11498		 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
11499		 * tables probably do not match the TLB.  Just proceed
11500		 * with the error code that the processor gave.
11501		 */
11502		fault.vector = PF_VECTOR;
11503		fault.error_code_valid = true;
11504		fault.error_code = error_code;
11505		fault.nested_page_fault = false;
11506		fault.address = gva;
11507	}
11508	vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
11509}
11510EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
11511
11512/*
11513 * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
11514 * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
11515 * indicates whether exit to userspace is needed.
11516 */
11517int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
11518			      struct x86_exception *e)
11519{
11520	if (r == X86EMUL_PROPAGATE_FAULT) {
11521		kvm_inject_emulated_page_fault(vcpu, e);
11522		return 1;
11523	}
11524
11525	/*
11526	 * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
11527	 * while handling a VMX instruction KVM could've handled the request
11528	 * correctly by exiting to userspace and performing I/O but there
11529	 * doesn't seem to be a real use-case behind such requests, just return
11530	 * KVM_EXIT_INTERNAL_ERROR for now.
11531	 */
11532	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
11533	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
11534	vcpu->run->internal.ndata = 0;
11535
11536	return 0;
11537}
11538EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
11539
11540int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
11541{
11542	bool pcid_enabled;
11543	struct x86_exception e;
11544	unsigned i;
11545	unsigned long roots_to_free = 0;
11546	struct {
11547		u64 pcid;
11548		u64 gla;
11549	} operand;
11550	int r;
11551
11552	r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
11553	if (r != X86EMUL_CONTINUE)
11554		return kvm_handle_memory_failure(vcpu, r, &e);
11555
11556	if (operand.pcid >> 12 != 0) {
11557		kvm_inject_gp(vcpu, 0);
11558		return 1;
11559	}
11560
11561	pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
11562
11563	switch (type) {
11564	case INVPCID_TYPE_INDIV_ADDR:
11565		if ((!pcid_enabled && (operand.pcid != 0)) ||
11566		    is_noncanonical_address(operand.gla, vcpu)) {
11567			kvm_inject_gp(vcpu, 0);
11568			return 1;
11569		}
11570		kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
11571		return kvm_skip_emulated_instruction(vcpu);
11572
11573	case INVPCID_TYPE_SINGLE_CTXT:
11574		if (!pcid_enabled && (operand.pcid != 0)) {
11575			kvm_inject_gp(vcpu, 0);
11576			return 1;
11577		}
11578
11579		if (kvm_get_active_pcid(vcpu) == operand.pcid) {
11580			kvm_mmu_sync_roots(vcpu);
11581			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
11582		}
11583
11584		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
11585			if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
11586			    == operand.pcid)
11587				roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
11588
11589		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
11590		/*
11591		 * If neither the current cr3 nor any of the prev_roots use the
11592		 * given PCID, then nothing needs to be done here because a
11593		 * resync will happen anyway before switching to any other CR3.
11594		 */
11595
11596		return kvm_skip_emulated_instruction(vcpu);
11597
11598	case INVPCID_TYPE_ALL_NON_GLOBAL:
11599		/*
11600		 * Currently, KVM doesn't mark global entries in the shadow
11601		 * page tables, so a non-global flush just degenerates to a
11602		 * global flush. If needed, we could optimize this later by
11603		 * keeping track of global entries in shadow page tables.
11604		 */
11605
11606		fallthrough;
11607	case INVPCID_TYPE_ALL_INCL_GLOBAL:
11608		kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
11609		return kvm_skip_emulated_instruction(vcpu);
11610
11611	default:
11612		BUG(); /* We have already checked above that type <= 3 */
11613	}
11614}
11615EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
11616
11617EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
11618EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
11619EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
11620EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
11621EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
11622EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
11623EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
11624EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
11625EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
11626EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
11627EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
11628EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
11629EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
11630EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
11631EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
11632EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
11633EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
11634EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
11635EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
11636EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
11637EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
11638EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
11639
11640static int __init kvm_x86_init(void)
11641{
11642	kvm_mmu_x86_module_init();
11643	return 0;
11644}
11645module_init(kvm_x86_init);
11646
11647static void __exit kvm_x86_exit(void)
11648{
11649	/*
11650	 * If module_init() is implemented, module_exit() must also be
11651	 * implemented to allow module unload.
11652	 */
11653}
11654module_exit(kvm_x86_exit);
11655